diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 4357bd3eb6b..cf5f575e3c7 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -4,7 +4,6 @@ self-hosted-runner: - func-tester - func-tester-aarch64 - fuzzer-unit-tester - - stress-tester - style-checker - style-checker-aarch64 - release-maker diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index 794aca4a515..15e07142025 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -229,18 +229,26 @@ jobs: uses: ./.github/workflows/reusable_test.yml with: test_name: Stress test (tsan) - runner_type: stress-tester + runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} ############################################################################################# ############################# INTEGRATION TESTS ############################################# ############################################################################################# - IntegrationTestsRelease: - needs: [RunConfig, BuilderDebRelease] + IntegrationTestsAsanOldAnalyzer: + needs: [RunConfig, BuilderDebAsan] if: ${{ !failure() && !cancelled() }} uses: ./.github/workflows/reusable_test.yml with: - test_name: Integration tests (release) - runner_type: stress-tester + test_name: Integration tests (asan, old analyzer) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} + IntegrationTestsTsan: + needs: [RunConfig, BuilderDebTsan] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Integration tests (tsan) + runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} FinishCheck: if: ${{ !cancelled() }} @@ -250,7 +258,8 @@ jobs: - FunctionalStatelessTestAsan - FunctionalStatefulTestDebug - StressTestTsan - - IntegrationTestsRelease + - IntegrationTestsTsan + - IntegrationTestsAsanOldAnalyzer - CompatibilityCheckX86 - CompatibilityCheckAarch64 runs-on: [self-hosted, style-checker] diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index b884ebfe7a0..0a84d093197 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -374,7 +374,7 @@ jobs: uses: ./.github/workflows/reusable_test.yml with: test_name: Stress test (asan) - runner_type: stress-tester + runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} StressTestTsan: needs: [RunConfig, BuilderDebTsan] @@ -382,7 +382,7 @@ jobs: uses: ./.github/workflows/reusable_test.yml with: test_name: Stress test (tsan) - runner_type: stress-tester + runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} StressTestMsan: needs: [RunConfig, BuilderDebMsan] @@ -390,7 +390,7 @@ jobs: uses: ./.github/workflows/reusable_test.yml with: test_name: Stress test (msan) - runner_type: stress-tester + runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} StressTestUBsan: needs: [RunConfig, BuilderDebUBsan] @@ -398,7 +398,7 @@ jobs: uses: ./.github/workflows/reusable_test.yml with: test_name: Stress test (ubsan) - runner_type: stress-tester + runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} StressTestDebug: needs: [RunConfig, BuilderDebDebug] @@ -406,7 +406,7 @@ jobs: uses: ./.github/workflows/reusable_test.yml with: test_name: Stress test (debug) - runner_type: stress-tester + runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} ############################################################################################# ############################# INTEGRATION TESTS ############################################# @@ -417,7 +417,7 @@ jobs: uses: ./.github/workflows/reusable_test.yml with: test_name: Integration tests (asan) - runner_type: stress-tester + runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} IntegrationTestsAnalyzerAsan: needs: [RunConfig, BuilderDebAsan] @@ -425,7 +425,7 @@ jobs: uses: ./.github/workflows/reusable_test.yml with: test_name: Integration tests (asan, old analyzer) - runner_type: stress-tester + runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} IntegrationTestsTsan: needs: [RunConfig, BuilderDebTsan] @@ -433,7 +433,7 @@ jobs: uses: ./.github/workflows/reusable_test.yml with: test_name: Integration tests (tsan) - runner_type: stress-tester + runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} IntegrationTestsRelease: needs: [RunConfig, BuilderDebRelease] @@ -441,7 +441,7 @@ jobs: uses: ./.github/workflows/reusable_test.yml with: test_name: Integration tests (release) - runner_type: stress-tester + runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} FinishCheck: if: ${{ !cancelled() }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 6abf48a6927..644ebf3a3dd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -339,7 +339,6 @@ set (CMAKE_ASM_FLAGS_RELWITHDEBINFO "${CMAKE_ASM_FLAGS_RELWITHDEBINFO} -O3 set (CMAKE_ASM_FLAGS_DEBUG "${CMAKE_ASM_FLAGS_DEBUG} -O${DEBUG_O_LEVEL} ${DEBUG_INFO_FLAGS} ${CMAKE_ASM_FLAGS_ADD}") if (OS_DARWIN) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-U,_inside_main") endif() diff --git a/ci_v2/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt b/ci_v2/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt index 8ec2e001a73..e2966898be2 100644 --- a/ci_v2/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt +++ b/ci_v2/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt @@ -1420,8 +1420,6 @@ config configs conformant congruential -conjuction -conjuctive connectionId const contrib @@ -1698,7 +1696,6 @@ formatReadableSize formatReadableTimeDelta formatRow formatRowNoNewline -formated formatschema formatter formatters @@ -3048,3 +3045,89 @@ znode znodes zookeeperSessionUptime zstd +ArrowCompression +CapnProtoEnumComparingMode +DateTimeInputFormat +DateTimeOutputFormat +DateTimeOverflowBehavior +deserialize +dotall +EachRow +EscapingRule +IdentifierQuotingRule +IdentifierQuotingStyle +IntervalOutputFormat +MsgPackUUIDRepresentation +ORCCompression +ParquetCompression +ParquetVersion +SchemaInferenceMode +alloc +CacheWarmer +conjuctive +cors +CORS +countIf +DefaultTableEngine +dereference +DistributedDDLOutputMode +DistributedProductMode +formatdatetime +inequal +INVOKER +ITION +JoinAlgorithm +JoinStrictness +keepalive +ListObject +ListObjects +LoadBalancing +LocalFSReadMethod +LogQueriesType +LogsLevel +MaxThreads +MemorySample +multibuffer +multiif +multiread +multithreading +MySQLDataTypesSupport +nonconst +NonZeroUInt +nullptr +OverflowMode +OverflowModeGroupBy +ParallelReplicasMode +param +parsedatetime +perf +PerfEventInfo +perkey +prefetched +prefetches +prefetching +preimage +QueryCacheNondeterministicFunctionHandling +QueryCacheSystemTableHandling +remerge +replcase +rerange +RetryStrategy +rowlist +SetOperationMode +ShortCircuitFunctionEvaluation +SQLSecurityType +sumIf +TCPHandler +throwif +TotalsMode +TransactionsWaitCSNMode +undelete +unmerged +DataPacket +DDLs +DistributedCacheLogMode +DistributedCachePoolBehaviourOnLimit +SharedJoin +ShareSet +unacked diff --git a/cmake/tools.cmake b/cmake/tools.cmake index 5c7da54b779..d95f1fbb29b 100644 --- a/cmake/tools.cmake +++ b/cmake/tools.cmake @@ -48,6 +48,8 @@ if (NOT LINKER_NAME) find_program (LLD_PATH NAMES "ld.lld-${COMPILER_VERSION_MAJOR}" "ld.lld") elseif (OS_DARWIN) find_program (LLD_PATH NAMES "ld") + # Duplicate libraries passed to the linker is not a problem. + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-no_warn_duplicate_libraries") endif () if (LLD_PATH) if (OS_LINUX OR OS_DARWIN) diff --git a/contrib/grpc b/contrib/grpc index 7bc3abe952a..62e871c36fa 160000 --- a/contrib/grpc +++ b/contrib/grpc @@ -1 +1 @@ -Subproject commit 7bc3abe952aba1dc7bce7f2f790dc781cb51a41e +Subproject commit 62e871c36fa93c0af939bd31762845265214fe3d diff --git a/contrib/libdivide b/contrib/libdivide index 3bd34388573..01526031eb7 160000 --- a/contrib/libdivide +++ b/contrib/libdivide @@ -1 +1 @@ -Subproject commit 3bd34388573681ce563348cdf04fe15d24770d04 +Subproject commit 01526031eb79375dc85e0212c966d2c514a01234 diff --git a/contrib/simdjson b/contrib/simdjson index 6060be2fdf6..e341c8b4386 160000 --- a/contrib/simdjson +++ b/contrib/simdjson @@ -1 +1 @@ -Subproject commit 6060be2fdf62edf4a8f51a8b0883d57d09397b30 +Subproject commit e341c8b43861b43de29c48ab65f292d997096953 diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index 7825e3edd98..dfe6a420260 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="24.9.1.3278" +ARG VERSION="24.9.2.42" ARG PACKAGES="clickhouse-keeper" ARG DIRECT_DOWNLOAD_URLS="" diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index 6a33023592c..991c25ad142 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -35,7 +35,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="24.9.1.3278" +ARG VERSION="24.9.2.42" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" ARG DIRECT_DOWNLOAD_URLS="" diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index f7c80286fe3..5dc88b49e31 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -28,7 +28,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list ARG REPO_CHANNEL="stable" ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION="24.9.1.3278" +ARG VERSION="24.9.2.42" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" #docker-official-library:off diff --git a/docker/test/integration/runner/misc/rabbitmq/enabled_plugins b/docker/test/integration/runner/misc/rabbitmq/enabled_plugins new file mode 100644 index 00000000000..a30892ff929 --- /dev/null +++ b/docker/test/integration/runner/misc/rabbitmq/enabled_plugins @@ -0,0 +1 @@ +[rabbitmq_consistent_hash_exchange]. \ No newline at end of file diff --git a/docker/test/integration/runner/misc/rabbitmq/rabbitmq.conf b/docker/test/integration/runner/misc/rabbitmq/rabbitmq.conf index 258a282907a..6da3758b08d 100644 --- a/docker/test/integration/runner/misc/rabbitmq/rabbitmq.conf +++ b/docker/test/integration/runner/misc/rabbitmq/rabbitmq.conf @@ -13,3 +13,5 @@ ssl_options.fail_if_no_peer_cert = false ssl_options.cacertfile = /etc/rabbitmq/ca-cert.pem ssl_options.certfile = /etc/rabbitmq/server-cert.pem ssl_options.keyfile = /etc/rabbitmq/server-key.pem + +vm_memory_high_watermark.absolute = 2GB diff --git a/docs/changelogs/v21.3.13.9-lts.md b/docs/changelogs/v21.3.13.9-lts.md index 05830ae38a0..c79955ab1d7 100644 --- a/docs/changelogs/v21.3.13.9-lts.md +++ b/docs/changelogs/v21.3.13.9-lts.md @@ -41,7 +41,7 @@ sidebar_label: 2022 * Backported in [#25364](https://github.com/ClickHouse/ClickHouse/issues/25364): On ZooKeeper connection loss `ReplicatedMergeTree` table might wait for background operations to complete before trying to reconnect. It's fixed, now background operations are stopped forcefully. [#25306](https://github.com/ClickHouse/ClickHouse/pull/25306) ([Alexander Tokmakov](https://github.com/tavplubix)). * Backported in [#25387](https://github.com/ClickHouse/ClickHouse/issues/25387): Fix the possibility of non-deterministic behaviour of the `quantileDeterministic` function and similar. This closes [#20480](https://github.com/ClickHouse/ClickHouse/issues/20480). [#25313](https://github.com/ClickHouse/ClickHouse/pull/25313) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Backported in [#25455](https://github.com/ClickHouse/ClickHouse/issues/25455): Fix lost `WHERE` condition in expression-push-down optimization of query plan (setting `query_plan_filter_push_down = 1` by default). Fixes [#25368](https://github.com/ClickHouse/ClickHouse/issues/25368). [#25370](https://github.com/ClickHouse/ClickHouse/pull/25370) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Backported in [#25406](https://github.com/ClickHouse/ClickHouse/issues/25406): Fix `REPLACE` column transformer when used in DDL by correctly quoting the formated query. This fixes [#23925](https://github.com/ClickHouse/ClickHouse/issues/23925). [#25391](https://github.com/ClickHouse/ClickHouse/pull/25391) ([Amos Bird](https://github.com/amosbird)). +* Backported in [#25406](https://github.com/ClickHouse/ClickHouse/issues/25406): Fix `REPLACE` column transformer when used in DDL by correctly quoting the formatted query. This fixes [#23925](https://github.com/ClickHouse/ClickHouse/issues/23925). [#25391](https://github.com/ClickHouse/ClickHouse/pull/25391) ([Amos Bird](https://github.com/amosbird)). * Backported in [#25505](https://github.com/ClickHouse/ClickHouse/issues/25505): Fix segfault when sharding_key is absent in task config for copier. [#25419](https://github.com/ClickHouse/ClickHouse/pull/25419) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). #### NO CL ENTRY diff --git a/docs/changelogs/v21.5.7.9-stable.md b/docs/changelogs/v21.5.7.9-stable.md index 9b4109e8f43..ea9f03e7eb2 100644 --- a/docs/changelogs/v21.5.7.9-stable.md +++ b/docs/changelogs/v21.5.7.9-stable.md @@ -40,7 +40,7 @@ sidebar_label: 2022 * Backported in [#25362](https://github.com/ClickHouse/ClickHouse/issues/25362): On ZooKeeper connection loss `ReplicatedMergeTree` table might wait for background operations to complete before trying to reconnect. It's fixed, now background operations are stopped forcefully. [#25306](https://github.com/ClickHouse/ClickHouse/pull/25306) ([Alexander Tokmakov](https://github.com/tavplubix)). * Backported in [#25386](https://github.com/ClickHouse/ClickHouse/issues/25386): Fix the possibility of non-deterministic behaviour of the `quantileDeterministic` function and similar. This closes [#20480](https://github.com/ClickHouse/ClickHouse/issues/20480). [#25313](https://github.com/ClickHouse/ClickHouse/pull/25313) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Backported in [#25456](https://github.com/ClickHouse/ClickHouse/issues/25456): Fix lost `WHERE` condition in expression-push-down optimization of query plan (setting `query_plan_filter_push_down = 1` by default). Fixes [#25368](https://github.com/ClickHouse/ClickHouse/issues/25368). [#25370](https://github.com/ClickHouse/ClickHouse/pull/25370) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Backported in [#25408](https://github.com/ClickHouse/ClickHouse/issues/25408): Fix `REPLACE` column transformer when used in DDL by correctly quoting the formated query. This fixes [#23925](https://github.com/ClickHouse/ClickHouse/issues/23925). [#25391](https://github.com/ClickHouse/ClickHouse/pull/25391) ([Amos Bird](https://github.com/amosbird)). +* Backported in [#25408](https://github.com/ClickHouse/ClickHouse/issues/25408): Fix `REPLACE` column transformer when used in DDL by correctly quoting the formatted query. This fixes [#23925](https://github.com/ClickHouse/ClickHouse/issues/23925). [#25391](https://github.com/ClickHouse/ClickHouse/pull/25391) ([Amos Bird](https://github.com/amosbird)). * Backported in [#25504](https://github.com/ClickHouse/ClickHouse/issues/25504): Fix segfault when sharding_key is absent in task config for copier. [#25419](https://github.com/ClickHouse/ClickHouse/pull/25419) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). #### NO CL ENTRY diff --git a/docs/changelogs/v21.6.5.37-stable.md b/docs/changelogs/v21.6.5.37-stable.md index fde95d30370..e977d373b73 100644 --- a/docs/changelogs/v21.6.5.37-stable.md +++ b/docs/changelogs/v21.6.5.37-stable.md @@ -24,7 +24,7 @@ sidebar_label: 2022 * Backported in [#25363](https://github.com/ClickHouse/ClickHouse/issues/25363): On ZooKeeper connection loss `ReplicatedMergeTree` table might wait for background operations to complete before trying to reconnect. It's fixed, now background operations are stopped forcefully. [#25306](https://github.com/ClickHouse/ClickHouse/pull/25306) ([Alexander Tokmakov](https://github.com/tavplubix)). * Backported in [#25388](https://github.com/ClickHouse/ClickHouse/issues/25388): Fix the possibility of non-deterministic behaviour of the `quantileDeterministic` function and similar. This closes [#20480](https://github.com/ClickHouse/ClickHouse/issues/20480). [#25313](https://github.com/ClickHouse/ClickHouse/pull/25313) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Backported in [#25448](https://github.com/ClickHouse/ClickHouse/issues/25448): Fix lost `WHERE` condition in expression-push-down optimization of query plan (setting `query_plan_filter_push_down = 1` by default). Fixes [#25368](https://github.com/ClickHouse/ClickHouse/issues/25368). [#25370](https://github.com/ClickHouse/ClickHouse/pull/25370) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Backported in [#25407](https://github.com/ClickHouse/ClickHouse/issues/25407): Fix `REPLACE` column transformer when used in DDL by correctly quoting the formated query. This fixes [#23925](https://github.com/ClickHouse/ClickHouse/issues/23925). [#25391](https://github.com/ClickHouse/ClickHouse/pull/25391) ([Amos Bird](https://github.com/amosbird)). +* Backported in [#25407](https://github.com/ClickHouse/ClickHouse/issues/25407): Fix `REPLACE` column transformer when used in DDL by correctly quoting the formatted query. This fixes [#23925](https://github.com/ClickHouse/ClickHouse/issues/23925). [#25391](https://github.com/ClickHouse/ClickHouse/pull/25391) ([Amos Bird](https://github.com/amosbird)). #### NOT FOR CHANGELOG / INSIGNIFICANT diff --git a/docs/changelogs/v21.7.1.7283-prestable.md b/docs/changelogs/v21.7.1.7283-prestable.md index 50565b636b9..6fc6c681e4d 100644 --- a/docs/changelogs/v21.7.1.7283-prestable.md +++ b/docs/changelogs/v21.7.1.7283-prestable.md @@ -133,7 +133,7 @@ sidebar_label: 2022 * On ZooKeeper connection loss `ReplicatedMergeTree` table might wait for background operations to complete before trying to reconnect. It's fixed, now background operations are stopped forcefully. [#25306](https://github.com/ClickHouse/ClickHouse/pull/25306) ([Alexander Tokmakov](https://github.com/tavplubix)). * Fix the possibility of non-deterministic behaviour of the `quantileDeterministic` function and similar. This closes [#20480](https://github.com/ClickHouse/ClickHouse/issues/20480). [#25313](https://github.com/ClickHouse/ClickHouse/pull/25313) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Fix lost `WHERE` condition in expression-push-down optimization of query plan (setting `query_plan_filter_push_down = 1` by default). Fixes [#25368](https://github.com/ClickHouse/ClickHouse/issues/25368). [#25370](https://github.com/ClickHouse/ClickHouse/pull/25370) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix `REPLACE` column transformer when used in DDL by correctly quoting the formated query. This fixes [#23925](https://github.com/ClickHouse/ClickHouse/issues/23925). [#25391](https://github.com/ClickHouse/ClickHouse/pull/25391) ([Amos Bird](https://github.com/amosbird)). +* Fix `REPLACE` column transformer when used in DDL by correctly quoting the formatted query. This fixes [#23925](https://github.com/ClickHouse/ClickHouse/issues/23925). [#25391](https://github.com/ClickHouse/ClickHouse/pull/25391) ([Amos Bird](https://github.com/amosbird)). * Fix segfault when sharding_key is absent in task config for copier. [#25419](https://github.com/ClickHouse/ClickHouse/pull/25419) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). * Fix excessive underscore before the names of the preprocessed configuration files. [#25431](https://github.com/ClickHouse/ClickHouse/pull/25431) ([Vitaly Baranov](https://github.com/vitlibar)). * Fix convertion of datetime with timezone for MySQL, PostgreSQL, ODBC. Closes [#5057](https://github.com/ClickHouse/ClickHouse/issues/5057). [#25528](https://github.com/ClickHouse/ClickHouse/pull/25528) ([Kseniia Sumarokova](https://github.com/kssenii)). diff --git a/docs/changelogs/v24.9.2.42-stable.md b/docs/changelogs/v24.9.2.42-stable.md new file mode 100644 index 00000000000..c6754cfc303 --- /dev/null +++ b/docs/changelogs/v24.9.2.42-stable.md @@ -0,0 +1,33 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.9.2.42-stable (de7c791a2ea) FIXME as compared to v24.9.1.3278-stable (6d058d82a8e) + +#### Improvement +* Backported in [#70091](https://github.com/ClickHouse/ClickHouse/issues/70091): Add `show_create_query_identifier_quoting_rule` to define identifier quoting behavior of the show create query result. Possible values: - `user_display`: When the identifiers is a keyword. - `when_necessary`: When the identifiers is one of `{"distinct", "all", "table"}`, or it can cause ambiguity: column names, dictionary attribute names. - `always`: Always quote identifiers. [#69448](https://github.com/ClickHouse/ClickHouse/pull/69448) ([tuanpach](https://github.com/tuanpach)). +* Backported in [#70100](https://github.com/ClickHouse/ClickHouse/issues/70100): Follow-up to https://github.com/ClickHouse/ClickHouse/pull/69346 Point 4 described there will work now as well:. [#69563](https://github.com/ClickHouse/ClickHouse/pull/69563) ([Vitaly Baranov](https://github.com/vitlibar)). +* Backported in [#70048](https://github.com/ClickHouse/ClickHouse/issues/70048): Add new column readonly_duration to the system.replicas table. Needed to be able to distinguish actual readonly replicas from sentinel ones in alerts. [#69871](https://github.com/ClickHouse/ClickHouse/pull/69871) ([Miсhael Stetsyuk](https://github.com/mstetsyuk)). + +#### Bug Fix (user-visible misbehavior in an official stable release) +* Backported in [#70193](https://github.com/ClickHouse/ClickHouse/issues/70193): Fix crash when executing `create view t as (with recursive 42 as ttt select ttt);`. [#69676](https://github.com/ClickHouse/ClickHouse/pull/69676) ([Han Fei](https://github.com/hanfei1991)). +* Backported in [#70083](https://github.com/ClickHouse/ClickHouse/issues/70083): Closes [#69752](https://github.com/ClickHouse/ClickHouse/issues/69752). [#69985](https://github.com/ClickHouse/ClickHouse/pull/69985) ([pufit](https://github.com/pufit)). +* Backported in [#70070](https://github.com/ClickHouse/ClickHouse/issues/70070): Fixes `Block structure mismatch` for queries with nested views and `WHERE` condition. Fixes [#66209](https://github.com/ClickHouse/ClickHouse/issues/66209). [#70054](https://github.com/ClickHouse/ClickHouse/pull/70054) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#70168](https://github.com/ClickHouse/ClickHouse/issues/70168): Fix wrong LOGICAL_ERROR when replacing literals in ranges. [#70122](https://github.com/ClickHouse/ClickHouse/pull/70122) ([Pablo Marcos](https://github.com/pamarcos)). +* Backported in [#70238](https://github.com/ClickHouse/ClickHouse/issues/70238): Check for Nullable(Nothing) type during ALTER TABLE MODIFY COLUMN/QUERY to prevent tables with such data type. [#70123](https://github.com/ClickHouse/ClickHouse/pull/70123) ([Pavel Kruglov](https://github.com/Avogar)). +* Backported in [#70205](https://github.com/ClickHouse/ClickHouse/issues/70205): Fix wrong result with skipping index. [#70127](https://github.com/ClickHouse/ClickHouse/pull/70127) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#70185](https://github.com/ClickHouse/ClickHouse/issues/70185): Fix data race in ColumnObject/ColumnTuple decompress method that could lead to heap use after free. [#70137](https://github.com/ClickHouse/ClickHouse/pull/70137) ([Pavel Kruglov](https://github.com/Avogar)). +* Backported in [#70253](https://github.com/ClickHouse/ClickHouse/issues/70253): Fix possible hung in ALTER COLUMN with Dynamic type. [#70144](https://github.com/ClickHouse/ClickHouse/pull/70144) ([Pavel Kruglov](https://github.com/Avogar)). +* Backported in [#70230](https://github.com/ClickHouse/ClickHouse/issues/70230): Use correct `max_types` parameter during Dynamic type creation for JSON subcolumn. [#70147](https://github.com/ClickHouse/ClickHouse/pull/70147) ([Pavel Kruglov](https://github.com/Avogar)). +* Backported in [#70217](https://github.com/ClickHouse/ClickHouse/issues/70217): Fix the password being displayed in `system.query_log` for users with bcrypt password authentication method. [#70148](https://github.com/ClickHouse/ClickHouse/pull/70148) ([Nikolay Degterinsky](https://github.com/evillique)). +* Backported in [#70267](https://github.com/ClickHouse/ClickHouse/issues/70267): Respect setting allow_simdjson in JSON type parser. [#70218](https://github.com/ClickHouse/ClickHouse/pull/70218) ([Pavel Kruglov](https://github.com/Avogar)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Backported in [#70052](https://github.com/ClickHouse/ClickHouse/issues/70052): Improve stateless test runner. [#69864](https://github.com/ClickHouse/ClickHouse/pull/69864) ([Alexey Katsman](https://github.com/alexkats)). +* Backported in [#70284](https://github.com/ClickHouse/ClickHouse/issues/70284): Improve pipdeptree generator for docker images. - Update requirements.txt for the integration tests runner container - Remove some small dependencies, improve `helpers/retry_decorator.py` - Upgrade docker-compose from EOL version 1 to version 2. [#70146](https://github.com/ClickHouse/ClickHouse/pull/70146) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#70261](https://github.com/ClickHouse/ClickHouse/issues/70261): Update test_storage_s3_queue/test.py. [#70159](https://github.com/ClickHouse/ClickHouse/pull/70159) ([Kseniia Sumarokova](https://github.com/kssenii)). + diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index 247a3e30293..66291014ed7 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -195,6 +195,9 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va - `--print-profile-events` – Print `ProfileEvents` packets. - `--profile-events-delay-ms` – Delay between printing `ProfileEvents` packets (-1 - print only totals, 0 - print every single packet). - `--jwt` – If specified, enables authorization via JSON Web Token. Server JWT authorization is available only in ClickHouse Cloud. +- `--progress` – Print progress of query execution. Possible values: 'tty|on|1|true|yes' - outputs to TTY in interactive mode; 'err' - outputs to STDERR non-interactive mode; 'off|0|false|no' - disables the progress printing. Default: TTY in interactive mode, disabled in non-interactive. +- `--progress-table` – Print a progress table with changing metrics during query execution. Possible values: 'tty|on|1|true|yes' - outputs to TTY in interactive mode; 'err' - outputs to STDERR non-interactive mode; 'off|0|false|no' - disables the progress table. Default: TTY in interactive mode, disabled in non-interactive. +- `--enable-progress-table-toggle` – Enable toggling of the progress table by pressing the control key (Space). Only applicable in interactive mode with the progress table printing enabled. Default: 'true'. Instead of `--host`, `--port`, `--user` and `--password` options, ClickHouse client also supports connection strings (see next section). diff --git a/docs/en/operations/settings/merge-tree-settings.md b/docs/en/operations/settings/merge-tree-settings.md index 8a106720ee0..4863858358d 100644 --- a/docs/en/operations/settings/merge-tree-settings.md +++ b/docs/en/operations/settings/merge-tree-settings.md @@ -1057,12 +1057,12 @@ Default value: throw ## deduplicate_merge_projection_mode -Whether to allow create projection for the table with non-classic MergeTree, that is not (Replicated, Shared) MergeTree. If allowed, what is the action when merge projections, either drop or rebuild. So classic MergeTree would ignore this setting. +Whether to allow create projection for the table with non-classic MergeTree, that is not (Replicated, Shared) MergeTree. Ignore option is purely for compatibility which might result in incorrect answer. Otherwise, if allowed, what is the action when merge projections, either drop or rebuild. So classic MergeTree would ignore this setting. It also controls `OPTIMIZE DEDUPLICATE` as well, but has effect on all MergeTree family members. Similar to the option `lightweight_mutation_projection_mode`, it is also part level. Possible values: -- throw, drop, rebuild +- ignore, throw, drop, rebuild Default value: throw diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index c012d065574..30113f6b0fd 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -7,206 +7,44 @@ toc_max_heading_level: 2 # Format settings {#format-settings} -## format_display_secrets_in_show_and_select {#format_display_secrets_in_show_and_select} +## bool_false_representation {#bool_false_representation} -Enables or disables showing secrets in `SHOW` and `SELECT` queries for tables, databases, -table functions, and dictionaries. +Type: String -User wishing to see secrets must also have -[`display_secrets_in_show_and_select` server setting](../server-configuration-parameters/settings#display_secrets_in_show_and_select) -turned on and a -[`displaySecretsInShowAndSelect`](../../sql-reference/statements/grant#display-secrets) privilege. +Default value: false -Possible values: +Text to represent false bool value in TSV/CSV/Vertical/Pretty formats. -- 0 — Disabled. -- 1 — Enabled. +## bool_true_representation {#bool_true_representation} -Default value: 0. +Type: String -## input_format_skip_unknown_fields {#input_format_skip_unknown_fields} +Default value: true -Enables or disables skipping insertion of extra data. - -When writing data, ClickHouse throws an exception if input data contain columns that do not exist in the target table. If skipping is enabled, ClickHouse does not insert extra data and does not throw an exception. - -Supported formats: - -- [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) (and other JSON formats) -- [BSONEachRow](../../interfaces/formats.md/#bsoneachrow) (and other JSON formats) -- [TSKV](../../interfaces/formats.md/#tskv) -- All formats with suffixes WithNames/WithNamesAndTypes -- [MySQLDump](../../interfaces/formats.md/#mysqldump) -- [Native](../../interfaces/formats.md/#native) - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: 1. - -## input_format_with_names_use_header {#input_format_with_names_use_header} - -Enables or disables checking the column order when inserting data. - -To improve insert performance, we recommend disabling this check if you are sure that the column order of the input data is the same as in the target table. - -Supported formats: - -- [CSVWithNames](../../interfaces/formats.md/#csvwithnames) -- [CSVWithNamesAndTypes](../../interfaces/formats.md/#csvwithnamesandtypes) -- [TabSeparatedWithNames](../../interfaces/formats.md/#tabseparatedwithnames) -- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md/#tabseparatedwithnamesandtypes) -- [JSONCompactEachRowWithNames](../../interfaces/formats.md/#jsoncompacteachrowwithnames) -- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompacteachrowwithnamesandtypes) -- [JSONCompactStringsEachRowWithNames](../../interfaces/formats.md/#jsoncompactstringseachrowwithnames) -- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompactstringseachrowwithnamesandtypes) -- [RowBinaryWithNames](../../interfaces/formats.md/#rowbinarywithnames) -- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md/#rowbinarywithnamesandtypes) -- [CustomSeparatedWithNames](../../interfaces/formats.md/#customseparatedwithnames) -- [CustomSeparatedWithNamesAndTypes](../../interfaces/formats.md/#customseparatedwithnamesandtypes) - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: 1. - -## input_format_with_types_use_header {#input_format_with_types_use_header} - -Controls whether format parser should check if data types from the input data match data types from the target table. - -Supported formats: - -- [CSVWithNamesAndTypes](../../interfaces/formats.md/#csvwithnamesandtypes) -- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md/#tabseparatedwithnamesandtypes) -- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompacteachrowwithnamesandtypes) -- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompactstringseachrowwithnamesandtypes) -- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md/#rowbinarywithnamesandtypes-rowbinarywithnamesandtypes) -- [CustomSeparatedWithNamesAndTypes](../../interfaces/formats.md/#customseparatedwithnamesandtypes) - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: 1. - -## input_format_defaults_for_omitted_fields {#input_format_defaults_for_omitted_fields} - -When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option applies to [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) (and other JSON formats), [CSV](../../interfaces/formats.md/#csv), [TabSeparated](../../interfaces/formats.md/#tabseparated), [TSKV](../../interfaces/formats.md/#tskv), [Parquet](../../interfaces/formats.md/#parquet), [Arrow](../../interfaces/formats.md/#arrow), [Avro](../../interfaces/formats.md/#avro), [ORC](../../interfaces/formats.md/#orc), [Native](../../interfaces/formats.md/#native) formats and formats with `WithNames`/`WithNamesAndTypes` suffixes. - -:::note -When this option is enabled, extended table metadata are sent from server to client. It consumes additional computing resources on the server and can reduce performance. -::: - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: 1. - -## input_format_null_as_default {#input_format_null_as_default} - -Enables or disables the initialization of [NULL](../../sql-reference/syntax.md/#null-literal) fields with [default values](../../sql-reference/statements/create/table.md/#create-default-values), if data type of these fields is not [nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable). -If column type is not nullable and this setting is disabled, then inserting `NULL` causes an exception. If column type is nullable, then `NULL` values are inserted as is, regardless of this setting. - -This setting is applicable for most input formats. - -For complex default expressions `input_format_defaults_for_omitted_fields` must be enabled too. - -Possible values: - -- 0 — Inserting `NULL` into a not nullable column causes an exception. -- 1 — `NULL` fields are initialized with default column values. - -Default value: `1`. - -## input_format_allow_seeks {#input_format_allow_seeks} - -Allow seeks while reading in ORC/Parquet/Arrow input formats. - -Enabled by default. - -## input_format_max_rows_to_read_for_schema_inference {#input_format_max_rows_to_read_for_schema_inference} - -The maximum rows of data to read for automatic schema inference. - -Default value: `25'000`. - -## input_format_max_bytes_to_read_for_schema_inference {#input_format_max_bytes_to_read_for_schema_inference} - -The maximum amount of data in bytes to read for automatic schema inference. - -Default value: `33554432` (32 Mb). +Text to represent true bool value in TSV/CSV/Vertical/Pretty formats. ## column_names_for_schema_inference {#column_names_for_schema_inference} +Type: String + +Default value: + The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...' -## schema_inference_hints {#schema_inference_hints} +## cross_to_inner_join_rewrite {#cross_to_inner_join_rewrite} -The list of column names and types to use as hints in schema inference for formats without schema. +Type: UInt64 -Example: +Default value: 1 -Query: -```sql -desc format(JSONEachRow, '{"x" : 1, "y" : "String", "z" : "0.0.0.0" }') settings schema_inference_hints='x UInt8, z IPv4'; -``` - -Result: -```sql -x UInt8 -y Nullable(String) -z IPv4 -``` - -:::note -If the `schema_inference_hints` is not formated properly, or if there is a typo or a wrong datatype, etc... the whole schema_inference_hints will be ignored. -::: - -## schema_inference_make_columns_nullable {#schema_inference_make_columns_nullable} - -Controls making inferred types `Nullable` in schema inference. -If the setting is enabled, all inferred type will be `Nullable`, if disabled, the inferred type will never be `Nullable`, if set to `auto`, the inferred type will be `Nullable` only if the column contains `NULL` in a sample that is parsed during schema inference or file metadata contains information about column nullability. - -Default value: `true`. - -## input_format_try_infer_integers {#input_format_try_infer_integers} - -If enabled, ClickHouse will try to infer integers instead of floats in schema inference for text formats. If all numbers in the column from input data are integers, the result type will be `Int64`, if at least one number is float, the result type will be `Float64`. - -Enabled by default. - -## input_format_try_infer_dates {#input_format_try_infer_dates} - -If enabled, ClickHouse will try to infer type `Date` from string fields in schema inference for text formats. If all fields from a column in input data were successfully parsed as dates, the result type will be `Date`, if at least one field was not parsed as date, the result type will be `String`. - -Enabled by default. - -## input_format_try_infer_datetimes {#input_format_try_infer_datetimes} - -If enabled, ClickHouse will try to infer type `DateTime64` from string fields in schema inference for text formats. If all fields from a column in input data were successfully parsed as datetimes, the result type will be `DateTime64`, if at least one field was not parsed as datetime, the result type will be `String`. - -Enabled by default. - -## input_format_try_infer_variants {#input_format_try_infer_variants} - -If enabled, ClickHouse will try to infer type [`Variant`](../../sql-reference/data-types/variant.md) in schema inference for text formats when there is more than one possible type for column/array elements. - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: `0`. +Use inner join instead of comma/cross join if there are joining expressions in the WHERE section. Values: 0 - no rewrite, 1 - apply if possible for comma/cross, 2 - force rewrite all comma joins, cross - if possible ## date_time_input_format {#date_time_input_format} +Type: DateTimeInputFormat + +Default value: basic + Allows choosing a parser of the text representation of date and time. The setting does not apply to [date and time functions](../../sql-reference/functions/date-time-functions.md). @@ -221,8 +59,6 @@ Possible values: ClickHouse can parse only the basic `YYYY-MM-DD HH:MM:SS` or `YYYY-MM-DD` format. For example, `2019-08-20 10:18:56` or `2019-08-20`. -Default value: `'basic'`. - Cloud default value: `'best_effort'`. See also: @@ -232,6 +68,10 @@ See also: ## date_time_output_format {#date_time_output_format} +Type: DateTimeOutputFormat + +Default value: simple + Allows choosing different output formats of the text representation of date and time. Possible values: @@ -248,65 +88,326 @@ Possible values: ClickHouse output date and time in [Unix timestamp](https://en.wikipedia.org/wiki/Unix_time) format. For example `1566285536`. -Default value: `simple`. - See also: - [DateTime data type.](../../sql-reference/data-types/datetime.md) - [Functions for working with dates and times.](../../sql-reference/functions/date-time-functions.md) -## interval_output_format {#interval_output_format} +## date_time_overflow_behavior {#date_time_overflow_behavior} -Allows choosing different output formats of the text representation of interval types. +Type: DateTimeOverflowBehavior + +Default value: ignore + +Overflow mode for Date, Date32, DateTime, DateTime64 types. Possible values: 'ignore', 'throw', 'saturate'. + +## dictionary_use_async_executor {#dictionary_use_async_executor} + +Type: Bool + +Default value: 0 + +Execute a pipeline for reading dictionary source in several threads. It's supported only by dictionaries with local CLICKHOUSE source. + +## errors_output_format {#errors_output_format} + +Type: String + +Default value: CSV + +Method to write Errors to text output. + +## exact_rows_before_limit {#exact_rows_before_limit} + +Type: Bool + +Default value: 0 + +When enabled, ClickHouse will provide exact value for rows_before_limit_at_least statistic, but with the cost that the data before limit will have to be read completely + +## format_avro_schema_registry_url {#format_avro_schema_registry_url} + +Type: URI + +Default value: + +For AvroConfluent format: Confluent Schema Registry URL. + +## format_binary_max_array_size {#format_binary_max_array_size} + +Type: UInt64 + +Default value: 1073741824 + +The maximum allowed size for Array in RowBinary format. It prevents allocating large amount of memory in case of corrupted data. 0 means there is no limit + +## format_binary_max_string_size {#format_binary_max_string_size} + +Type: UInt64 + +Default value: 1073741824 + +The maximum allowed size for String in RowBinary format. It prevents allocating large amount of memory in case of corrupted data. 0 means there is no limit + +## format_capn_proto_enum_comparising_mode {#format_capn_proto_enum_comparising_mode} + +Type: CapnProtoEnumComparingMode + +Default value: by_values + +How to map ClickHouse Enum and CapnProto Enum + +## format_capn_proto_use_autogenerated_schema {#format_capn_proto_use_autogenerated_schema} + +Type: Bool + +Default value: 1 + +Use autogenerated CapnProto schema when format_schema is not set + +## format_csv_allow_double_quotes {#format_csv_allow_double_quotes} + +Type: Bool + +Default value: 1 + +If it is set to true, allow strings in double quotes. + +## format_csv_allow_single_quotes {#format_csv_allow_single_quotes} + +Type: Bool + +Default value: 0 + +If it is set to true, allow strings in single quotes. + +## format_csv_delimiter {#format_csv_delimiter} + +Type: Char + +Default value: , + +The character to be considered as a delimiter in CSV data. If setting with a string, a string has to have a length of 1. + +## format_csv_null_representation {#format_csv_null_representation} + +Type: String + +Default value: \N + +Custom NULL representation in CSV format + +## format_custom_escaping_rule {#format_custom_escaping_rule} + +Type: EscapingRule + +Default value: Escaped + +Field escaping rule (for CustomSeparated format) + +## format_custom_field_delimiter {#format_custom_field_delimiter} + +Type: String + +Default value: + +Delimiter between fields (for CustomSeparated format) + +## format_custom_result_after_delimiter {#format_custom_result_after_delimiter} + +Type: String + +Default value: + +Suffix after result set (for CustomSeparated format) + +## format_custom_result_before_delimiter {#format_custom_result_before_delimiter} + +Type: String + +Default value: + +Prefix before result set (for CustomSeparated format) + +## format_custom_row_after_delimiter {#format_custom_row_after_delimiter} + +Type: String + +Default value: + + +Delimiter after field of the last column (for CustomSeparated format) + +## format_custom_row_before_delimiter {#format_custom_row_before_delimiter} + +Type: String + +Default value: + +Delimiter before field of the first column (for CustomSeparated format) + +## format_custom_row_between_delimiter {#format_custom_row_between_delimiter} + +Type: String + +Default value: + +Delimiter between rows (for CustomSeparated format) + +## format_display_secrets_in_show_and_select {#format_display_secrets_in_show_and_select} + +Type: Bool + +Default value: 0 + +Enables or disables showing secrets in `SHOW` and `SELECT` queries for tables, databases, +table functions, and dictionaries. + +User wishing to see secrets must also have +[`display_secrets_in_show_and_select` server setting](../server-configuration-parameters/settings#display_secrets_in_show_and_select) +turned on and a +[`displaySecretsInShowAndSelect`](../../sql-reference/statements/grant#display-secrets) privilege. Possible values: -- `kusto` - KQL-style output format. +- 0 — Disabled. +- 1 — Enabled. - ClickHouse outputs intervals in [KQL format](https://learn.microsoft.com/en-us/dotnet/standard/base-types/standard-timespan-format-strings#the-constant-c-format-specifier). For example, `toIntervalDay(2)` would be formatted as `2.00:00:00`. Please note that for interval types of varying length (ie. `IntervalMonth` and `IntervalYear`) the average number of seconds per interval is taken into account. +## format_json_object_each_row_column_for_object_name {#format_json_object_each_row_column_for_object_name} -- `numeric` - Numeric output format. +Type: String - ClickHouse outputs intervals as their underlying numeric representation. For example, `toIntervalDay(2)` would be formatted as `2`. +Default value: -Default value: `numeric`. +The name of column that will be used for storing/writing object names in [JSONObjectEachRow](../../interfaces/formats.md/#jsonobjecteachrow) format. +Column type should be String. If value is empty, default names `row_{i}`will be used for object names. -See also: +### input_format_json_compact_allow_variable_number_of_columns {#input_format_json_compact_allow_variable_number_of_columns} -- [Interval](../../sql-reference/data-types/special-data-types/interval.md) - -## input_format_ipv4_default_on_conversion_error {#input_format_ipv4_default_on_conversion_error} - -Deserialization of IPv4 will use default values instead of throwing exception on conversion error. +Allow variable number of columns in rows in JSONCompact/JSONCompactEachRow input formats. +Ignore extra columns in rows with more columns than expected and treat missing columns as default values. Disabled by default. -## input_format_ipv6_default_on_conversion_error {#input_format_ipv6_default_on_conversion_error} +### output_format_markdown_escape_special_characters {#output_format_markdown_escape_special_characters} -Deserialization of IPV6 will use default values instead of throwing exception on conversion error. +When enabled, escape special characters in Markdown. -Disabled by default. +[Common Mark](https://spec.commonmark.org/0.30/#example-12) defines the following special characters that can be escaped by \: -## bool_true_representation {#bool_true_representation} +``` +! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ +``` -Text to represent true bool value in TSV/CSV/Vertical/Pretty formats. +Possible values: -Default value: `true` ++ 0 — Disable. ++ 1 — Enable. -## bool_false_representation {#bool_false_representation} +### input_format_json_empty_as_default {#input_format_json_empty_as_default} -Text to represent false bool value in TSV/CSV/Vertical/Pretty formats. +When enabled, replace empty input fields in JSON with default values. For complex default expressions `input_format_defaults_for_omitted_fields` must be enabled too. -Default value: `false` +Possible values: -## output_format_decimal_trailing_zeros {#output_format_decimal_trailing_zeros} ++ 0 — Disable. ++ 1 — Enable. -Output trailing zeros when printing Decimal values. E.g. 1.230000 instead of 1.23. +## format_protobuf_use_autogenerated_schema {#format_protobuf_use_autogenerated_schema} -Disabled by default. +Type: Bool + +Default value: 1 + +Use autogenerated Protobuf when format_schema is not set + +## format_regexp {#format_regexp} + +Type: String + +Default value: + +Regular expression (for Regexp format) + +## format_regexp_escaping_rule {#format_regexp_escaping_rule} + +Type: EscapingRule + +Default value: Raw + +Field escaping rule (for Regexp format) + +## format_regexp_skip_unmatched {#format_regexp_skip_unmatched} + +Type: Bool + +Default value: 0 + +Skip lines unmatched by regular expression (for Regexp format) + +## format_schema {#format_schema} + +Type: String + +Default value: + +This parameter is useful when you are using formats that require a schema definition, such as [Cap’n Proto](https://capnproto.org/) or [Protobuf](https://developers.google.com/protocol-buffers/). The value depends on the format. + +## format_template_resultset {#format_template_resultset} + +Type: String + +Default value: + +Path to file which contains format string for result set (for Template format) + +## format_template_resultset_format {#format_template_resultset_format} + +Type: String + +Default value: + +Format string for result set (for Template format) + +## format_template_row {#format_template_row} + +Type: String + +Default value: + +Path to file which contains format string for rows (for Template format) + +## format_template_row_format {#format_template_row_format} + +Type: String + +Default value: + +Format string for rows (for Template format) + +## format_template_rows_between_delimiter {#format_template_rows_between_delimiter} + +Type: String + +Default value: + + +Delimiter between rows (for Template format) + +## format_tsv_null_representation {#format_tsv_null_representation} + +Type: String + +Default value: \N + +Custom NULL representation in TSV format ## input_format_allow_errors_num {#input_format_allow_errors_num} +Type: UInt64 + +Default value: 0 + Sets the maximum number of acceptable errors when reading from text formats (CSV, TSV, etc.). The default value is 0. @@ -319,6 +420,10 @@ If both `input_format_allow_errors_num` and `input_format_allow_errors_ratio` ar ## input_format_allow_errors_ratio {#input_format_allow_errors_ratio} +Type: Float + +Default value: 0 + Sets the maximum percentage of errors allowed when reading from text formats (CSV, TSV, etc.). The percentage of errors is set as a floating-point number between 0 and 1. @@ -330,43 +435,290 @@ If an error occurred while reading rows but the error counter is still less than If both `input_format_allow_errors_num` and `input_format_allow_errors_ratio` are exceeded, ClickHouse throws an exception. -## format_schema {#format-schema} +## input_format_allow_seeks {#input_format_allow_seeks} -This parameter is useful when you are using formats that require a schema definition, such as [Cap’n Proto](https://capnproto.org/) or [Protobuf](https://developers.google.com/protocol-buffers/). The value depends on the format. +Type: Bool -## output_format_schema {#output-format-schema} +Default value: 1 -The path to the file where the automatically generated schema will be saved in [Cap’n Proto](../../interfaces/formats.md#capnproto-capnproto) or [Protobuf](../../interfaces/formats.md#protobuf-protobuf) formats. +Allow seeks while reading in ORC/Parquet/Arrow input formats. -## output_format_enable_streaming {#output_format_enable_streaming} +Enabled by default. -Enable streaming in output formats that support it. +## input_format_arrow_allow_missing_columns {#input_format_arrow_allow_missing_columns} + +Type: Bool + +Default value: 1 + +Allow missing columns while reading Arrow input formats + +## input_format_arrow_case_insensitive_column_matching {#input_format_arrow_case_insensitive_column_matching} + +Type: Bool + +Default value: 0 + +Ignore case when matching Arrow columns with CH columns. + +## input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference {#input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference} + +Type: Bool + +Default value: 0 + +Skip columns with unsupported types while schema inference for format Arrow + +## input_format_avro_allow_missing_fields {#input_format_avro_allow_missing_fields} + +Type: Bool + +Default value: 0 + +For Avro/AvroConfluent format: when field is not found in schema use default value instead of error + +## input_format_avro_null_as_default {#input_format_avro_null_as_default} + +Type: Bool + +Default value: 0 + +For Avro/AvroConfluent format: insert default in case of null and non Nullable column + +## input_format_binary_decode_types_in_binary_format {#input_format_binary_decode_types_in_binary_format} + +Type: Bool + +Default value: 0 + +Read data types in binary format instead of type names in RowBinaryWithNamesAndTypes input format + +## input_format_bson_skip_fields_with_unsupported_types_in_schema_inference {#input_format_bson_skip_fields_with_unsupported_types_in_schema_inference} + +Type: Bool + +Default value: 0 + +Skip fields with unsupported types while schema inference for format BSON. + +## input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference {#input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference} + +Type: Bool + +Default value: 0 + +Skip columns with unsupported types while schema inference for format CapnProto + +## input_format_csv_allow_cr_end_of_line {#input_format_csv_allow_cr_end_of_line} + +Type: Bool + +Default value: 0 + +If it is set true, \\r will be allowed at end of line not followed by \\n + +## input_format_csv_allow_variable_number_of_columns {#input_format_csv_allow_variable_number_of_columns} + +Type: Bool + +Default value: 0 + +Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values + +## input_format_csv_allow_whitespace_or_tab_as_delimiter {#input_format_csv_allow_whitespace_or_tab_as_delimiter} + +Type: Bool + +Default value: 0 + +Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings + +## input_format_csv_arrays_as_nested_csv {#input_format_csv_arrays_as_nested_csv} + +Type: Bool + +Default value: 0 + +When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: \"[\"\"Hello\"\", \"\"world\"\", \"\"42\"\"\"\" TV\"\"]\". Braces around array can be omitted. + +## input_format_csv_deserialize_separate_columns_into_tuple {#input_format_csv_deserialize_separate_columns_into_tuple} + +Type: Bool + +Default value: 1 + +If it set to true, then separate columns written in CSV format can be deserialized to Tuple column. + +## input_format_csv_detect_header {#input_format_csv_detect_header} + +Type: Bool + +Default value: 1 + +Automatically detect header with names and types in CSV format + +## input_format_csv_empty_as_default {#input_format_csv_empty_as_default} + +Type: Bool + +Default value: 1 + +Treat empty fields in CSV input as default values. + +## input_format_csv_enum_as_number {#input_format_csv_enum_as_number} + +Type: Bool + +Default value: 0 + +Treat inserted enum values in CSV formats as enum indices + +## input_format_csv_skip_first_lines {#input_format_csv_skip_first_lines} + +Type: UInt64 + +Default value: 0 + +Skip specified number of lines at the beginning of data in CSV format + +## input_format_csv_skip_trailing_empty_lines {#input_format_csv_skip_trailing_empty_lines} + +Type: Bool + +Default value: 0 + +Skip trailing empty lines in CSV format + +## input_format_csv_trim_whitespaces {#input_format_csv_trim_whitespaces} + +Type: Bool + +Default value: 1 + +Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings + +## input_format_csv_try_infer_numbers_from_strings {#input_format_csv_try_infer_numbers_from_strings} + +Type: Bool + +Default value: 0 + +If enabled, during schema inference ClickHouse will try to infer numbers from string fields. +It can be useful if CSV data contains quoted UInt64 numbers. Disabled by default. -## output_format_write_statistics {#output_format_write_statistics} +## input_format_csv_try_infer_strings_from_quoted_tuples {#input_format_csv_try_infer_strings_from_quoted_tuples} -Write statistics about read rows, bytes, time elapsed in suitable output formats. +Type: Bool -Enabled by default +Default value: 1 -## insert_distributed_one_random_shard {#insert_distributed_one_random_shard} +Interpret quoted tuples in the input data as a value of type String. -Enables or disables random shard insertion into a [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table when there is no distributed key. +## input_format_csv_use_best_effort_in_schema_inference {#input_format_csv_use_best_effort_in_schema_inference} -By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will reject any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards. +Type: Bool + +Default value: 1 + +Use some tweaks and heuristics to infer schema in CSV format + +## input_format_csv_use_default_on_bad_values {#input_format_csv_use_default_on_bad_values} + +Type: Bool + +Default value: 0 + +Allow to set default value to column when CSV field deserialization failed on bad value + +## input_format_custom_allow_variable_number_of_columns {#input_format_custom_allow_variable_number_of_columns} + +Type: Bool + +Default value: 0 + +Ignore extra columns in CustomSeparated input (if file has more columns than expected) and treat missing fields in CustomSeparated input as default values + +## input_format_custom_detect_header {#input_format_custom_detect_header} + +Type: Bool + +Default value: 1 + +Automatically detect header with names and types in CustomSeparated format + +## input_format_custom_skip_trailing_empty_lines {#input_format_custom_skip_trailing_empty_lines} + +Type: Bool + +Default value: 0 + +Skip trailing empty lines in CustomSeparated format + +## input_format_defaults_for_omitted_fields {#input_format_defaults_for_omitted_fields} + +Type: Bool + +Default value: 1 + +When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option applies to [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) (and other JSON formats), [CSV](../../interfaces/formats.md/#csv), [TabSeparated](../../interfaces/formats.md/#tabseparated), [TSKV](../../interfaces/formats.md/#tskv), [Parquet](../../interfaces/formats.md/#parquet), [Arrow](../../interfaces/formats.md/#arrow), [Avro](../../interfaces/formats.md/#avro), [ORC](../../interfaces/formats.md/#orc), [Native](../../interfaces/formats.md/#native) formats and formats with `WithNames`/`WithNamesAndTypes` suffixes. + +:::note +When this option is enabled, extended table metadata are sent from server to client. It consumes additional computing resources on the server and can reduce performance. +::: Possible values: -- 0 — Insertion is rejected if there are multiple shards and no distributed key is given. -- 1 — Insertion is done randomly among all available shards when no distributed key is given. +- 0 — Disabled. +- 1 — Enabled. -Default value: `0`. +## input_format_force_null_for_omitted_fields {#input_format_force_null_for_omitted_fields} -## JSON formats settings {#json-formats-settings} +Type: Bool + +Default value: 0 + +Force initialize omitted fields with null values + +## input_format_hive_text_allow_variable_number_of_columns {#input_format_hive_text_allow_variable_number_of_columns} + +Type: Bool + +Default value: 1 + +Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values + +## input_format_hive_text_collection_items_delimiter {#input_format_hive_text_collection_items_delimiter} + +Type: Char + +Default value:  + +Delimiter between collection(array or map) items in Hive Text File + +## input_format_hive_text_fields_delimiter {#input_format_hive_text_fields_delimiter} + +Type: Char + +Default value:  + +Delimiter between fields in Hive Text File + +## input_format_hive_text_map_keys_delimiter {#input_format_hive_text_map_keys_delimiter} + +Type: Char + +Default value:  + +Delimiter between a pair of map key/values in Hive Text File ## input_format_import_nested_json {#input_format_import_nested_json} +Type: Bool + +Default value: 0 + Enables or disables the insertion of JSON data with nested objects. Supported formats: @@ -378,104 +730,81 @@ Possible values: - 0 — Disabled. - 1 — Enabled. -Default value: 0. - See also: - [Usage of Nested Structures](../../interfaces/formats.md/#jsoneachrow-nested) with the `JSONEachRow` format. -## input_format_json_read_bools_as_numbers {#input_format_json_read_bools_as_numbers} +## input_format_ipv4_default_on_conversion_error {#input_format_ipv4_default_on_conversion_error} -Allow parsing bools as numbers in JSON input formats. +Type: Bool -Enabled by default. +Default value: 0 -## input_format_json_read_bools_as_strings {#input_format_json_read_bools_as_strings} - -Allow parsing bools as strings in JSON input formats. - -Enabled by default. - -## input_format_json_read_numbers_as_strings {#input_format_json_read_numbers_as_strings} - -Allow parsing numbers as strings in JSON input formats. - -Enabled by default. - -## input_format_json_try_infer_numbers_from_strings {#input_format_json_try_infer_numbers_from_strings} - -If enabled, during schema inference ClickHouse will try to infer numbers from string fields. -It can be useful if JSON data contains quoted UInt64 numbers. +Deserialization of IPv4 will use default values instead of throwing exception on conversion error. Disabled by default. -## input_format_json_read_objects_as_strings {#input_format_json_read_objects_as_strings} +## input_format_ipv6_default_on_conversion_error {#input_format_ipv6_default_on_conversion_error} -Allow parsing JSON objects as strings in JSON input formats. +Type: Bool -Example: +Default value: 0 -```sql -SET input_format_json_read_objects_as_strings = 1; -CREATE TABLE test (id UInt64, obj String, date Date) ENGINE=Memory(); -INSERT INTO test FORMAT JSONEachRow {"id" : 1, "obj" : {"a" : 1, "b" : "Hello"}, "date" : "2020-01-01"}; -SELECT * FROM test; -``` +Deserialization of IPV6 will use default values instead of throwing exception on conversion error. -Result: +Disabled by default. -``` -┌─id─┬─obj──────────────────────┬───────date─┐ -│ 1 │ {"a" : 1, "b" : "Hello"} │ 2020-01-01 │ -└────┴──────────────────────────┴────────────┘ -``` +## input_format_json_compact_allow_variable_number_of_columns {#input_format_json_compact_allow_variable_number_of_columns} + +Type: Bool + +Default value: 0 + +Ignore extra columns in JSONCompact(EachRow) input (if file has more columns than expected) and treat missing fields in JSONCompact(EachRow) input as default values + +## input_format_json_defaults_for_missing_elements_in_named_tuple {#input_format_json_defaults_for_missing_elements_in_named_tuple} + +Type: Bool + +Default value: 1 + +Insert default values for missing elements in JSON object while parsing named tuple. +This setting works only when setting `input_format_json_named_tuples_as_objects` is enabled. Enabled by default. -## input_format_json_try_infer_named_tuples_from_objects {#input_format_json_try_infer_named_tuples_from_objects} +## input_format_json_empty_as_default {#input_format_json_empty_as_default} -If enabled, during schema inference ClickHouse will try to infer named Tuple from JSON objects. -The resulting named Tuple will contain all elements from all corresponding JSON objects from sample data. +Type: Bool -Example: +Default value: 0 -```sql -SET input_format_json_try_infer_named_tuples_from_objects = 1; -DESC format(JSONEachRow, '{"obj" : {"a" : 42, "b" : "Hello"}}, {"obj" : {"a" : 43, "c" : [1, 2, 3]}}, {"obj" : {"d" : {"e" : 42}}}') -``` +Treat empty fields in JSON input as default values. -Result: +## input_format_json_ignore_unknown_keys_in_named_tuple {#input_format_json_ignore_unknown_keys_in_named_tuple} -``` -┌─name─┬─type───────────────────────────────────────────────────────────────────────────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐ -│ obj │ Tuple(a Nullable(Int64), b Nullable(String), c Array(Nullable(Int64)), d Tuple(e Nullable(Int64))) │ │ │ │ │ │ -└──────┴────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ -``` +Type: Bool + +Default value: 1 + +Ignore unknown keys in json object for named tuples. Enabled by default. -## input_format_json_read_arrays_as_strings {#input_format_json_read_arrays_as_strings} +## input_format_json_ignore_unnecessary_fields {#input_format_json_ignore_unnecessary_fields} -Allow parsing JSON arrays as strings in JSON input formats. +Type: Bool -Example: +Default value: 1 -```sql -SET input_format_json_read_arrays_as_strings = 1; -SELECT arr, toTypeName(arr), JSONExtractArrayRaw(arr)[3] from format(JSONEachRow, 'arr String', '{"arr" : [1, "Hello", [1,2,3]]}'); -``` - -Result: -``` -┌─arr───────────────────┬─toTypeName(arr)─┬─arrayElement(JSONExtractArrayRaw(arr), 3)─┐ -│ [1, "Hello", [1,2,3]] │ String │ [1,2,3] │ -└───────────────────────┴─────────────────┴───────────────────────────────────────────┘ -``` - -Enabled by default. +Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields ## input_format_json_infer_incomplete_types_as_strings {#input_format_json_infer_incomplete_types_as_strings} +Type: Bool + +Default value: 1 + Allow to use String type for JSON keys that contain only `Null`/`{}`/`[]` in data sample during schema inference. In JSON formats any value can be read as String, and we can avoid errors like `Cannot determine type for column 'column_name' by first 25000 rows of data, most likely this column contains only Nulls or empty Arrays/Maps` during schema inference by using String type for keys with unknown types. @@ -501,15 +830,874 @@ Result: Enabled by default. +## input_format_json_max_depth {#input_format_json_max_depth} + +Type: UInt64 + +Default value: 1000 + +Maximum depth of a field in JSON. This is not a strict limit, it does not have to be applied precisely. + +## input_format_json_named_tuples_as_objects {#input_format_json_named_tuples_as_objects} + +Type: Bool + +Default value: 1 + +Parse named tuple columns as JSON objects. + +Enabled by default. + +## input_format_json_read_arrays_as_strings {#input_format_json_read_arrays_as_strings} + +Type: Bool + +Default value: 1 + +Allow parsing JSON arrays as strings in JSON input formats. + +Example: + +```sql +SET input_format_json_read_arrays_as_strings = 1; +SELECT arr, toTypeName(arr), JSONExtractArrayRaw(arr)[3] from format(JSONEachRow, 'arr String', '{"arr" : [1, "Hello", [1,2,3]]}'); +``` + +Result: +``` +┌─arr───────────────────┬─toTypeName(arr)─┬─arrayElement(JSONExtractArrayRaw(arr), 3)─┐ +│ [1, "Hello", [1,2,3]] │ String │ [1,2,3] │ +└───────────────────────┴─────────────────┴───────────────────────────────────────────┘ +``` + +Enabled by default. + +## input_format_json_read_bools_as_numbers {#input_format_json_read_bools_as_numbers} + +Type: Bool + +Default value: 1 + +Allow parsing bools as numbers in JSON input formats. + +Enabled by default. + +## input_format_json_read_bools_as_strings {#input_format_json_read_bools_as_strings} + +Type: Bool + +Default value: 1 + +Allow parsing bools as strings in JSON input formats. + +Enabled by default. + +## input_format_json_read_numbers_as_strings {#input_format_json_read_numbers_as_strings} + +Type: Bool + +Default value: 1 + +Allow parsing numbers as strings in JSON input formats. + +Enabled by default. + +## input_format_json_read_objects_as_strings {#input_format_json_read_objects_as_strings} + +Type: Bool + +Default value: 1 + +Allow parsing JSON objects as strings in JSON input formats. + +Example: + +```sql +SET input_format_json_read_objects_as_strings = 1; +CREATE TABLE test (id UInt64, obj String, date Date) ENGINE=Memory(); +INSERT INTO test FORMAT JSONEachRow {"id" : 1, "obj" : {"a" : 1, "b" : "Hello"}, "date" : "2020-01-01"}; +SELECT * FROM test; +``` + +Result: + +``` +┌─id─┬─obj──────────────────────┬───────date─┐ +│ 1 │ {"a" : 1, "b" : "Hello"} │ 2020-01-01 │ +└────┴──────────────────────────┴────────────┘ +``` + +Enabled by default. + +## input_format_json_throw_on_bad_escape_sequence {#input_format_json_throw_on_bad_escape_sequence} + +Type: Bool + +Default value: 1 + +Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data. + +Enabled by default. + +## input_format_json_try_infer_named_tuples_from_objects {#input_format_json_try_infer_named_tuples_from_objects} + +Type: Bool + +Default value: 1 + +If enabled, during schema inference ClickHouse will try to infer named Tuple from JSON objects. +The resulting named Tuple will contain all elements from all corresponding JSON objects from sample data. + +Example: + +```sql +SET input_format_json_try_infer_named_tuples_from_objects = 1; +DESC format(JSONEachRow, '{"obj" : {"a" : 42, "b" : "Hello"}}, {"obj" : {"a" : 43, "c" : [1, 2, 3]}}, {"obj" : {"d" : {"e" : 42}}}') +``` + +Result: + +``` +┌─name─┬─type───────────────────────────────────────────────────────────────────────────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐ +│ obj │ Tuple(a Nullable(Int64), b Nullable(String), c Array(Nullable(Int64)), d Tuple(e Nullable(Int64))) │ │ │ │ │ │ +└──────┴────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ +``` + +Enabled by default. + +## input_format_json_try_infer_numbers_from_strings {#input_format_json_try_infer_numbers_from_strings} + +Type: Bool + +Default value: 0 + +If enabled, during schema inference ClickHouse will try to infer numbers from string fields. +It can be useful if JSON data contains quoted UInt64 numbers. + +Disabled by default. + +## input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects {#input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects} + +Type: Bool + +Default value: 0 + +Use String type instead of an exception in case of ambiguous paths in JSON objects during named tuples inference + ## input_format_json_validate_types_from_metadata {#input_format_json_validate_types_from_metadata} +Type: Bool + +Default value: 1 + For JSON/JSONCompact/JSONColumnsWithMetadata input formats, if this setting is set to 1, the types from metadata in input data will be compared with the types of the corresponding columns from the table. Enabled by default. +## input_format_max_bytes_to_read_for_schema_inference {#input_format_max_bytes_to_read_for_schema_inference} + +Type: UInt64 + +Default value: 33554432 + +The maximum amount of data in bytes to read for automatic schema inference. + +## input_format_max_rows_to_read_for_schema_inference {#input_format_max_rows_to_read_for_schema_inference} + +Type: UInt64 + +Default value: 25000 + +The maximum rows of data to read for automatic schema inference. + +## input_format_msgpack_number_of_columns {#input_format_msgpack_number_of_columns} + +Type: UInt64 + +Default value: 0 + +The number of columns in inserted MsgPack data. Used for automatic schema inference from data. + +## input_format_mysql_dump_map_column_names {#input_format_mysql_dump_map_column_names} + +Type: Bool + +Default value: 1 + +Match columns from table in MySQL dump and columns from ClickHouse table by names + +## input_format_mysql_dump_table_name {#input_format_mysql_dump_table_name} + +Type: String + +Default value: + +Name of the table in MySQL dump from which to read data + +## input_format_native_allow_types_conversion {#input_format_native_allow_types_conversion} + +Type: Bool + +Default value: 1 + +Allow data types conversion in Native input format + +## input_format_native_decode_types_in_binary_format {#input_format_native_decode_types_in_binary_format} + +Type: Bool + +Default value: 0 + +Read data types in binary format instead of type names in Native input format + +## input_format_null_as_default {#input_format_null_as_default} + +Type: Bool + +Default value: 1 + +Enables or disables the initialization of [NULL](../../sql-reference/syntax.md/#null-literal) fields with [default values](../../sql-reference/statements/create/table.md/#create-default-values), if data type of these fields is not [nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable). +If column type is not nullable and this setting is disabled, then inserting `NULL` causes an exception. If column type is nullable, then `NULL` values are inserted as is, regardless of this setting. + +This setting is applicable for most input formats. + +For complex default expressions `input_format_defaults_for_omitted_fields` must be enabled too. + +Possible values: + +- 0 — Inserting `NULL` into a not nullable column causes an exception. +- 1 — `NULL` fields are initialized with default column values. + +## input_format_orc_allow_missing_columns {#input_format_orc_allow_missing_columns} + +Type: Bool + +Default value: 1 + +Allow missing columns while reading ORC input formats + +## input_format_orc_case_insensitive_column_matching {#input_format_orc_case_insensitive_column_matching} + +Type: Bool + +Default value: 0 + +Ignore case when matching ORC columns with CH columns. + +## input_format_orc_filter_push_down {#input_format_orc_filter_push_down} + +Type: Bool + +Default value: 1 + +When reading ORC files, skip whole stripes or row groups based on the WHERE/PREWHERE expressions, min/max statistics or bloom filter in the ORC metadata. + +## input_format_orc_reader_time_zone_name {#input_format_orc_reader_time_zone_name} + +Type: String + +Default value: GMT + +The time zone name for ORC row reader, the default ORC row reader's time zone is GMT. + +## input_format_orc_row_batch_size {#input_format_orc_row_batch_size} + +Type: Int64 + +Default value: 100000 + +Batch size when reading ORC stripes. + +## input_format_orc_skip_columns_with_unsupported_types_in_schema_inference {#input_format_orc_skip_columns_with_unsupported_types_in_schema_inference} + +Type: Bool + +Default value: 0 + +Skip columns with unsupported types while schema inference for format ORC + +## input_format_orc_use_fast_decoder {#input_format_orc_use_fast_decoder} + +Type: Bool + +Default value: 1 + +Use a faster ORC decoder implementation. + +## input_format_parquet_allow_missing_columns {#input_format_parquet_allow_missing_columns} + +Type: Bool + +Default value: 1 + +Allow missing columns while reading Parquet input formats + +## input_format_parquet_case_insensitive_column_matching {#input_format_parquet_case_insensitive_column_matching} + +Type: Bool + +Default value: 0 + +Ignore case when matching Parquet columns with CH columns. + +## input_format_parquet_filter_push_down {#input_format_parquet_filter_push_down} + +Type: Bool + +Default value: 1 + +When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and min/max statistics in the Parquet metadata. + +## input_format_parquet_local_file_min_bytes_for_seek {#input_format_parquet_local_file_min_bytes_for_seek} + +Type: UInt64 + +Default value: 8192 + +Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format + +## input_format_parquet_max_block_size {#input_format_parquet_max_block_size} + +Type: UInt64 + +Default value: 65409 + +Max block size for parquet reader. + +## input_format_parquet_prefer_block_bytes {#input_format_parquet_prefer_block_bytes} + +Type: UInt64 + +Default value: 16744704 + +Average block bytes output by parquet reader + +## input_format_parquet_preserve_order {#input_format_parquet_preserve_order} + +Type: Bool + +Default value: 0 + +Avoid reordering rows when reading from Parquet files. Usually makes it much slower. + +## input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference {#input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference} + +Type: Bool + +Default value: 0 + +Skip columns with unsupported types while schema inference for format Parquet + +## input_format_parquet_use_native_reader {#input_format_parquet_use_native_reader} + +Type: Bool + +Default value: 0 + +When reading Parquet files, to use native reader instead of arrow reader. + +## input_format_protobuf_flatten_google_wrappers {#input_format_protobuf_flatten_google_wrappers} + +Type: Bool + +Default value: 0 + +Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls + +## input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference {#input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference} + +Type: Bool + +Default value: 0 + +Skip fields with unsupported types while schema inference for format Protobuf + +## input_format_record_errors_file_path {#input_format_record_errors_file_path} + +Type: String + +Default value: + +Path of the file used to record errors while reading text formats (CSV, TSV). + +## input_format_skip_unknown_fields {#input_format_skip_unknown_fields} + +Type: Bool + +Default value: 1 + +Enables or disables skipping insertion of extra data. + +When writing data, ClickHouse throws an exception if input data contain columns that do not exist in the target table. If skipping is enabled, ClickHouse does not insert extra data and does not throw an exception. + +Supported formats: + +- [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) (and other JSON formats) +- [BSONEachRow](../../interfaces/formats.md/#bsoneachrow) (and other JSON formats) +- [TSKV](../../interfaces/formats.md/#tskv) +- All formats with suffixes WithNames/WithNamesAndTypes +- [MySQLDump](../../interfaces/formats.md/#mysqldump) +- [Native](../../interfaces/formats.md/#native) + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +## input_format_try_infer_dates {#input_format_try_infer_dates} + +Type: Bool + +Default value: 1 + +If enabled, ClickHouse will try to infer type `Date` from string fields in schema inference for text formats. If all fields from a column in input data were successfully parsed as dates, the result type will be `Date`, if at least one field was not parsed as date, the result type will be `String`. + +Enabled by default. + +## input_format_try_infer_datetimes {#input_format_try_infer_datetimes} + +Type: Bool + +Default value: 1 + +If enabled, ClickHouse will try to infer type `DateTime64` from string fields in schema inference for text formats. If all fields from a column in input data were successfully parsed as datetimes, the result type will be `DateTime64`, if at least one field was not parsed as datetime, the result type will be `String`. + +Enabled by default. + +## input_format_try_infer_datetimes_only_datetime64 {#input_format_try_infer_datetimes_only_datetime64} + +Type: Bool + +Default value: 0 + +When input_format_try_infer_datetimes is enabled, infer only DateTime64 but not DateTime types + +## input_format_try_infer_exponent_floats {#input_format_try_infer_exponent_floats} + +Type: Bool + +Default value: 0 + +Try to infer floats in exponential notation while schema inference in text formats (except JSON, where exponent numbers are always inferred) + +## input_format_try_infer_integers {#input_format_try_infer_integers} + +Type: Bool + +Default value: 1 + +If enabled, ClickHouse will try to infer integers instead of floats in schema inference for text formats. If all numbers in the column from input data are integers, the result type will be `Int64`, if at least one number is float, the result type will be `Float64`. + +Enabled by default. + +## input_format_try_infer_variants {#input_format_try_infer_variants} + +Type: Bool + +Default value: 0 + +If enabled, ClickHouse will try to infer type [`Variant`](../../sql-reference/data-types/variant.md) in schema inference for text formats when there is more than one possible type for column/array elements. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +## input_format_tsv_allow_variable_number_of_columns {#input_format_tsv_allow_variable_number_of_columns} + +Type: Bool + +Default value: 0 + +Ignore extra columns in TSV input (if file has more columns than expected) and treat missing fields in TSV input as default values + +## input_format_tsv_crlf_end_of_line {#input_format_tsv_crlf_end_of_line} + +Type: Bool + +Default value: 0 + +If it is set true, file function will read TSV format with \\r\\n instead of \\n. + +## input_format_tsv_detect_header {#input_format_tsv_detect_header} + +Type: Bool + +Default value: 1 + +Automatically detect header with names and types in TSV format + +## input_format_tsv_empty_as_default {#input_format_tsv_empty_as_default} + +Type: Bool + +Default value: 0 + +Treat empty fields in TSV input as default values. + +## input_format_tsv_enum_as_number {#input_format_tsv_enum_as_number} + +Type: Bool + +Default value: 0 + +Treat inserted enum values in TSV formats as enum indices. + +## input_format_tsv_skip_first_lines {#input_format_tsv_skip_first_lines} + +Type: UInt64 + +Default value: 0 + +Skip specified number of lines at the beginning of data in TSV format + +## input_format_tsv_skip_trailing_empty_lines {#input_format_tsv_skip_trailing_empty_lines} + +Type: Bool + +Default value: 0 + +Skip trailing empty lines in TSV format + +## input_format_tsv_use_best_effort_in_schema_inference {#input_format_tsv_use_best_effort_in_schema_inference} + +Type: Bool + +Default value: 1 + +Use some tweaks and heuristics to infer schema in TSV format + +## input_format_values_accurate_types_of_literals {#input_format_values_accurate_types_of_literals} + +Type: Bool + +Default value: 1 + +For Values format: when parsing and interpreting expressions using template, check actual type of literal to avoid possible overflow and precision issues. + +## input_format_values_deduce_templates_of_expressions {#input_format_values_deduce_templates_of_expressions} + +Type: Bool + +Default value: 1 + +For Values format: if the field could not be parsed by streaming parser, run SQL parser, deduce template of the SQL expression, try to parse all rows using template and then interpret expression for all rows. + +## input_format_values_interpret_expressions {#input_format_values_interpret_expressions} + +Type: Bool + +Default value: 1 + +For Values format: if the field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression. + +## input_format_with_names_use_header {#input_format_with_names_use_header} + +Type: Bool + +Default value: 1 + +Enables or disables checking the column order when inserting data. + +To improve insert performance, we recommend disabling this check if you are sure that the column order of the input data is the same as in the target table. + +Supported formats: + +- [CSVWithNames](../../interfaces/formats.md/#csvwithnames) +- [CSVWithNamesAndTypes](../../interfaces/formats.md/#csvwithnamesandtypes) +- [TabSeparatedWithNames](../../interfaces/formats.md/#tabseparatedwithnames) +- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md/#tabseparatedwithnamesandtypes) +- [JSONCompactEachRowWithNames](../../interfaces/formats.md/#jsoncompacteachrowwithnames) +- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompacteachrowwithnamesandtypes) +- [JSONCompactStringsEachRowWithNames](../../interfaces/formats.md/#jsoncompactstringseachrowwithnames) +- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompactstringseachrowwithnamesandtypes) +- [RowBinaryWithNames](../../interfaces/formats.md/#rowbinarywithnames) +- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md/#rowbinarywithnamesandtypes) +- [CustomSeparatedWithNames](../../interfaces/formats.md/#customseparatedwithnames) +- [CustomSeparatedWithNamesAndTypes](../../interfaces/formats.md/#customseparatedwithnamesandtypes) + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +## input_format_with_types_use_header {#input_format_with_types_use_header} + +Type: Bool + +Default value: 1 + +Controls whether format parser should check if data types from the input data match data types from the target table. + +Supported formats: + +- [CSVWithNamesAndTypes](../../interfaces/formats.md/#csvwithnamesandtypes) +- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md/#tabseparatedwithnamesandtypes) +- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompacteachrowwithnamesandtypes) +- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompactstringseachrowwithnamesandtypes) +- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md/#rowbinarywithnamesandtypes-rowbinarywithnamesandtypes) +- [CustomSeparatedWithNamesAndTypes](../../interfaces/formats.md/#customseparatedwithnamesandtypes) + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +## insert_distributed_one_random_shard {#insert_distributed_one_random_shard} + +Type: Bool + +Default value: 0 + +Enables or disables random shard insertion into a [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table when there is no distributed key. + +By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will reject any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards. + +Possible values: + +- 0 — Insertion is rejected if there are multiple shards and no distributed key is given. +- 1 — Insertion is done randomly among all available shards when no distributed key is given. + +## interval_output_format {#interval_output_format} + +Type: IntervalOutputFormat + +Default value: numeric + +Allows choosing different output formats of the text representation of interval types. + +Possible values: + +- `kusto` - KQL-style output format. + + ClickHouse outputs intervals in [KQL format](https://learn.microsoft.com/en-us/dotnet/standard/base-types/standard-timespan-format-strings#the-constant-c-format-specifier). For example, `toIntervalDay(2)` would be formatted as `2.00:00:00`. Please note that for interval types of varying length (ie. `IntervalMonth` and `IntervalYear`) the average number of seconds per interval is taken into account. + +- `numeric` - Numeric output format. + + ClickHouse outputs intervals as their underlying numeric representation. For example, `toIntervalDay(2)` would be formatted as `2`. + +See also: + +- [Interval](../../sql-reference/data-types/special-data-types/interval.md) + +## output_format_arrow_compression_method {#output_format_arrow_compression_method} + +Type: ArrowCompression + +Default value: lz4_frame + +Compression method for Arrow output format. Supported codecs: lz4_frame, zstd, none (uncompressed) + +## output_format_arrow_fixed_string_as_fixed_byte_array {#output_format_arrow_fixed_string_as_fixed_byte_array} + +Type: Bool + +Default value: 1 + +Use Arrow FIXED_SIZE_BINARY type instead of Binary for FixedString columns. + +## output_format_arrow_low_cardinality_as_dictionary {#output_format_arrow_low_cardinality_as_dictionary} + +Type: Bool + +Default value: 0 + +Enable output LowCardinality type as Dictionary Arrow type + +## output_format_arrow_string_as_string {#output_format_arrow_string_as_string} + +Type: Bool + +Default value: 1 + +Use Arrow String type instead of Binary for String columns + +## output_format_arrow_use_64_bit_indexes_for_dictionary {#output_format_arrow_use_64_bit_indexes_for_dictionary} + +Type: Bool + +Default value: 0 + +Always use 64 bit integers for dictionary indexes in Arrow format + +## output_format_arrow_use_signed_indexes_for_dictionary {#output_format_arrow_use_signed_indexes_for_dictionary} + +Type: Bool + +Default value: 1 + +Use signed integers for dictionary indexes in Arrow format + +## output_format_avro_codec {#output_format_avro_codec} + +Type: String + +Default value: + +Compression codec used for output. Possible values: 'null', 'deflate', 'snappy', 'zstd'. + +## output_format_avro_rows_in_file {#output_format_avro_rows_in_file} + +Type: UInt64 + +Default value: 1 + +Max rows in a file (if permitted by storage) + +## output_format_avro_string_column_pattern {#output_format_avro_string_column_pattern} + +Type: String + +Default value: + +For Avro format: regexp of String columns to select as AVRO string. + +## output_format_avro_sync_interval {#output_format_avro_sync_interval} + +Type: UInt64 + +Default value: 16384 + +Sync interval in bytes. + +## output_format_binary_encode_types_in_binary_format {#output_format_binary_encode_types_in_binary_format} + +Type: Bool + +Default value: 0 + +Write data types in binary format instead of type names in RowBinaryWithNamesAndTypes output format + +## output_format_bson_string_as_string {#output_format_bson_string_as_string} + +Type: Bool + +Default value: 0 + +Use BSON String type instead of Binary for String columns. + +## output_format_csv_crlf_end_of_line {#output_format_csv_crlf_end_of_line} + +Type: Bool + +Default value: 0 + +If it is set true, end of line in CSV format will be \\r\\n instead of \\n. + +## output_format_csv_serialize_tuple_into_separate_columns {#output_format_csv_serialize_tuple_into_separate_columns} + +Type: Bool + +Default value: 1 + +If it set to true, then Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost) + +## output_format_decimal_trailing_zeros {#output_format_decimal_trailing_zeros} + +Type: Bool + +Default value: 0 + +Output trailing zeros when printing Decimal values. E.g. 1.230000 instead of 1.23. + +Disabled by default. + +## output_format_enable_streaming {#output_format_enable_streaming} + +Type: Bool + +Default value: 0 + +Enable streaming in output formats that support it. + +Disabled by default. + +## output_format_json_array_of_rows {#output_format_json_array_of_rows} + +Type: Bool + +Default value: 0 + +Enables the ability to output all rows as a JSON array in the [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) format. + +Possible values: + +- 1 — ClickHouse outputs all rows as an array, each row in the `JSONEachRow` format. +- 0 — ClickHouse outputs each row separately in the `JSONEachRow` format. + +**Example of a query with the enabled setting** + +Query: + +```sql +SET output_format_json_array_of_rows = 1; +SELECT number FROM numbers(3) FORMAT JSONEachRow; +``` + +Result: + +```text +[ +{"number":"0"}, +{"number":"1"}, +{"number":"2"} +] +``` + +**Example of a query with the disabled setting** + +Query: + +```sql +SET output_format_json_array_of_rows = 0; +SELECT number FROM numbers(3) FORMAT JSONEachRow; +``` + +Result: + +```text +{"number":"0"} +{"number":"1"} +{"number":"2"} +``` + +## output_format_json_escape_forward_slashes {#output_format_json_escape_forward_slashes} + +Type: Bool + +Default value: 1 + +Controls escaping forward slashes for string outputs in JSON output format. This is intended for compatibility with JavaScript. Don't confuse with backslashes that are always escaped. + +Enabled by default. + +## output_format_json_named_tuples_as_objects {#output_format_json_named_tuples_as_objects} + +Type: Bool + +Default value: 1 + +Serialize named tuple columns as JSON objects. + +Enabled by default. + +## output_format_json_quote_64bit_floats {#output_format_json_quote_64bit_floats} + +Type: Bool + +Default value: 0 + +Controls quoting of 64-bit [floats](../../sql-reference/data-types/float.md) when they are output in JSON* formats. + +Disabled by default. + ## output_format_json_quote_64bit_integers {#output_format_json_quote_64bit_integers} +Type: Bool + +Default value: 1 + Controls quoting of 64-bit or bigger [integers](../../sql-reference/data-types/int-uint.md) (like `UInt64` or `Int128`) when they are output in a [JSON](../../interfaces/formats.md/#json) format. Such integers are enclosed in quotes by default. This behavior is compatible with most JavaScript implementations. @@ -518,16 +1706,22 @@ Possible values: - 0 — Integers are output without quotes. - 1 — Integers are enclosed in quotes. -Default value: 1. +## output_format_json_quote_decimals {#output_format_json_quote_decimals} -## output_format_json_quote_64bit_floats {#output_format_json_quote_64bit_floats} +Type: Bool -Controls quoting of 64-bit [floats](../../sql-reference/data-types/float.md) when they are output in JSON* formats. +Default value: 0 + +Controls quoting of decimals in JSON output formats. Disabled by default. ## output_format_json_quote_denormals {#output_format_json_quote_denormals} +Type: Bool + +Default value: 0 + Enables `+nan`, `-nan`, `+inf`, `-inf` outputs in [JSON](../../interfaces/formats.md/#json) output format. Possible values: @@ -535,8 +1729,6 @@ Possible values: - 0 — Disabled. - 1 — Enabled. -Default value: 0. - **Example** Consider the following table `account_orders`: @@ -625,1117 +1817,189 @@ When `output_format_json_quote_denormals = 1`, the query returns: } ``` -## output_format_json_quote_decimals {#output_format_json_quote_decimals} +## output_format_json_skip_null_value_in_named_tuples {#output_format_json_skip_null_value_in_named_tuples} -Controls quoting of decimals in JSON output formats. +Type: Bool -Disabled by default. +Default value: 0 -## output_format_json_escape_forward_slashes {#output_format_json_escape_forward_slashes} - -Controls escaping forward slashes for string outputs in JSON output format. This is intended for compatibility with JavaScript. Don't confuse with backslashes that are always escaped. - -Enabled by default. - -## output_format_json_named_tuples_as_objects {#output_format_json_named_tuples_as_objects} - -Serialize named tuple columns as JSON objects. - -Enabled by default. - -## input_format_json_named_tuples_as_objects {#input_format_json_named_tuples_as_objects} - -Parse named tuple columns as JSON objects. - -Enabled by default. - -## input_format_json_ignore_unknown_keys_in_named_tuple {#input_format_json_ignore_unknown_keys_in_named_tuple} - -Ignore unknown keys in json object for named tuples. - -Enabled by default. - -## input_format_json_defaults_for_missing_elements_in_named_tuple {#input_format_json_defaults_for_missing_elements_in_named_tuple} - -Insert default values for missing elements in JSON object while parsing named tuple. -This setting works only when setting `input_format_json_named_tuples_as_objects` is enabled. - -Enabled by default. - -## input_format_json_throw_on_bad_escape_sequence {#input_format_json_throw_on_bad_escape_sequence} - -Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data. - -Enabled by default. - -## output_format_json_array_of_rows {#output_format_json_array_of_rows} - -Enables the ability to output all rows as a JSON array in the [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) format. - -Possible values: - -- 1 — ClickHouse outputs all rows as an array, each row in the `JSONEachRow` format. -- 0 — ClickHouse outputs each row separately in the `JSONEachRow` format. - -Default value: `0`. - -**Example of a query with the enabled setting** - -Query: - -```sql -SET output_format_json_array_of_rows = 1; -SELECT number FROM numbers(3) FORMAT JSONEachRow; -``` - -Result: - -```text -[ -{"number":"0"}, -{"number":"1"}, -{"number":"2"} -] -``` - -**Example of a query with the disabled setting** - -Query: - -```sql -SET output_format_json_array_of_rows = 0; -SELECT number FROM numbers(3) FORMAT JSONEachRow; -``` - -Result: - -```text -{"number":"0"} -{"number":"1"} -{"number":"2"} -``` +Skip key value pairs with null value when serialize named tuple columns as JSON objects. It is only valid when output_format_json_named_tuples_as_objects is true. ## output_format_json_validate_utf8 {#output_format_json_validate_utf8} +Type: Bool + +Default value: 0 + Controls validation of UTF-8 sequences in JSON output formats, doesn't impact formats JSON/JSONCompact/JSONColumnsWithMetadata, they always validate UTF-8. Disabled by default. -## format_json_object_each_row_column_for_object_name {#format_json_object_each_row_column_for_object_name} +## output_format_markdown_escape_special_characters {#output_format_markdown_escape_special_characters} -The name of column that will be used for storing/writing object names in [JSONObjectEachRow](../../interfaces/formats.md/#jsonobjecteachrow) format. -Column type should be String. If value is empty, default names `row_{i}`will be used for object names. +Type: Bool -Default value: ''. +Default value: 0 -### input_format_json_compact_allow_variable_number_of_columns {#input_format_json_compact_allow_variable_number_of_columns} +Escape special characters in Markdown -Allow variable number of columns in rows in JSONCompact/JSONCompactEachRow input formats. -Ignore extra columns in rows with more columns than expected and treat missing columns as default values. +## output_format_msgpack_uuid_representation {#output_format_msgpack_uuid_representation} -Disabled by default. +Type: MsgPackUUIDRepresentation -### output_format_markdown_escape_special_characters {#output_format_markdown_escape_special_characters} +Default value: ext -When enabled, escape special characters in Markdown. +The way how to output UUID in MsgPack format. -[Common Mark](https://spec.commonmark.org/0.30/#example-12) defines the following special characters that can be escaped by \: +## output_format_native_encode_types_in_binary_format {#output_format_native_encode_types_in_binary_format} -``` -! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ -``` +Type: Bool -Possible values: +Default value: 0 -+ 0 — Disable. -+ 1 — Enable. +Write data types in binary format instead of type names in Native output format -Default value: 0. +## output_format_orc_compression_method {#output_format_orc_compression_method} -### input_format_json_empty_as_default {#input_format_json_empty_as_default} +Type: ORCCompression -When enabled, replace empty input fields in JSON with default values. For complex default expressions `input_format_defaults_for_omitted_fields` must be enabled too. +Default value: zstd -Possible values: +Compression method for ORC output format. Supported codecs: lz4, snappy, zlib, zstd, none (uncompressed) -+ 0 — Disable. -+ 1 — Enable. +## output_format_orc_dictionary_key_size_threshold {#output_format_orc_dictionary_key_size_threshold} -Default value: 0. +Type: Double -## TSV format settings {#tsv-format-settings} +Default value: 0 -### input_format_tsv_empty_as_default {#input_format_tsv_empty_as_default} +For a string column in ORC output format, if the number of distinct values is greater than this fraction of the total number of non-null rows, turn off dictionary encoding. Otherwise dictionary encoding is enabled -When enabled, replace empty input fields in TSV with default values. For complex default expressions `input_format_defaults_for_omitted_fields` must be enabled too. +## output_format_orc_row_index_stride {#output_format_orc_row_index_stride} -Disabled by default. +Type: UInt64 -### input_format_tsv_enum_as_number {#input_format_tsv_enum_as_number} +Default value: 10000 -When enabled, always treat enum values as enum ids for TSV input format. It's recommended to enable this setting if data contains only enum ids to optimize enum parsing. +Target row index stride in ORC output format -Possible values: +## output_format_orc_string_as_string {#output_format_orc_string_as_string} -- 0 — Enum values are parsed as values or as enum IDs. -- 1 — Enum values are parsed only as enum IDs. +Type: Bool -Default value: 0. +Default value: 1 -**Example** +Use ORC String type instead of Binary for String columns -Consider the table: +## output_format_parquet_batch_size {#output_format_parquet_batch_size} -```sql -CREATE TABLE table_with_enum_column_for_tsv_insert (Id Int32,Value Enum('first' = 1, 'second' = 2)) ENGINE=Memory(); -``` +Type: UInt64 -When the `input_format_tsv_enum_as_number` setting is enabled: +Default value: 1024 -Query: +Check page size every this many rows. Consider decreasing if you have columns with average values size above a few KBs. -```sql -SET input_format_tsv_enum_as_number = 1; -INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2; -SELECT * FROM table_with_enum_column_for_tsv_insert; -``` +## output_format_parquet_compliant_nested_types {#output_format_parquet_compliant_nested_types} -Result: +Type: Bool -```text -┌──Id─┬─Value──┐ -│ 102 │ second │ -└─────┴────────┘ -``` +Default value: 1 -Query: +In parquet file schema, use name 'element' instead of 'item' for list elements. This is a historical artifact of Arrow library implementation. Generally increases compatibility, except perhaps with some old versions of Arrow. -```sql -SET input_format_tsv_enum_as_number = 1; -INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 'first'; -``` +## output_format_parquet_compression_method {#output_format_parquet_compression_method} -throws an exception. +Type: ParquetCompression -When the `input_format_tsv_enum_as_number` setting is disabled: +Default value: zstd -Query: +Compression method for Parquet output format. Supported codecs: snappy, lz4, brotli, zstd, gzip, none (uncompressed) -```sql -SET input_format_tsv_enum_as_number = 0; -INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2; -INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 'first'; -SELECT * FROM table_with_enum_column_for_tsv_insert; -``` +## output_format_parquet_data_page_size {#output_format_parquet_data_page_size} -Result: +Type: UInt64 -```text -┌──Id─┬─Value──┐ -│ 102 │ second │ -└─────┴────────┘ -┌──Id─┬─Value──┐ -│ 103 │ first │ -└─────┴────────┘ -``` +Default value: 1048576 -### input_format_tsv_use_best_effort_in_schema_inference {#input_format_tsv_use_best_effort_in_schema_inference} +Target page size in bytes, before compression. -Use some tweaks and heuristics to infer schema in TSV format. If disabled, all fields will be treated as String. +## output_format_parquet_fixed_string_as_fixed_byte_array {#output_format_parquet_fixed_string_as_fixed_byte_array} -Enabled by default. +Type: Bool -### input_format_tsv_skip_first_lines {#input_format_tsv_skip_first_lines} +Default value: 1 -The number of lines to skip at the beginning of data in TSV input format. +Use Parquet FIXED_LENGTH_BYTE_ARRAY type instead of Binary for FixedString columns. -Default value: `0`. +## output_format_parquet_parallel_encoding {#output_format_parquet_parallel_encoding} -### output_format_tsv_crlf_end_of_line {#output_format_tsv_crlf_end_of_line} +Type: Bool -Use DOS/Windows-style line separator (CRLF) in TSV instead of Unix style (LF). +Default value: 1 -Disabled by default. +Do Parquet encoding in multiple threads. Requires output_format_parquet_use_custom_encoder. -### input_format_tsv_crlf_end_of_line {#input_format_tsv_crlf_end_of_line} +## output_format_parquet_row_group_size {#output_format_parquet_row_group_size} -Use DOS/Windows-style line separator (CRLF) for TSV input files instead of Unix style (LF). +Type: UInt64 -Disabled by default. +Default value: 1000000 -### format_tsv_null_representation {#format_tsv_null_representation} +Target row group size in rows. -Defines the representation of `NULL` for [TSV](../../interfaces/formats.md/#tabseparated) output and input formats. User can set any string as a value, for example, `My NULL`. +## output_format_parquet_row_group_size_bytes {#output_format_parquet_row_group_size_bytes} -Default value: `\N`. +Type: UInt64 -**Examples** +Default value: 536870912 -Query +Target row group size in bytes, before compression. -```sql -SELECT * FROM tsv_custom_null FORMAT TSV; -``` +## output_format_parquet_string_as_string {#output_format_parquet_string_as_string} -Result +Type: Bool -```text -788 -\N -\N -``` - -Query - -```sql -SET format_tsv_null_representation = 'My NULL'; -SELECT * FROM tsv_custom_null FORMAT TSV; -``` - -Result - -```text -788 -My NULL -My NULL -``` - -### input_format_tsv_skip_trailing_empty_lines {input_format_tsv_skip_trailing_empty_lines} - -When enabled, trailing empty lines at the end of TSV file will be skipped. - -Disabled by default. - -### input_format_tsv_allow_variable_number_of_columns {#input_format_tsv_allow_variable_number_of_columns} - -Allow variable number of columns in rows in TSV input format. -Ignore extra columns in rows with more columns than expected and treat missing columns as default values. - -Disabled by default. - -## CSV format settings {#csv-format-settings} - -### format_csv_delimiter {#format_csv_delimiter} - -The character is interpreted as a delimiter in the CSV data. - -Default value: `,`. - -### format_csv_allow_single_quotes {#format_csv_allow_single_quotes} - -If it is set to true, allow strings in single quotes. - -Disabled by default. - -### format_csv_allow_double_quotes {#format_csv_allow_double_quotes} - -If it is set to true, allow strings in double quotes. - -Enabled by default. - -### output_format_csv_crlf_end_of_line {#output_format_csv_crlf_end_of_line} - -Use DOS/Windows-style line separator (CRLF) in CSV instead of Unix style (LF). - -Disabled by default. - -### input_format_csv_allow_cr_end_of_line {#input_format_csv_allow_cr_end_of_line} - -If it is set true, CR(\\r) will be allowed at end of line not followed by LF(\\n) - -Disabled by default. - -### input_format_csv_enum_as_number {#input_format_csv_enum_as_number} - -When enabled, always treat enum values as enum ids for CSV input format. It's recommended to enable this setting if data contains only enum ids to optimize enum parsing. - -Possible values: - -- 0 — Enum values are parsed as values or as enum IDs. -- 1 — Enum values are parsed only as enum IDs. - -Default value: 0. - -**Examples** - -Consider the table: - -```sql -CREATE TABLE table_with_enum_column_for_csv_insert (Id Int32,Value Enum('first' = 1, 'second' = 2)) ENGINE=Memory(); -``` - -When the `input_format_csv_enum_as_number` setting is enabled: - -Query: - -```sql -SET input_format_csv_enum_as_number = 1; -INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2 -``` - -Result: - -```text -┌──Id─┬─Value──┐ -│ 102 │ second │ -└─────┴────────┘ -``` - -Query: - -```sql -SET input_format_csv_enum_as_number = 1; -INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 103,'first' -``` - -throws an exception. - -When the `input_format_csv_enum_as_number` setting is disabled: - -Query: - -```sql -SET input_format_csv_enum_as_number = 0; -INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2 -INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 103,'first' -SELECT * FROM table_with_enum_column_for_csv_insert; -``` - -Result: - -```text -┌──Id─┬─Value──┐ -│ 102 │ second │ -└─────┴────────┘ -┌──Id─┬─Value─┐ -│ 103 │ first │ -└─────┴───────┘ -``` - -### input_format_csv_arrays_as_nested_csv {#input_format_csv_arrays_as_nested_csv} - -When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted. - -Disabled by default. - -### input_format_csv_empty_as_default {#input_format_csv_empty_as_default} - -When enabled, replace empty input fields in CSV with default values. For complex default expressions `input_format_defaults_for_omitted_fields` must be enabled too. - -Enabled by default. - -### input_format_csv_use_best_effort_in_schema_inference {#input_format_csv_use_best_effort_in_schema_inference} - -Use some tweaks and heuristics to infer schema in CSV format. If disabled, all fields will be treated as String. - -Enabled by default. - -### input_format_csv_skip_first_lines {#input_format_csv_skip_first_lines} - -The number of lines to skip at the beginning of data in CSV input format. - -Default value: `0`. - -### format_csv_null_representation {#format_csv_null_representation} - -Defines the representation of `NULL` for [CSV](../../interfaces/formats.md/#csv) output and input formats. User can set any string as a value, for example, `My NULL`. - -Default value: `\N`. - -**Examples** - -Query - -```sql -SELECT * from csv_custom_null FORMAT CSV; -``` - -Result - -```text -788 -\N -\N -``` - -Query - -```sql -SET format_csv_null_representation = 'My NULL'; -SELECT * FROM csv_custom_null FORMAT CSV; -``` - -Result - -```text -788 -My NULL -My NULL -``` - -### input_format_csv_skip_trailing_empty_lines {input_format_csv_skip_trailing_empty_lines} - -When enabled, trailing empty lines at the end of CSV file will be skipped. - -Disabled by default. - -### input_format_csv_trim_whitespaces {#input_format_csv_trim_whitespaces} - -Trims spaces and tabs in non-quoted CSV strings. - -Default value: `true`. - -**Examples** - -Query - -```bash -echo ' string ' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_trim_whitespaces=true -``` - -Result - -```text -"string" -``` - -Query - -```bash -echo ' string ' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_trim_whitespaces=false -``` - -Result - -```text -" string " -``` - -### input_format_csv_allow_variable_number_of_columns {#input_format_csv_allow_variable_number_of_columns} - -Allow variable number of columns in rows in CSV input format. -Ignore extra columns in rows with more columns than expected and treat missing columns as default values. - -Disabled by default. - -### input_format_csv_allow_whitespace_or_tab_as_delimiter {#input_format_csv_allow_whitespace_or_tab_as_delimiter} - -Allow to use whitespace or tab as field delimiter in CSV strings. - -Default value: `false`. - -**Examples** - -Query - -```bash -echo 'a b' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_allow_whitespace_or_tab_as_delimiter=true --format_csv_delimiter=' ' -``` - -Result - -```text -a b -``` - -Query - -```bash -echo 'a b' | ./clickhouse local -q "select * from table FORMAT CSV" --input-format="CSV" --input_format_csv_allow_whitespace_or_tab_as_delimiter=true --format_csv_delimiter='\t' -``` - -Result - -```text -a b -``` - -### input_format_csv_use_default_on_bad_values {#input_format_csv_use_default_on_bad_values} - -Allow to set default value to column when CSV field deserialization failed on bad value - -Default value: `false`. - -**Examples** - -Query - -```bash -./clickhouse local -q "create table test_tbl (x String, y UInt32, z Date) engine=MergeTree order by x" -echo 'a,b,c' | ./clickhouse local -q "INSERT INTO test_tbl SETTINGS input_format_csv_use_default_on_bad_values=true FORMAT CSV" -./clickhouse local -q "select * from test_tbl" -``` - -Result - -```text -a 0 1971-01-01 -``` - -## input_format_csv_try_infer_numbers_from_strings {#input_format_csv_try_infer_numbers_from_strings} - -If enabled, during schema inference ClickHouse will try to infer numbers from string fields. -It can be useful if CSV data contains quoted UInt64 numbers. - -Disabled by default. - -## Values format settings {#values-format-settings} - -### input_format_values_interpret_expressions {#input_format_values_interpret_expressions} - -Enables or disables the full SQL parser if the fast stream parser can’t parse the data. This setting is used only for the [Values](../../interfaces/formats.md/#data-format-values) format at the data insertion. For more information about syntax parsing, see the [Syntax](../../sql-reference/syntax.md) section. - -Possible values: - -- 0 — Disabled. - - In this case, you must provide formatted data. See the [Formats](../../interfaces/formats.md) section. - -- 1 — Enabled. - - In this case, you can use an SQL expression as a value, but data insertion is much slower this way. If you insert only formatted data, then ClickHouse behaves as if the setting value is 0. - -Default value: 1. - -Example of Use - -Insert the [DateTime](../../sql-reference/data-types/datetime.md) type value with the different settings. - -``` sql -SET input_format_values_interpret_expressions = 0; -INSERT INTO datetime_t VALUES (now()) -``` - -``` text -Exception on client: -Code: 27. DB::Exception: Cannot parse input: expected ) before: now()): (at row 1) -``` - -``` sql -SET input_format_values_interpret_expressions = 1; -INSERT INTO datetime_t VALUES (now()) -``` - -``` text -Ok. -``` - -The last query is equivalent to the following: - -``` sql -SET input_format_values_interpret_expressions = 0; -INSERT INTO datetime_t SELECT now() -``` - -``` text -Ok. -``` - -### input_format_values_deduce_templates_of_expressions {#input_format_values_deduce_templates_of_expressions} - -Enables or disables template deduction for SQL expressions in [Values](../../interfaces/formats.md/#data-format-values) format. It allows parsing and interpreting expressions in `Values` much faster if expressions in consecutive rows have the same structure. ClickHouse tries to deduce the template of an expression, parse the following rows using this template and evaluate the expression on a batch of successfully parsed rows. - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: 1. - -For the following query: - -``` sql -INSERT INTO test VALUES (lower('Hello')), (lower('world')), (lower('INSERT')), (upper('Values')), ... -``` - -- If `input_format_values_interpret_expressions=1` and `format_values_deduce_templates_of_expressions=0`, expressions are interpreted separately for each row (this is very slow for large number of rows). -- If `input_format_values_interpret_expressions=0` and `format_values_deduce_templates_of_expressions=1`, expressions in the first, second and third rows are parsed using template `lower(String)` and interpreted together, expression in the forth row is parsed with another template (`upper(String)`). -- If `input_format_values_interpret_expressions=1` and `format_values_deduce_templates_of_expressions=1`, the same as in previous case, but also allows fallback to interpreting expressions separately if it’s not possible to deduce template. - -### input_format_values_accurate_types_of_literals {#input_format_values_accurate_types_of_literals} - -This setting is used only when `input_format_values_deduce_templates_of_expressions = 1`. Expressions for some column may have the same structure, but contain numeric literals of different types, e.g. - -``` sql -(..., abs(0), ...), -- UInt64 literal -(..., abs(3.141592654), ...), -- Float64 literal -(..., abs(-1), ...), -- Int64 literal -``` - -Possible values: - -- 0 — Disabled. - - In this case, ClickHouse may use a more general type for some literals (e.g., `Float64` or `Int64` instead of `UInt64` for `42`), but it may cause overflow and precision issues. - -- 1 — Enabled. - - In this case, ClickHouse checks the actual type of literal and uses an expression template of the corresponding type. In some cases, it may significantly slow down expression evaluation in `Values`. - -Default value: 1. - -## Arrow format settings {#arrow-format-settings} - -### input_format_arrow_case_insensitive_column_matching {#input_format_arrow_case_insensitive_column_matching} - -Ignore case when matching Arrow column names with ClickHouse column names. - -Disabled by default. - -### input_format_arrow_allow_missing_columns {#input_format_arrow_allow_missing_columns} - -While importing data, when column is not found in schema default value will be used instead of error. - -Disabled by default. - -### input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference {#input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference} - -Allow skipping columns with unsupported types while schema inference for format Arrow. - -Disabled by default. - -### output_format_arrow_low_cardinality_as_dictionary {#output_format_arrow_low_cardinality_as_dictionary} - -Allows to convert the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) type to the `DICTIONARY` type of the [Arrow](../../interfaces/formats.md/#data-format-arrow) format for `SELECT` queries. - -Possible values: - -- 0 — The `LowCardinality` type is not converted to the `DICTIONARY` type. -- 1 — The `LowCardinality` type is converted to the `DICTIONARY` type. - -Default value: `0`. - -### output_format_arrow_use_signed_indexes_for_dictionary {#output_format_arrow_use_signed_indexes_for_dictionary} - -Use signed integer types instead of unsigned in `DICTIONARY` type of the [Arrow](../../interfaces/formats.md/#data-format-arrow) format during [LowCardinality](../../sql-reference/data-types/lowcardinality.md) output when `output_format_arrow_low_cardinality_as_dictionary` is enabled. - -Possible values: - -- 0 — Unsigned integer types are used for indexes in `DICTIONARY` type. -- 1 — Signed integer types are used for indexes in `DICTIONARY` type. - -Default value: `1`. - -### output_format_arrow_use_64_bit_indexes_for_dictionary {#output_format_arrow_use_64_bit_indexes_for_dictionary} - -Use 64-bit integer type in `DICTIONARY` type of the [Arrow](../../interfaces/formats.md/#data-format-arrow) format during [LowCardinality](../../sql-reference/data-types/lowcardinality.md) output when `output_format_arrow_low_cardinality_as_dictionary` is enabled. - -Possible values: - -- 0 — Type for indexes in `DICTIONARY` type is determined automatically. -- 1 — 64-bit integer type is used for indexes in `DICTIONARY` type. - -Default value: `0`. - -### output_format_arrow_string_as_string {#output_format_arrow_string_as_string} - -Use Arrow String type instead of Binary for String columns. - -Disabled by default. - -### output_format_arrow_fixed_string_as_fixed_byte_array (#output_format_arrow_fixed_string_as_fixed_byte_array) - -Use Arrow FIXED_SIZE_BINARY type instead of Binary/String for FixedString columns. - -Enabled by default. - -### output_format_arrow_compression_method {#output_format_arrow_compression_method} - -Compression method used in output Arrow format. Supported codecs: `lz4_frame`, `zstd`, `none` (uncompressed) - -Default value: `lz4_frame`. - -## ORC format settings {#orc-format-settings} - -### input_format_orc_row_batch_size {#input_format_orc_row_batch_size} - -Batch size when reading ORC stripes. - -Default value: `100'000` - -### input_format_orc_case_insensitive_column_matching {#input_format_orc_case_insensitive_column_matching} - -Ignore case when matching ORC column names with ClickHouse column names. - -Disabled by default. - -### input_format_orc_allow_missing_columns {#input_format_orc_allow_missing_columns} - -While importing data, when column is not found in schema default value will be used instead of error. - -Disabled by default. - -### input_format_orc_skip_columns_with_unsupported_types_in_schema_inference {#input_format_orc_skip_columns_with_unsupported_types_in_schema_inference} - -Allow skipping columns with unsupported types while schema inference for format Arrow. - -Disabled by default. - -### output_format_orc_string_as_string {#output_format_orc_string_as_string} - -Use ORC String type instead of Binary for String columns. - -Disabled by default. - -### output_format_orc_compression_method {#output_format_orc_compression_method} - -Compression method used in output ORC format. Supported codecs: `lz4`, `snappy`, `zlib`, `zstd`, `none` (uncompressed) - -Default value: `none`. - -## Parquet format settings {#parquet-format-settings} - -### input_format_parquet_case_insensitive_column_matching {#input_format_parquet_case_insensitive_column_matching} - -Ignore case when matching Parquet column names with ClickHouse column names. - -Disabled by default. - -### output_format_parquet_row_group_size {#output_format_parquet_row_group_size} - -Row group size in rows. - -Default value: `1'000'000`. - -### input_format_parquet_allow_missing_columns {#input_format_parquet_allow_missing_columns} - -While importing data, when column is not found in schema default value will be used instead of error. - -Enabled by default. - -### input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference {#input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference} - -Allow skipping columns with unsupported types while schema inference for format Parquet. - -Disabled by default. - -### input_format_parquet_local_file_min_bytes_for_seek {#input_format_parquet_local_file_min_bytes_for_seek} - -min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format. - -Default value - `8192`. - -### output_format_parquet_string_as_string {#output_format_parquet_string_as_string} +Default value: 1 Use Parquet String type instead of Binary for String columns. -Disabled by default. +## output_format_parquet_use_custom_encoder {#output_format_parquet_use_custom_encoder} -### output_format_parquet_fixed_string_as_fixed_byte_array (#output_format_parquet_fixed_string_as_fixed_byte_array) +Type: Bool -Use Parquet FIXED_LENGTH_BYTE_ARRAY type instead of Binary/String for FixedString columns. +Default value: 1 -Enabled by default. +Use a faster Parquet encoder implementation. -### output_format_parquet_version {#output_format_parquet_version} +## output_format_parquet_version {#output_format_parquet_version} -The version of Parquet format used in output format. Supported versions: `1.0`, `2.4`, `2.6` and `2.latest`. +Type: ParquetVersion -Default value: `2.latest`. +Default value: 2.latest -### output_format_parquet_compression_method {#output_format_parquet_compression_method} +Parquet format version for output format. Supported versions: 1.0, 2.4, 2.6 and 2.latest (default) -Compression method used in output Parquet format. Supported codecs: `snappy`, `lz4`, `brotli`, `zstd`, `gzip`, `none` (uncompressed) +## output_format_parquet_write_page_index {#output_format_parquet_write_page_index} -Default value: `lz4`. +Type: Bool -### input_format_parquet_max_block_size {#input_format_parquet_max_block_size} -Max block row size for parquet reader. By controlling the number of rows in each block, you can control the memory usage, -and in some operators that cache blocks, you can improve the accuracy of the operator's memory control。 +Default value: 1 -Default value: `65409`. +Add a possibility to write page index into parquet files. -### input_format_parquet_prefer_block_bytes {#input_format_parquet_prefer_block_bytes} -Average block bytes output by parquet reader. Lowering the configuration in the case of reading some high compression parquet relieves the memory pressure. +## output_format_pretty_color {#output_format_pretty_color} -Default value: `65409 * 256 = 16744704` +Type: UInt64Auto -### output_format_parquet_write_page_index {#input_format_parquet_max_block_size} +Default value: auto -Could add page index into parquet files. To enable this, need set `output_format_parquet_use_custom_encoder`=`false` and -`output_format_parquet_write_page_index`=`true`. +Use ANSI escape sequences in Pretty formats. 0 - disabled, 1 - enabled, 'auto' - enabled if a terminal. -Enable by default. +## output_format_pretty_display_footer_column_names {#output_format_pretty_display_footer_column_names} -## Hive format settings {#hive-format-settings} +Type: UInt64 -### input_format_hive_text_fields_delimiter {#input_format_hive_text_fields_delimiter} - -Delimiter between fields in Hive Text File. - -Default value: `\x01`. - -### input_format_hive_text_collection_items_delimiter {#input_format_hive_text_collection_items_delimiter} - -Delimiter between collection(array or map) items in Hive Text File. - -Default value: `\x02`. - -### input_format_hive_text_map_keys_delimiter {#input_format_hive_text_map_keys_delimiter} - -Delimiter between a pair of map key/values in Hive Text File. - -Default value: `\x03`. - -## MsgPack format settings {#msgpack-format-settings} - -### input_format_msgpack_number_of_columns {#input_format_msgpack_number_of_columns} - -The number of columns in inserted MsgPack data. Used for automatic schema inference from data. - -Default value: `0`. - -### output_format_msgpack_uuid_representation {#output_format_msgpack_uuid_representation} - -The way how to output UUID in MsgPack format. -Possible values: - -- `bin` - as 16-bytes binary. -- `str` - as a string of 36 bytes. -- `ext` - as extension with ExtType = 2. - -Default value: `ext`. - - -## Protobuf format settings {#protobuf-format-settings} - -### input_format_protobuf_flatten_google_wrappers {#input_format_protobuf_flatten_google_wrappers} - -Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls. - -Disabled by default. - -### output_format_protobuf_nullables_with_google_wrappers {#output_format_protobuf_nullables_with_google_wrappers} - -When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized. - -Disabled by default. - -### format_protobuf_use_autogenerated_schema {#format_capn_proto_use_autogenerated_schema} - -Use autogenerated Protobuf schema when [format_schema](#formatschema-format-schema) is not set. -The schema is generated from ClickHouse table structure using function [structureToProtobufSchema](../../sql-reference/functions/other-functions.md#structure_to_protobuf_schema) - -## Avro format settings {#avro-format-settings} - -### input_format_avro_allow_missing_fields {#input_format_avro_allow_missing_fields} - -Enables using fields that are not specified in [Avro](../../interfaces/formats.md/#data-format-avro) or [AvroConfluent](../../interfaces/formats.md/#data-format-avro-confluent) format schema. When a field is not found in the schema, ClickHouse uses the default value instead of throwing an exception. - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: 0. - -### format_avro_schema_registry_url {#format_avro_schema_registry_url} - -Sets [Confluent Schema Registry](https://docs.confluent.io/current/schema-registry/index.html) URL to use with [AvroConfluent](../../interfaces/formats.md/#data-format-avro-confluent) format. - -Format: -``` text -http://[user:password@]machine[:port]" -``` - -Examples: -``` text -http://registry.example.com:8081 -http://admin:secret@registry.example.com:8081 -``` - -Default value: `Empty`. - -### output_format_avro_codec {#output_format_avro_codec} - -Sets the compression codec used for output Avro file. - -Type: string - -Possible values: - -- `null` — No compression -- `deflate` — Compress with Deflate (zlib) -- `snappy` — Compress with [Snappy](https://google.github.io/snappy/) - -Default value: `snappy` (if available) or `deflate`. - -### output_format_avro_sync_interval {#output_format_avro_sync_interval} - -Sets minimum data size (in bytes) between synchronization markers for output Avro file. - -Type: unsigned int - -Possible values: 32 (32 bytes) - 1073741824 (1 GiB) - -Default value: 32768 (32 KiB) - -### output_format_avro_string_column_pattern {#output_format_avro_string_column_pattern} - -Regexp of column names of type String to output as Avro `string` (default is `bytes`). -RE2 syntax is supported. - -Type: string - -### output_format_avro_rows_in_file {#output_format_avro_rows_in_file} - -Max rows in a file (if permitted by storage). - -Default value: `1`. - -## Pretty formats settings {#pretty-formats-settings} - -### output_format_pretty_max_rows {#output_format_pretty_max_rows} - -Rows limit for Pretty formats. - -Default value: `10'000`. - -### output_format_pretty_max_column_pad_width {#output_format_pretty_max_column_pad_width} - -Maximum width to pad all values in a column in Pretty formats. - -Default value: `250`. - -### output_format_pretty_max_value_width {#output_format_pretty_max_value_width} - -Limits the width of value displayed in [Pretty](../../interfaces/formats.md/#pretty) formats. If the value width exceeds the limit, the value is cut. - -Possible values: - -- Positive integer. -- 0 — The value is cut completely. - -Default value: `10000` symbols. - -**Examples** - -Query: -```sql -SET output_format_pretty_max_value_width = 10; -SELECT range(number) FROM system.numbers LIMIT 10 FORMAT PrettyCompactNoEscapes; -``` -Result: -```text -┌─range(number)─┐ -│ [] │ -│ [0] │ -│ [0,1] │ -│ [0,1,2] │ -│ [0,1,2,3] │ -│ [0,1,2,3,4⋯ │ -│ [0,1,2,3,4⋯ │ -│ [0,1,2,3,4⋯ │ -│ [0,1,2,3,4⋯ │ -│ [0,1,2,3,4⋯ │ -└───────────────┘ -``` - -Query with zero width: -```sql -SET output_format_pretty_max_value_width = 0; -SELECT range(number) FROM system.numbers LIMIT 5 FORMAT PrettyCompactNoEscapes; -``` -Result: -```text -┌─range(number)─┐ -│ ⋯ │ -│ ⋯ │ -│ ⋯ │ -│ ⋯ │ -│ ⋯ │ -└───────────────┘ -``` - -### output_format_pretty_color {#output_format_pretty_color} - -Use ANSI escape sequences to paint colors in Pretty formats. - -possible values: - -- `0` — Disabled. Pretty formats do not use ANSI escape sequences. -- `1` — Enabled. Pretty formats will use ANSI escape sequences except for `NoEscapes` formats. -- `auto` - Enabled if `stdout` is a terminal except for `NoEscapes` formats. - -Default value is `auto`. - -### output_format_pretty_grid_charset {#output_format_pretty_grid_charset} - -Allows changing a charset which is used for printing grids borders. Available charsets are UTF-8, ASCII. - -**Example** - -``` text -SET output_format_pretty_grid_charset = 'UTF-8'; -SELECT * FROM a; -┌─a─┐ -│ 1 │ -└───┘ - -SET output_format_pretty_grid_charset = 'ASCII'; -SELECT * FROM a; -+-a-+ -| 1 | -+---+ -``` - -### output_format_pretty_row_numbers {#output_format_pretty_row_numbers} - -Adds row numbers to output in the [Pretty](../../interfaces/formats.md/#pretty) format. - -Possible values: - -- 0 — Output without row numbers. -- 1 — Output with row numbers. - -Default value: `1`. - -**Example** - -Query: - -```sql -SET output_format_pretty_row_numbers = 1; -SELECT TOP 3 name, value FROM system.settings; -``` - -Result: -```text - ┌─name────────────────────┬─value───┐ -1. │ min_compress_block_size │ 65536 │ -2. │ max_compress_block_size │ 1048576 │ -3. │ max_block_size │ 65505 │ - └─────────────────────────┴─────────┘ -``` - -### output_format_pretty_single_large_number_tip_threshold {#output_format_pretty_single_large_number_tip_threshold} - -Print a readable number tip on the right side of the table if the block consists of a single number which exceeds -this value (except 0). - -Possible values: - -- 0 — The readable number tip will not be printed. -- Positive integer — The readable number tip will be printed if the single number exceeds this value. - -Default value: `1000000`. - -**Example** - -Query: - -```sql -SELECT 1000000000 as a; -``` - -Result: -```text -┌──────────a─┐ -│ 1000000000 │ -- 1.00 billion -└────────────┘ -``` - -## output_format_pretty_display_footer_column_names +Default value: 1 Display column names in the footer if there are many table rows. @@ -1744,8 +2008,6 @@ Possible values: - 0 — No column names are displayed in the footer. - 1 — Column names are displayed in the footer if row count is greater than or equal to the threshold value set by [output_format_pretty_display_footer_column_names_min_rows](#output_format_pretty_display_footer_column_names_min_rows) (50 by default). -Default value: `1`. - **Example** Query: @@ -1766,241 +2028,274 @@ Result: 1000. │ 999 │ UInt64 │ └─number─┴─toTypeName(number)─┘ ``` -## output_format_pretty_display_footer_column_names_min_rows + +## output_format_pretty_display_footer_column_names_min_rows {#output_format_pretty_display_footer_column_names_min_rows} + +Type: UInt64 + +Default value: 50 Sets the minimum number of rows for which a footer with column names will be displayed if setting [output_format_pretty_display_footer_column_names](#output_format_pretty_display_footer_column_names) is enabled. -Default value: `50`. +## output_format_pretty_grid_charset {#output_format_pretty_grid_charset} -## Template format settings {#template-format-settings} +Type: String -### format_template_resultset {#format_template_resultset} +Default value: UTF-8 -Path to file which contains format string for result set (for Template format). +Charset for printing grid borders. Available charsets: ASCII, UTF-8 (default one). -### format_template_resultset_format {#format_template_resultset_format} +## output_format_pretty_highlight_digit_groups {#output_format_pretty_highlight_digit_groups} -Format string for result set (for Template format) +Type: Bool -### format_template_row {#format_template_row} +Default value: 1 -Path to file which contains format string for rows (for Template format). +If enabled and if output is a terminal, highlight every digit corresponding to the number of thousands, millions, etc. with underline. -### format_template_rows_between_delimiter {#format_template_rows_between_delimiter} +## output_format_pretty_max_column_pad_width {#output_format_pretty_max_column_pad_width} -Delimiter between rows (for Template format). +Type: UInt64 -### format_template_row_format {#format_template_row_format} +Default value: 250 -Format string for rows (for Template format) +Maximum width to pad all values in a column in Pretty formats. -## CustomSeparated format settings {custom-separated-format-settings} +## output_format_pretty_max_rows {#output_format_pretty_max_rows} -### format_custom_escaping_rule {#format_custom_escaping_rule} +Type: UInt64 -Sets the field escaping rule for [CustomSeparated](../../interfaces/formats.md/#format-customseparated) data format. +Default value: 10000 -Possible values: +Rows limit for Pretty formats. -- `'Escaped'` — Similarly to [TSV](../../interfaces/formats.md/#tabseparated). -- `'Quoted'` — Similarly to [Values](../../interfaces/formats.md/#data-format-values). -- `'CSV'` — Similarly to [CSV](../../interfaces/formats.md/#csv). -- `'JSON'` — Similarly to [JSONEachRow](../../interfaces/formats.md/#jsoneachrow). -- `'XML'` — Similarly to [XML](../../interfaces/formats.md/#xml). -- `'Raw'` — Extracts subpatterns as a whole, no escaping rules, similarly to [TSVRaw](../../interfaces/formats.md/#tabseparatedraw). +## output_format_pretty_max_value_width {#output_format_pretty_max_value_width} -Default value: `'Escaped'`. +Type: UInt64 -### format_custom_field_delimiter {#format_custom_field_delimiter} +Default value: 10000 -Sets the character that is interpreted as a delimiter between the fields for [CustomSeparated](../../interfaces/formats.md/#format-customseparated) data format. +Maximum width of value to display in Pretty formats. If greater - it will be cut. -Default value: `'\t'`. +## output_format_pretty_max_value_width_apply_for_single_value {#output_format_pretty_max_value_width_apply_for_single_value} -### format_custom_row_before_delimiter {#format_custom_row_before_delimiter} +Type: UInt64 -Sets the character that is interpreted as a delimiter before the field of the first column for [CustomSeparated](../../interfaces/formats.md/#format-customseparated) data format. +Default value: 0 -Default value: `''`. +Only cut values (see the `output_format_pretty_max_value_width` setting) when it is not a single value in a block. Otherwise output it entirely, which is useful for the `SHOW CREATE TABLE` query. -### format_custom_row_after_delimiter {#format_custom_row_after_delimiter} +## output_format_pretty_row_numbers {#output_format_pretty_row_numbers} -Sets the character that is interpreted as a delimiter after the field of the last column for [CustomSeparated](../../interfaces/formats.md/#format-customseparated) data format. +Type: Bool -Default value: `'\n'`. +Default value: 1 -### format_custom_row_between_delimiter {#format_custom_row_between_delimiter} +Add row numbers before each row for pretty output format -Sets the character that is interpreted as a delimiter between the rows for [CustomSeparated](../../interfaces/formats.md/#format-customseparated) data format. +## output_format_pretty_single_large_number_tip_threshold {#output_format_pretty_single_large_number_tip_threshold} -Default value: `''`. +Type: UInt64 -### format_custom_result_before_delimiter {#format_custom_result_before_delimiter} +Default value: 1000000 -Sets the character that is interpreted as a prefix before the result set for [CustomSeparated](../../interfaces/formats.md/#format-customseparated) data format. +Print a readable number tip on the right side of the table if the block consists of a single number which exceeds this value (except 0) -Default value: `''`. +## output_format_protobuf_nullables_with_google_wrappers {#output_format_protobuf_nullables_with_google_wrappers} -### format_custom_result_after_delimiter {#format_custom_result_after_delimiter} +Type: Bool -Sets the character that is interpreted as a suffix after the result set for [CustomSeparated](../../interfaces/formats.md/#format-customseparated) data format. +Default value: 0 -Default value: `''`. +When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized -### input_format_custom_skip_trailing_empty_lines {input_format_custom_skip_trailing_empty_lines} +## output_format_schema {#output_format_schema} -When enabled, trailing empty lines at the end of file in CustomSeparated format will be skipped. +Type: String -Disabled by default. +Default value: -### input_format_custom_allow_variable_number_of_columns {#input_format_custom_allow_variable_number_of_columns} +The path to the file where the automatically generated schema will be saved in [Cap’n Proto](../../interfaces/formats.md#capnproto-capnproto) or [Protobuf](../../interfaces/formats.md#protobuf-protobuf) formats. -Allow variable number of columns in rows in CustomSeparated input format. -Ignore extra columns in rows with more columns than expected and treat missing columns as default values. +## output_format_sql_insert_include_column_names {#output_format_sql_insert_include_column_names} -Disabled by default. +Type: Bool -## Regexp format settings {#regexp-format-settings} +Default value: 1 -### format_regexp_escaping_rule {#format_regexp_escaping_rule} +Include column names in INSERT query -Field escaping rule. +## output_format_sql_insert_max_batch_size {#output_format_sql_insert_max_batch_size} -Possible values: +Type: UInt64 -- `'Escaped'` — Similarly to [TSV](../../interfaces/formats.md/#tabseparated). -- `'Quoted'` — Similarly to [Values](../../interfaces/formats.md/#data-format-values). -- `'CSV'` — Similarly to [CSV](../../interfaces/formats.md/#csv). -- `'JSON'` — Similarly to [JSONEachRow](../../interfaces/formats.md/#jsoneachrow). -- `'XML'` — Similarly to [XML](../../interfaces/formats.md/#xml). -- `'Raw'` — Extracts subpatterns as a whole, no escaping rules, similarly to [TSVRaw](../../interfaces/formats.md/#tabseparatedraw). +Default value: 65409 -Default value: `Raw`. +The maximum number of rows in one INSERT statement. -### format_regexp_skip_unmatched {#format_regexp_skip_unmatched} +## output_format_sql_insert_quote_names {#output_format_sql_insert_quote_names} -Skip lines unmatched by regular expression. +Type: Bool -Disabled by default. +Default value: 1 -## CapnProto format settings {#capn-proto-format-settings} +Quote column names with '`' characters -### format_capn_proto_enum_comparising_mode {#format_capn_proto_enum_comparising_mode} +## output_format_sql_insert_table_name {#output_format_sql_insert_table_name} -Determines how to map ClickHouse `Enum` data type and [CapnProto](../../interfaces/formats.md/#capnproto) `Enum` data type from schema. +Type: String -Possible values: +Default value: table -- `'by_values'` — Values in enums should be the same, names can be different. -- `'by_names'` — Names in enums should be the same, values can be different. -- `'by_name_case_insensitive'` — Names in enums should be the same case-insensitive, values can be different. +The name of table in the output INSERT query -Default value: `'by_values'`. +## output_format_sql_insert_use_replace {#output_format_sql_insert_use_replace} -### format_capn_proto_use_autogenerated_schema {#format_capn_proto_use_autogenerated_schema} +Type: Bool -Use autogenerated CapnProto schema when [format_schema](#formatschema-format-schema) is not set. -The schema is generated from ClickHouse table structure using function [structureToCapnProtoSchema](../../sql-reference/functions/other-functions.md#structure_to_capnproto_schema) +Default value: 0 -## MySQLDump format settings {#musqldump-format-settings} +Use REPLACE statement instead of INSERT -### input_format_mysql_dump_table_name (#input_format_mysql_dump_table_name) +## output_format_tsv_crlf_end_of_line {#output_format_tsv_crlf_end_of_line} -The name of the table from which to read data from in MySQLDump input format. +Type: Bool -### input_format_mysql_dump_map_columns (#input_format_mysql_dump_map_columns) +Default value: 0 -Enables matching columns from table in MySQL dump and columns from ClickHouse table by names in MySQLDump input format. +If it is set true, end of line in TSV format will be \\r\\n instead of \\n. -Possible values: +## output_format_values_escape_quote_with_quote {#output_format_values_escape_quote_with_quote} -- 0 — Disabled. -- 1 — Enabled. +Type: Bool -Default value: 1. +Default value: 0 -## SQLInsert format settings {#sqlinsert-format-settings} +If true escape ' with '', otherwise quoted with \\' -### output_format_sql_insert_max_batch_size {#output_format_sql_insert_max_batch_size} +## output_format_write_statistics {#output_format_write_statistics} -The maximum number of rows in one INSERT statement. +Type: Bool -Default value: `65505`. +Default value: 1 -### output_format_sql_insert_table_name {#output_format_sql_insert_table_name} +Write statistics about read rows, bytes, time elapsed in suitable output formats. -The name of table that will be used in the output INSERT statement. +Enabled by default -Default value: `table`. +## precise_float_parsing {#precise_float_parsing} -### output_format_sql_insert_include_column_names {#output_format_sql_insert_include_column_names} +Type: Bool -Include column names in INSERT statement. +Default value: 0 -Default value: `true`. +Prefer more precise (but slower) float parsing algorithm -### output_format_sql_insert_use_replace {#output_format_sql_insert_use_replace} +## regexp_dict_allow_hyperscan {#regexp_dict_allow_hyperscan} -Use REPLACE keyword instead of INSERT. +Type: Bool -Default value: `false`. +Default value: 1 -### output_format_sql_insert_quote_names {#output_format_sql_insert_quote_names} +Allow regexp_tree dictionary using Hyperscan library. -Quote column names with "`" characters +## regexp_dict_flag_case_insensitive {#regexp_dict_flag_case_insensitive} -Default value: `true`. +Type: Bool -## BSONEachRow format settings {#bson-each-row-format-settings} +Default value: 0 -### output_format_bson_string_as_string {#output_format_bson_string_as_string} +Use case-insensitive matching for a regexp_tree dictionary. Can be overridden in individual expressions with (?i) and (?-i). -Use BSON String type instead of Binary for String columns. +## regexp_dict_flag_dotall {#regexp_dict_flag_dotall} -Disabled by default. +Type: Bool -### input_format_bson_skip_fields_with_unsupported_types_in_schema_inference {#input_format_bson_skip_fields_with_unsupported_types_in_schema_inference} +Default value: 0 -Allow skipping columns with unsupported types while schema inference for format BSONEachRow. +Allow '.' to match newline characters for a regexp_tree dictionary. -Disabled by default. +## rows_before_aggregation {#rows_before_aggregation} -## RowBinary format settings {#row-binary-format-settings} +Type: Bool -### format_binary_max_string_size {#format_binary_max_string_size} +Default value: 0 -The maximum allowed size for String in RowBinary format. It prevents allocating large amount of memory in case of corrupted data. 0 means there is no limit. +When enabled, ClickHouse will provide exact value for rows_before_aggregation statistic, represents the number of rows read before aggregation -Default value: `1GiB`. +## schema_inference_hints {#schema_inference_hints} -### output_format_binary_encode_types_in_binary_format {#output_format_binary_encode_types_in_binary_format} +Type: String -Write data types in [binary format](../../sql-reference/data-types/data-types-binary-encoding.md) instead of type names in RowBinaryWithNamesAndTypes output format. +Default value: -Disabled by default. +The list of column names and types to use as hints in schema inference for formats without schema. -### input_format_binary_decode_types_in_binary_format {#input_format_binary_decode_types_in_binary_format} +Example: -Read data types in [binary format](../../sql-reference/data-types/data-types-binary-encoding.md) instead of type names in RowBinaryWithNamesAndTypes input format. +Query: +```sql +desc format(JSONEachRow, '{"x" : 1, "y" : "String", "z" : "0.0.0.0" }') settings schema_inference_hints='x UInt8, z IPv4'; +``` -Disabled by default. +Result: +```sql +x UInt8 +y Nullable(String) +z IPv4 +``` -## Native format settings {#native-format-settings} +:::note +If the `schema_inference_hints` is not formatted properly, or if there is a typo or a wrong datatype, etc... the whole schema_inference_hints will be ignored. +::: -### input_format_native_allow_types_conversion {#input_format_native_allow_types_conversion} +## schema_inference_make_columns_nullable {#schema_inference_make_columns_nullable} -Allow types conversion in Native input format between columns from input data and requested columns. +Type: UInt64Auto -Enabled by default. +Default value: 1 -### output_format_native_encode_types_in_binary_format {#output_format_native_encode_types_in_binary_format} +Controls making inferred types `Nullable` in schema inference. +If the setting is enabled, all inferred type will be `Nullable`, if disabled, the inferred type will never be `Nullable`, if set to `auto`, the inferred type will be `Nullable` only if the column contains `NULL` in a sample that is parsed during schema inference or file metadata contains information about column nullability. -Write data types in [binary format](../../sql-reference/data-types/data-types-binary-encoding.md) instead of type names in Native output format. +## schema_inference_mode {#schema_inference_mode} -Disabled by default. +Type: SchemaInferenceMode -### input_format_native_decode_types_in_binary_format {#input_format_native_decode_types_in_binary_format} +Default value: default + +Mode of schema inference. 'default' - assume that all files have the same schema and schema can be inferred from any file, 'union' - files can have different schemas and the resulting schema should be the a union of schemas of all files + +## show_create_query_identifier_quoting_rule {#show_create_query_identifier_quoting_rule} + +Type: IdentifierQuotingRule + +Default value: when_necessary + +Set the quoting rule for identifiers in SHOW CREATE query + +## show_create_query_identifier_quoting_style {#show_create_query_identifier_quoting_style} + +Type: IdentifierQuotingStyle + +Default value: Backticks + +Set the quoting style for identifiers in SHOW CREATE query + +## type_json_skip_duplicated_paths {#type_json_skip_duplicated_paths} + +Type: Bool + +Default value: 0 + +When enabled, during parsing JSON object into JSON type duplicated paths will be ignored and only the first one will be inserted instead of an exception + +## validate_experimental_and_suspicious_types_inside_nested_types {#validate_experimental_and_suspicious_types_inside_nested_types} + +Type: Bool + +Default value: 1 + +Validate usage of experimental and suspicious types inside nested types like Array/Map/Tuple -Read data types in [binary format](../../sql-reference/data-types/data-types-binary-encoding.md) instead of type names in Native input format. -Disabled by default. \ No newline at end of file diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 7b4cc770931..821d08cad7b 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -9,47 +9,23 @@ toc_max_heading_level: 2 All below settings are also available in table [system.settings](/docs/en/operations/system-tables/settings). -## additional_table_filters +## add_http_cors_header {#add_http_cors_header} -An additional filter expression that is applied after reading -from the specified table. +Type: Bool -Default value: 0. +Default value: 0 -**Example** +Write add http CORS header. -``` sql -INSERT INTO table_1 VALUES (1, 'a'), (2, 'bb'), (3, 'ccc'), (4, 'dddd'); -SELECT * FROM table_1; -``` -```response -┌─x─┬─y────┐ -│ 1 │ a │ -│ 2 │ bb │ -│ 3 │ ccc │ -│ 4 │ dddd │ -└───┴──────┘ -``` -```sql -SELECT * -FROM table_1 -SETTINGS additional_table_filters = {'table_1': 'x != 2'} -``` -```response -┌─x─┬─y────┐ -│ 1 │ a │ -│ 3 │ ccc │ -│ 4 │ dddd │ -└───┴──────┘ -``` +## additional_result_filter {#additional_result_filter} -## additional_result_filter +Type: String + +Default value: An additional filter expression to apply to the result of `SELECT` query. This setting is not applied to any subquery. -Default value: `''`. - **Example** ``` sql @@ -77,14 +53,503 @@ SETTINGS additional_result_filter = 'x != 2' └───┴──────┘ ``` +## additional_table_filters {#additional_table_filters} + +Type: Map + +Default value: {} + +An additional filter expression that is applied after reading +from the specified table. + +**Example** + +``` sql +INSERT INTO table_1 VALUES (1, 'a'), (2, 'bb'), (3, 'ccc'), (4, 'dddd'); +SELECT * FROM table_1; +``` +```response +┌─x─┬─y────┐ +│ 1 │ a │ +│ 2 │ bb │ +│ 3 │ ccc │ +│ 4 │ dddd │ +└───┴──────┘ +``` +```sql +SELECT * +FROM table_1 +SETTINGS additional_table_filters = {'table_1': 'x != 2'} +``` +```response +┌─x─┬─y────┐ +│ 1 │ a │ +│ 3 │ ccc │ +│ 4 │ dddd │ +└───┴──────┘ +``` + +## aggregate_functions_null_for_empty {#aggregate_functions_null_for_empty} + +Type: Bool + +Default value: 0 + +Enables or disables rewriting all aggregate functions in a query, adding [-OrNull](../../sql-reference/aggregate-functions/combinators.md/#agg-functions-combinator-ornull) suffix to them. Enable it for SQL standard compatibility. +It is implemented via query rewrite (similar to [count_distinct_implementation](#count_distinct_implementation) setting) to get consistent results for distributed queries. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +**Example** + +Consider the following query with aggregate functions: +```sql +SELECT SUM(-1), MAX(0) FROM system.one WHERE 0; +``` + +With `aggregate_functions_null_for_empty = 0` it would produce: +```text +┌─SUM(-1)─┬─MAX(0)─┐ +│ 0 │ 0 │ +└─────────┴────────┘ +``` + +With `aggregate_functions_null_for_empty = 1` the result would be: +```text +┌─SUMOrNull(-1)─┬─MAXOrNull(0)─┐ +│ NULL │ NULL │ +└───────────────┴──────────────┘ +``` + +## aggregation_in_order_max_block_bytes {#aggregation_in_order_max_block_bytes} + +Type: UInt64 + +Default value: 50000000 + +Maximal size of block in bytes accumulated during aggregation in order of primary key. Lower block size allows to parallelize more final merge stage of aggregation. + +## aggregation_memory_efficient_merge_threads {#aggregation_memory_efficient_merge_threads} + +Type: UInt64 + +Default value: 0 + +Number of threads to use for merge intermediate aggregation results in memory efficient mode. When bigger, then more memory is consumed. 0 means - same as 'max_threads'. + +## allow_aggregate_partitions_independently {#allow_aggregate_partitions_independently} + +Type: Bool + +Default value: 0 + +Enable independent aggregation of partitions on separate threads when partition key suits group by key. Beneficial when number of partitions close to number of cores and partitions have roughly the same size + +## allow_archive_path_syntax {#allow_archive_path_syntax} + +Type: Bool + +Default value: 1 + +File/S3 engines/table function will parse paths with '::' as '\\ :: \\' if archive has correct extension + +## allow_asynchronous_read_from_io_pool_for_merge_tree {#allow_asynchronous_read_from_io_pool_for_merge_tree} + +Type: Bool + +Default value: 0 + +Use background I/O pool to read from MergeTree tables. This setting may increase performance for I/O bound queries + +## allow_changing_replica_until_first_data_packet {#allow_changing_replica_until_first_data_packet} + +Type: Bool + +Default value: 0 + +If it's enabled, in hedged requests we can start new connection until receiving first data packet even if we have already made some progress +(but progress haven't updated for `receive_data_timeout` timeout), otherwise we disable changing replica after the first time we made progress. + +## allow_create_index_without_type {#allow_create_index_without_type} + +Type: Bool + +Default value: 0 + +Allow CREATE INDEX query without TYPE. Query will be ignored. Made for SQL compatibility tests. + +## allow_custom_error_code_in_throwif {#allow_custom_error_code_in_throwif} + +Type: Bool + +Default value: 0 + +Enable custom error code in function throwIf(). If true, thrown exceptions may have unexpected error codes. + +## allow_ddl {#allow_ddl} + +Type: Bool + +Default value: 1 + +If it is set to true, then a user is allowed to executed DDL queries. + +## allow_deprecated_database_ordinary {#allow_deprecated_database_ordinary} + +Type: Bool + +Default value: 0 + +Allow to create databases with deprecated Ordinary engine + +## allow_deprecated_error_prone_window_functions {#allow_deprecated_error_prone_window_functions} + +Type: Bool + +Default value: 0 + +Allow usage of deprecated error prone window functions (neighbor, runningAccumulate, runningDifferenceStartingWithFirstValue, runningDifference) + +## allow_deprecated_snowflake_conversion_functions {#allow_deprecated_snowflake_conversion_functions} + +Type: Bool + +Default value: 0 + +Functions `snowflakeToDateTime`, `snowflakeToDateTime64`, `dateTimeToSnowflake`, and `dateTime64ToSnowflake` are deprecated and disabled by default. +Please use functions `snowflakeIDToDateTime`, `snowflakeIDToDateTime64`, `dateTimeToSnowflakeID`, and `dateTime64ToSnowflakeID` instead. + +To re-enable the deprecated functions (e.g., during a transition period), please set this setting to `true`. + +## allow_deprecated_syntax_for_merge_tree {#allow_deprecated_syntax_for_merge_tree} + +Type: Bool + +Default value: 0 + +Allow to create *MergeTree tables with deprecated engine definition syntax + +## allow_distributed_ddl {#allow_distributed_ddl} + +Type: Bool + +Default value: 1 + +If it is set to true, then a user is allowed to executed distributed DDL queries. + +## allow_drop_detached {#allow_drop_detached} + +Type: Bool + +Default value: 0 + +Allow ALTER TABLE ... DROP DETACHED PART[ITION] ... queries + +## allow_execute_multiif_columnar {#allow_execute_multiif_columnar} + +Type: Bool + +Default value: 1 + +Allow execute multiIf function columnar + +## allow_experimental_analyzer {#allow_experimental_analyzer} + +Type: Bool + +Default value: 1 + +Allow new query analyzer. + +## allow_experimental_codecs {#allow_experimental_codecs} + +Type: Bool + +Default value: 0 + +If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing). + +## allow_experimental_database_materialized_mysql {#allow_experimental_database_materialized_mysql} + +Type: Bool + +Default value: 0 + +Allow to create database with Engine=MaterializedMySQL(...). + +## allow_experimental_database_materialized_postgresql {#allow_experimental_database_materialized_postgresql} + +Type: Bool + +Default value: 0 + +Allow to create database with Engine=MaterializedPostgreSQL(...). + +## allow_experimental_dynamic_type {#allow_experimental_dynamic_type} + +Type: Bool + +Default value: 0 + +Allow Dynamic data type + +## allow_experimental_full_text_index {#allow_experimental_full_text_index} + +Type: Bool + +Default value: 0 + +If it is set to true, allow to use experimental full-text index. + +## allow_experimental_funnel_functions {#allow_experimental_funnel_functions} + +Type: Bool + +Default value: 0 + +Enable experimental functions for funnel analysis. + +## allow_experimental_hash_functions {#allow_experimental_hash_functions} + +Type: Bool + +Default value: 0 + +Enable experimental hash functions + +## allow_experimental_inverted_index {#allow_experimental_inverted_index} + +Type: Bool + +Default value: 0 + +If it is set to true, allow to use experimental inverted index. + +## allow_experimental_join_condition {#allow_experimental_join_condition} + +Type: Bool + +Default value: 0 + +Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y. + +## allow_experimental_join_right_table_sorting {#allow_experimental_join_right_table_sorting} + +Type: Bool + +Default value: 0 + +If it is set to true, and the conditions of `join_to_sort_minimum_perkey_rows` and `join_to_sort_maximum_table_rows` are met, rerange the right table by key to improve the performance in left or inner hash join. + +## allow_experimental_json_type {#allow_experimental_json_type} + +Type: Bool + +Default value: 0 + +Allow JSON data type + +## allow_experimental_kafka_offsets_storage_in_keeper {#allow_experimental_kafka_offsets_storage_in_keeper} + +Type: Bool + +Default value: 0 + +Allow experimental feature to store Kafka related offsets in ClickHouse Keeper. When enabled a ClickHouse Keeper path and replica name can be specified to the Kafka table engine. As a result instead of the regular Kafka engine, a new type of storage engine will be used that stores the committed offsets primarily in ClickHouse Keeper + +## allow_experimental_live_view {#allow_experimental_live_view} + +Type: Bool + +Default value: 0 + +Allows creation of a deprecated LIVE VIEW. + +Possible values: + +- 0 — Working with live views is disabled. +- 1 — Working with live views is enabled. + +## allow_experimental_materialized_postgresql_table {#allow_experimental_materialized_postgresql_table} + +Type: Bool + +Default value: 0 + +Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental + +## allow_experimental_nlp_functions {#allow_experimental_nlp_functions} + +Type: Bool + +Default value: 0 + +Enable experimental functions for natural language processing. + +## allow_experimental_object_type {#allow_experimental_object_type} + +Type: Bool + +Default value: 0 + +Allow Object and JSON data types + +## allow_experimental_parallel_reading_from_replicas {#allow_experimental_parallel_reading_from_replicas} + +Type: UInt64 + +Default value: 0 + +Use up to `max_parallel_replicas` the number of replicas from each shard for SELECT query execution. Reading is parallelized and coordinated dynamically. 0 - disabled, 1 - enabled, silently disable them in case of failure, 2 - enabled, throw an exception in case of failure + +## allow_experimental_query_deduplication {#allow_experimental_query_deduplication} + +Type: Bool + +Default value: 0 + +Experimental data deduplication for SELECT queries based on part UUIDs + +## allow_experimental_refreshable_materialized_view {#allow_experimental_refreshable_materialized_view} + +Type: Bool + +Default value: 0 + +Allow refreshable materialized views (CREATE MATERIALIZED VIEW \\ REFRESH ...). + +## allow_experimental_shared_set_join {#allow_experimental_shared_set_join} + +Type: Bool + +Default value: 1 + +Only in ClickHouse Cloud. Allow to create ShareSet and SharedJoin + +## allow_experimental_statistics {#allow_experimental_statistics} + +Type: Bool + +Default value: 0 + +Allows defining columns with [statistics](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) and [manipulate statistics](../../engines/table-engines/mergetree-family/mergetree.md#column-statistics). + +## allow_experimental_time_series_table {#allow_experimental_time_series_table} + +Type: Bool + +Default value: 0 + +Allows creation of tables with the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine. + +Possible values: + +- 0 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is disabled. +- 1 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is enabled. + +## allow_experimental_variant_type {#allow_experimental_variant_type} + +Type: Bool + +Default value: 0 + +Allows creation of experimental [Variant](../../sql-reference/data-types/variant.md). + +## allow_experimental_vector_similarity_index {#allow_experimental_vector_similarity_index} + +Type: Bool + +Default value: 0 + +Allow experimental vector similarity index + +## allow_experimental_window_view {#allow_experimental_window_view} + +Type: Bool + +Default value: 0 + +Enable WINDOW VIEW. Not mature enough. + +## allow_get_client_http_header {#allow_get_client_http_header} + +Type: Bool + +Default value: 0 + +Allow to use the function `getClientHTTPHeader` which lets to obtain a value of an the current HTTP request's header. It is not enabled by default for security reasons, because some headers, such as `Cookie`, could contain sensitive info. Note that the `X-ClickHouse-*` and `Authentication` headers are always restricted and cannot be obtained with this function. + +## allow_hyperscan {#allow_hyperscan} + +Type: Bool + +Default value: 1 + +Allow functions that use Hyperscan library. Disable to avoid potentially long compilation times and excessive resource usage. + +## allow_introspection_functions {#allow_introspection_functions} + +Type: Bool + +Default value: 0 + +Enables or disables [introspection functions](../../sql-reference/functions/introspection.md) for query profiling. + +Possible values: + +- 1 — Introspection functions enabled. +- 0 — Introspection functions disabled. + +**See Also** + +- [Sampling Query Profiler](../../operations/optimizing-performance/sampling-query-profiler.md) +- System table [trace_log](../../operations/system-tables/trace_log.md/#system_tables-trace_log) + +## allow_materialized_view_with_bad_select {#allow_materialized_view_with_bad_select} + +Type: Bool + +Default value: 1 + +Allow CREATE MATERIALIZED VIEW with SELECT query that references nonexistent tables or columns. It must still be syntactically valid. Doesn't apply to refreshable MVs. Doesn't apply if the MV schema needs to be inferred from the SELECT query (i.e. if the CREATE has no column list and no TO table). Can be used for creating MV before its source table. + +## allow_named_collection_override_by_default {#allow_named_collection_override_by_default} + +Type: Bool + +Default value: 1 + +Allow named collections' fields override by default. + +## allow_non_metadata_alters {#allow_non_metadata_alters} + +Type: Bool + +Default value: 1 + +Allow to execute alters which affects not only tables metadata, but also data on disk + +## allow_nonconst_timezone_arguments {#allow_nonconst_timezone_arguments} + +Type: Bool + +Default value: 0 + +Allow non-const timezone arguments in certain time-related functions like toTimeZone(), fromUnixTimestamp*(), snowflakeToDateTime*() + ## allow_nondeterministic_mutations {#allow_nondeterministic_mutations} +Type: Bool + +Default value: 0 + User-level setting that allows mutations on replicated tables to make use of non-deterministic functions such as `dictGet`. Given that, for example, dictionaries, can be out of sync across nodes, mutations that pull values from them are disallowed on replicated tables by default. Enabling this setting allows this behavior, making it the user's responsibility to ensure that the data used is in sync across all nodes. -Default value: 0. - **Example** ``` xml @@ -100,249 +565,985 @@ Default value: 0. ``` -## mutations_execute_nondeterministic_on_initiator {#mutations_execute_nondeterministic_on_initiator} +## allow_nondeterministic_optimize_skip_unused_shards {#allow_nondeterministic_optimize_skip_unused_shards} -If true constant nondeterministic functions (e.g. function `now()`) are executed on initiator and replaced to literals in `UPDATE` and `DELETE` queries. It helps to keep data in sync on replicas while executing mutations with constant nondeterministic functions. Default value: `false`. +Type: Bool -## mutations_execute_subqueries_on_initiator {#mutations_execute_subqueries_on_initiator} +Default value: 0 -If true scalar subqueries are executed on initiator and replaced to literals in `UPDATE` and `DELETE` queries. Default value: `false`. - -## mutations_max_literal_size_to_replace {#mutations_max_literal_size_to_replace} - -The maximum size of serialized literal in bytes to replace in `UPDATE` and `DELETE` queries. Takes effect only if at least one the two settings above is enabled. Default value: 16384 (16 KiB). - -## distributed_product_mode {#distributed-product-mode} - -Changes the behaviour of [distributed subqueries](../../sql-reference/operators/in.md). - -ClickHouse applies this setting when the query contains the product of distributed tables, i.e. when the query for a distributed table contains a non-GLOBAL subquery for the distributed table. - -Restrictions: - -- Only applied for IN and JOIN subqueries. -- Only if the FROM section uses a distributed table containing more than one shard. -- If the subquery concerns a distributed table containing more than one shard. -- Not used for a table-valued [remote](../../sql-reference/table-functions/remote.md) function. +Allow nondeterministic (like `rand` or `dictGet`, since later has some caveats with updates) functions in sharding key. Possible values: -- `deny` — Default value. Prohibits using these types of subqueries (returns the “Double-distributed in/JOIN subqueries is denied” exception). -- `local` — Replaces the database and table in the subquery with local ones for the destination server (shard), leaving the normal `IN`/`JOIN.` -- `global` — Replaces the `IN`/`JOIN` query with `GLOBAL IN`/`GLOBAL JOIN.` -- `allow` — Allows the use of these types of subqueries. +- 0 — Disallowed. +- 1 — Allowed. -## prefer_global_in_and_join {#prefer-global-in-and-join} +## allow_prefetched_read_pool_for_local_filesystem {#allow_prefetched_read_pool_for_local_filesystem} -Enables the replacement of `IN`/`JOIN` operators with `GLOBAL IN`/`GLOBAL JOIN`. +Type: Bool -Possible values: +Default value: 0 -- 0 — Disabled. `IN`/`JOIN` operators are not replaced with `GLOBAL IN`/`GLOBAL JOIN`. -- 1 — Enabled. `IN`/`JOIN` operators are replaced with `GLOBAL IN`/`GLOBAL JOIN`. +Prefer prefetched threadpool if all parts are on local filesystem -Default value: `0`. +## allow_prefetched_read_pool_for_remote_filesystem {#allow_prefetched_read_pool_for_remote_filesystem} -**Usage** +Type: Bool -Although `SET distributed_product_mode=global` can change the queries behavior for the distributed tables, it's not suitable for local tables or tables from external resources. Here is when the `prefer_global_in_and_join` setting comes into play. +Default value: 1 -For example, we have query serving nodes that contain local tables, which are not suitable for distribution. We need to scatter their data on the fly during distributed processing with the `GLOBAL` keyword — `GLOBAL IN`/`GLOBAL JOIN`. +Prefer prefetched threadpool if all parts are on remote filesystem -Another use case of `prefer_global_in_and_join` is accessing tables created by external engines. This setting helps to reduce the number of calls to external sources while joining such tables: only one call per query. +## allow_push_predicate_when_subquery_contains_with {#allow_push_predicate_when_subquery_contains_with} -**See also:** +Type: Bool -- [Distributed subqueries](../../sql-reference/operators/in.md/#select-distributed-subqueries) for more information on how to use `GLOBAL IN`/`GLOBAL JOIN` +Default value: 1 -## enable_optimize_predicate_expression {#enable-optimize-predicate-expression} +Allows push predicate when subquery contains WITH clause -Turns on predicate pushdown in `SELECT` queries. +## allow_settings_after_format_in_insert {#allow_settings_after_format_in_insert} -Predicate pushdown may significantly reduce network traffic for distributed queries. +Type: Bool -Possible values: +Default value: 0 -- 0 — Disabled. -- 1 — Enabled. +Control whether `SETTINGS` after `FORMAT` in `INSERT` queries is allowed or not. It is not recommended to use this, since this may interpret part of `SETTINGS` as values. -Default value: 1. - -Usage - -Consider the following queries: - -1. `SELECT count() FROM test_table WHERE date = '2018-10-10'` -2. `SELECT count() FROM (SELECT * FROM test_table) WHERE date = '2018-10-10'` - -If `enable_optimize_predicate_expression = 1`, then the execution time of these queries is equal because ClickHouse applies `WHERE` to the subquery when processing it. - -If `enable_optimize_predicate_expression = 0`, then the execution time of the second query is much longer because the `WHERE` clause applies to all the data after the subquery finishes. - -## fallback_to_stale_replicas_for_distributed_queries {#fallback_to_stale_replicas_for_distributed_queries} - -Forces a query to an out-of-date replica if updated data is not available. See [Replication](../../engines/table-engines/mergetree-family/replication.md). - -ClickHouse selects the most relevant from the outdated replicas of the table. - -Used when performing `SELECT` from a distributed table that points to replicated tables. - -By default, 1 (enabled). - -## force_index_by_date {#force_index_by_date} - -Disables query execution if the index can’t be used by date. - -Works with tables in the MergeTree family. - -If `force_index_by_date=1`, ClickHouse checks whether the query has a date key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition reduces the amount of data to read. For example, the condition `Date != ' 2000-01-01 '` is acceptable even when it matches all the data in the table (i.e., running the query requires a full scan). For more information about ranges of data in MergeTree tables, see [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). - -## force_primary_key {#force-primary-key} - -Disables query execution if indexing by the primary key is not possible. - -Works with tables in the MergeTree family. - -If `force_primary_key=1`, ClickHouse checks to see if the query has a primary key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition reduces the amount of data to read. For more information about data ranges in MergeTree tables, see [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). - -## use_skip_indexes {#use_skip_indexes} - -Use data skipping indexes during query execution. - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: 1. - -## use_skip_indexes_if_final {#use_skip_indexes_if_final} - -Controls whether skipping indexes are used when executing a query with the FINAL modifier. - -By default, this setting is disabled because skip indexes may exclude rows (granules) containing the latest data, which could lead to incorrect results. When enabled, skipping indexes are applied even with the FINAL modifier, potentially improving performance but with the risk of missing recent updates. - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: 0. - -## force_data_skipping_indices {#force_data_skipping_indices} - -Disables query execution if passed data skipping indices wasn't used. - -Consider the following example: +Example: ```sql -CREATE TABLE data -( - key Int, - d1 Int, - d1_null Nullable(Int), - INDEX d1_idx d1 TYPE minmax GRANULARITY 1, - INDEX d1_null_idx assumeNotNull(d1_null) TYPE minmax GRANULARITY 1 -) -Engine=MergeTree() -ORDER BY key; - -SELECT * FROM data_01515; -SELECT * FROM data_01515 SETTINGS force_data_skipping_indices=''; -- query will produce CANNOT_PARSE_TEXT error. -SELECT * FROM data_01515 SETTINGS force_data_skipping_indices='d1_idx'; -- query will produce INDEX_NOT_USED error. -SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_idx'; -- Ok. -SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='`d1_idx`'; -- Ok (example of full featured parser). -SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='`d1_idx`, d1_null_idx'; -- query will produce INDEX_NOT_USED error, since d1_null_idx is not used. -SELECT * FROM data_01515 WHERE d1 = 0 AND assumeNotNull(d1_null) = 0 SETTINGS force_data_skipping_indices='`d1_idx`, d1_null_idx'; -- Ok. +INSERT INTO FUNCTION null('foo String') SETTINGS max_threads=1 VALUES ('bar'); ``` -## ignore_data_skipping_indices {#ignore_data_skipping_indices} - -Ignores the skipping indexes specified if used by the query. - -Consider the following example: +But the following query will work only with `allow_settings_after_format_in_insert`: ```sql -CREATE TABLE data -( - key Int, - x Int, - y Int, - INDEX x_idx x TYPE minmax GRANULARITY 1, - INDEX y_idx y TYPE minmax GRANULARITY 1, - INDEX xy_idx (x,y) TYPE minmax GRANULARITY 1 -) -Engine=MergeTree() -ORDER BY key; - -INSERT INTO data VALUES (1, 2, 3); - -SELECT * FROM data; -SELECT * FROM data SETTINGS ignore_data_skipping_indices=''; -- query will produce CANNOT_PARSE_TEXT error. -SELECT * FROM data SETTINGS ignore_data_skipping_indices='x_idx'; -- Ok. -SELECT * FROM data SETTINGS ignore_data_skipping_indices='na_idx'; -- Ok. - -SELECT * FROM data WHERE x = 1 AND y = 1 SETTINGS ignore_data_skipping_indices='xy_idx',force_data_skipping_indices='xy_idx' ; -- query will produce INDEX_NOT_USED error, since xy_idx is explictly ignored. -SELECT * FROM data WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx'; +SET allow_settings_after_format_in_insert=1; +INSERT INTO FUNCTION null('foo String') VALUES ('bar') SETTINGS max_threads=1; ``` -The query without ignoring any indexes: +Possible values: + +- 0 — Disallow. +- 1 — Allow. + +:::note +Use this setting only for backward compatibility if your use cases depend on old syntax. +::: + +## allow_simdjson {#allow_simdjson} + +Type: Bool + +Default value: 1 + +Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used. + +## allow_statistics_optimize {#allow_statistics_optimize} + +Type: Bool + +Default value: 0 + +Allows using statistics to optimize queries + +## allow_suspicious_codecs {#allow_suspicious_codecs} + +Type: Bool + +Default value: 0 + +If it is set to true, allow to specify meaningless compression codecs. + +## allow_suspicious_fixed_string_types {#allow_suspicious_fixed_string_types} + +Type: Bool + +Default value: 0 + +In CREATE TABLE statement allows creating columns of type FixedString(n) with n > 256. FixedString with length >= 256 is suspicious and most likely indicates a misuse + +## allow_suspicious_indices {#allow_suspicious_indices} + +Type: Bool + +Default value: 0 + +Reject primary/secondary indexes and sorting keys with identical expressions + +## allow_suspicious_low_cardinality_types {#allow_suspicious_low_cardinality_types} + +Type: Bool + +Default value: 0 + +Allows or restricts using [LowCardinality](../../sql-reference/data-types/lowcardinality.md) with data types with fixed size of 8 bytes or less: numeric data types and `FixedString(8_bytes_or_less)`. + +For small fixed values using of `LowCardinality` is usually inefficient, because ClickHouse stores a numeric index for each row. As a result: + +- Disk space usage can rise. +- RAM consumption can be higher, depending on a dictionary size. +- Some functions can work slower due to extra coding/encoding operations. + +Merge times in [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md)-engine tables can grow due to all the reasons described above. + +Possible values: + +- 1 — Usage of `LowCardinality` is not restricted. +- 0 — Usage of `LowCardinality` is restricted. + +## allow_suspicious_primary_key {#allow_suspicious_primary_key} + +Type: Bool + +Default value: 0 + +Allow suspicious `PRIMARY KEY`/`ORDER BY` for MergeTree (i.e. SimpleAggregateFunction). + +## allow_suspicious_ttl_expressions {#allow_suspicious_ttl_expressions} + +Type: Bool + +Default value: 0 + +Reject TTL expressions that don't depend on any of table's columns. It indicates a user error most of the time. + +## allow_suspicious_variant_types {#allow_suspicious_variant_types} + +Type: Bool + +Default value: 0 + +In CREATE TABLE statement allows specifying Variant type with similar variant types (for example, with different numeric or date types). Enabling this setting may introduce some ambiguity when working with values with similar types. + +## allow_suspicious_types_in_group_by {#allow_suspicious_types_in_group_by} + +Type: Bool + +Default value: 0 + +Allows or restricts using [Variant](../../sql-reference/data-types/variant.md) and [Dynamic](../../sql-reference/data-types/dynamic.md) types in GROUP BY keys. + +## allow_suspicious_types_in_order_by {#allow_suspicious_types_in_order_by} + +Type: Bool + +Default value: 0 + +Allows or restricts using [Variant](../../sql-reference/data-types/variant.md) and [Dynamic](../../sql-reference/data-types/dynamic.md) types in ORDER BY keys. + +## allow_unrestricted_reads_from_keeper {#allow_unrestricted_reads_from_keeper} + +Type: Bool + +Default value: 0 + +Allow unrestricted (without condition on path) reads from system.zookeeper table, can be handy, but is not safe for zookeeper + +## alter_move_to_space_execute_async {#alter_move_to_space_execute_async} + +Type: Bool + +Default value: 0 + +Execute ALTER TABLE MOVE ... TO [DISK|VOLUME] asynchronously + +## alter_partition_verbose_result {#alter_partition_verbose_result} + +Type: Bool + +Default value: 0 + +Enables or disables the display of information about the parts to which the manipulation operations with partitions and parts have been successfully applied. +Applicable to [ATTACH PARTITION|PART](../../sql-reference/statements/alter/partition.md/#alter_attach-partition) and to [FREEZE PARTITION](../../sql-reference/statements/alter/partition.md/#alter_freeze-partition). + +Possible values: + +- 0 — disable verbosity. +- 1 — enable verbosity. + +**Example** + ```sql -EXPLAIN indexes = 1 SELECT * FROM data WHERE x = 1 AND y = 2; +CREATE TABLE test(a Int64, d Date, s String) ENGINE = MergeTree PARTITION BY toYYYYMM(d) ORDER BY a; +INSERT INTO test VALUES(1, '2021-01-01', ''); +INSERT INTO test VALUES(1, '2021-01-01', ''); +ALTER TABLE test DETACH PARTITION ID '202101'; -Expression ((Projection + Before ORDER BY)) - Filter (WHERE) - ReadFromMergeTree (default.data) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 1/1 - Skip - Name: x_idx - Description: minmax GRANULARITY 1 - Parts: 0/1 - Granules: 0/1 - Skip - Name: y_idx - Description: minmax GRANULARITY 1 - Parts: 0/0 - Granules: 0/0 - Skip - Name: xy_idx - Description: minmax GRANULARITY 1 - Parts: 0/0 - Granules: 0/0 +ALTER TABLE test ATTACH PARTITION ID '202101' SETTINGS alter_partition_verbose_result = 1; + +┌─command_type─────┬─partition_id─┬─part_name────┬─old_part_name─┐ +│ ATTACH PARTITION │ 202101 │ 202101_7_7_0 │ 202101_5_5_0 │ +│ ATTACH PARTITION │ 202101 │ 202101_8_8_0 │ 202101_6_6_0 │ +└──────────────────┴──────────────┴──────────────┴───────────────┘ + +ALTER TABLE test FREEZE SETTINGS alter_partition_verbose_result = 1; + +┌─command_type─┬─partition_id─┬─part_name────┬─backup_name─┬─backup_path───────────────────┬─part_backup_path────────────────────────────────────────────┐ +│ FREEZE ALL │ 202101 │ 202101_7_7_0 │ 8 │ /var/lib/clickhouse/shadow/8/ │ /var/lib/clickhouse/shadow/8/data/default/test/202101_7_7_0 │ +│ FREEZE ALL │ 202101 │ 202101_8_8_0 │ 8 │ /var/lib/clickhouse/shadow/8/ │ /var/lib/clickhouse/shadow/8/data/default/test/202101_8_8_0 │ +└──────────────┴──────────────┴──────────────┴─────────────┴───────────────────────────────┴─────────────────────────────────────────────────────────────┘ ``` -Ignoring the `xy_idx` index: +## alter_sync {#alter_sync} + +Type: UInt64 + +Default value: 1 + +Allows to set up waiting for actions to be executed on replicas by [ALTER](../../sql-reference/statements/alter/index.md), [OPTIMIZE](../../sql-reference/statements/optimize.md) or [TRUNCATE](../../sql-reference/statements/truncate.md) queries. + +Possible values: + +- 0 — Do not wait. +- 1 — Wait for own execution. +- 2 — Wait for everyone. + +Cloud default value: `0`. + +:::note +`alter_sync` is applicable to `Replicated` tables only, it does nothing to alters of not `Replicated` tables. +::: + +## analyze_index_with_space_filling_curves {#analyze_index_with_space_filling_curves} + +Type: Bool + +Default value: 1 + +If a table has a space-filling curve in its index, e.g. `ORDER BY mortonEncode(x, y)` or `ORDER BY hilbertEncode(x, y)`, and the query has conditions on its arguments, e.g. `x >= 10 AND x <= 20 AND y >= 20 AND y <= 30`, use the space-filling curve for index analysis. + +## analyzer_compatibility_join_using_top_level_identifier {#analyzer_compatibility_join_using_top_level_identifier} + +Type: Bool + +Default value: 0 + +Force to resolve identifier in JOIN USING from projection (for example, in `SELECT a + 1 AS b FROM t1 JOIN t2 USING (b)` join will be performed by `t1.a + 1 = t2.b`, rather then `t1.b = t2.b`). + +## any_join_distinct_right_table_keys {#any_join_distinct_right_table_keys} + +Type: Bool + +Default value: 0 + +Enables legacy ClickHouse server behaviour in `ANY INNER|LEFT JOIN` operations. + +:::note +Use this setting only for backward compatibility if your use cases depend on legacy `JOIN` behaviour. +::: + +When the legacy behaviour is enabled: + +- Results of `t1 ANY LEFT JOIN t2` and `t2 ANY RIGHT JOIN t1` operations are not equal because ClickHouse uses the logic with many-to-one left-to-right table keys mapping. +- Results of `ANY INNER JOIN` operations contain all rows from the left table like the `SEMI LEFT JOIN` operations do. + +When the legacy behaviour is disabled: + +- Results of `t1 ANY LEFT JOIN t2` and `t2 ANY RIGHT JOIN t1` operations are equal because ClickHouse uses the logic which provides one-to-many keys mapping in `ANY RIGHT JOIN` operations. +- Results of `ANY INNER JOIN` operations contain one row per key from both the left and right tables. + +Possible values: + +- 0 — Legacy behaviour is disabled. +- 1 — Legacy behaviour is enabled. + +See also: + +- [JOIN strictness](../../sql-reference/statements/select/join.md/#join-settings) + +## apply_deleted_mask {#apply_deleted_mask} + +Type: Bool + +Default value: 1 + +Enables filtering out rows deleted with lightweight DELETE. If disabled, a query will be able to read those rows. This is useful for debugging and \"undelete\" scenarios + +## apply_mutations_on_fly {#apply_mutations_on_fly} + +Type: Bool + +Default value: 0 + +If true, mutations (UPDATEs and DELETEs) which are not materialized in data part will be applied on SELECTs. Only available in ClickHouse Cloud. + +## asterisk_include_alias_columns {#asterisk_include_alias_columns} + +Type: Bool + +Default value: 0 + +Include [ALIAS](../../sql-reference/statements/create/table.md#alias) columns for wildcard query (`SELECT *`). + +Possible values: + +- 0 - disabled +- 1 - enabled + +## asterisk_include_materialized_columns {#asterisk_include_materialized_columns} + +Type: Bool + +Default value: 0 + +Include [MATERIALIZED](../../sql-reference/statements/create/table.md#materialized) columns for wildcard query (`SELECT *`). + +Possible values: + +- 0 - disabled +- 1 - enabled + +## async_insert {#async_insert} + +Type: Bool + +Default value: 0 + +If true, data from INSERT query is stored in queue and later flushed to table in background. If wait_for_async_insert is false, INSERT query is processed almost instantly, otherwise client will wait until data will be flushed to table + +## async_insert_busy_timeout_decrease_rate {#async_insert_busy_timeout_decrease_rate} + +Type: Double + +Default value: 0.2 + +The exponential growth rate at which the adaptive asynchronous insert timeout decreases + +## async_insert_busy_timeout_increase_rate {#async_insert_busy_timeout_increase_rate} + +Type: Double + +Default value: 0.2 + +The exponential growth rate at which the adaptive asynchronous insert timeout increases + +## async_insert_busy_timeout_max_ms {#async_insert_busy_timeout_max_ms} + +Type: Milliseconds + +Default value: 200 + +Maximum time to wait before dumping collected data per query since the first data appeared. + +## async_insert_busy_timeout_min_ms {#async_insert_busy_timeout_min_ms} + +Type: Milliseconds + +Default value: 50 + +If auto-adjusting is enabled through async_insert_use_adaptive_busy_timeout, minimum time to wait before dumping collected data per query since the first data appeared. It also serves as the initial value for the adaptive algorithm + +## async_insert_deduplicate {#async_insert_deduplicate} + +Type: Bool + +Default value: 0 + +For async INSERT queries in the replicated table, specifies that deduplication of inserting blocks should be performed + +## async_insert_max_data_size {#async_insert_max_data_size} + +Type: UInt64 + +Default value: 10485760 + +Maximum size in bytes of unparsed data collected per query before being inserted + +## async_insert_max_query_number {#async_insert_max_query_number} + +Type: UInt64 + +Default value: 450 + +Maximum number of insert queries before being inserted + +## async_insert_poll_timeout_ms {#async_insert_poll_timeout_ms} + +Type: Milliseconds + +Default value: 10 + +Timeout for polling data from asynchronous insert queue + +## async_insert_use_adaptive_busy_timeout {#async_insert_use_adaptive_busy_timeout} + +Type: Bool + +Default value: 1 + +If it is set to true, use adaptive busy timeout for asynchronous inserts + +## async_query_sending_for_remote {#async_query_sending_for_remote} + +Type: Bool + +Default value: 1 + +Enables asynchronous connection creation and query sending while executing remote query. + +Enabled by default. + +## async_socket_for_remote {#async_socket_for_remote} + +Type: Bool + +Default value: 1 + +Enables asynchronous read from socket while executing remote query. + +Enabled by default. + +## azure_allow_parallel_part_upload {#azure_allow_parallel_part_upload} + +Type: Bool + +Default value: 1 + +Use multiple threads for azure multipart upload. + +## azure_create_new_file_on_insert {#azure_create_new_file_on_insert} + +Type: Bool + +Default value: 0 + +Enables or disables creating a new file on each insert in azure engine tables + +## azure_ignore_file_doesnt_exist {#azure_ignore_file_doesnt_exist} + +Type: Bool + +Default value: 0 + +Ignore absence of file if it does not exist when reading certain keys. + +Possible values: +- 1 — `SELECT` returns empty result. +- 0 — `SELECT` throws an exception. + +## azure_list_object_keys_size {#azure_list_object_keys_size} + +Type: UInt64 + +Default value: 1000 + +Maximum number of files that could be returned in batch by ListObject request + +## azure_max_blocks_in_multipart_upload {#azure_max_blocks_in_multipart_upload} + +Type: UInt64 + +Default value: 50000 + +Maximum number of blocks in multipart upload for Azure. + +## azure_max_inflight_parts_for_one_file {#azure_max_inflight_parts_for_one_file} + +Type: UInt64 + +Default value: 20 + +The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited. + +## azure_max_single_part_copy_size {#azure_max_single_part_copy_size} + +Type: UInt64 + +Default value: 268435456 + +The maximum size of object to copy using single part copy to Azure blob storage. + +## azure_max_single_part_upload_size {#azure_max_single_part_upload_size} + +Type: UInt64 + +Default value: 104857600 + +The maximum size of object to upload using singlepart upload to Azure blob storage. + +## azure_max_single_read_retries {#azure_max_single_read_retries} + +Type: UInt64 + +Default value: 4 + +The maximum number of retries during single Azure blob storage read. + +## azure_max_unexpected_write_error_retries {#azure_max_unexpected_write_error_retries} + +Type: UInt64 + +Default value: 4 + +The maximum number of retries in case of unexpected errors during Azure blob storage write + +## azure_max_upload_part_size {#azure_max_upload_part_size} + +Type: UInt64 + +Default value: 5368709120 + +The maximum size of part to upload during multipart upload to Azure blob storage. + +## azure_min_upload_part_size {#azure_min_upload_part_size} + +Type: UInt64 + +Default value: 16777216 + +The minimum size of part to upload during multipart upload to Azure blob storage. + +## azure_sdk_max_retries {#azure_sdk_max_retries} + +Type: UInt64 + +Default value: 10 + +Maximum number of retries in azure sdk + +## azure_sdk_retry_initial_backoff_ms {#azure_sdk_retry_initial_backoff_ms} + +Type: UInt64 + +Default value: 10 + +Minimal backoff between retries in azure sdk + +## azure_sdk_retry_max_backoff_ms {#azure_sdk_retry_max_backoff_ms} + +Type: UInt64 + +Default value: 1000 + +Maximal backoff between retries in azure sdk + +## azure_skip_empty_files {#azure_skip_empty_files} + +Type: Bool + +Default value: 0 + +Enables or disables skipping empty files in S3 engine. + +Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. + +## azure_strict_upload_part_size {#azure_strict_upload_part_size} + +Type: UInt64 + +Default value: 0 + +The exact size of part to upload during multipart upload to Azure blob storage. + +## azure_throw_on_zero_files_match {#azure_throw_on_zero_files_match} + +Type: Bool + +Default value: 0 + +Throw an error if matched zero files according to glob expansion rules. + +Possible values: +- 1 — `SELECT` throws an exception. +- 0 — `SELECT` returns empty result. + +## azure_truncate_on_insert {#azure_truncate_on_insert} + +Type: Bool + +Default value: 0 + +Enables or disables truncate before insert in azure engine tables. + +## azure_upload_part_size_multiply_factor {#azure_upload_part_size_multiply_factor} + +Type: UInt64 + +Default value: 2 + +Multiply azure_min_upload_part_size by this factor each time azure_multiply_parts_count_threshold parts were uploaded from a single write to Azure blob storage. + +## azure_upload_part_size_multiply_parts_count_threshold {#azure_upload_part_size_multiply_parts_count_threshold} + +Type: UInt64 + +Default value: 500 + +Each time this number of parts was uploaded to Azure blob storage, azure_min_upload_part_size is multiplied by azure_upload_part_size_multiply_factor. + +## backup_restore_batch_size_for_keeper_multi {#backup_restore_batch_size_for_keeper_multi} + +Type: UInt64 + +Default value: 1000 + +Maximum size of batch for multi request to [Zoo]Keeper during backup or restore + +## backup_restore_batch_size_for_keeper_multiread {#backup_restore_batch_size_for_keeper_multiread} + +Type: UInt64 + +Default value: 10000 + +Maximum size of batch for multiread request to [Zoo]Keeper during backup or restore + +## backup_restore_keeper_fault_injection_probability {#backup_restore_keeper_fault_injection_probability} + +Type: Float + +Default value: 0 + +Approximate probability of failure for a keeper request during backup or restore. Valid value is in interval [0.0f, 1.0f] + +## backup_restore_keeper_fault_injection_seed {#backup_restore_keeper_fault_injection_seed} + +Type: UInt64 + +Default value: 0 + +0 - random seed, otherwise the setting value + +## backup_restore_keeper_max_retries {#backup_restore_keeper_max_retries} + +Type: UInt64 + +Default value: 20 + +Max retries for keeper operations during backup or restore + +## backup_restore_keeper_retry_initial_backoff_ms {#backup_restore_keeper_retry_initial_backoff_ms} + +Type: UInt64 + +Default value: 100 + +Initial backoff timeout for [Zoo]Keeper operations during backup or restore + +## backup_restore_keeper_retry_max_backoff_ms {#backup_restore_keeper_retry_max_backoff_ms} + +Type: UInt64 + +Default value: 5000 + +Max backoff timeout for [Zoo]Keeper operations during backup or restore + +## backup_restore_keeper_value_max_size {#backup_restore_keeper_value_max_size} + +Type: UInt64 + +Default value: 1048576 + +Maximum size of data of a [Zoo]Keeper's node during backup + +## backup_restore_s3_retry_attempts {#backup_restore_s3_retry_attempts} + +Type: UInt64 + +Default value: 1000 + +Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries. It takes place only for backup/restore. + +## cache_warmer_threads {#cache_warmer_threads} + +Type: UInt64 + +Default value: 4 + +Only available in ClickHouse Cloud. Number of background threads for speculatively downloading new data parts into file cache, when cache_populated_by_fetch is enabled. Zero to disable. + +## calculate_text_stack_trace {#calculate_text_stack_trace} + +Type: Bool + +Default value: 1 + +Calculate text stack trace in case of exceptions during query execution. This is the default. It requires symbol lookups that may slow down fuzzing tests when a huge amount of wrong queries are executed. In normal cases, you should not disable this option. + +## cancel_http_readonly_queries_on_client_close {#cancel_http_readonly_queries_on_client_close} + +Type: Bool + +Default value: 0 + +Cancels HTTP read-only queries (e.g. SELECT) when a client closes the connection without waiting for the response. + +Cloud default value: `1`. + +## cast_ipv4_ipv6_default_on_conversion_error {#cast_ipv4_ipv6_default_on_conversion_error} + +Type: Bool + +Default value: 0 + +CAST operator into IPv4, CAST operator into IPV6 type, toIPv4, toIPv6 functions will return default value instead of throwing exception on conversion error. + +## cast_keep_nullable {#cast_keep_nullable} + +Type: Bool + +Default value: 0 + +Enables or disables keeping of the `Nullable` data type in [CAST](../../sql-reference/functions/type-conversion-functions.md/#castx-t) operations. + +When the setting is enabled and the argument of `CAST` function is `Nullable`, the result is also transformed to `Nullable` type. When the setting is disabled, the result always has the destination type exactly. + +Possible values: + +- 0 — The `CAST` result has exactly the destination type specified. +- 1 — If the argument type is `Nullable`, the `CAST` result is transformed to `Nullable(DestinationDataType)`. + +**Examples** + +The following query results in the destination data type exactly: + ```sql -EXPLAIN indexes = 1 SELECT * FROM data WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx'; - -Expression ((Projection + Before ORDER BY)) - Filter (WHERE) - ReadFromMergeTree (default.data) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 1/1 - Skip - Name: x_idx - Description: minmax GRANULARITY 1 - Parts: 0/1 - Granules: 0/1 - Skip - Name: y_idx - Description: minmax GRANULARITY 1 - Parts: 0/0 - Granules: 0/0 +SET cast_keep_nullable = 0; +SELECT CAST(toNullable(toInt32(0)) AS Int32) as x, toTypeName(x); ``` -Works with tables in the MergeTree family. +Result: + +```text +┌─x─┬─toTypeName(CAST(toNullable(toInt32(0)), 'Int32'))─┐ +│ 0 │ Int32 │ +└───┴───────────────────────────────────────────────────┘ +``` + +The following query results in the `Nullable` modification on the destination data type: + +```sql +SET cast_keep_nullable = 1; +SELECT CAST(toNullable(toInt32(0)) AS Int32) as x, toTypeName(x); +``` + +Result: + +```text +┌─x─┬─toTypeName(CAST(toNullable(toInt32(0)), 'Int32'))─┐ +│ 0 │ Nullable(Int32) │ +└───┴───────────────────────────────────────────────────┘ +``` + +**See Also** + +- [CAST](../../sql-reference/functions/type-conversion-functions.md/#type_conversion_function-cast) function + +## cast_string_to_dynamic_use_inference {#cast_string_to_dynamic_use_inference} + +Type: Bool + +Default value: 0 + +Use types inference during String to Dynamic conversion + +## check_query_single_value_result {#check_query_single_value_result} + +Type: Bool + +Default value: 1 + +Defines the level of detail for the [CHECK TABLE](../../sql-reference/statements/check-table.md/#checking-mergetree-tables) query result for `MergeTree` family engines . + +Possible values: + +- 0 — the query shows a check status for every individual data part of a table. +- 1 — the query shows the general table check status. + +## check_referential_table_dependencies {#check_referential_table_dependencies} + +Type: Bool + +Default value: 0 + +Check that DDL query (such as DROP TABLE or RENAME) will not break referential dependencies + +## check_table_dependencies {#check_table_dependencies} + +Type: Bool + +Default value: 1 + +Check that DDL query (such as DROP TABLE or RENAME) will not break dependencies + +## checksum_on_read {#checksum_on_read} + +Type: Bool + +Default value: 1 + +Validate checksums on reading. It is enabled by default and should be always enabled in production. Please do not expect any benefits in disabling this setting. It may only be used for experiments and benchmarks. The setting is only applicable for tables of MergeTree family. Checksums are always validated for other table engines and when receiving data over the network. + +## cloud_mode {#cloud_mode} + +Type: Bool + +Default value: 0 + +Cloud mode + +## cloud_mode_database_engine {#cloud_mode_database_engine} + +Type: UInt64 + +Default value: 1 + +The database engine allowed in Cloud. 1 - rewrite DDLs to use Replicated database, 2 - rewrite DDLs to use Shared database + +## cloud_mode_engine {#cloud_mode_engine} + +Type: UInt64 + +Default value: 1 + +The engine family allowed in Cloud. 0 - allow everything, 1 - rewrite DDLs to use *ReplicatedMergeTree, 2 - rewrite DDLs to use SharedMergeTree. UInt64 to minimize public part + +## cluster_for_parallel_replicas {#cluster_for_parallel_replicas} + +Type: String + +Default value: + +Cluster for a shard in which current server is located + +## collect_hash_table_stats_during_aggregation {#collect_hash_table_stats_during_aggregation} + +Type: Bool + +Default value: 1 + +Enable collecting hash table statistics to optimize memory allocation + +## collect_hash_table_stats_during_joins {#collect_hash_table_stats_during_joins} + +Type: Bool + +Default value: 1 + +Enable collecting hash table statistics to optimize memory allocation + +## compatibility {#compatibility} + +Type: String + +Default value: + +The `compatibility` setting causes ClickHouse to use the default settings of a previous version of ClickHouse, where the previous version is provided as the setting. + +If settings are set to non-default values, then those settings are honored (only settings that have not been modified are affected by the `compatibility` setting). + +This setting takes a ClickHouse version number as a string, like `22.3`, `22.8`. An empty value means that this setting is disabled. + +Disabled by default. + +:::note +In ClickHouse Cloud the compatibility setting must be set by ClickHouse Cloud support. Please [open a case](https://clickhouse.cloud/support) to have it set. +::: + +## compatibility_ignore_auto_increment_in_create_table {#compatibility_ignore_auto_increment_in_create_table} + +Type: Bool + +Default value: 0 + +Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL + +## compatibility_ignore_collation_in_create_table {#compatibility_ignore_collation_in_create_table} + +Type: Bool + +Default value: 1 + +Compatibility ignore collation in create table + +## compile_aggregate_expressions {#compile_aggregate_expressions} + +Type: Bool + +Default value: 1 + +Enables or disables JIT-compilation of aggregate functions to native code. Enabling this setting can improve the performance. + +Possible values: + +- 0 — Aggregation is done without JIT compilation. +- 1 — Aggregation is done using JIT compilation. + +**See Also** + +- [min_count_to_compile_aggregate_expression](#min_count_to_compile_aggregate_expression) + +## compile_expressions {#compile_expressions} + +Type: Bool + +Default value: 0 + +Compile some scalar functions and operators to native code. Due to a bug in the LLVM compiler infrastructure, on AArch64 machines, it is known to lead to a nullptr dereference and, consequently, server crash. Do not enable this setting. + +## compile_sort_description {#compile_sort_description} + +Type: Bool + +Default value: 1 + +Compile sort description to native code. + +## connect_timeout {#connect_timeout} + +Type: Seconds + +Default value: 10 + +Connection timeout if there are no replicas. + +## connect_timeout_with_failover_ms {#connect_timeout_with_failover_ms} + +Type: Milliseconds + +Default value: 1000 + +The timeout in milliseconds for connecting to a remote server for a Distributed table engine, if the ‘shard’ and ‘replica’ sections are used in the cluster definition. +If unsuccessful, several attempts are made to connect to various replicas. + +## connect_timeout_with_failover_secure_ms {#connect_timeout_with_failover_secure_ms} + +Type: Milliseconds + +Default value: 1000 + +Connection timeout for selecting first healthy replica (for secure connections). + +## connection_pool_max_wait_ms {#connection_pool_max_wait_ms} + +Type: Milliseconds + +Default value: 0 + +The wait time in milliseconds for a connection when the connection pool is full. + +Possible values: + +- Positive integer. +- 0 — Infinite timeout. + +## connections_with_failover_max_tries {#connections_with_failover_max_tries} + +Type: UInt64 + +Default value: 3 + +The maximum number of connection attempts with each replica for the Distributed table engine. ## convert_query_to_cnf {#convert_query_to_cnf} +Type: Bool + +Default value: 0 + When set to `true`, a `SELECT` query will be converted to conjuctive normal form (CNF). There are scenarios where rewriting a query in CNF may execute faster (view this [Github issue](https://github.com/ClickHouse/ClickHouse/issues/11749) for an explanation). For example, notice how the following `SELECT` query is not modified (the default behavior): @@ -407,32 +1608,916 @@ Notice the `WHERE` clause is rewritten in CNF, but the result set is the identic Possible values: true, false -Default value: false +## count_distinct_implementation {#count_distinct_implementation} +Type: String -## fsync_metadata {#fsync-metadata} +Default value: uniqExact -Enables or disables [fsync](http://pubs.opengroup.org/onlinepubs/9699919799/functions/fsync.html) when writing `.sql` files. Enabled by default. +Specifies which of the `uniq*` functions should be used to perform the [COUNT(DISTINCT ...)](../../sql-reference/aggregate-functions/reference/count.md/#agg_function-count) construction. -It makes sense to disable it if the server has millions of tiny tables that are constantly being created and destroyed. +Possible values: -## function_range_max_elements_in_block {#function_range_max_elements_in_block} +- [uniq](../../sql-reference/aggregate-functions/reference/uniq.md/#agg_function-uniq) +- [uniqCombined](../../sql-reference/aggregate-functions/reference/uniqcombined.md/#agg_function-uniqcombined) +- [uniqCombined64](../../sql-reference/aggregate-functions/reference/uniqcombined64.md/#agg_function-uniqcombined64) +- [uniqHLL12](../../sql-reference/aggregate-functions/reference/uniqhll12.md/#agg_function-uniqhll12) +- [uniqExact](../../sql-reference/aggregate-functions/reference/uniqexact.md/#agg_function-uniqexact) -Sets the safety threshold for data volume generated by function [range](../../sql-reference/functions/array-functions.md/#range). Defines the maximum number of values generated by function per block of data (sum of array sizes for every row in a block). +## count_distinct_optimization {#count_distinct_optimization} + +Type: Bool + +Default value: 0 + +Rewrite count distinct to subquery of group by + +## create_if_not_exists {#create_if_not_exists} + +Type: Bool + +Default value: 0 + +Enable `IF NOT EXISTS` for `CREATE` statement by default. If either this setting or `IF NOT EXISTS` is specified and a table with the provided name already exists, no exception will be thrown. + +## create_index_ignore_unique {#create_index_ignore_unique} + +Type: Bool + +Default value: 0 + +Ignore UNIQUE keyword in CREATE UNIQUE INDEX. Made for SQL compatibility tests. + +## create_replicated_merge_tree_fault_injection_probability {#create_replicated_merge_tree_fault_injection_probability} + +Type: Float + +Default value: 0 + +The probability of a fault injection during table creation after creating metadata in ZooKeeper + +## create_table_empty_primary_key_by_default {#create_table_empty_primary_key_by_default} + +Type: Bool + +Default value: 0 + +Allow to create *MergeTree tables with empty primary key when ORDER BY and PRIMARY KEY not specified + +## cross_join_min_bytes_to_compress {#cross_join_min_bytes_to_compress} + +Type: UInt64 + +Default value: 1073741824 + +Minimal size of block to compress in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached. + +## cross_join_min_rows_to_compress {#cross_join_min_rows_to_compress} + +Type: UInt64 + +Default value: 10000000 + +Minimal count of rows to compress block in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached. + +## data_type_default_nullable {#data_type_default_nullable} + +Type: Bool + +Default value: 0 + +Allows data types without explicit modifiers [NULL or NOT NULL](../../sql-reference/statements/create/table.md/#null-modifiers) in column definition will be [Nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable). + +Possible values: + +- 1 — The data types in column definitions are set to `Nullable` by default. +- 0 — The data types in column definitions are set to not `Nullable` by default. + +## database_atomic_wait_for_drop_and_detach_synchronously {#database_atomic_wait_for_drop_and_detach_synchronously} + +Type: Bool + +Default value: 0 + +Adds a modifier `SYNC` to all `DROP` and `DETACH` queries. + +Possible values: + +- 0 — Queries will be executed with delay. +- 1 — Queries will be executed without delay. + +## database_replicated_allow_explicit_uuid {#database_replicated_allow_explicit_uuid} + +Type: UInt64 + +Default value: 0 + +0 - Don't allow to explicitly specify UUIDs for tables in Replicated databases. 1 - Allow. 2 - Allow, but ignore the specified UUID and generate a random one instead. + +## database_replicated_allow_heavy_create {#database_replicated_allow_heavy_create} + +Type: Bool + +Default value: 0 + +Allow long-running DDL queries (CREATE AS SELECT and POPULATE) in Replicated database engine. Note that it can block DDL queue for a long time. + +## database_replicated_allow_only_replicated_engine {#database_replicated_allow_only_replicated_engine} + +Type: Bool + +Default value: 0 + +Allow to create only Replicated tables in database with engine Replicated + +## database_replicated_allow_replicated_engine_arguments {#database_replicated_allow_replicated_engine_arguments} + +Type: UInt64 + +Default value: 0 + +0 - Don't allow to explicitly specify ZooKeeper path and replica name for *MergeTree tables in Replicated databases. 1 - Allow. 2 - Allow, but ignore the specified path and use default one instead. 3 - Allow and don't log a warning. + +## database_replicated_always_detach_permanently {#database_replicated_always_detach_permanently} + +Type: Bool + +Default value: 0 + +Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated + +## database_replicated_enforce_synchronous_settings {#database_replicated_enforce_synchronous_settings} + +Type: Bool + +Default value: 0 + +Enforces synchronous waiting for some queries (see also database_atomic_wait_for_drop_and_detach_synchronously, mutation_sync, alter_sync). Not recommended to enable these settings. + +## database_replicated_initial_query_timeout_sec {#database_replicated_initial_query_timeout_sec} + +Type: UInt64 + +Default value: 300 + +Sets how long initial DDL query should wait for Replicated database to process previous DDL queue entries in seconds. Possible values: - Positive integer. +- 0 — Unlimited. -Default value: `500,000,000`. +## decimal_check_overflow {#decimal_check_overflow} + +Type: Bool + +Default value: 1 + +Check overflow of decimal arithmetic/comparison operations + +## deduplicate_blocks_in_dependent_materialized_views {#deduplicate_blocks_in_dependent_materialized_views} + +Type: Bool + +Default value: 0 + +Enables or disables the deduplication check for materialized views that receive data from Replicated\* tables. + +Possible values: + + 0 — Disabled. + 1 — Enabled. + +Usage + +By default, deduplication is not performed for materialized views but is done upstream, in the source table. +If an INSERTed block is skipped due to deduplication in the source table, there will be no insertion into attached materialized views. This behaviour exists to enable the insertion of highly aggregated data into materialized views, for cases where inserted blocks are the same after materialized view aggregation but derived from different INSERTs into the source table. +At the same time, this behaviour “breaks” `INSERT` idempotency. If an `INSERT` into the main table was successful and `INSERT` into a materialized view failed (e.g. because of communication failure with ClickHouse Keeper) a client will get an error and can retry the operation. However, the materialized view won’t receive the second insert because it will be discarded by deduplication in the main (source) table. The setting `deduplicate_blocks_in_dependent_materialized_views` allows for changing this behaviour. On retry, a materialized view will receive the repeat insert and will perform a deduplication check by itself, +ignoring check result for the source table, and will insert rows lost because of the first failure. + +## default_materialized_view_sql_security {#default_materialized_view_sql_security} + +Type: SQLSecurityType + +Default value: DEFINER + +Allows to set a default value for SQL SECURITY option when creating a materialized view. [More about SQL security](../../sql-reference/statements/create/view.md#sql_security). + +The default value is `DEFINER`. + +## default_max_bytes_in_join {#default_max_bytes_in_join} + +Type: UInt64 + +Default value: 1000000000 + +Maximum size of right-side table if limit is required but max_bytes_in_join is not set. + +## default_normal_view_sql_security {#default_normal_view_sql_security} + +Type: SQLSecurityType + +Default value: INVOKER + +Allows to set default `SQL SECURITY` option while creating a normal view. [More about SQL security](../../sql-reference/statements/create/view.md#sql_security). + +The default value is `INVOKER`. + +## default_table_engine {#default_table_engine} + +Type: DefaultTableEngine + +Default value: MergeTree + +Default table engine to use when `ENGINE` is not set in a `CREATE` statement. + +Possible values: + +- a string representing any valid table engine name + +Cloud default value: `SharedMergeTree`. + +**Example** + +Query: + +```sql +SET default_table_engine = 'Log'; + +SELECT name, value, changed FROM system.settings WHERE name = 'default_table_engine'; +``` + +Result: + +```response +┌─name─────────────────┬─value─┬─changed─┐ +│ default_table_engine │ Log │ 1 │ +└──────────────────────┴───────┴─────────┘ +``` + +In this example, any new table that does not specify an `Engine` will use the `Log` table engine: + +Query: + +```sql +CREATE TABLE my_table ( + x UInt32, + y UInt32 +); + +SHOW CREATE TABLE my_table; +``` + +Result: + +```response +┌─statement────────────────────────────────────────────────────────────────┐ +│ CREATE TABLE default.my_table +( + `x` UInt32, + `y` UInt32 +) +ENGINE = Log +└──────────────────────────────────────────────────────────────────────────┘ +``` + +## default_temporary_table_engine {#default_temporary_table_engine} + +Type: DefaultTableEngine + +Default value: Memory + +Same as [default_table_engine](#default_table_engine) but for temporary tables. + +In this example, any new temporary table that does not specify an `Engine` will use the `Log` table engine: + +Query: + +```sql +SET default_temporary_table_engine = 'Log'; + +CREATE TEMPORARY TABLE my_table ( + x UInt32, + y UInt32 +); + +SHOW CREATE TEMPORARY TABLE my_table; +``` + +Result: + +```response +┌─statement────────────────────────────────────────────────────────────────┐ +│ CREATE TEMPORARY TABLE default.my_table +( + `x` UInt32, + `y` UInt32 +) +ENGINE = Log +└──────────────────────────────────────────────────────────────────────────┘ +``` + +## default_view_definer {#default_view_definer} + +Type: String + +Default value: CURRENT_USER + +Allows to set default `DEFINER` option while creating a view. [More about SQL security](../../sql-reference/statements/create/view.md#sql_security). + +The default value is `CURRENT_USER`. + +## describe_compact_output {#describe_compact_output} + +Type: Bool + +Default value: 0 + +If true, include only column names and types into result of DESCRIBE query + +## describe_extend_object_types {#describe_extend_object_types} + +Type: Bool + +Default value: 0 + +Deduce concrete type of columns of type Object in DESCRIBE query + +## describe_include_subcolumns {#describe_include_subcolumns} + +Type: Bool + +Default value: 0 + +Enables describing subcolumns for a [DESCRIBE](../../sql-reference/statements/describe-table.md) query. For example, members of a [Tuple](../../sql-reference/data-types/tuple.md) or subcolumns of a [Map](../../sql-reference/data-types/map.md/#map-subcolumns), [Nullable](../../sql-reference/data-types/nullable.md/#finding-null) or an [Array](../../sql-reference/data-types/array.md/#array-size) data type. + +Possible values: + +- 0 — Subcolumns are not included in `DESCRIBE` queries. +- 1 — Subcolumns are included in `DESCRIBE` queries. + +**Example** + +See an example for the [DESCRIBE](../../sql-reference/statements/describe-table.md) statement. + +## describe_include_virtual_columns {#describe_include_virtual_columns} + +Type: Bool + +Default value: 0 + +If true, virtual columns of table will be included into result of DESCRIBE query + +## dialect {#dialect} + +Type: Dialect + +Default value: clickhouse + +Which dialect will be used to parse query + +## dictionary_validate_primary_key_type {#dictionary_validate_primary_key_type} + +Type: Bool + +Default value: 0 + +Validate primary key type for dictionaries. By default id type for simple layouts will be implicitly converted to UInt64. + +## distinct_overflow_mode {#distinct_overflow_mode} + +Type: OverflowMode + +Default value: throw + +What to do when the limit is exceeded. + +## distributed_aggregation_memory_efficient {#distributed_aggregation_memory_efficient} + +Type: Bool + +Default value: 1 + +Is the memory-saving mode of distributed aggregation enabled. + +## distributed_background_insert_batch {#distributed_background_insert_batch} + +Type: Bool + +Default value: 0 + +Enables/disables inserted data sending in batches. + +When batch sending is enabled, the [Distributed](../../engines/table-engines/special/distributed.md) table engine tries to send multiple files of inserted data in one operation instead of sending them separately. Batch sending improves cluster performance by better-utilizing server and network resources. + +Possible values: + +- 1 — Enabled. +- 0 — Disabled. + +## distributed_background_insert_max_sleep_time_ms {#distributed_background_insert_max_sleep_time_ms} + +Type: Milliseconds + +Default value: 30000 + +Maximum interval for the [Distributed](../../engines/table-engines/special/distributed.md) table engine to send data. Limits exponential growth of the interval set in the [distributed_background_insert_sleep_time_ms](#distributed_background_insert_sleep_time_ms) setting. + +Possible values: + +- A positive integer number of milliseconds. + +## distributed_background_insert_sleep_time_ms {#distributed_background_insert_sleep_time_ms} + +Type: Milliseconds + +Default value: 100 + +Base interval for the [Distributed](../../engines/table-engines/special/distributed.md) table engine to send data. The actual interval grows exponentially in the event of errors. + +Possible values: + +- A positive integer number of milliseconds. + +## distributed_background_insert_split_batch_on_failure {#distributed_background_insert_split_batch_on_failure} + +Type: Bool + +Default value: 0 + +Enables/disables splitting batches on failures. + +Sometimes sending particular batch to the remote shard may fail, because of some complex pipeline after (i.e. `MATERIALIZED VIEW` with `GROUP BY`) due to `Memory limit exceeded` or similar errors. In this case, retrying will not help (and this will stuck distributed sends for the table) but sending files from that batch one by one may succeed INSERT. + +So installing this setting to `1` will disable batching for such batches (i.e. temporary disables `distributed_background_insert_batch` for failed batches). + +Possible values: + +- 1 — Enabled. +- 0 — Disabled. + +:::note +This setting also affects broken batches (that may appears because of abnormal server (machine) termination and no `fsync_after_insert`/`fsync_directories` for [Distributed](../../engines/table-engines/special/distributed.md) table engine). +::: + +:::note +You should not rely on automatic batch splitting, since this may hurt performance. +::: + +## distributed_background_insert_timeout {#distributed_background_insert_timeout} + +Type: UInt64 + +Default value: 0 + +Timeout for insert query into distributed. Setting is used only with insert_distributed_sync enabled. Zero value means no timeout. + +## distributed_cache_bypass_connection_pool {#distributed_cache_bypass_connection_pool} + +Type: Bool + +Default value: 0 + +Only in ClickHouse Cloud. Allow to bypass distributed cache connection pool + +## distributed_cache_connect_max_tries {#distributed_cache_connect_max_tries} + +Type: UInt64 + +Default value: 100 + +Only in ClickHouse Cloud. Number of tries to connect to distributed cache if unsuccessful + +## distributed_cache_data_packet_ack_window {#distributed_cache_data_packet_ack_window} + +Type: UInt64 + +Default value: 5 + +Only in ClickHouse Cloud. A window for sending ACK for DataPacket sequence in a single distributed cache read request + +## distributed_cache_fetch_metrics_only_from_current_az {#distributed_cache_fetch_metrics_only_from_current_az} + +Type: Bool + +Default value: 1 + +Only in ClickHouse Cloud. Fetch metrics only from current availability zone in system.distributed_cache_metrics, system.distributed_cache_events + +## distributed_cache_log_mode {#distributed_cache_log_mode} + +Type: DistributedCacheLogMode + +Default value: on_error + +Only in ClickHouse Cloud. Mode for writing to system.distributed_cache_log + +## distributed_cache_max_unacked_inflight_packets {#distributed_cache_max_unacked_inflight_packets} + +Type: UInt64 + +Default value: 10 + +Only in ClickHouse Cloud. A maximum number of unacknowledged in-flight packets in a single distributed cache read request + +## distributed_cache_pool_behaviour_on_limit {#distributed_cache_pool_behaviour_on_limit} + +Type: DistributedCachePoolBehaviourOnLimit + +Default value: allocate_bypassing_pool + +Only in ClickHouse Cloud. Identifies behaviour of distributed cache connection on pool limit reached + +## distributed_cache_read_alignment {#distributed_cache_read_alignment} + +Type: UInt64 + +Default value: 0 + +Only in ClickHouse Cloud. A setting for testing purposes, do not change it + +## distributed_cache_receive_response_wait_milliseconds {#distributed_cache_receive_response_wait_milliseconds} + +Type: UInt64 + +Default value: 60000 + +Only in ClickHouse Cloud. Wait time in milliseconds to receive data for request from distributed cache + +## distributed_cache_receive_timeout_milliseconds {#distributed_cache_receive_timeout_milliseconds} + +Type: UInt64 + +Default value: 10000 + +Only in ClickHouse Cloud. Wait time in milliseconds to receive any kind of response from distributed cache + +## distributed_cache_throw_on_error {#distributed_cache_throw_on_error} + +Type: Bool + +Default value: 0 + +Only in ClickHouse Cloud. Rethrow exception happened during communication with distributed cache or exception received from distributed cache. Otherwise fallback to skipping distributed cache on error + +## distributed_cache_wait_connection_from_pool_milliseconds {#distributed_cache_wait_connection_from_pool_milliseconds} + +Type: UInt64 + +Default value: 100 + +Only in ClickHouse Cloud. Wait time in milliseconds to receive connection from connection pool if distributed_cache_pool_behaviour_on_limit is wait + +## distributed_connections_pool_size {#distributed_connections_pool_size} + +Type: UInt64 + +Default value: 1024 + +The maximum number of simultaneous connections with remote servers for distributed processing of all queries to a single Distributed table. We recommend setting a value no less than the number of servers in the cluster. + +## distributed_ddl_entry_format_version {#distributed_ddl_entry_format_version} + +Type: UInt64 + +Default value: 5 + +Compatibility version of distributed DDL (ON CLUSTER) queries + +## distributed_ddl_output_mode {#distributed_ddl_output_mode} + +Type: DistributedDDLOutputMode + +Default value: throw + +Sets format of distributed DDL query result. + +Possible values: + +- `throw` — Returns result set with query execution status for all hosts where query is finished. If query has failed on some hosts, then it will rethrow the first exception. If query is not finished yet on some hosts and [distributed_ddl_task_timeout](#distributed_ddl_task_timeout) exceeded, then it throws `TIMEOUT_EXCEEDED` exception. +- `none` — Is similar to throw, but distributed DDL query returns no result set. +- `null_status_on_timeout` — Returns `NULL` as execution status in some rows of result set instead of throwing `TIMEOUT_EXCEEDED` if query is not finished on the corresponding hosts. +- `never_throw` — Do not throw `TIMEOUT_EXCEEDED` and do not rethrow exceptions if query has failed on some hosts. +- `none_only_active` - similar to `none`, but doesn't wait for inactive replicas of the `Replicated` database. Note: with this mode it's impossible to figure out that the query was not executed on some replica and will be executed in background. +- `null_status_on_timeout_only_active` — similar to `null_status_on_timeout`, but doesn't wait for inactive replicas of the `Replicated` database +- `throw_only_active` — similar to `throw`, but doesn't wait for inactive replicas of the `Replicated` database + +Cloud default value: `none`. + +## distributed_ddl_task_timeout {#distributed_ddl_task_timeout} + +Type: Int64 + +Default value: 180 + +Sets timeout for DDL query responses from all hosts in cluster. If a DDL request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. Negative value means infinite. + +Possible values: + +- Positive integer. +- 0 — Async mode. +- Negative integer — infinite timeout. + +## distributed_foreground_insert {#distributed_foreground_insert} + +Type: Bool + +Default value: 0 + +Enables or disables synchronous data insertion into a [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table. + +By default, when inserting data into a `Distributed` table, the ClickHouse server sends data to cluster nodes in background mode. When `distributed_foreground_insert=1`, the data is processed synchronously, and the `INSERT` operation succeeds only after all the data is saved on all shards (at least one replica for each shard if `internal_replication` is true). + +Possible values: + +- 0 — Data is inserted in background mode. +- 1 — Data is inserted in synchronous mode. + +Cloud default value: `1`. **See Also** -- [max_block_size](#setting-max_block_size) -- [min_insert_block_size_rows](#min-insert-block-size-rows) +- [Distributed Table Engine](../../engines/table-engines/special/distributed.md/#distributed) +- [Managing Distributed Tables](../../sql-reference/statements/system.md/#query-language-system-distributed) + +## distributed_group_by_no_merge {#distributed_group_by_no_merge} + +Type: UInt64 + +Default value: 0 + +Do not merge aggregation states from different servers for distributed query processing, you can use this in case it is for certain that there are different keys on different shards + +Possible values: + +- `0` — Disabled (final query processing is done on the initiator node). +- `1` - Do not merge aggregation states from different servers for distributed query processing (query completely processed on the shard, initiator only proxy the data), can be used in case it is for certain that there are different keys on different shards. +- `2` - Same as `1` but applies `ORDER BY` and `LIMIT` (it is not possible when the query processed completely on the remote node, like for `distributed_group_by_no_merge=1`) on the initiator (can be used for queries with `ORDER BY` and/or `LIMIT`). + +**Example** + +```sql +SELECT * +FROM remote('127.0.0.{2,3}', system.one) +GROUP BY dummy +LIMIT 1 +SETTINGS distributed_group_by_no_merge = 1 +FORMAT PrettyCompactMonoBlock + +┌─dummy─┐ +│ 0 │ +│ 0 │ +└───────┘ +``` + +```sql +SELECT * +FROM remote('127.0.0.{2,3}', system.one) +GROUP BY dummy +LIMIT 1 +SETTINGS distributed_group_by_no_merge = 2 +FORMAT PrettyCompactMonoBlock + +┌─dummy─┐ +│ 0 │ +└───────┘ +``` + +## distributed_insert_skip_read_only_replicas {#distributed_insert_skip_read_only_replicas} + +Type: Bool + +Default value: 0 + +Enables skipping read-only replicas for INSERT queries into Distributed. + +Possible values: + +- 0 — INSERT was as usual, if it will go to read-only replica it will fail +- 1 — Initiator will skip read-only replicas before sending data to shards. + +## distributed_product_mode {#distributed_product_mode} + +Type: DistributedProductMode + +Default value: deny + +Changes the behaviour of [distributed subqueries](../../sql-reference/operators/in.md). + +ClickHouse applies this setting when the query contains the product of distributed tables, i.e. when the query for a distributed table contains a non-GLOBAL subquery for the distributed table. + +Restrictions: + +- Only applied for IN and JOIN subqueries. +- Only if the FROM section uses a distributed table containing more than one shard. +- If the subquery concerns a distributed table containing more than one shard. +- Not used for a table-valued [remote](../../sql-reference/table-functions/remote.md) function. + +Possible values: + +- `deny` — Default value. Prohibits using these types of subqueries (returns the “Double-distributed in/JOIN subqueries is denied” exception). +- `local` — Replaces the database and table in the subquery with local ones for the destination server (shard), leaving the normal `IN`/`JOIN.` +- `global` — Replaces the `IN`/`JOIN` query with `GLOBAL IN`/`GLOBAL JOIN.` +- `allow` — Allows the use of these types of subqueries. + +## distributed_push_down_limit {#distributed_push_down_limit} + +Type: UInt64 + +Default value: 1 + +Enables or disables [LIMIT](#limit) applying on each shard separately. + +This will allow to avoid: +- Sending extra rows over network; +- Processing rows behind the limit on the initiator. + +Starting from 21.9 version you cannot get inaccurate results anymore, since `distributed_push_down_limit` changes query execution only if at least one of the conditions met: +- [distributed_group_by_no_merge](#distributed-group-by-no-merge) > 0. +- Query **does not have** `GROUP BY`/`DISTINCT`/`LIMIT BY`, but it has `ORDER BY`/`LIMIT`. +- Query **has** `GROUP BY`/`DISTINCT`/`LIMIT BY` with `ORDER BY`/`LIMIT` and: + - [optimize_skip_unused_shards](#optimize-skip-unused-shards) is enabled. + - [optimize_distributed_group_by_sharding_key](#optimize-distributed-group-by-sharding-key) is enabled. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +See also: + +- [distributed_group_by_no_merge](#distributed-group-by-no-merge) +- [optimize_skip_unused_shards](#optimize-skip-unused-shards) +- [optimize_distributed_group_by_sharding_key](#optimize-distributed-group-by-sharding-key) + +## distributed_replica_error_cap {#distributed_replica_error_cap} + +Type: UInt64 + +Default value: 1000 + +- Type: unsigned int +- Default value: 1000 + +The error count of each replica is capped at this value, preventing a single replica from accumulating too many errors. + +See also: + +- [load_balancing](#load_balancing-round_robin) +- [Table engine Distributed](../../engines/table-engines/special/distributed.md) +- [distributed_replica_error_half_life](#distributed_replica_error_half_life) +- [distributed_replica_max_ignored_errors](#distributed_replica_max_ignored_errors) + +## distributed_replica_error_half_life {#distributed_replica_error_half_life} + +Type: Seconds + +Default value: 60 + +- Type: seconds +- Default value: 60 seconds + +Controls how fast errors in distributed tables are zeroed. If a replica is unavailable for some time, accumulates 5 errors, and distributed_replica_error_half_life is set to 1 second, then the replica is considered normal 3 seconds after the last error. + +See also: + +- [load_balancing](#load_balancing-round_robin) +- [Table engine Distributed](../../engines/table-engines/special/distributed.md) +- [distributed_replica_error_cap](#distributed_replica_error_cap) +- [distributed_replica_max_ignored_errors](#distributed_replica_max_ignored_errors) + +## distributed_replica_max_ignored_errors {#distributed_replica_max_ignored_errors} + +Type: UInt64 + +Default value: 0 + +- Type: unsigned int +- Default value: 0 + +The number of errors that will be ignored while choosing replicas (according to `load_balancing` algorithm). + +See also: + +- [load_balancing](#load_balancing-round_robin) +- [Table engine Distributed](../../engines/table-engines/special/distributed.md) +- [distributed_replica_error_cap](#distributed_replica_error_cap) +- [distributed_replica_error_half_life](#distributed_replica_error_half_life) + +## do_not_merge_across_partitions_select_final {#do_not_merge_across_partitions_select_final} + +Type: Bool + +Default value: 0 + +Merge parts only in one partition in select final + +## empty_result_for_aggregation_by_constant_keys_on_empty_set {#empty_result_for_aggregation_by_constant_keys_on_empty_set} + +Type: Bool + +Default value: 1 + +Return empty result when aggregating by constant keys on empty set. + +## empty_result_for_aggregation_by_empty_set {#empty_result_for_aggregation_by_empty_set} + +Type: Bool + +Default value: 0 + +Return empty result when aggregating without keys on empty set. + +## enable_blob_storage_log {#enable_blob_storage_log} + +Type: Bool + +Default value: 1 + +Write information about blob storage operations to system.blob_storage_log table + +## enable_deflate_qpl_codec {#enable_deflate_qpl_codec} + +Type: Bool + +Default value: 0 + +If turned on, the DEFLATE_QPL codec may be used to compress columns. + +Possible values: + +- 0 - Disabled +- 1 - Enabled + +## enable_early_constant_folding {#enable_early_constant_folding} + +Type: Bool + +Default value: 1 + +Enable query optimization where we analyze function and subqueries results and rewrite query if there are constants there + +## enable_extended_results_for_datetime_functions {#enable_extended_results_for_datetime_functions} + +Type: Bool + +Default value: 0 + +Enables or disables returning results of type: +- `Date32` with extended range (compared to type `Date`) for functions [toStartOfYear](../../sql-reference/functions/date-time-functions.md#tostartofyear), [toStartOfISOYear](../../sql-reference/functions/date-time-functions.md#tostartofisoyear), [toStartOfQuarter](../../sql-reference/functions/date-time-functions.md#tostartofquarter), [toStartOfMonth](../../sql-reference/functions/date-time-functions.md#tostartofmonth), [toLastDayOfMonth](../../sql-reference/functions/date-time-functions.md#tolastdayofmonth), [toStartOfWeek](../../sql-reference/functions/date-time-functions.md#tostartofweek), [toLastDayOfWeek](../../sql-reference/functions/date-time-functions.md#tolastdayofweek) and [toMonday](../../sql-reference/functions/date-time-functions.md#tomonday). +- `DateTime64` with extended range (compared to type `DateTime`) for functions [toStartOfDay](../../sql-reference/functions/date-time-functions.md#tostartofday), [toStartOfHour](../../sql-reference/functions/date-time-functions.md#tostartofhour), [toStartOfMinute](../../sql-reference/functions/date-time-functions.md#tostartofminute), [toStartOfFiveMinutes](../../sql-reference/functions/date-time-functions.md#tostartoffiveminutes), [toStartOfTenMinutes](../../sql-reference/functions/date-time-functions.md#tostartoftenminutes), [toStartOfFifteenMinutes](../../sql-reference/functions/date-time-functions.md#tostartoffifteenminutes) and [timeSlot](../../sql-reference/functions/date-time-functions.md#timeslot). + +Possible values: + +- 0 — Functions return `Date` or `DateTime` for all types of arguments. +- 1 — Functions return `Date32` or `DateTime64` for `Date32` or `DateTime64` arguments and `Date` or `DateTime` otherwise. + +## enable_filesystem_cache {#enable_filesystem_cache} + +Type: Bool + +Default value: 1 + +Use cache for remote filesystem. This setting does not turn on/off cache for disks (must be done via disk config), but allows to bypass cache for some queries if intended + +## enable_filesystem_cache_log {#enable_filesystem_cache_log} + +Type: Bool + +Default value: 0 + +Allows to record the filesystem caching log for each query + +## enable_filesystem_cache_on_write_operations {#enable_filesystem_cache_on_write_operations} + +Type: Bool + +Default value: 0 + +Write into cache on write operations. To actually work this setting requires be added to disk config too + +## enable_filesystem_read_prefetches_log {#enable_filesystem_read_prefetches_log} + +Type: Bool + +Default value: 0 + +Log to system.filesystem prefetch_log during query. Should be used only for testing or debugging, not recommended to be turned on by default + +## enable_global_with_statement {#enable_global_with_statement} + +Type: Bool + +Default value: 1 + +Propagate WITH statements to UNION queries and all subqueries ## enable_http_compression {#enable_http_compression} +Type: Bool + +Default value: 0 + Enables or disables data compression in the response to an HTTP request. For more information, read the [HTTP interface description](../../interfaces/http.md). @@ -442,18 +2527,1132 @@ Possible values: - 0 — Disabled. - 1 — Enabled. -Default value: 0. +## enable_job_stack_trace {#enable_job_stack_trace} -## http_zlib_compression_level {#http_zlib_compression_level} +Type: Bool -Sets the level of data compression in the response to an HTTP request if [enable_http_compression = 1](#enable_http_compression). +Default value: 0 -Possible values: Numbers from 1 to 9. +Output stack trace of a job creator when job results in exception -Default value: 3. +## enable_lightweight_delete {#enable_lightweight_delete} + +Type: Bool + +Default value: 1 + +Enable lightweight DELETE mutations for mergetree tables. + +## enable_memory_bound_merging_of_aggregation_results {#enable_memory_bound_merging_of_aggregation_results} + +Type: Bool + +Default value: 1 + +Enable memory bound merging strategy for aggregation. + +## enable_multiple_prewhere_read_steps {#enable_multiple_prewhere_read_steps} + +Type: Bool + +Default value: 1 + +Move more conditions from WHERE to PREWHERE and do reads from disk and filtering in multiple steps if there are multiple conditions combined with AND + +## enable_named_columns_in_function_tuple {#enable_named_columns_in_function_tuple} + +Type: Bool + +Default value: 1 + +Generate named tuples in function tuple() when all names are unique and can be treated as unquoted identifiers. + +## enable_optimize_predicate_expression {#enable_optimize_predicate_expression} + +Type: Bool + +Default value: 1 + +Turns on predicate pushdown in `SELECT` queries. + +Predicate pushdown may significantly reduce network traffic for distributed queries. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Usage + +Consider the following queries: + +1. `SELECT count() FROM test_table WHERE date = '2018-10-10'` +2. `SELECT count() FROM (SELECT * FROM test_table) WHERE date = '2018-10-10'` + +If `enable_optimize_predicate_expression = 1`, then the execution time of these queries is equal because ClickHouse applies `WHERE` to the subquery when processing it. + +If `enable_optimize_predicate_expression = 0`, then the execution time of the second query is much longer because the `WHERE` clause applies to all the data after the subquery finishes. + +## enable_optimize_predicate_expression_to_final_subquery {#enable_optimize_predicate_expression_to_final_subquery} + +Type: Bool + +Default value: 1 + +Allow push predicate to final subquery. + +## enable_order_by_all {#enable_order_by_all} + +Type: Bool + +Default value: 1 + +Enables or disables sorting with `ORDER BY ALL` syntax, see [ORDER BY](../../sql-reference/statements/select/order-by.md). + +Possible values: + +- 0 — Disable ORDER BY ALL. +- 1 — Enable ORDER BY ALL. + +**Example** + +Query: + +```sql +CREATE TABLE TAB(C1 Int, C2 Int, ALL Int) ENGINE=Memory(); + +INSERT INTO TAB VALUES (10, 20, 30), (20, 20, 10), (30, 10, 20); + +SELECT * FROM TAB ORDER BY ALL; -- returns an error that ALL is ambiguous + +SELECT * FROM TAB ORDER BY ALL SETTINGS enable_order_by_all = 0; +``` + +Result: + +```text +┌─C1─┬─C2─┬─ALL─┐ +│ 20 │ 20 │ 10 │ +│ 30 │ 10 │ 20 │ +│ 10 │ 20 │ 30 │ +└────┴────┴─────┘ +``` + +## enable_parsing_to_custom_serialization {#enable_parsing_to_custom_serialization} + +Type: Bool + +Default value: 1 + +If true then data can be parsed directly to columns with custom serialization (e.g. Sparse) according to hints for serialization got from the table. + +## enable_positional_arguments {#enable_positional_arguments} + +Type: Bool + +Default value: 1 + +Enables or disables supporting positional arguments for [GROUP BY](../../sql-reference/statements/select/group-by.md), [LIMIT BY](../../sql-reference/statements/select/limit-by.md), [ORDER BY](../../sql-reference/statements/select/order-by.md) statements. + +Possible values: + +- 0 — Positional arguments aren't supported. +- 1 — Positional arguments are supported: column numbers can use instead of column names. + +**Example** + +Query: + +```sql +CREATE TABLE positional_arguments(one Int, two Int, three Int) ENGINE=Memory(); + +INSERT INTO positional_arguments VALUES (10, 20, 30), (20, 20, 10), (30, 10, 20); + +SELECT * FROM positional_arguments ORDER BY 2,3; +``` + +Result: + +```text +┌─one─┬─two─┬─three─┐ +│ 30 │ 10 │ 20 │ +│ 20 │ 20 │ 10 │ +│ 10 │ 20 │ 30 │ +└─────┴─────┴───────┘ +``` + +## enable_reads_from_query_cache {#enable_reads_from_query_cache} + +Type: Bool + +Default value: 1 + +If turned on, results of `SELECT` queries are retrieved from the [query cache](../query-cache.md). + +Possible values: + +- 0 - Disabled +- 1 - Enabled + +## enable_s3_requests_logging {#enable_s3_requests_logging} + +Type: Bool + +Default value: 0 + +Enable very explicit logging of S3 requests. Makes sense for debug only. + +## enable_scalar_subquery_optimization {#enable_scalar_subquery_optimization} + +Type: Bool + +Default value: 1 + +If it is set to true, prevent scalar subqueries from (de)serializing large scalar values and possibly avoid running the same subquery more than once. + +## enable_secure_identifiers {#enable_secure_identifiers} + +Type: Bool + +Default value: 0 + +If enabled, only allow secure identifiers which contain only underscore and alphanumeric characters + +## enable_sharing_sets_for_mutations {#enable_sharing_sets_for_mutations} + +Type: Bool + +Default value: 1 + +Allow sharing set objects build for IN subqueries between different tasks of the same mutation. This reduces memory usage and CPU consumption + +## enable_software_prefetch_in_aggregation {#enable_software_prefetch_in_aggregation} + +Type: Bool + +Default value: 1 + +Enable use of software prefetch in aggregation + +## enable_unaligned_array_join {#enable_unaligned_array_join} + +Type: Bool + +Default value: 0 + +Allow ARRAY JOIN with multiple arrays that have different sizes. When this settings is enabled, arrays will be resized to the longest one. + +## enable_url_encoding {#enable_url_encoding} + +Type: Bool + +Default value: 1 + +Allows to enable/disable decoding/encoding path in uri in [URL](../../engines/table-engines/special/url.md) engine tables. + +Enabled by default. + +## enable_vertical_final {#enable_vertical_final} + +Type: Bool + +Default value: 1 + +If enable, remove duplicated rows during FINAL by marking rows as deleted and filtering them later instead of merging rows + +## enable_writes_to_query_cache {#enable_writes_to_query_cache} + +Type: Bool + +Default value: 1 + +If turned on, results of `SELECT` queries are stored in the [query cache](../query-cache.md). + +Possible values: + +- 0 - Disabled +- 1 - Enabled + +## enable_zstd_qat_codec {#enable_zstd_qat_codec} + +Type: Bool + +Default value: 0 + +If turned on, the ZSTD_QAT codec may be used to compress columns. + +Possible values: + +- 0 - Disabled +- 1 - Enabled + +## engine_file_allow_create_multiple_files {#engine_file_allow_create_multiple_files} + +Type: Bool + +Default value: 0 + +Enables or disables creating a new file on each insert in file engine tables if the format has the suffix (`JSON`, `ORC`, `Parquet`, etc.). If enabled, on each insert a new file will be created with a name following this pattern: + +`data.Parquet` -> `data.1.Parquet` -> `data.2.Parquet`, etc. + +Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` query creates a new file. + +## engine_file_empty_if_not_exists {#engine_file_empty_if_not_exists} + +Type: Bool + +Default value: 0 + +Allows to select data from a file engine table without file. + +Possible values: +- 0 — `SELECT` throws exception. +- 1 — `SELECT` returns empty result. + +## engine_file_skip_empty_files {#engine_file_skip_empty_files} + +Type: Bool + +Default value: 0 + +Enables or disables skipping empty files in [File](../../engines/table-engines/special/file.md) engine tables. + +Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. + +## engine_file_truncate_on_insert {#engine_file_truncate_on_insert} + +Type: Bool + +Default value: 0 + +Enables or disables truncate before insert in [File](../../engines/table-engines/special/file.md) engine tables. + +Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` query replaces existing content of the file with the new data. + +## engine_url_skip_empty_files {#engine_url_skip_empty_files} + +Type: Bool + +Default value: 0 + +Enables or disables skipping empty files in [URL](../../engines/table-engines/special/url.md) engine tables. + +Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. + +## except_default_mode {#except_default_mode} + +Type: SetOperationMode + +Default value: ALL + +Set default mode in EXCEPT query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without mode will throw exception. + +## external_storage_connect_timeout_sec {#external_storage_connect_timeout_sec} + +Type: UInt64 + +Default value: 10 + +Connect timeout in seconds. Now supported only for MySQL + +## external_storage_max_read_bytes {#external_storage_max_read_bytes} + +Type: UInt64 + +Default value: 0 + +Limit maximum number of bytes when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializedMySQL. If equal to 0, this setting is disabled + +## external_storage_max_read_rows {#external_storage_max_read_rows} + +Type: UInt64 + +Default value: 0 + +Limit maximum number of rows when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializedMySQL. If equal to 0, this setting is disabled + +## external_storage_rw_timeout_sec {#external_storage_rw_timeout_sec} + +Type: UInt64 + +Default value: 300 + +Read/write timeout in seconds. Now supported only for MySQL + +## external_table_functions_use_nulls {#external_table_functions_use_nulls} + +Type: Bool + +Default value: 1 + +Defines how [mysql](../../sql-reference/table-functions/mysql.md), [postgresql](../../sql-reference/table-functions/postgresql.md) and [odbc](../../sql-reference/table-functions/odbc.md) table functions use Nullable columns. + +Possible values: + +- 0 — The table function explicitly uses Nullable columns. +- 1 — The table function implicitly uses Nullable columns. + +**Usage** + +If the setting is set to `0`, the table function does not make Nullable columns and inserts default values instead of NULL. This is also applicable for NULL values inside arrays. + +## external_table_strict_query {#external_table_strict_query} + +Type: Bool + +Default value: 0 + +If it is set to true, transforming expression to local filter is forbidden for queries to external tables. + +## extract_key_value_pairs_max_pairs_per_row {#extract_key_value_pairs_max_pairs_per_row} + +Type: UInt64 + +Default value: 1000 + +Max number of pairs that can be produced by the `extractKeyValuePairs` function. Used as a safeguard against consuming too much memory. + +## extremes {#extremes} + +Type: Bool + +Default value: 0 + +Whether to count extreme values (the minimums and maximums in columns of a query result). Accepts 0 or 1. By default, 0 (disabled). +For more information, see the section “Extreme values”. + +## fallback_to_stale_replicas_for_distributed_queries {#fallback_to_stale_replicas_for_distributed_queries} + +Type: Bool + +Default value: 1 + +Forces a query to an out-of-date replica if updated data is not available. See [Replication](../../engines/table-engines/mergetree-family/replication.md). + +ClickHouse selects the most relevant from the outdated replicas of the table. + +Used when performing `SELECT` from a distributed table that points to replicated tables. + +By default, 1 (enabled). + +## filesystem_cache_max_download_size {#filesystem_cache_max_download_size} + +Type: UInt64 + +Default value: 137438953472 + +Max remote filesystem cache size that can be downloaded by a single query + +## filesystem_cache_reserve_space_wait_lock_timeout_milliseconds {#filesystem_cache_reserve_space_wait_lock_timeout_milliseconds} + +Type: UInt64 + +Default value: 1000 + +Wait time to lock cache for space reservation in filesystem cache + +## filesystem_cache_segments_batch_size {#filesystem_cache_segments_batch_size} + +Type: UInt64 + +Default value: 20 + +Limit on size of a single batch of file segments that a read buffer can request from cache. Too low value will lead to excessive requests to cache, too large may slow down eviction from cache + +## filesystem_prefetch_max_memory_usage {#filesystem_prefetch_max_memory_usage} + +Type: UInt64 + +Default value: 1073741824 + +Maximum memory usage for prefetches. + +## filesystem_prefetch_step_bytes {#filesystem_prefetch_step_bytes} + +Type: UInt64 + +Default value: 0 + +Prefetch step in bytes. Zero means `auto` - approximately the best prefetch step will be auto deduced, but might not be 100% the best. The actual value might be different because of setting filesystem_prefetch_min_bytes_for_single_read_task + +## filesystem_prefetch_step_marks {#filesystem_prefetch_step_marks} + +Type: UInt64 + +Default value: 0 + +Prefetch step in marks. Zero means `auto` - approximately the best prefetch step will be auto deduced, but might not be 100% the best. The actual value might be different because of setting filesystem_prefetch_min_bytes_for_single_read_task + +## filesystem_prefetches_limit {#filesystem_prefetches_limit} + +Type: UInt64 + +Default value: 200 + +Maximum number of prefetches. Zero means unlimited. A setting `filesystem_prefetches_max_memory_usage` is more recommended if you want to limit the number of prefetches + +## final {#final} + +Type: Bool + +Default value: 0 + +Automatically applies [FINAL](../../sql-reference/statements/select/from.md#final-modifier) modifier to all tables in a query, to tables where [FINAL](../../sql-reference/statements/select/from.md#final-modifier) is applicable, including joined tables and tables in sub-queries, and +distributed tables. + +Possible values: + +- 0 - disabled +- 1 - enabled + +Example: + +```sql +CREATE TABLE test +( + key Int64, + some String +) +ENGINE = ReplacingMergeTree +ORDER BY key; + +INSERT INTO test FORMAT Values (1, 'first'); +INSERT INTO test FORMAT Values (1, 'second'); + +SELECT * FROM test; +┌─key─┬─some───┐ +│ 1 │ second │ +└─────┴────────┘ +┌─key─┬─some──┐ +│ 1 │ first │ +└─────┴───────┘ + +SELECT * FROM test SETTINGS final = 1; +┌─key─┬─some───┐ +│ 1 │ second │ +└─────┴────────┘ + +SET final = 1; +SELECT * FROM test; +┌─key─┬─some───┐ +│ 1 │ second │ +└─────┴────────┘ +``` + +## flatten_nested {#flatten_nested} + +Type: Bool + +Default value: 1 + +Sets the data format of a [nested](../../sql-reference/data-types/nested-data-structures/index.md) columns. + +Possible values: + +- 1 — Nested column is flattened to separate arrays. +- 0 — Nested column stays a single array of tuples. + +**Usage** + +If the setting is set to `0`, it is possible to use an arbitrary level of nesting. + +**Examples** + +Query: + +``` sql +SET flatten_nested = 1; +CREATE TABLE t_nest (`n` Nested(a UInt32, b UInt32)) ENGINE = MergeTree ORDER BY tuple(); + +SHOW CREATE TABLE t_nest; +``` + +Result: + +``` text +┌─statement───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ CREATE TABLE default.t_nest +( + `n.a` Array(UInt32), + `n.b` Array(UInt32) +) +ENGINE = MergeTree +ORDER BY tuple() +SETTINGS index_granularity = 8192 │ +└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +Query: + +``` sql +SET flatten_nested = 0; + +CREATE TABLE t_nest (`n` Nested(a UInt32, b UInt32)) ENGINE = MergeTree ORDER BY tuple(); + +SHOW CREATE TABLE t_nest; +``` + +Result: + +``` text +┌─statement──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ CREATE TABLE default.t_nest +( + `n` Nested(a UInt32, b UInt32) +) +ENGINE = MergeTree +ORDER BY tuple() +SETTINGS index_granularity = 8192 │ +└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +## force_aggregate_partitions_independently {#force_aggregate_partitions_independently} + +Type: Bool + +Default value: 0 + +Force the use of optimization when it is applicable, but heuristics decided not to use it + +## force_aggregation_in_order {#force_aggregation_in_order} + +Type: Bool + +Default value: 0 + +The setting is used by the server itself to support distributed queries. Do not change it manually, because it will break normal operations. (Forces use of aggregation in order on remote nodes during distributed aggregation). + +## force_data_skipping_indices {#force_data_skipping_indices} + +Type: String + +Default value: + +Disables query execution if passed data skipping indices wasn't used. + +Consider the following example: + +```sql +CREATE TABLE data +( + key Int, + d1 Int, + d1_null Nullable(Int), + INDEX d1_idx d1 TYPE minmax GRANULARITY 1, + INDEX d1_null_idx assumeNotNull(d1_null) TYPE minmax GRANULARITY 1 +) +Engine=MergeTree() +ORDER BY key; + +SELECT * FROM data_01515; +SELECT * FROM data_01515 SETTINGS force_data_skipping_indices=''; -- query will produce CANNOT_PARSE_TEXT error. +SELECT * FROM data_01515 SETTINGS force_data_skipping_indices='d1_idx'; -- query will produce INDEX_NOT_USED error. +SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_idx'; -- Ok. +SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='`d1_idx`'; -- Ok (example of full featured parser). +SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='`d1_idx`, d1_null_idx'; -- query will produce INDEX_NOT_USED error, since d1_null_idx is not used. +SELECT * FROM data_01515 WHERE d1 = 0 AND assumeNotNull(d1_null) = 0 SETTINGS force_data_skipping_indices='`d1_idx`, d1_null_idx'; -- Ok. +``` + +## force_grouping_standard_compatibility {#force_grouping_standard_compatibility} + +Type: Bool + +Default value: 1 + +Make GROUPING function to return 1 when argument is not used as an aggregation key + +## force_index_by_date {#force_index_by_date} + +Type: Bool + +Default value: 0 + +Disables query execution if the index can’t be used by date. + +Works with tables in the MergeTree family. + +If `force_index_by_date=1`, ClickHouse checks whether the query has a date key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition reduces the amount of data to read. For example, the condition `Date != ' 2000-01-01 '` is acceptable even when it matches all the data in the table (i.e., running the query requires a full scan). For more information about ranges of data in MergeTree tables, see [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). + +## force_optimize_projection {#force_optimize_projection} + +Type: Bool + +Default value: 0 + +Enables or disables the obligatory use of [projections](../../engines/table-engines/mergetree-family/mergetree.md/#projections) in `SELECT` queries, when projection optimization is enabled (see [optimize_use_projections](#optimize_use_projections) setting). + +Possible values: + +- 0 — Projection optimization is not obligatory. +- 1 — Projection optimization is obligatory. + +## force_optimize_projection_name {#force_optimize_projection_name} + +Type: String + +Default value: + +If it is set to a non-empty string, check that this projection is used in the query at least once. + +Possible values: + +- string: name of projection that used in a query + +## force_optimize_skip_unused_shards {#force_optimize_skip_unused_shards} + +Type: UInt64 + +Default value: 0 + +Enables or disables query execution if [optimize_skip_unused_shards](#optimize-skip-unused-shards) is enabled and skipping of unused shards is not possible. If the skipping is not possible and the setting is enabled, an exception will be thrown. + +Possible values: + +- 0 — Disabled. ClickHouse does not throw an exception. +- 1 — Enabled. Query execution is disabled only if the table has a sharding key. +- 2 — Enabled. Query execution is disabled regardless of whether a sharding key is defined for the table. + +## force_optimize_skip_unused_shards_nesting {#force_optimize_skip_unused_shards_nesting} + +Type: UInt64 + +Default value: 0 + +Controls [`force_optimize_skip_unused_shards`](#force-optimize-skip-unused-shards) (hence still requires [`force_optimize_skip_unused_shards`](#force-optimize-skip-unused-shards)) depends on the nesting level of the distributed query (case when you have `Distributed` table that look into another `Distributed` table). + +Possible values: + +- 0 - Disabled, `force_optimize_skip_unused_shards` works always. +- 1 — Enables `force_optimize_skip_unused_shards` only for the first level. +- 2 — Enables `force_optimize_skip_unused_shards` up to the second level. + +## force_primary_key {#force_primary_key} + +Type: Bool + +Default value: 0 + +Disables query execution if indexing by the primary key is not possible. + +Works with tables in the MergeTree family. + +If `force_primary_key=1`, ClickHouse checks to see if the query has a primary key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition reduces the amount of data to read. For more information about data ranges in MergeTree tables, see [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). + +## force_remove_data_recursively_on_drop {#force_remove_data_recursively_on_drop} + +Type: Bool + +Default value: 0 + +Recursively remove data on DROP query. Avoids 'Directory not empty' error, but may silently remove detached data + +## formatdatetime_f_prints_single_zero {#formatdatetime_f_prints_single_zero} + +Type: Bool + +Default value: 0 + +Formatter '%f' in function 'formatDateTime()' prints a single zero instead of six zeros if the formatted value has no fractional seconds. + +## formatdatetime_format_without_leading_zeros {#formatdatetime_format_without_leading_zeros} + +Type: Bool + +Default value: 0 + +Formatters '%c', '%l' and '%k' in function 'formatDateTime()' print months and hours without leading zeros. + +## formatdatetime_parsedatetime_m_is_month_name {#formatdatetime_parsedatetime_m_is_month_name} + +Type: Bool + +Default value: 1 + +Formatter '%M' in functions 'formatDateTime()' and 'parseDateTime()' print/parse the month name instead of minutes. + +## fsync_metadata {#fsync_metadata} + +Type: Bool + +Default value: 1 + +Enables or disables [fsync](http://pubs.opengroup.org/onlinepubs/9699919799/functions/fsync.html) when writing `.sql` files. Enabled by default. + +It makes sense to disable it if the server has millions of tiny tables that are constantly being created and destroyed. + +## function_implementation {#function_implementation} + +Type: String + +Default value: + +Choose function implementation for specific target or variant (experimental). If empty enable all of them. + +## function_json_value_return_type_allow_complex {#function_json_value_return_type_allow_complex} + +Type: Bool + +Default value: 0 + +Control whether allow to return complex type (such as: struct, array, map) for json_value function. + +```sql +SELECT JSON_VALUE('{"hello":{"world":"!"}}', '$.hello') settings function_json_value_return_type_allow_complex=true + +┌─JSON_VALUE('{"hello":{"world":"!"}}', '$.hello')─┐ +│ {"world":"!"} │ +└──────────────────────────────────────────────────┘ + +1 row in set. Elapsed: 0.001 sec. +``` + +Possible values: + +- true — Allow. +- false — Disallow. + +## function_json_value_return_type_allow_nullable {#function_json_value_return_type_allow_nullable} + +Type: Bool + +Default value: 0 + +Control whether allow to return `NULL` when value is not exist for JSON_VALUE function. + +```sql +SELECT JSON_VALUE('{"hello":"world"}', '$.b') settings function_json_value_return_type_allow_nullable=true; + +┌─JSON_VALUE('{"hello":"world"}', '$.b')─┐ +│ ᴺᵁᴸᴸ │ +└────────────────────────────────────────┘ + +1 row in set. Elapsed: 0.001 sec. +``` + +Possible values: + +- true — Allow. +- false — Disallow. + +## function_locate_has_mysql_compatible_argument_order {#function_locate_has_mysql_compatible_argument_order} + +Type: Bool + +Default value: 1 + +Controls the order of arguments in function [locate](../../sql-reference/functions/string-search-functions.md#locate). + +Possible values: + +- 0 — Function `locate` accepts arguments `(haystack, needle[, start_pos])`. +- 1 — Function `locate` accepts arguments `(needle, haystack, [, start_pos])` (MySQL-compatible behavior) + +## function_range_max_elements_in_block {#function_range_max_elements_in_block} + +Type: UInt64 + +Default value: 500000000 + +Sets the safety threshold for data volume generated by function [range](../../sql-reference/functions/array-functions.md/#range). Defines the maximum number of values generated by function per block of data (sum of array sizes for every row in a block). + +Possible values: + +- Positive integer. + +**See Also** + +- [max_block_size](#setting-max_block_size) +- [min_insert_block_size_rows](#min-insert-block-size-rows) + +## function_sleep_max_microseconds_per_block {#function_sleep_max_microseconds_per_block} + +Type: UInt64 + +Default value: 3000000 + +Maximum number of microseconds the function `sleep` is allowed to sleep for each block. If a user called it with a larger value, it throws an exception. It is a safety threshold. + +## function_visible_width_behavior {#function_visible_width_behavior} + +Type: UInt64 + +Default value: 1 + +The version of `visibleWidth` behavior. 0 - only count the number of code points; 1 - correctly count zero-width and combining characters, count full-width characters as two, estimate the tab width, count delete characters. + +## geo_distance_returns_float64_on_float64_arguments {#geo_distance_returns_float64_on_float64_arguments} + +Type: Bool + +Default value: 1 + +If all four arguments to `geoDistance`, `greatCircleDistance`, `greatCircleAngle` functions are Float64, return Float64 and use double precision for internal calculations. In previous ClickHouse versions, the functions always returned Float32. + +## glob_expansion_max_elements {#glob_expansion_max_elements} + +Type: UInt64 + +Default value: 1000 + +Maximum number of allowed addresses (For external storages, table functions, etc). + +## grace_hash_join_initial_buckets {#grace_hash_join_initial_buckets} + +Type: UInt64 + +Default value: 1 + +Initial number of grace hash join buckets + +## grace_hash_join_max_buckets {#grace_hash_join_max_buckets} + +Type: UInt64 + +Default value: 1024 + +Limit on the number of grace hash join buckets + +## group_by_overflow_mode {#group_by_overflow_mode} + +Type: OverflowModeGroupBy + +Default value: throw + +What to do when the limit is exceeded. + +## group_by_two_level_threshold {#group_by_two_level_threshold} + +Type: UInt64 + +Default value: 100000 + +From what number of keys, a two-level aggregation starts. 0 - the threshold is not set. + +## group_by_two_level_threshold_bytes {#group_by_two_level_threshold_bytes} + +Type: UInt64 + +Default value: 50000000 + +From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. Two-level aggregation is used when at least one of the thresholds is triggered. + +## group_by_use_nulls {#group_by_use_nulls} + +Type: Bool + +Default value: 0 + +Changes the way the [GROUP BY clause](/docs/en/sql-reference/statements/select/group-by.md) treats the types of aggregation keys. +When the `ROLLUP`, `CUBE`, or `GROUPING SETS` specifiers are used, some aggregation keys may not be used to produce some result rows. +Columns for these keys are filled with either default value or `NULL` in corresponding rows depending on this setting. + +Possible values: + +- 0 — The default value for the aggregation key type is used to produce missing values. +- 1 — ClickHouse executes `GROUP BY` the same way as the SQL standard says. The types of aggregation keys are converted to [Nullable](/docs/en/sql-reference/data-types/nullable.md/#data_type-nullable). Columns for corresponding aggregation keys are filled with [NULL](/docs/en/sql-reference/syntax.md) for rows that didn't use it. + +See also: + +- [GROUP BY clause](/docs/en/sql-reference/statements/select/group-by.md) + +## handshake_timeout_ms {#handshake_timeout_ms} + +Type: Milliseconds + +Default value: 10000 + +Timeout in milliseconds for receiving Hello packet from replicas during handshake. + +## hdfs_create_new_file_on_insert {#hdfs_create_new_file_on_insert} + +Type: Bool + +Default value: 0 + +Enables or disables creating a new file on each insert in HDFS engine tables. If enabled, on each insert a new HDFS file will be created with the name, similar to this pattern: + +initial: `data.Parquet.gz` -> `data.1.Parquet.gz` -> `data.2.Parquet.gz`, etc. + +Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` query creates a new file. + +## hdfs_ignore_file_doesnt_exist {#hdfs_ignore_file_doesnt_exist} + +Type: Bool + +Default value: 0 + +Ignore absence of file if it does not exist when reading certain keys. + +Possible values: +- 1 — `SELECT` returns empty result. +- 0 — `SELECT` throws an exception. + +## hdfs_replication {#hdfs_replication} + +Type: UInt64 + +Default value: 0 + +The actual number of replications can be specified when the hdfs file is created. + +## hdfs_skip_empty_files {#hdfs_skip_empty_files} + +Type: Bool + +Default value: 0 + +Enables or disables skipping empty files in [HDFS](../../engines/table-engines/integrations/hdfs.md) engine tables. + +Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. + +## hdfs_throw_on_zero_files_match {#hdfs_throw_on_zero_files_match} + +Type: Bool + +Default value: 0 + +Throw an error if matched zero files according to glob expansion rules. + +Possible values: +- 1 — `SELECT` throws an exception. +- 0 — `SELECT` returns empty result. + +## hdfs_truncate_on_insert {#hdfs_truncate_on_insert} + +Type: Bool + +Default value: 0 + +Enables or disables truncation before an insert in hdfs engine tables. If disabled, an exception will be thrown on an attempt to insert if a file in HDFS already exists. + +Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` query replaces existing content of the file with the new data. + +## hedged_connection_timeout_ms {#hedged_connection_timeout_ms} + +Type: Milliseconds + +Default value: 50 + +Connection timeout for establishing connection with replica for Hedged requests + +## hsts_max_age {#hsts_max_age} + +Type: UInt64 + +Default value: 0 + +Expired time for HSTS. 0 means disable HSTS. + +## http_connection_timeout {#http_connection_timeout} + +Type: Seconds + +Default value: 1 + +HTTP connection timeout (in seconds). + +Possible values: + +- Any positive integer. +- 0 - Disabled (infinite timeout). + +## http_headers_progress_interval_ms {#http_headers_progress_interval_ms} + +Type: UInt64 + +Default value: 100 + +Do not send HTTP headers X-ClickHouse-Progress more frequently than at each specified interval. + +## http_make_head_request {#http_make_head_request} + +Type: Bool + +Default value: 1 + +The `http_make_head_request` setting allows the execution of a `HEAD` request while reading data from HTTP to retrieve information about the file to be read, such as its size. Since it's enabled by default, it may be desirable to disable this setting in cases where the server does not support `HEAD` requests. + +## http_max_field_name_size {#http_max_field_name_size} + +Type: UInt64 + +Default value: 131072 + +Maximum length of field name in HTTP header + +## http_max_field_value_size {#http_max_field_value_size} + +Type: UInt64 + +Default value: 131072 + +Maximum length of field value in HTTP header + +## http_max_fields {#http_max_fields} + +Type: UInt64 + +Default value: 1000000 + +Maximum number of fields in HTTP header + +## http_max_multipart_form_data_size {#http_max_multipart_form_data_size} + +Type: UInt64 + +Default value: 1073741824 + +Limit on size of multipart/form-data content. This setting cannot be parsed from URL parameters and should be set in a user profile. Note that content is parsed and external tables are created in memory before the start of query execution. And this is the only limit that has an effect on that stage (limits on max memory usage and max execution time have no effect while reading HTTP form data). + +## http_max_request_param_data_size {#http_max_request_param_data_size} + +Type: UInt64 + +Default value: 10485760 + +Limit on size of request data used as a query parameter in predefined HTTP requests. + +## http_max_tries {#http_max_tries} + +Type: UInt64 + +Default value: 10 + +Max attempts to read via http. + +## http_max_uri_size {#http_max_uri_size} + +Type: UInt64 + +Default value: 1048576 + +Sets the maximum URI length of an HTTP request. + +Possible values: + +- Positive integer. ## http_native_compression_disable_checksumming_on_decompress {#http_native_compression_disable_checksumming_on_decompress} +Type: Bool + +Default value: 0 + Enables or disables checksum verification when decompressing the HTTP POST data from the client. Used only for ClickHouse native compression format (not used with `gzip` or `deflate`). For more information, read the [HTTP interface description](../../interfaces/http.md). @@ -463,72 +3662,425 @@ Possible values: - 0 — Disabled. - 1 — Enabled. -Default value: 0. +## http_receive_timeout {#http_receive_timeout} -## http_max_uri_size {#http-max-uri-size} +Type: Seconds -Sets the maximum URI length of an HTTP request. +Default value: 30 + +HTTP receive timeout (in seconds). Possible values: -- Positive integer. +- Any positive integer. +- 0 - Disabled (infinite timeout). -Default value: 1048576. +## http_response_buffer_size {#http_response_buffer_size} -## http_make_head_request {#http-make-head-request} +Type: UInt64 -The `http_make_head_request` setting allows the execution of a `HEAD` request while reading data from HTTP to retrieve information about the file to be read, such as its size. Since it's enabled by default, it may be desirable to disable this setting in cases where the server does not support `HEAD` requests. +Default value: 0 -Default value: `true`. +The number of bytes to buffer in the server memory before sending a HTTP response to the client or flushing to disk (when http_wait_end_of_query is enabled). -## table_function_remote_max_addresses {#table_function_remote_max_addresses} +## http_retry_initial_backoff_ms {#http_retry_initial_backoff_ms} -Sets the maximum number of addresses generated from patterns for the [remote](../../sql-reference/table-functions/remote.md) function. +Type: UInt64 + +Default value: 100 + +Min milliseconds for backoff, when retrying read via http + +## http_retry_max_backoff_ms {#http_retry_max_backoff_ms} + +Type: UInt64 + +Default value: 10000 + +Max milliseconds for backoff, when retrying read via http + +## http_send_timeout {#http_send_timeout} + +Type: Seconds + +Default value: 30 + +HTTP send timeout (in seconds). Possible values: -- Positive integer. +- Any positive integer. +- 0 - Disabled (infinite timeout). -Default value: `1000`. +:::note +It's applicable only to the default profile. A server reboot is required for the changes to take effect. +::: -## glob_expansion_max_elements {#glob_expansion_max_elements} +## http_skip_not_found_url_for_globs {#http_skip_not_found_url_for_globs} -Sets the maximum number of addresses generated from patterns for external storages and table functions (like [url](../../sql-reference/table-functions/url.md)) except the `remote` function. +Type: Bool + +Default value: 1 + +Skip URLs for globs with HTTP_NOT_FOUND error + +## http_wait_end_of_query {#http_wait_end_of_query} + +Type: Bool + +Default value: 0 + +Enable HTTP response buffering on the server-side. + +## http_write_exception_in_output_format {#http_write_exception_in_output_format} + +Type: Bool + +Default value: 1 + +Write exception in output format to produce valid output. Works with JSON and XML formats. + +## http_zlib_compression_level {#http_zlib_compression_level} + +Type: Int64 + +Default value: 3 + +Sets the level of data compression in the response to an HTTP request if [enable_http_compression = 1](#enable_http_compression). + +Possible values: Numbers from 1 to 9. + +## iceberg_engine_ignore_schema_evolution {#iceberg_engine_ignore_schema_evolution} + +Type: Bool + +Default value: 0 + +Allow to ignore schema evolution in Iceberg table engine and read all data using schema specified by the user on table creation or latest schema parsed from metadata on table creation. + +:::note +Enabling this setting can lead to incorrect result as in case of evolved schema all data files will be read using the same schema. +::: + +## idle_connection_timeout {#idle_connection_timeout} + +Type: UInt64 + +Default value: 3600 + +Timeout to close idle TCP connections after specified number of seconds. Possible values: -- Positive integer. +- Positive integer (0 - close immediately, after 0 seconds). -Default value: `1000`. +## ignore_cold_parts_seconds {#ignore_cold_parts_seconds} -## send_progress_in_http_headers {#send_progress_in_http_headers} +Type: Int64 -Enables or disables `X-ClickHouse-Progress` HTTP response headers in `clickhouse-server` responses. +Default value: 0 -For more information, read the [HTTP interface description](../../interfaces/http.md). +Only available in ClickHouse Cloud. Exclude new data parts from SELECT queries until they're either pre-warmed (see cache_populated_by_fetch) or this many seconds old. Only for Replicated-/SharedMergeTree. + +## ignore_data_skipping_indices {#ignore_data_skipping_indices} + +Type: String + +Default value: + +Ignores the skipping indexes specified if used by the query. + +Consider the following example: + +```sql +CREATE TABLE data +( + key Int, + x Int, + y Int, + INDEX x_idx x TYPE minmax GRANULARITY 1, + INDEX y_idx y TYPE minmax GRANULARITY 1, + INDEX xy_idx (x,y) TYPE minmax GRANULARITY 1 +) +Engine=MergeTree() +ORDER BY key; + +INSERT INTO data VALUES (1, 2, 3); + +SELECT * FROM data; +SELECT * FROM data SETTINGS ignore_data_skipping_indices=''; -- query will produce CANNOT_PARSE_TEXT error. +SELECT * FROM data SETTINGS ignore_data_skipping_indices='x_idx'; -- Ok. +SELECT * FROM data SETTINGS ignore_data_skipping_indices='na_idx'; -- Ok. + +SELECT * FROM data WHERE x = 1 AND y = 1 SETTINGS ignore_data_skipping_indices='xy_idx',force_data_skipping_indices='xy_idx' ; -- query will produce INDEX_NOT_USED error, since xy_idx is explicitly ignored. +SELECT * FROM data WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx'; +``` + +The query without ignoring any indexes: +```sql +EXPLAIN indexes = 1 SELECT * FROM data WHERE x = 1 AND y = 2; + +Expression ((Projection + Before ORDER BY)) + Filter (WHERE) + ReadFromMergeTree (default.data) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 + Skip + Name: x_idx + Description: minmax GRANULARITY 1 + Parts: 0/1 + Granules: 0/1 + Skip + Name: y_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 + Skip + Name: xy_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 +``` + +Ignoring the `xy_idx` index: +```sql +EXPLAIN indexes = 1 SELECT * FROM data WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx'; + +Expression ((Projection + Before ORDER BY)) + Filter (WHERE) + ReadFromMergeTree (default.data) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 + Skip + Name: x_idx + Description: minmax GRANULARITY 1 + Parts: 0/1 + Granules: 0/1 + Skip + Name: y_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 +``` + +Works with tables in the MergeTree family. + +## ignore_drop_queries_probability {#ignore_drop_queries_probability} + +Type: Float + +Default value: 0 + +If enabled, server will ignore all DROP table queries with specified probability (for Memory and JOIN engines it will replcase DROP to TRUNCATE). Used for testing purposes + +## ignore_materialized_views_with_dropped_target_table {#ignore_materialized_views_with_dropped_target_table} + +Type: Bool + +Default value: 0 + +Ignore MVs with dropped target table during pushing to views + +## ignore_on_cluster_for_replicated_access_entities_queries {#ignore_on_cluster_for_replicated_access_entities_queries} + +Type: Bool + +Default value: 0 + +Ignore ON CLUSTER clause for replicated access entities management queries. + +## ignore_on_cluster_for_replicated_named_collections_queries {#ignore_on_cluster_for_replicated_named_collections_queries} + +Type: Bool + +Default value: 0 + +Ignore ON CLUSTER clause for replicated named collections management queries. + +## ignore_on_cluster_for_replicated_udf_queries {#ignore_on_cluster_for_replicated_udf_queries} + +Type: Bool + +Default value: 0 + +Ignore ON CLUSTER clause for replicated UDF management queries. + +## implicit_transaction {#implicit_transaction} + +Type: Bool + +Default value: 0 + +If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback) + +## input_format_parallel_parsing {#input_format_parallel_parsing} + +Type: Bool + +Default value: 1 + +Enables or disables order-preserving parallel parsing of data formats. Supported only for [TSV](../../interfaces/formats.md/#tabseparated), [TSKV](../../interfaces/formats.md/#tskv), [CSV](../../interfaces/formats.md/#csv) and [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) formats. + +Possible values: + +- 1 — Enabled. +- 0 — Disabled. + +## insert_allow_materialized_columns {#insert_allow_materialized_columns} + +Type: Bool + +Default value: 0 + +If setting is enabled, Allow materialized columns in INSERT. + +## insert_deduplicate {#insert_deduplicate} + +Type: Bool + +Default value: 1 + +Enables or disables block deduplication of `INSERT` (for Replicated\* tables). Possible values: - 0 — Disabled. - 1 — Enabled. -Default value: 0. +By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)). +For the replicated tables by default the only 100 of the most recent blocks for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md/#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)). +For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window). -## max_http_get_redirects {#setting-max_http_get_redirects} +## insert_deduplication_token {#insert_deduplication_token} -Limits the maximum number of HTTP GET redirect hops for [URL](../../engines/table-engines/special/url.md)-engine tables. The setting applies to both types of tables: those created by the [CREATE TABLE](../../sql-reference/statements/create/table.md) query and by the [url](../../sql-reference/table-functions/url.md) table function. +Type: String + +Default value: + +The setting allows a user to provide own deduplication semantic in MergeTree/ReplicatedMergeTree +For example, by providing a unique value for the setting in each INSERT statement, +user can avoid the same inserted data being deduplicated. Possible values: -- Any positive integer number of hops. -- 0 — No hops allowed. +- Any string -Default value: `0`. +`insert_deduplication_token` is used for deduplication _only_ when not empty. -Cloud default value: `10`. +For the replicated tables by default the only 100 of the most recent inserts for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md/#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)). +For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window). + +:::note +`insert_deduplication_token` works on a partition level (the same as `insert_deduplication` checksum). Multiple partitions can have the same `insert_deduplication_token`. +::: + +Example: + +```sql +CREATE TABLE test_table +( A Int64 ) +ENGINE = MergeTree +ORDER BY A +SETTINGS non_replicated_deduplication_window = 100; + +INSERT INTO test_table SETTINGS insert_deduplication_token = 'test' VALUES (1); + +-- the next insert won't be deduplicated because insert_deduplication_token is different +INSERT INTO test_table SETTINGS insert_deduplication_token = 'test1' VALUES (1); + +-- the next insert will be deduplicated because insert_deduplication_token +-- is the same as one of the previous +INSERT INTO test_table SETTINGS insert_deduplication_token = 'test' VALUES (2); + +SELECT * FROM test_table + +┌─A─┐ +│ 1 │ +└───┘ +┌─A─┐ +│ 1 │ +└───┘ +``` + +## insert_keeper_fault_injection_probability {#insert_keeper_fault_injection_probability} + +Type: Float + +Default value: 0 + +Approximate probability of failure for a keeper request during insert. Valid value is in interval [0.0f, 1.0f] + +## insert_keeper_fault_injection_seed {#insert_keeper_fault_injection_seed} + +Type: UInt64 + +Default value: 0 + +0 - random seed, otherwise the setting value + +## insert_keeper_max_retries {#insert_keeper_max_retries} + +Type: UInt64 + +Default value: 20 + +The setting sets the maximum number of retries for ClickHouse Keeper (or ZooKeeper) requests during insert into replicated MergeTree. Only Keeper requests which failed due to network error, Keeper session timeout, or request timeout are considered for retries. + +Possible values: + +- Positive integer. +- 0 — Retries are disabled + +Cloud default value: `20`. + +Keeper request retries are done after some timeout. The timeout is controlled by the following settings: `insert_keeper_retry_initial_backoff_ms`, `insert_keeper_retry_max_backoff_ms`. +The first retry is done after `insert_keeper_retry_initial_backoff_ms` timeout. The consequent timeouts will be calculated as follows: +``` +timeout = min(insert_keeper_retry_max_backoff_ms, latest_timeout * 2) +``` + +For example, if `insert_keeper_retry_initial_backoff_ms=100`, `insert_keeper_retry_max_backoff_ms=10000` and `insert_keeper_max_retries=8` then timeouts will be `100, 200, 400, 800, 1600, 3200, 6400, 10000`. + +Apart from fault tolerance, the retries aim to provide a better user experience - they allow to avoid returning an error during INSERT execution if Keeper is restarted, for example, due to an upgrade. + +## insert_keeper_retry_initial_backoff_ms {#insert_keeper_retry_initial_backoff_ms} + +Type: UInt64 + +Default value: 100 + +Initial timeout(in milliseconds) to retry a failed Keeper request during INSERT query execution + +Possible values: + +- Positive integer. +- 0 — No timeout + +## insert_keeper_retry_max_backoff_ms {#insert_keeper_retry_max_backoff_ms} + +Type: UInt64 + +Default value: 10000 + +Maximum timeout (in milliseconds) to retry a failed Keeper request during INSERT query execution + +Possible values: + +- Positive integer. +- 0 — Maximum timeout is not limited ## insert_null_as_default {#insert_null_as_default} +Type: Bool + +Default value: 1 + Enables or disables the insertion of [default values](../../sql-reference/statements/create/table.md/#create-default-values) instead of [NULL](../../sql-reference/syntax.md/#null-literal) into columns with not [nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable) data type. If column type is not nullable and this setting is disabled, then inserting `NULL` causes an exception. If column type is nullable, then `NULL` values are inserted as is, regardless of this setting. @@ -539,23 +4091,147 @@ Possible values: - 0 — Inserting `NULL` into a not nullable column causes an exception. - 1 — Default column value is inserted instead of `NULL`. -Default value: `1`. +## insert_quorum {#insert_quorum} -## join_default_strictness {#join_default_strictness} +Type: UInt64Auto -Sets default strictness for [JOIN clauses](../../sql-reference/statements/select/join.md/#select-join). +Default value: 0 + +:::note +This setting is not applicable to SharedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information. +::: + +Enables the quorum writes. + +- If `insert_quorum < 2`, the quorum writes are disabled. +- If `insert_quorum >= 2`, the quorum writes are enabled. +- If `insert_quorum = 'auto'`, use majority number (`number_of_replicas / 2 + 1`) as quorum number. + +Quorum writes + +`INSERT` succeeds only when ClickHouse manages to correctly write data to the `insert_quorum` of replicas during the `insert_quorum_timeout`. If for any reason the number of replicas with successful writes does not reach the `insert_quorum`, the write is considered failed and ClickHouse will delete the inserted block from all the replicas where data has already been written. + +When `insert_quorum_parallel` is disabled, all replicas in the quorum are consistent, i.e. they contain data from all previous `INSERT` queries (the `INSERT` sequence is linearized). When reading data written using `insert_quorum` and `insert_quorum_parallel` is disabled, you can turn on sequential consistency for `SELECT` queries using [select_sequential_consistency](#select_sequential_consistency). + +ClickHouse generates an exception: + +- If the number of available replicas at the time of the query is less than the `insert_quorum`. +- When `insert_quorum_parallel` is disabled and an attempt to write data is made when the previous block has not yet been inserted in `insert_quorum` of replicas. This situation may occur if the user tries to perform another `INSERT` query to the same table before the previous one with `insert_quorum` is completed. + +See also: + +- [insert_quorum_timeout](#insert_quorum_timeout) +- [insert_quorum_parallel](#insert_quorum_parallel) +- [select_sequential_consistency](#select_sequential_consistency) + +## insert_quorum_parallel {#insert_quorum_parallel} + +Type: Bool + +Default value: 1 + +:::note +This setting is not applicable to SharedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information. +::: + +Enables or disables parallelism for quorum `INSERT` queries. If enabled, additional `INSERT` queries can be sent while previous queries have not yet finished. If disabled, additional writes to the same table will be rejected. Possible values: -- `ALL` — If the right table has several matching rows, ClickHouse creates a [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) from matching rows. This is the normal `JOIN` behaviour from standard SQL. -- `ANY` — If the right table has several matching rows, only the first one found is joined. If the right table has only one matching row, the results of `ANY` and `ALL` are the same. -- `ASOF` — For joining sequences with an uncertain match. -- `Empty string` — If `ALL` or `ANY` is not specified in the query, ClickHouse throws an exception. +- 0 — Disabled. +- 1 — Enabled. -Default value: `ALL`. +See also: + +- [insert_quorum](#insert_quorum) +- [insert_quorum_timeout](#insert_quorum_timeout) +- [select_sequential_consistency](#select_sequential_consistency) + +## insert_quorum_timeout {#insert_quorum_timeout} + +Type: Milliseconds + +Default value: 600000 + +Write to a quorum timeout in milliseconds. If the timeout has passed and no write has taken place yet, ClickHouse will generate an exception and the client must repeat the query to write the same block to the same or any other replica. + +See also: + +- [insert_quorum](#insert_quorum) +- [insert_quorum_parallel](#insert_quorum_parallel) +- [select_sequential_consistency](#select_sequential_consistency) + +## insert_shard_id {#insert_shard_id} + +Type: UInt64 + +Default value: 0 + +If not `0`, specifies the shard of [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table into which the data will be inserted synchronously. + +If `insert_shard_id` value is incorrect, the server will throw an exception. + +To get the number of shards on `requested_cluster`, you can check server config or use this query: + +``` sql +SELECT uniq(shard_num) FROM system.clusters WHERE cluster = 'requested_cluster'; +``` + +Possible values: + +- 0 — Disabled. +- Any number from `1` to `shards_num` of corresponding [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table. + +**Example** + +Query: + +```sql +CREATE TABLE x AS system.numbers ENGINE = MergeTree ORDER BY number; +CREATE TABLE x_dist AS x ENGINE = Distributed('test_cluster_two_shards_localhost', currentDatabase(), x); +INSERT INTO x_dist SELECT * FROM numbers(5) SETTINGS insert_shard_id = 1; +SELECT * FROM x_dist ORDER BY number ASC; +``` + +Result: + +``` text +┌─number─┐ +│ 0 │ +│ 0 │ +│ 1 │ +│ 1 │ +│ 2 │ +│ 2 │ +│ 3 │ +│ 3 │ +│ 4 │ +│ 4 │ +└────────┘ +``` + +## interactive_delay {#interactive_delay} + +Type: UInt64 + +Default value: 100000 + +The interval in microseconds for checking whether request execution has been canceled and sending the progress. + +## intersect_default_mode {#intersect_default_mode} + +Type: SetOperationMode + +Default value: ALL + +Set default mode in INTERSECT query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without mode will throw exception. ## join_algorithm {#join_algorithm} +Type: JoinAlgorithm + +Default value: default + Specifies which [JOIN](../../sql-reference/statements/select/join.md) algorithm is used. Several algorithms can be specified, and an available one would be chosen for a particular query based on kind/strictness and table engine. @@ -610,9 +4286,12 @@ Possible values: ClickHouse always tries to use `partial_merge` join if possible, otherwise, it uses `hash`. *Deprecated*, same as `partial_merge,hash`. - ## join_any_take_last_row {#join_any_take_last_row} +Type: Bool + +Default value: 0 + Changes the behaviour of join operations with `ANY` strictness. :::note @@ -624,73 +4303,33 @@ Possible values: - 0 — If the right table has more than one matching row, only the first one found is joined. - 1 — If the right table has more than one matching row, only the last one found is joined. -Default value: 0. - See also: - [JOIN clause](../../sql-reference/statements/select/join.md/#select-join) - [Join table engine](../../engines/table-engines/special/join.md) - [join_default_strictness](#join_default_strictness) -## join_use_nulls {#join_use_nulls} +## join_default_strictness {#join_default_strictness} -Sets the type of [JOIN](../../sql-reference/statements/select/join.md) behaviour. When merging tables, empty cells may appear. ClickHouse fills them differently based on this setting. +Type: JoinStrictness + +Default value: ALL + +Sets default strictness for [JOIN clauses](../../sql-reference/statements/select/join.md/#select-join). Possible values: -- 0 — The empty cells are filled with the default value of the corresponding field type. -- 1 — `JOIN` behaves the same way as in standard SQL. The type of the corresponding field is converted to [Nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable), and empty cells are filled with [NULL](../../sql-reference/syntax.md). - -Default value: 0. - -## group_by_use_nulls {#group_by_use_nulls} - -Changes the way the [GROUP BY clause](/docs/en/sql-reference/statements/select/group-by.md) treats the types of aggregation keys. -When the `ROLLUP`, `CUBE`, or `GROUPING SETS` specifiers are used, some aggregation keys may not be used to produce some result rows. -Columns for these keys are filled with either default value or `NULL` in corresponding rows depending on this setting. - -Possible values: - -- 0 — The default value for the aggregation key type is used to produce missing values. -- 1 — ClickHouse executes `GROUP BY` the same way as the SQL standard says. The types of aggregation keys are converted to [Nullable](/docs/en/sql-reference/data-types/nullable.md/#data_type-nullable). Columns for corresponding aggregation keys are filled with [NULL](/docs/en/sql-reference/syntax.md) for rows that didn't use it. - -Default value: 0. - -See also: - -- [GROUP BY clause](/docs/en/sql-reference/statements/select/group-by.md) - -## partial_merge_join_optimizations {#partial_merge_join_optimizations} - -Disables optimizations in partial merge join algorithm for [JOIN](../../sql-reference/statements/select/join.md) queries. - -By default, this setting enables improvements that could lead to wrong results. If you see suspicious results in your queries, disable optimizations by this setting. Optimizations can be different in different versions of the ClickHouse server. - -Possible values: - -- 0 — Optimizations disabled. -- 1 — Optimizations enabled. - -Default value: 1. - -## partial_merge_join_rows_in_right_blocks {#partial_merge_join_rows_in_right_blocks} - -Limits sizes of right-hand join data blocks in partial merge join algorithm for [JOIN](../../sql-reference/statements/select/join.md) queries. - -ClickHouse server: - -1. Splits right-hand join data into blocks with up to the specified number of rows. -2. Indexes each block with its minimum and maximum values. -3. Unloads prepared blocks to disk if it is possible. - -Possible values: - -- Any positive integer. Recommended range of values: \[1000, 100000\]. - -Default value: 65536. +- `ALL` — If the right table has several matching rows, ClickHouse creates a [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) from matching rows. This is the normal `JOIN` behaviour from standard SQL. +- `ANY` — If the right table has several matching rows, only the first one found is joined. If the right table has only one matching row, the results of `ANY` and `ALL` are the same. +- `ASOF` — For joining sequences with an uncertain match. +- `Empty string` — If `ALL` or `ANY` is not specified in the query, ClickHouse throws an exception. ## join_on_disk_max_files_to_merge {#join_on_disk_max_files_to_merge} +Type: UInt64 + +Default value: 64 + Limits the number of files allowed for parallel sorting in MergeJoin operations when they are executed on disk. The bigger the value of the setting, the more RAM is used and the less disk I/O is needed. @@ -699,652 +4338,72 @@ Possible values: - Any positive integer, starting from 2. -Default value: 64. +## join_output_by_rowlist_perkey_rows_threshold {#join_output_by_rowlist_perkey_rows_threshold} -## any_join_distinct_right_table_keys {#any_join_distinct_right_table_keys} +Type: UInt64 -Enables legacy ClickHouse server behaviour in `ANY INNER|LEFT JOIN` operations. +Default value: 5 -:::note -Use this setting only for backward compatibility if your use cases depend on legacy `JOIN` behaviour. -::: +The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join. -When the legacy behaviour is enabled: +## join_overflow_mode {#join_overflow_mode} -- Results of `t1 ANY LEFT JOIN t2` and `t2 ANY RIGHT JOIN t1` operations are not equal because ClickHouse uses the logic with many-to-one left-to-right table keys mapping. -- Results of `ANY INNER JOIN` operations contain all rows from the left table like the `SEMI LEFT JOIN` operations do. +Type: OverflowMode -When the legacy behaviour is disabled: +Default value: throw -- Results of `t1 ANY LEFT JOIN t2` and `t2 ANY RIGHT JOIN t1` operations are equal because ClickHouse uses the logic which provides one-to-many keys mapping in `ANY RIGHT JOIN` operations. -- Results of `ANY INNER JOIN` operations contain one row per key from both the left and right tables. +What to do when the limit is exceeded. + +## join_to_sort_maximum_table_rows {#join_to_sort_maximum_table_rows} + +Type: UInt64 + +Default value: 10000 + +The maximum number of rows in the right table to determine whether to rerange the right table by key in left or inner join. + +## join_to_sort_minimum_perkey_rows {#join_to_sort_minimum_perkey_rows} + +Type: UInt64 + +Default value: 40 + +The lower limit of per-key average rows in the right table to determine whether to rerange the right table by key in left or inner join. This setting ensures that the optimization is not applied for sparse table keys + +## join_use_nulls {#join_use_nulls} + +Type: Bool + +Default value: 0 + +Sets the type of [JOIN](../../sql-reference/statements/select/join.md) behaviour. When merging tables, empty cells may appear. ClickHouse fills them differently based on this setting. Possible values: -- 0 — Legacy behaviour is disabled. -- 1 — Legacy behaviour is enabled. +- 0 — The empty cells are filled with the default value of the corresponding field type. +- 1 — `JOIN` behaves the same way as in standard SQL. The type of the corresponding field is converted to [Nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable), and empty cells are filled with [NULL](../../sql-reference/syntax.md). -Default value: 0. +## joined_subquery_requires_alias {#joined_subquery_requires_alias} -See also: +Type: Bool -- [JOIN strictness](../../sql-reference/statements/select/join.md/#join-settings) +Default value: 1 -## max_rows_in_set_to_optimize_join +Force joined subqueries and table functions to have aliases for correct name qualification. -Maximal size of the set to filter joined tables by each other's row sets before joining. +## kafka_disable_num_consumers_limit {#kafka_disable_num_consumers_limit} -Possible values: +Type: Bool -- 0 — Disable. -- Any positive integer. +Default value: 0 -Default value: 100000. +Disable limit on kafka_num_consumers that depends on the number of available CPU cores. -## temporary_files_codec {#temporary_files_codec} +## kafka_max_wait_ms {#kafka_max_wait_ms} -Sets compression codec for temporary files used in sorting and joining operations on disk. +Type: Milliseconds -Possible values: - -- LZ4 — [LZ4](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) compression is applied. -- NONE — No compression is applied. - -Default value: LZ4. - -## max_block_size {#setting-max_block_size} - -In ClickHouse, data is processed by blocks, which are sets of column parts. The internal processing cycles for a single block are efficient but there are noticeable costs when processing each block. - -The `max_block_size` setting indicates the recommended maximum number of rows to include in a single block when loading data from tables. Blocks the size of `max_block_size` are not always loaded from the table: if ClickHouse determines that less data needs to be retrieved, a smaller block is processed. - -The block size should not be too small to avoid noticeable costs when processing each block. It should also not be too large to ensure that queries with a LIMIT clause execute quickly after processing the first block. When setting `max_block_size`, the goal should be to avoid consuming too much memory when extracting a large number of columns in multiple threads and to preserve at least some cache locality. - -Default value: `65,409` - -## preferred_block_size_bytes {#preferred-block-size-bytes} - -Used for the same purpose as `max_block_size`, but it sets the recommended block size in bytes by adapting it to the number of rows in the block. -However, the block size cannot be more than `max_block_size` rows. -By default: 1,000,000. It only works when reading from MergeTree engines. - -## max_concurrent_queries_for_user {#max-concurrent-queries-for-user} - -The maximum number of simultaneously processed queries per user. - -Possible values: - -- Positive integer. -- 0 — No limit. - -Default value: `0`. - -**Example** - -``` xml -5 -``` - -## max_concurrent_queries_for_all_users {#max-concurrent-queries-for-all-users} - -Throw exception if the value of this setting is less or equal than the current number of simultaneously processed queries. - -Example: `max_concurrent_queries_for_all_users` can be set to 99 for all users and database administrator can set it to 100 for itself to run queries for investigation even when the server is overloaded. - -Modifying the setting for one query or user does not affect other queries. - -Possible values: - -- Positive integer. -- 0 — No limit. - -Default value: `0`. - -**Example** - -``` xml -99 -``` - -**See Also** - -- [max_concurrent_queries](/docs/en/operations/server-configuration-parameters/settings.md/#max_concurrent_queries) - -## merge_tree_min_rows_for_concurrent_read {#setting-merge-tree-min-rows-for-concurrent-read} - -If the number of rows to be read from a file of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table exceeds `merge_tree_min_rows_for_concurrent_read` then ClickHouse tries to perform a concurrent reading from this file on several threads. - -Possible values: - -- Positive integer. - -Default value: `163840`. - -## merge_tree_min_rows_for_concurrent_read_for_remote_filesystem {#merge-tree-min-rows-for-concurrent-read-for-remote-filesystem} - -The minimum number of lines to read from one file before the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem. - -Possible values: - -- Positive integer. - -Default value: `163840`. - -## merge_tree_min_bytes_for_concurrent_read {#setting-merge-tree-min-bytes-for-concurrent-read} - -If the number of bytes to read from one file of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md)-engine table exceeds `merge_tree_min_bytes_for_concurrent_read`, then ClickHouse tries to concurrently read from this file in several threads. - -Possible value: - -- Positive integer. - -Default value: `251658240`. - -## merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem {#merge-tree-min-bytes-for-concurrent-read-for-remote-filesystem} - -The minimum number of bytes to read from one file before [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem. - -Possible values: - -- Positive integer. - -Default value: `251658240`. - -## merge_tree_min_rows_for_seek {#setting-merge-tree-min-rows-for-seek} - -If the distance between two data blocks to be read in one file is less than `merge_tree_min_rows_for_seek` rows, then ClickHouse does not seek through the file but reads the data sequentially. - -Possible values: - -- Any positive integer. - -Default value: 0. - -## merge_tree_min_bytes_for_seek {#setting-merge-tree-min-bytes-for-seek} - -If the distance between two data blocks to be read in one file is less than `merge_tree_min_bytes_for_seek` bytes, then ClickHouse sequentially reads a range of file that contains both blocks, thus avoiding extra seek. - -Possible values: - -- Any positive integer. - -Default value: 0. - -## merge_tree_coarse_index_granularity {#setting-merge-tree-coarse-index-granularity} - -When searching for data, ClickHouse checks the data marks in the index file. If ClickHouse finds that required keys are in some range, it divides this range into `merge_tree_coarse_index_granularity` subranges and searches the required keys there recursively. - -Possible values: - -- Any positive even integer. - -Default value: 8. - -## merge_tree_max_rows_to_use_cache {#setting-merge-tree-max-rows-to-use-cache} - -If ClickHouse should read more than `merge_tree_max_rows_to_use_cache` rows in one query, it does not use the cache of uncompressed blocks. - -The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md/#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. - -Possible values: - -- Any positive integer. - -Default value: 128 ✕ 8192. - -## merge_tree_max_bytes_to_use_cache {#setting-merge-tree-max-bytes-to-use-cache} - -If ClickHouse should read more than `merge_tree_max_bytes_to_use_cache` bytes in one query, it does not use the cache of uncompressed blocks. - -The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md/#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. - -Possible values: - -- Any positive integer. - -Default value: 2013265920. - -## min_bytes_to_use_direct_io {#min-bytes-to-use-direct-io} - -The minimum data volume required for using direct I/O access to the storage disk. - -ClickHouse uses this setting when reading data from tables. If the total storage volume of all the data to be read exceeds `min_bytes_to_use_direct_io` bytes, then ClickHouse reads the data from the storage disk with the `O_DIRECT` option. - -Possible values: - -- 0 — Direct I/O is disabled. -- Positive integer. - -Default value: 0. - -## network_compression_method {#network_compression_method} - -Sets the method of data compression that is used for communication between servers and between server and [clickhouse-client](../../interfaces/cli.md). - -Possible values: - -- `LZ4` — sets LZ4 compression method. -- `ZSTD` — sets ZSTD compression method. - -Default value: `LZ4`. - -**See Also** - -- [network_zstd_compression_level](#network_zstd_compression_level) - -## network_zstd_compression_level {#network_zstd_compression_level} - -Adjusts the level of ZSTD compression. Used only when [network_compression_method](#network_compression_method) is set to `ZSTD`. - -Possible values: - -- Positive integer from 1 to 15. - -Default value: `1`. - -## log_queries {#log-queries} - -Setting up query logging. - -Queries sent to ClickHouse with this setup are logged according to the rules in the [query_log](../../operations/server-configuration-parameters/settings.md/#query-log) server configuration parameter. - -Example: - -``` text -log_queries=1 -``` - -## log_queries_min_query_duration_ms {#log-queries-min-query-duration-ms} - -If enabled (non-zero), queries faster than the value of this setting will not be logged (you can think about this as a `long_query_time` for [MySQL Slow Query Log](https://dev.mysql.com/doc/refman/5.7/en/slow-query-log.html)), and this basically means that you will not find them in the following tables: - -- `system.query_log` -- `system.query_thread_log` - -Only the queries with the following type will get to the log: - -- `QUERY_FINISH` -- `EXCEPTION_WHILE_PROCESSING` - -- Type: milliseconds -- Default value: 0 (any query) - -## log_queries_min_type {#log-queries-min-type} - -`query_log` minimal type to log. - -Possible values: -- `QUERY_START` (`=1`) -- `QUERY_FINISH` (`=2`) -- `EXCEPTION_BEFORE_START` (`=3`) -- `EXCEPTION_WHILE_PROCESSING` (`=4`) - -Default value: `QUERY_START`. - -Can be used to limit which entities will go to `query_log`, say you are interested only in errors, then you can use `EXCEPTION_WHILE_PROCESSING`: - -``` text -log_queries_min_type='EXCEPTION_WHILE_PROCESSING' -``` - -## log_query_threads {#log-query-threads} - -Setting up query threads logging. - -Query threads log into the [system.query_thread_log](../../operations/system-tables/query_thread_log.md) table. This setting has effect only when [log_queries](#log-queries) is true. Queries’ threads run by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../../operations/server-configuration-parameters/settings.md/#query_thread_log) server configuration parameter. - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: `1`. - -**Example** - -``` text -log_query_threads=1 -``` - -## log_query_views {#log-query-views} - -Setting up query views logging. - -When a query run by ClickHouse with this setting enabled has associated views (materialized or live views), they are logged in the [query_views_log](../../operations/server-configuration-parameters/settings.md/#query_views_log) server configuration parameter. - -Example: - -``` text -log_query_views=1 -``` - -## log_formatted_queries {#log-formatted-queries} - -Allows to log formatted queries to the [system.query_log](../../operations/system-tables/query_log.md) system table (populates `formatted_query` column in the [system.query_log](../../operations/system-tables/query_log.md)). - -Possible values: - -- 0 — Formatted queries are not logged in the system table. -- 1 — Formatted queries are logged in the system table. - -Default value: `0`. - -## log_comment {#log-comment} - -Specifies the value for the `log_comment` field of the [system.query_log](../system-tables/query_log.md) table and comment text for the server log. - -It can be used to improve the readability of server logs. Additionally, it helps to select queries related to the test from the `system.query_log` after running [clickhouse-test](../../development/tests.md). - -Possible values: - -- Any string no longer than [max_query_size](#max_query_size). If the max_query_size is exceeded, the server throws an exception. - -Default value: empty string. - -**Example** - -Query: - -``` sql -SET log_comment = 'log_comment test', log_queries = 1; -SELECT 1; -SYSTEM FLUSH LOGS; -SELECT type, query FROM system.query_log WHERE log_comment = 'log_comment test' AND event_date >= yesterday() ORDER BY event_time DESC LIMIT 2; -``` - -Result: - -``` text -┌─type────────┬─query─────┐ -│ QueryStart │ SELECT 1; │ -│ QueryFinish │ SELECT 1; │ -└─────────────┴───────────┘ -``` - -## log_processors_profiles {#log_processors_profiles} - -Write time that processor spent during execution/waiting for data to `system.processors_profile_log` table. - -See also: - -- [`system.processors_profile_log`](../../operations/system-tables/processors_profile_log.md) -- [`EXPLAIN PIPELINE`](../../sql-reference/statements/explain.md#explain-pipeline) - -## max_insert_block_size {#max_insert_block_size} - -The size of blocks (in a count of rows) to form for insertion into a table. -This setting only applies in cases when the server forms the blocks. -For example, for an INSERT via the HTTP interface, the server parses the data format and forms blocks of the specified size. -But when using clickhouse-client, the client parses the data itself, and the ‘max_insert_block_size’ setting on the server does not affect the size of the inserted blocks. -The setting also does not have a purpose when using INSERT SELECT, since data is inserted using the same blocks that are formed after SELECT. - -Default value: 1,048,576. - -The default is slightly more than `max_block_size`. The reason for this is that certain table engines (`*MergeTree`) form a data part on the disk for each inserted block, which is a fairly large entity. Similarly, `*MergeTree` tables sort data during insertion, and a large enough block size allow sorting more data in RAM. - -## min_insert_block_size_rows {#min-insert-block-size-rows} - -Sets the minimum number of rows in the block that can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. - -Possible values: - -- Positive integer. -- 0 — Squashing disabled. - -Default value: 1048576. - -## min_insert_block_size_bytes {#min-insert-block-size-bytes} - -Sets the minimum number of bytes in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. - -Possible values: - -- Positive integer. -- 0 — Squashing disabled. - -Default value: 268435456. - -## max_replica_delay_for_distributed_queries {#max_replica_delay_for_distributed_queries} - -Disables lagging replicas for distributed queries. See [Replication](../../engines/table-engines/mergetree-family/replication.md). - -Sets the time in seconds. If a replica's lag is greater than or equal to the set value, this replica is not used. - -Possible values: - -- Positive integer. -- 0 — Replica lags are not checked. - -To prevent the use of any replica with a non-zero lag, set this parameter to 1. - -Default value: 300. - -Used when performing `SELECT` from a distributed table that points to replicated tables. - -## max_threads {#max_threads} - -The maximum number of query processing threads, excluding threads for retrieving data from remote servers (see the ‘max_distributed_connections’ parameter). - -This parameter applies to threads that perform the same stages of the query processing pipeline in parallel. -For example, when reading from a table, if it is possible to evaluate expressions with functions, filter with WHERE and pre-aggregate for GROUP BY in parallel using at least ‘max_threads’ number of threads, then ‘max_threads’ are used. - -Default value: the number of physical CPU cores. - -For queries that are completed quickly because of a LIMIT, you can set a lower ‘max_threads’. For example, if the necessary number of entries are located in every block and max_threads = 8, then 8 blocks are retrieved, although it would have been enough to read just one. - -The smaller the `max_threads` value, the less memory is consumed. - -## max_insert_threads {#max-insert-threads} - -The maximum number of threads to execute the `INSERT SELECT` query. - -Possible values: - -- 0 (or 1) — `INSERT SELECT` no parallel execution. -- Positive integer. Bigger than 1. - -Default value: `0`. - -Cloud default value: from `2` to `4`, depending on the service size. - -Parallel `INSERT SELECT` has effect only if the `SELECT` part is executed in parallel, see [max_threads](#max_threads) setting. -Higher values will lead to higher memory usage. - -## max_compress_block_size {#max-compress-block-size} - -The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, 1,048,576 (1 MiB). Specifying a smaller block size generally leads to slightly reduced compression ratio, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced. - -:::note -This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse. -::: - -Don’t confuse blocks for compression (a chunk of memory consisting of bytes) with blocks for query processing (a set of rows from a table). - -## min_compress_block_size {#min-compress-block-size} - -For [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. In order to reduce latency when processing queries, a block is compressed when writing the next mark if its size is at least `min_compress_block_size`. By default, 65,536. - -The actual size of the block, if the uncompressed data is less than `max_compress_block_size`, is no less than this value and no less than the volume of data for one mark. - -Let’s look at an example. Assume that `index_granularity` was set to 8192 during table creation. - -We are writing a UInt32-type column (4 bytes per value). When writing 8192 rows, the total will be 32 KB of data. Since min_compress_block_size = 65,536, a compressed block will be formed for every two marks. - -We are writing a URL column with the String type (average size of 60 bytes per value). When writing 8192 rows, the average will be slightly less than 500 KB of data. Since this is more than 65,536, a compressed block will be formed for each mark. In this case, when reading data from the disk in the range of a single mark, extra data won’t be decompressed. - -:::note -This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse. -::: - -## max_query_size {#max_query_size} - -The maximum number of bytes of a query string parsed by the SQL parser. -Data in the VALUES clause of INSERT queries is processed by a separate stream parser (that consumes O(1) RAM) and not affected by this restriction. - -Default value: 262144 (= 256 KiB). - -:::note -`max_query_size` cannot be set within an SQL query (e.g., `SELECT now() SETTINGS max_query_size=10000`) because ClickHouse needs to allocate a buffer to parse the query, and this buffer size is determined by the `max_query_size` setting, which must be configured before the query is executed. -::: - -## max_parser_depth {#max_parser_depth} - -Limits maximum recursion depth in the recursive descent parser. Allows controlling the stack size. - -Possible values: - -- Positive integer. -- 0 — Recursion depth is unlimited. - -Default value: 1000. - -## interactive_delay {#interactive-delay} - -The interval in microseconds for checking whether request execution has been canceled and sending the progress. - -Default value: 100,000 (checks for cancelling and sends the progress ten times per second). - -## idle_connection_timeout {#idle_connection_timeout} - -Timeout to close idle TCP connections after specified number of seconds. - -Possible values: - -- Positive integer (0 - close immediately, after 0 seconds). - -Default value: 3600. - -## connect_timeout, receive_timeout, send_timeout {#connect-timeout-receive-timeout-send-timeout} - -Timeouts in seconds on the socket used for communicating with the client. - -Default value: 10, 300, 300. - -## handshake_timeout_ms {#handshake-timeout-ms} - -Timeout in milliseconds for receiving Hello packet from replicas during handshake. - -Default value: 10000. - -## cancel_http_readonly_queries_on_client_close {#cancel-http-readonly-queries-on-client-close} - -Cancels HTTP read-only queries (e.g. SELECT) when a client closes the connection without waiting for the response. - -Default value: `0`. - -Cloud default value: `1`. - -## poll_interval {#poll-interval} - -Lock in a wait loop for the specified number of seconds. - -Default value: 10. - -## max_distributed_connections {#max-distributed-connections} - -The maximum number of simultaneous connections with remote servers for distributed processing of a single query to a single Distributed table. We recommend setting a value no less than the number of servers in the cluster. - -Default value: 1024. - -The following parameters are only used when creating Distributed tables (and when launching a server), so there is no reason to change them at runtime. - -## distributed_connections_pool_size {#distributed-connections-pool-size} - -The maximum number of simultaneous connections with remote servers for distributed processing of all queries to a single Distributed table. We recommend setting a value no less than the number of servers in the cluster. - -Default value: 1024. - -## max_distributed_depth {#max-distributed-depth} - -Limits the maximum depth of recursive queries for [Distributed](../../engines/table-engines/special/distributed.md) tables. - -If the value is exceeded, the server throws an exception. - -Possible values: - -- Positive integer. -- 0 — Unlimited depth. - -Default value: `5`. - -## max_replicated_fetches_network_bandwidth_for_server {#max_replicated_fetches_network_bandwidth_for_server} - -Limits the maximum speed of data exchange over the network in bytes per second for [replicated](../../engines/table-engines/mergetree-family/replication.md) fetches for the server. Only has meaning at server startup. You can also limit the speed for a particular table with [max_replicated_fetches_network_bandwidth](../../operations/settings/merge-tree-settings.md/#max_replicated_fetches_network_bandwidth) setting. - -The setting isn't followed perfectly accurately. - -Possible values: - -- Positive integer. -- 0 — Unlimited. - -Default value: `0`. - -**Usage** - -Could be used for throttling speed when replicating the data to add or replace new nodes. - -:::note -60000000 bytes/s approximately corresponds to 457 Mbps (60000000 / 1024 / 1024 * 8). -::: - -## max_replicated_sends_network_bandwidth_for_server {#max_replicated_sends_network_bandwidth_for_server} - -Limits the maximum speed of data exchange over the network in bytes per second for [replicated](../../engines/table-engines/mergetree-family/replication.md) sends for the server. Only has meaning at server startup. You can also limit the speed for a particular table with [max_replicated_sends_network_bandwidth](../../operations/settings/merge-tree-settings.md/#max_replicated_sends_network_bandwidth) setting. - -The setting isn't followed perfectly accurately. - -Possible values: - -- Positive integer. -- 0 — Unlimited. - -Default value: `0`. - -**Usage** - -Could be used for throttling speed when replicating the data to add or replace new nodes. - -:::note -60000000 bytes/s approximately corresponds to 457 Mbps (60000000 / 1024 / 1024 * 8). -::: - -## connect_timeout_with_failover_ms {#connect-timeout-with-failover-ms} - -The timeout in milliseconds for connecting to a remote server for a Distributed table engine, if the ‘shard’ and ‘replica’ sections are used in the cluster definition. -If unsuccessful, several attempts are made to connect to various replicas. - -Default value: 1000. - -## connect_timeout_with_failover_secure_ms - -Connection timeout for selecting first healthy replica (for secure connections) - -Default value: 1000. - -## connection_pool_max_wait_ms {#connection-pool-max-wait-ms} - -The wait time in milliseconds for a connection when the connection pool is full. - -Possible values: - -- Positive integer. -- 0 — Infinite timeout. - -Default value: 0. - -## connections_with_failover_max_tries {#connections-with-failover-max-tries} - -The maximum number of connection attempts with each replica for the Distributed table engine. - -Default value: 3. - -## extremes {#extremes} - -Whether to count extreme values (the minimums and maximums in columns of a query result). Accepts 0 or 1. By default, 0 (disabled). -For more information, see the section “Extreme values”. - -## kafka_max_wait_ms {#kafka-max-wait-ms} +Default value: 5000 The wait time in milliseconds for reading messages from [Kafka](../../engines/table-engines/integrations/kafka.md/#kafka) before retry. @@ -1353,106 +4412,96 @@ Possible values: - Positive integer. - 0 — Infinite timeout. -Default value: 5000. - See also: - [Apache Kafka](https://kafka.apache.org/) -## kafka_disable_num_consumers_limit {#kafka-disable-num-consumers-limit} +## keeper_map_strict_mode {#keeper_map_strict_mode} -Disable limit on kafka_num_consumers that depends on the number of available CPU cores. +Type: Bool -Default value: false. +Default value: 0 -## postgresql_connection_pool_size {#postgresql-connection-pool-size} +Enforce additional checks during operations on KeeperMap. E.g. throw an exception on an insert for already existing key -Connection pool size for PostgreSQL table engine and database engine. +## keeper_max_retries {#keeper_max_retries} -Default value: 16 +Type: UInt64 -## postgresql_connection_attempt_timeout {#postgresql-connection-attempt-timeout} +Default value: 10 -Connection timeout in seconds of a single attempt to connect PostgreSQL end-point. -The value is passed as a `connect_timeout` parameter of the connection URL. +Max retries for general keeper operations -Default value: `2`. +## keeper_retry_initial_backoff_ms {#keeper_retry_initial_backoff_ms} -## postgresql_connection_pool_wait_timeout {#postgresql-connection-pool-wait-timeout} +Type: UInt64 -Connection pool push/pop timeout on empty pool for PostgreSQL table engine and database engine. By default it will block on empty pool. +Default value: 100 + +Initial backoff timeout for general keeper operations + +## keeper_retry_max_backoff_ms {#keeper_retry_max_backoff_ms} + +Type: UInt64 Default value: 5000 -## postgresql_connection_pool_retries {#postgresql-connection-pool-retries} +Max backoff timeout for general keeper operations -The maximum number of retries to establish a connection with the PostgreSQL end-point. +## legacy_column_name_of_tuple_literal {#legacy_column_name_of_tuple_literal} -Default value: `2`. +Type: Bool -## postgresql_connection_pool_auto_close_connection {#postgresql-connection-pool-auto-close-connection} +Default value: 0 -Close connection before returning connection to the pool. +List all names of element of large tuple literals in their column names instead of hash. This settings exists only for compatibility reasons. It makes sense to set to 'true', while doing rolling update of cluster from version lower than 21.7 to higher. -Default value: false. +## lightweight_deletes_sync {#lightweight_deletes_sync} -## odbc_bridge_connection_pool_size {#odbc-bridge-connection-pool-size} +Type: UInt64 -Connection pool size for each connection settings string in ODBC bridge. +Default value: 2 -Default value: 16 - -## odbc_bridge_use_connection_pooling {#odbc-bridge-use-connection-pooling} - -Use connection pooling in ODBC bridge. If set to false, a new connection is created every time. - -Default value: true - -## use_uncompressed_cache {#setting-use_uncompressed_cache} - -Whether to use a cache of uncompressed blocks. Accepts 0 or 1. By default, 0 (disabled). -Using the uncompressed cache (only for tables in the MergeTree family) can significantly reduce latency and increase throughput when working with a large number of short queries. Enable this setting for users who send frequent short requests. Also pay attention to the [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md/#server-settings-uncompressed_cache_size) configuration parameter (only set in the config file) – the size of uncompressed cache blocks. By default, it is 8 GiB. The uncompressed cache is filled in as needed and the least-used data is automatically deleted. - -For queries that read at least a somewhat large volume of data (one million rows or more), the uncompressed cache is disabled automatically to save space for truly small queries. This means that you can keep the ‘use_uncompressed_cache’ setting always set to 1. - -## replace_running_query {#replace-running-query} - -When using the HTTP interface, the ‘query_id’ parameter can be passed. This is any string that serves as the query identifier. -If a query from the same user with the same ‘query_id’ already exists at this time, the behaviour depends on the ‘replace_running_query’ parameter. - -`0` (default) – Throw an exception (do not allow the query to run if a query with the same ‘query_id’ is already running). - -`1` – Cancel the old query and start running the new one. - -Set this parameter to 1 for implementing suggestions for segmentation conditions. After entering the next character, if the old query hasn’t finished yet, it should be cancelled. - -## replace_running_query_max_wait_ms {#replace-running-query-max-wait-ms} - -The wait time for running the query with the same `query_id` to finish, when the [replace_running_query](#replace-running-query) setting is active. +The same as [`mutations_sync`](#mutations_sync), but controls only execution of lightweight deletes. Possible values: +- 0 - Mutations execute asynchronously. +- 1 - The query waits for the lightweight deletes to complete on the current server. +- 2 - The query waits for the lightweight deletes to complete on all replicas (if they exist). + +**See Also** + +- [Synchronicity of ALTER Queries](../../sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) +- [Mutations](../../sql-reference/statements/alter/index.md#mutations) + +## limit {#limit} + +Type: UInt64 + +Default value: 0 + +Sets the maximum number of rows to get from the query result. It adjusts the value set by the [LIMIT](../../sql-reference/statements/select/limit.md/#limit-clause) clause, so that the limit, specified in the query, cannot exceed the limit, set by this setting. + +Possible values: + +- 0 — The number of rows is not limited. - Positive integer. -- 0 — Throwing an exception that does not allow to run a new query if the server already executes a query with the same `query_id`. -Default value: 5000. +## live_view_heartbeat_interval {#live_view_heartbeat_interval} -## stream_flush_interval_ms {#stream-flush-interval-ms} +Type: Seconds -Works for tables with streaming in the case of a timeout, or when a thread generates [max_insert_block_size](#max_insert_block_size) rows. +Default value: 15 -The default value is 7500. - -The smaller the value, the more often data is flushed into the table. Setting the value too low leads to poor performance. - -## stream_poll_timeout_ms {#stream_poll_timeout_ms} - -Timeout for polling data from/to streaming storages. - -Default value: 500. +The heartbeat interval in seconds to indicate live query is alive. ## load_balancing {#load_balancing} +Type: LoadBalancing + +Default value: random + Specifies the algorithm of replicas selection that is used for distributed query processing. ClickHouse supports the following algorithms of choosing replicas: @@ -1539,43 +4588,887 @@ load_balancing = round_robin This algorithm uses a round-robin policy across replicas with the same number of errors (only the queries with `round_robin` policy is accounted). -## prefer_localhost_replica {#prefer-localhost-replica} +## load_balancing_first_offset {#load_balancing_first_offset} -Enables/disables preferable using the localhost replica when processing distributed queries. +Type: UInt64 + +Default value: 0 + +Which replica to preferably send a query when FIRST_OR_RANDOM load balancing strategy is used. + +## load_marks_asynchronously {#load_marks_asynchronously} + +Type: Bool + +Default value: 0 + +Load MergeTree marks asynchronously + +## local_filesystem_read_method {#local_filesystem_read_method} + +Type: String + +Default value: pread_threadpool + +Method of reading data from local filesystem, one of: read, pread, mmap, io_uring, pread_threadpool. The 'io_uring' method is experimental and does not work for Log, TinyLog, StripeLog, File, Set and Join, and other tables with append-able files in presence of concurrent reads and writes. + +## local_filesystem_read_prefetch {#local_filesystem_read_prefetch} + +Type: Bool + +Default value: 0 + +Should use prefetching when reading data from local filesystem. + +## lock_acquire_timeout {#lock_acquire_timeout} + +Type: Seconds + +Default value: 120 + +Defines how many seconds a locking request waits before failing. + +Locking timeout is used to protect from deadlocks while executing read/write operations with tables. When the timeout expires and the locking request fails, the ClickHouse server throws an exception "Locking attempt timed out! Possible deadlock avoided. Client should retry." with error code `DEADLOCK_AVOIDED`. Possible values: -- 1 — ClickHouse always sends a query to the localhost replica if it exists. -- 0 — ClickHouse uses the balancing strategy specified by the [load_balancing](#load_balancing) setting. +- Positive integer (in seconds). +- 0 — No locking timeout. -Default value: 1. +## log_comment {#log_comment} + +Type: String + +Default value: + +Specifies the value for the `log_comment` field of the [system.query_log](../system-tables/query_log.md) table and comment text for the server log. + +It can be used to improve the readability of server logs. Additionally, it helps to select queries related to the test from the `system.query_log` after running [clickhouse-test](../../development/tests.md). + +Possible values: + +- Any string no longer than [max_query_size](#max_query_size). If the max_query_size is exceeded, the server throws an exception. + +**Example** + +Query: + +``` sql +SET log_comment = 'log_comment test', log_queries = 1; +SELECT 1; +SYSTEM FLUSH LOGS; +SELECT type, query FROM system.query_log WHERE log_comment = 'log_comment test' AND event_date >= yesterday() ORDER BY event_time DESC LIMIT 2; +``` + +Result: + +``` text +┌─type────────┬─query─────┐ +│ QueryStart │ SELECT 1; │ +│ QueryFinish │ SELECT 1; │ +└─────────────┴───────────┘ +``` + +## log_formatted_queries {#log_formatted_queries} + +Type: Bool + +Default value: 0 + +Allows to log formatted queries to the [system.query_log](../../operations/system-tables/query_log.md) system table (populates `formatted_query` column in the [system.query_log](../../operations/system-tables/query_log.md)). + +Possible values: + +- 0 — Formatted queries are not logged in the system table. +- 1 — Formatted queries are logged in the system table. + +## log_processors_profiles {#log_processors_profiles} + +Type: Bool + +Default value: 1 + +Write time that processor spent during execution/waiting for data to `system.processors_profile_log` table. + +See also: + +- [`system.processors_profile_log`](../../operations/system-tables/processors_profile_log.md) +- [`EXPLAIN PIPELINE`](../../sql-reference/statements/explain.md#explain-pipeline) + +## log_profile_events {#log_profile_events} + +Type: Bool + +Default value: 1 + +Log query performance statistics into the query_log, query_thread_log and query_views_log. + +## log_queries {#log_queries} + +Type: Bool + +Default value: 1 + +Setting up query logging. + +Queries sent to ClickHouse with this setup are logged according to the rules in the [query_log](../../operations/server-configuration-parameters/settings.md/#query-log) server configuration parameter. + +Example: + +``` text +log_queries=1 +``` + +## log_queries_cut_to_length {#log_queries_cut_to_length} + +Type: UInt64 + +Default value: 100000 + +If query length is greater than a specified threshold (in bytes), then cut query when writing to query log. Also limit the length of printed query in ordinary text log. + +## log_queries_min_query_duration_ms {#log_queries_min_query_duration_ms} + +Type: Milliseconds + +Default value: 0 + +If enabled (non-zero), queries faster than the value of this setting will not be logged (you can think about this as a `long_query_time` for [MySQL Slow Query Log](https://dev.mysql.com/doc/refman/5.7/en/slow-query-log.html)), and this basically means that you will not find them in the following tables: + +- `system.query_log` +- `system.query_thread_log` + +Only the queries with the following type will get to the log: + +- `QUERY_FINISH` +- `EXCEPTION_WHILE_PROCESSING` + +- Type: milliseconds +- Default value: 0 (any query) + +## log_queries_min_type {#log_queries_min_type} + +Type: LogQueriesType + +Default value: QUERY_START + +`query_log` minimal type to log. + +Possible values: +- `QUERY_START` (`=1`) +- `QUERY_FINISH` (`=2`) +- `EXCEPTION_BEFORE_START` (`=3`) +- `EXCEPTION_WHILE_PROCESSING` (`=4`) + +Can be used to limit which entities will go to `query_log`, say you are interested only in errors, then you can use `EXCEPTION_WHILE_PROCESSING`: + +``` text +log_queries_min_type='EXCEPTION_WHILE_PROCESSING' +``` + +## log_queries_probability {#log_queries_probability} + +Type: Float + +Default value: 1 + +Allows a user to write to [query_log](../../operations/system-tables/query_log.md), [query_thread_log](../../operations/system-tables/query_thread_log.md), and [query_views_log](../../operations/system-tables/query_views_log.md) system tables only a sample of queries selected randomly with the specified probability. It helps to reduce the load with a large volume of queries in a second. + +Possible values: + +- 0 — Queries are not logged in the system tables. +- Positive floating-point number in the range [0..1]. For example, if the setting value is `0.5`, about half of the queries are logged in the system tables. +- 1 — All queries are logged in the system tables. + +## log_query_settings {#log_query_settings} + +Type: Bool + +Default value: 1 + +Log query settings into the query_log. + +## log_query_threads {#log_query_threads} + +Type: Bool + +Default value: 0 + +Setting up query threads logging. + +Query threads log into the [system.query_thread_log](../../operations/system-tables/query_thread_log.md) table. This setting has effect only when [log_queries](#log-queries) is true. Queries’ threads run by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../../operations/server-configuration-parameters/settings.md/#query_thread_log) server configuration parameter. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +**Example** + +``` text +log_query_threads=1 +``` + +## log_query_views {#log_query_views} + +Type: Bool + +Default value: 1 + +Setting up query views logging. + +When a query run by ClickHouse with this setting enabled has associated views (materialized or live views), they are logged in the [query_views_log](../../operations/server-configuration-parameters/settings.md/#query_views_log) server configuration parameter. + +Example: + +``` text +log_query_views=1 +``` + +## low_cardinality_allow_in_native_format {#low_cardinality_allow_in_native_format} + +Type: Bool + +Default value: 1 + +Allows or restricts using the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) data type with the [Native](../../interfaces/formats.md/#native) format. + +If usage of `LowCardinality` is restricted, ClickHouse server converts `LowCardinality`-columns to ordinary ones for `SELECT` queries, and convert ordinary columns to `LowCardinality`-columns for `INSERT` queries. + +This setting is required mainly for third-party clients which do not support `LowCardinality` data type. + +Possible values: + +- 1 — Usage of `LowCardinality` is not restricted. +- 0 — Usage of `LowCardinality` is restricted. + +## low_cardinality_max_dictionary_size {#low_cardinality_max_dictionary_size} + +Type: UInt64 + +Default value: 8192 + +Sets a maximum size in rows of a shared global dictionary for the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) data type that can be written to a storage file system. This setting prevents issues with RAM in case of unlimited dictionary growth. All the data that can’t be encoded due to maximum dictionary size limitation ClickHouse writes in an ordinary method. + +Possible values: + +- Any positive integer. + +## low_cardinality_use_single_dictionary_for_part {#low_cardinality_use_single_dictionary_for_part} + +Type: Bool + +Default value: 0 + +Turns on or turns off using of single dictionary for the data part. + +By default, the ClickHouse server monitors the size of dictionaries and if a dictionary overflows then the server starts to write the next one. To prohibit creating several dictionaries set `low_cardinality_use_single_dictionary_for_part = 1`. + +Possible values: + +- 1 — Creating several dictionaries for the data part is prohibited. +- 0 — Creating several dictionaries for the data part is not prohibited. + +## materialize_skip_indexes_on_insert {#materialize_skip_indexes_on_insert} + +Type: Bool + +Default value: 1 + +If true skip indexes are calculated on inserts, otherwise skip indexes will be calculated only during merges + +## materialize_statistics_on_insert {#materialize_statistics_on_insert} + +Type: Bool + +Default value: 1 + +If true statistics are calculated on inserts, otherwise statistics will be calculated only during merges + +## materialize_ttl_after_modify {#materialize_ttl_after_modify} + +Type: Bool + +Default value: 1 + +Apply TTL for old data, after ALTER MODIFY TTL query + +## materialized_views_ignore_errors {#materialized_views_ignore_errors} + +Type: Bool + +Default value: 0 + +Allows to ignore errors for MATERIALIZED VIEW, and deliver original block to the table regardless of MVs + +## max_analyze_depth {#max_analyze_depth} + +Type: UInt64 + +Default value: 5000 + +Maximum number of analyses performed by interpreter. + +## max_ast_depth {#max_ast_depth} + +Type: UInt64 + +Default value: 1000 + +Maximum depth of query syntax tree. Checked after parsing. + +## max_ast_elements {#max_ast_elements} + +Type: UInt64 + +Default value: 50000 + +Maximum size of query syntax tree in number of nodes. Checked after parsing. + +## max_backup_bandwidth {#max_backup_bandwidth} + +Type: UInt64 + +Default value: 0 + +The maximum read speed in bytes per second for particular backup on server. Zero means unlimited. + +## max_block_size {#max_block_size} + +Type: UInt64 + +Default value: 65409 + +In ClickHouse, data is processed by blocks, which are sets of column parts. The internal processing cycles for a single block are efficient but there are noticeable costs when processing each block. + +The `max_block_size` setting indicates the recommended maximum number of rows to include in a single block when loading data from tables. Blocks the size of `max_block_size` are not always loaded from the table: if ClickHouse determines that less data needs to be retrieved, a smaller block is processed. + +The block size should not be too small to avoid noticeable costs when processing each block. It should also not be too large to ensure that queries with a LIMIT clause execute quickly after processing the first block. When setting `max_block_size`, the goal should be to avoid consuming too much memory when extracting a large number of columns in multiple threads and to preserve at least some cache locality. + +## max_bytes_before_external_group_by {#max_bytes_before_external_group_by} + +Type: UInt64 + +Default value: 0 + +If memory usage during GROUP BY operation is exceeding this threshold in bytes, activate the 'external aggregation' mode (spill data to disk). Recommended value is half of the available system memory. + +## max_bytes_before_external_sort {#max_bytes_before_external_sort} + +Type: UInt64 + +Default value: 0 + +If memory usage during ORDER BY operation is exceeding this threshold in bytes, activate the 'external sorting' mode (spill data to disk). Recommended value is half of the available system memory. + +## max_bytes_before_remerge_sort {#max_bytes_before_remerge_sort} + +Type: UInt64 + +Default value: 1000000000 + +In case of ORDER BY with LIMIT, when memory usage is higher than specified threshold, perform additional steps of merging blocks before final merge to keep just top LIMIT rows. + +## max_bytes_in_distinct {#max_bytes_in_distinct} + +Type: UInt64 + +Default value: 0 + +Maximum total size of the state (in uncompressed bytes) in memory for the execution of DISTINCT. + +## max_bytes_in_join {#max_bytes_in_join} + +Type: UInt64 + +Default value: 0 + +Maximum size of the hash table for JOIN (in number of bytes in memory). + +## max_bytes_in_set {#max_bytes_in_set} + +Type: UInt64 + +Default value: 0 + +Maximum size of the set (in bytes in memory) resulting from the execution of the IN section. + +## max_bytes_to_read {#max_bytes_to_read} + +Type: UInt64 + +Default value: 0 + +Limit on read bytes (after decompression) from the most 'deep' sources. That is, only in the deepest subquery. When reading from a remote server, it is only checked on a remote server. + +## max_bytes_to_read_leaf {#max_bytes_to_read_leaf} + +Type: UInt64 + +Default value: 0 + +Limit on read bytes (after decompression) on the leaf nodes for distributed queries. Limit is applied for local reads only, excluding the final merge stage on the root node. Note, the setting is unstable with prefer_localhost_replica=1. + +## max_bytes_to_sort {#max_bytes_to_sort} + +Type: UInt64 + +Default value: 0 + +If more than the specified amount of (uncompressed) bytes have to be processed for ORDER BY operation, the behavior will be determined by the 'sort_overflow_mode' which by default is - throw an exception + +## max_bytes_to_transfer {#max_bytes_to_transfer} + +Type: UInt64 + +Default value: 0 + +Maximum size (in uncompressed bytes) of the transmitted external table obtained when the GLOBAL IN/JOIN section is executed. + +## max_columns_to_read {#max_columns_to_read} + +Type: UInt64 + +Default value: 0 + +If a query requires reading more than specified number of columns, exception is thrown. Zero value means unlimited. This setting is useful to prevent too complex queries. + +## max_compress_block_size {#max_compress_block_size} + +Type: UInt64 + +Default value: 1048576 + +The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, 1,048,576 (1 MiB). Specifying a smaller block size generally leads to slightly reduced compression ratio, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced. :::note -Disable this setting if you use [max_parallel_replicas](#max_parallel_replicas) without [parallel_replicas_custom_key](#parallel_replicas_custom_key). -If [parallel_replicas_custom_key](#parallel_replicas_custom_key) is set, disable this setting only if it's used on a cluster with multiple shards containing multiple replicas. -If it's used on a cluster with a single shard and multiple replicas, disabling this setting will have negative effects. +This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse. ::: -## totals_mode {#totals-mode} +Don’t confuse blocks for compression (a chunk of memory consisting of bytes) with blocks for query processing (a set of rows from a table). -How to calculate TOTALS when HAVING is present, as well as when max_rows_to_group_by and group_by_overflow_mode = ‘any’ are present. -See the section “WITH TOTALS modifier”. +## max_concurrent_queries_for_all_users {#max_concurrent_queries_for_all_users} -## totals_auto_threshold {#totals-auto-threshold} +Type: UInt64 -The threshold for `totals_mode = 'auto'`. -See the section “WITH TOTALS modifier”. +Default value: 0 + +Throw exception if the value of this setting is less or equal than the current number of simultaneously processed queries. + +Example: `max_concurrent_queries_for_all_users` can be set to 99 for all users and database administrator can set it to 100 for itself to run queries for investigation even when the server is overloaded. + +Modifying the setting for one query or user does not affect other queries. + +Possible values: + +- Positive integer. +- 0 — No limit. + +**Example** + +``` xml +99 +``` + +**See Also** + +- [max_concurrent_queries](/docs/en/operations/server-configuration-parameters/settings.md/#max_concurrent_queries) + +## max_concurrent_queries_for_user {#max_concurrent_queries_for_user} + +Type: UInt64 + +Default value: 0 + +The maximum number of simultaneously processed queries per user. + +Possible values: + +- Positive integer. +- 0 — No limit. + +**Example** + +``` xml +5 +``` + +## max_distributed_connections {#max_distributed_connections} + +Type: UInt64 + +Default value: 1024 + +The maximum number of simultaneous connections with remote servers for distributed processing of a single query to a single Distributed table. We recommend setting a value no less than the number of servers in the cluster. + +The following parameters are only used when creating Distributed tables (and when launching a server), so there is no reason to change them at runtime. + +## max_distributed_depth {#max_distributed_depth} + +Type: UInt64 + +Default value: 5 + +Limits the maximum depth of recursive queries for [Distributed](../../engines/table-engines/special/distributed.md) tables. + +If the value is exceeded, the server throws an exception. + +Possible values: + +- Positive integer. +- 0 — Unlimited depth. + +## max_download_buffer_size {#max_download_buffer_size} + +Type: UInt64 + +Default value: 10485760 + +The maximal size of buffer for parallel downloading (e.g. for URL engine) per each thread. + +## max_download_threads {#max_download_threads} + +Type: MaxThreads + +Default value: 4 + +The maximum number of threads to download data (e.g. for URL engine). + +## max_estimated_execution_time {#max_estimated_execution_time} + +Type: Seconds + +Default value: 0 + +Maximum query estimate execution time in seconds. + +## max_execution_speed {#max_execution_speed} + +Type: UInt64 + +Default value: 0 + +Maximum number of execution rows per second. + +## max_execution_speed_bytes {#max_execution_speed_bytes} + +Type: UInt64 + +Default value: 0 + +Maximum number of execution bytes per second. + +## max_execution_time {#max_execution_time} + +Type: Seconds + +Default value: 0 + +If query runtime exceeds the specified number of seconds, the behavior will be determined by the 'timeout_overflow_mode', which by default is - throw an exception. Note that the timeout is checked and the query can stop only in designated places during data processing. It currently cannot stop during merging of aggregation states or during query analysis, and the actual run time will be higher than the value of this setting. + +## max_execution_time_leaf {#max_execution_time_leaf} + +Type: Seconds + +Default value: 0 + +Similar semantic to max_execution_time but only apply on leaf node for distributed queries, the time out behavior will be determined by 'timeout_overflow_mode_leaf' which by default is - throw an exception + +## max_expanded_ast_elements {#max_expanded_ast_elements} + +Type: UInt64 + +Default value: 500000 + +Maximum size of query syntax tree in number of nodes after expansion of aliases and the asterisk. + +## max_fetch_partition_retries_count {#max_fetch_partition_retries_count} + +Type: UInt64 + +Default value: 5 + +Amount of retries while fetching partition from another host. + +## max_final_threads {#max_final_threads} + +Type: MaxThreads + +Default value: 'auto(16)' + +Sets the maximum number of parallel threads for the `SELECT` query data read phase with the [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier. + +Possible values: + +- Positive integer. +- 0 or 1 — Disabled. `SELECT` queries are executed in a single thread. + +## max_http_get_redirects {#max_http_get_redirects} + +Type: UInt64 + +Default value: 0 + +Max number of HTTP GET redirects hops allowed. Ensures additional security measures are in place to prevent a malicious server from redirecting your requests to unexpected services.\n\nIt is the case when an external server redirects to another address, but that address appears to be internal to the company's infrastructure, and by sending an HTTP request to an internal server, you could request an internal API from the internal network, bypassing the auth, or even query other services, such as Redis or Memcached. When you don't have an internal infrastructure (including something running on your localhost), or you trust the server, it is safe to allow redirects. Although keep in mind, that if the URL uses HTTP instead of HTTPS, and you will have to trust not only the remote server but also your ISP and every network in the middle. + +## max_hyperscan_regexp_length {#max_hyperscan_regexp_length} + +Type: UInt64 + +Default value: 0 + +Defines the maximum length for each regular expression in the [hyperscan multi-match functions](../../sql-reference/functions/string-search-functions.md/#multimatchanyhaystack-pattern1-pattern2-patternn). + +Possible values: + +- Positive integer. +- 0 - The length is not limited. + +**Example** + +Query: + +```sql +SELECT multiMatchAny('abcd', ['ab','bcd','c','d']) SETTINGS max_hyperscan_regexp_length = 3; +``` + +Result: + +```text +┌─multiMatchAny('abcd', ['ab', 'bcd', 'c', 'd'])─┐ +│ 1 │ +└────────────────────────────────────────────────┘ +``` + +Query: + +```sql +SELECT multiMatchAny('abcd', ['ab','bcd','c','d']) SETTINGS max_hyperscan_regexp_length = 2; +``` + +Result: + +```text +Exception: Regexp length too large. +``` + +**See Also** + +- [max_hyperscan_regexp_total_length](#max-hyperscan-regexp-total-length) + +## max_hyperscan_regexp_total_length {#max_hyperscan_regexp_total_length} + +Type: UInt64 + +Default value: 0 + +Sets the maximum length total of all regular expressions in each [hyperscan multi-match function](../../sql-reference/functions/string-search-functions.md/#multimatchanyhaystack-pattern1-pattern2-patternn). + +Possible values: + +- Positive integer. +- 0 - The length is not limited. + +**Example** + +Query: + +```sql +SELECT multiMatchAny('abcd', ['a','b','c','d']) SETTINGS max_hyperscan_regexp_total_length = 5; +``` + +Result: + +```text +┌─multiMatchAny('abcd', ['a', 'b', 'c', 'd'])─┐ +│ 1 │ +└─────────────────────────────────────────────┘ +``` + +Query: + +```sql +SELECT multiMatchAny('abcd', ['ab','bc','c','d']) SETTINGS max_hyperscan_regexp_total_length = 5; +``` + +Result: + +```text +Exception: Total regexp lengths too large. +``` + +**See Also** + +- [max_hyperscan_regexp_length](#max-hyperscan-regexp-length) + +## max_insert_block_size {#max_insert_block_size} + +Type: UInt64 + +Default value: 1048449 + +The size of blocks (in a count of rows) to form for insertion into a table. +This setting only applies in cases when the server forms the blocks. +For example, for an INSERT via the HTTP interface, the server parses the data format and forms blocks of the specified size. +But when using clickhouse-client, the client parses the data itself, and the ‘max_insert_block_size’ setting on the server does not affect the size of the inserted blocks. +The setting also does not have a purpose when using INSERT SELECT, since data is inserted using the same blocks that are formed after SELECT. + +The default is slightly more than `max_block_size`. The reason for this is that certain table engines (`*MergeTree`) form a data part on the disk for each inserted block, which is a fairly large entity. Similarly, `*MergeTree` tables sort data during insertion, and a large enough block size allow sorting more data in RAM. + +## max_insert_delayed_streams_for_parallel_write {#max_insert_delayed_streams_for_parallel_write} + +Type: UInt64 + +Default value: 0 + +The maximum number of streams (columns) to delay final part flush. Default - auto (1000 in case of underlying storage supports parallel write, for example S3 and disabled otherwise) + +## max_insert_threads {#max_insert_threads} + +Type: UInt64 + +Default value: 0 + +The maximum number of threads to execute the `INSERT SELECT` query. + +Possible values: + +- 0 (or 1) — `INSERT SELECT` no parallel execution. +- Positive integer. Bigger than 1. + +Cloud default value: from `2` to `4`, depending on the service size. + +Parallel `INSERT SELECT` has effect only if the `SELECT` part is executed in parallel, see [max_threads](#max_threads) setting. +Higher values will lead to higher memory usage. + +## max_joined_block_size_rows {#max_joined_block_size_rows} + +Type: UInt64 + +Default value: 65409 + +Maximum block size for JOIN result (if join algorithm supports it). 0 means unlimited. + +## max_limit_for_ann_queries {#max_limit_for_ann_queries} + +Type: UInt64 + +Default value: 1000000 + +SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes. + +## max_live_view_insert_blocks_before_refresh {#max_live_view_insert_blocks_before_refresh} + +Type: UInt64 + +Default value: 64 + +Limit maximum number of inserted blocks after which mergeable blocks are dropped and query is re-executed. + +## max_local_read_bandwidth {#max_local_read_bandwidth} + +Type: UInt64 + +Default value: 0 + +The maximum speed of local reads in bytes per second. + +## max_local_write_bandwidth {#max_local_write_bandwidth} + +Type: UInt64 + +Default value: 0 + +The maximum speed of local writes in bytes per second. + +## max_memory_usage {#max_memory_usage} + +Type: UInt64 + +Default value: 0 + +Maximum memory usage for processing of single query. Zero means unlimited. + +## max_memory_usage_for_user {#max_memory_usage_for_user} + +Type: UInt64 + +Default value: 0 + +Maximum memory usage for processing all concurrently running queries for the user. Zero means unlimited. + +## max_network_bandwidth {#max_network_bandwidth} + +Type: UInt64 + +Default value: 0 + +Limits the speed of the data exchange over the network in bytes per second. This setting applies to every query. + +Possible values: + +- Positive integer. +- 0 — Bandwidth control is disabled. + +## max_network_bandwidth_for_all_users {#max_network_bandwidth_for_all_users} + +Type: UInt64 + +Default value: 0 + +Limits the speed that data is exchanged at over the network in bytes per second. This setting applies to all concurrently running queries on the server. + +Possible values: + +- Positive integer. +- 0 — Control of the data speed is disabled. + +## max_network_bandwidth_for_user {#max_network_bandwidth_for_user} + +Type: UInt64 + +Default value: 0 + +Limits the speed of the data exchange over the network in bytes per second. This setting applies to all concurrently running queries performed by a single user. + +Possible values: + +- Positive integer. +- 0 — Control of the data speed is disabled. + +## max_network_bytes {#max_network_bytes} + +Type: UInt64 + +Default value: 0 + +Limits the data volume (in bytes) that is received or transmitted over the network when executing a query. This setting applies to every individual query. + +Possible values: + +- Positive integer. +- 0 — Data volume control is disabled. + +## max_number_of_partitions_for_independent_aggregation {#max_number_of_partitions_for_independent_aggregation} + +Type: UInt64 + +Default value: 128 + +Maximal number of partitions in table to apply optimization ## max_parallel_replicas {#max_parallel_replicas} +Type: NonZeroUInt64 + +Default value: 1 + The maximum number of replicas for each shard when executing a query. Possible values: - Positive integer. -Default value: `1`. - **Additional Info** This options will produce different results depending on the settings used. @@ -1597,78 +5490,682 @@ A query may be processed faster if it is executed on several servers in parallel This setting is useful for any replicated table. -## parallel_replicas_custom_key {#parallel_replicas_custom_key} +## max_parser_backtracks {#max_parser_backtracks} -An arbitrary integer expression that can be used to split work between replicas for a specific table. -The value can be any integer expression. +Type: UInt64 -Simple expressions using primary keys are preferred. +Default value: 1000000 -If the setting is used on a cluster that consists of a single shard with multiple replicas, those replicas will be converted into virtual shards. -Otherwise, it will behave same as for `SAMPLE` key, it will use multiple replicas of each shard. +Maximum parser backtracking (how many times it tries different alternatives in the recursive descend parsing process). -## parallel_replicas_custom_key_range_lower {#parallel_replicas_custom_key_range_lower} +## max_parser_depth {#max_parser_depth} -Allows the filter type `range` to split the work evenly between replicas based on the custom range `[parallel_replicas_custom_key_range_lower, INT_MAX]`. +Type: UInt64 -When used in conjuction with [parallel_replicas_custom_key_range_upper](#parallel_replicas_custom_key_range_upper), it lets the filter evenly split the work over replicas for the range `[parallel_replicas_custom_key_range_lower, parallel_replicas_custom_key_range_upper]`. +Default value: 1000 -Note: This setting will not cause any additional data to be filtered during query processing, rather it changes the points at which the range filter breaks up the range `[0, INT_MAX]` for parallel processing. - -## parallel_replicas_custom_key_range_upper {#parallel_replicas_custom_key_range_upper} - -Allows the filter type `range` to split the work evenly between replicas based on the custom range `[0, parallel_replicas_custom_key_range_upper]`. A value of 0 disables the upper bound, setting it the max value of the custom key expression. - -When used in conjuction with [parallel_replicas_custom_key_range_lower](#parallel_replicas_custom_key_range_lower), it lets the filter evenly split the work over replicas for the range `[parallel_replicas_custom_key_range_lower, parallel_replicas_custom_key_range_upper]`. - -Note: This setting will not cause any additional data to be filtered during query processing, rather it changes the points at which the range filter breaks up the range `[0, INT_MAX]` for parallel processing - -## enable_parallel_replicas - -Enables or disables sending SELECT queries to all replicas of a table (up to `max_parallel_replicas`). Reading is parallelized and coordinated dynamically. It will work for any kind of MergeTree table. +Limits maximum recursion depth in the recursive descent parser. Allows controlling the stack size. Possible values: -- 0 - Disabled. -- 1 - Enabled, silently disabled in case of failure. -- 2 - Enabled, throws an exception in case of failure. +- Positive integer. +- 0 — Recursion depth is unlimited. -Default value: `0`. +## max_parsing_threads {#max_parsing_threads} -## compile_expressions {#compile-expressions} +Type: MaxThreads -Enables or disables compilation of frequently used simple functions and operators to native code with LLVM at runtime. +Default value: 'auto(16)' + +The maximum number of threads to parse data in input formats that support parallel parsing. By default, it is determined automatically + +## max_partition_size_to_drop {#max_partition_size_to_drop} + +Type: UInt64 + +Default value: 50000000000 + +Restriction on dropping partitions in query time. The value 0 means that you can drop partitions without any restrictions. + +Cloud default value: 1 TB. + +:::note +This query setting overwrites its server setting equivalent, see [max_partition_size_to_drop](/docs/en/operations/server-configuration-parameters/settings.md/#max-partition-size-to-drop) +::: + +## max_partitions_per_insert_block {#max_partitions_per_insert_block} + +Type: UInt64 + +Default value: 100 + +Limit maximum number of partitions in the single INSERTed block. Zero means unlimited. Throw an exception if the block contains too many partitions. This setting is a safety threshold because using a large number of partitions is a common misconception. + +## max_partitions_to_read {#max_partitions_to_read} + +Type: Int64 + +Default value: -1 + +Limit the max number of partitions that can be accessed in one query. <= 0 means unlimited. + +## max_query_size {#max_query_size} + +Type: UInt64 + +Default value: 262144 + +The maximum number of bytes of a query string parsed by the SQL parser. +Data in the VALUES clause of INSERT queries is processed by a separate stream parser (that consumes O(1) RAM) and not affected by this restriction. + +:::note +`max_query_size` cannot be set within an SQL query (e.g., `SELECT now() SETTINGS max_query_size=10000`) because ClickHouse needs to allocate a buffer to parse the query, and this buffer size is determined by the `max_query_size` setting, which must be configured before the query is executed. +::: + +## max_read_buffer_size {#max_read_buffer_size} + +Type: UInt64 + +Default value: 1048576 + +The maximum size of the buffer to read from the filesystem. + +## max_read_buffer_size_local_fs {#max_read_buffer_size_local_fs} + +Type: UInt64 + +Default value: 131072 + +The maximum size of the buffer to read from local filesystem. If set to 0 then max_read_buffer_size will be used. + +## max_read_buffer_size_remote_fs {#max_read_buffer_size_remote_fs} + +Type: UInt64 + +Default value: 0 + +The maximum size of the buffer to read from remote filesystem. If set to 0 then max_read_buffer_size will be used. + +## max_recursive_cte_evaluation_depth {#max_recursive_cte_evaluation_depth} + +Type: UInt64 + +Default value: 1000 + +Maximum limit on recursive CTE evaluation depth + +## max_remote_read_network_bandwidth {#max_remote_read_network_bandwidth} + +Type: UInt64 + +Default value: 0 + +The maximum speed of data exchange over the network in bytes per second for read. + +## max_remote_write_network_bandwidth {#max_remote_write_network_bandwidth} + +Type: UInt64 + +Default value: 0 + +The maximum speed of data exchange over the network in bytes per second for write. + +## max_replica_delay_for_distributed_queries {#max_replica_delay_for_distributed_queries} + +Type: UInt64 + +Default value: 300 + +Disables lagging replicas for distributed queries. See [Replication](../../engines/table-engines/mergetree-family/replication.md). + +Sets the time in seconds. If a replica's lag is greater than or equal to the set value, this replica is not used. Possible values: -- 0 — Disabled. -- 1 — Enabled. +- Positive integer. +- 0 — Replica lags are not checked. -Default value: `1`. +To prevent the use of any replica with a non-zero lag, set this parameter to 1. -## min_count_to_compile_expression {#min-count-to-compile-expression} +Used when performing `SELECT` from a distributed table that points to replicated tables. -Minimum count of executing same expression before it is get compiled. +## max_result_bytes {#max_result_bytes} -Default value: `3`. +Type: UInt64 -## compile_aggregate_expressions {#compile_aggregate_expressions} +Default value: 0 -Enables or disables JIT-compilation of aggregate functions to native code. Enabling this setting can improve the performance. +Limit on result size in bytes (uncompressed). The query will stop after processing a block of data if the threshold is met, but it will not cut the last block of the result, therefore the result size can be larger than the threshold. Caveats: the result size in memory is taken into account for this threshold. Even if the result size is small, it can reference larger data structures in memory, representing dictionaries of LowCardinality columns, and Arenas of AggregateFunction columns, so the threshold can be exceeded despite the small result size. The setting is fairly low level and should be used with caution. + +## max_result_rows {#max_result_rows} + +Type: UInt64 + +Default value: 0 + +Limit on result size in rows. The query will stop after processing a block of data if the threshold is met, but it will not cut the last block of the result, therefore the result size can be larger than the threshold. + +## max_rows_in_distinct {#max_rows_in_distinct} + +Type: UInt64 + +Default value: 0 + +Maximum number of elements during execution of DISTINCT. + +## max_rows_in_join {#max_rows_in_join} + +Type: UInt64 + +Default value: 0 + +Maximum size of the hash table for JOIN (in number of rows). + +## max_rows_in_set {#max_rows_in_set} + +Type: UInt64 + +Default value: 0 + +Maximum size of the set (in number of elements) resulting from the execution of the IN section. + +## max_rows_in_set_to_optimize_join {#max_rows_in_set_to_optimize_join} + +Type: UInt64 + +Default value: 0 + +Maximal size of the set to filter joined tables by each other's row sets before joining. Possible values: -- 0 — Aggregation is done without JIT compilation. -- 1 — Aggregation is done using JIT compilation. +- 0 — Disable. +- Any positive integer. -Default value: `1`. +## max_rows_to_group_by {#max_rows_to_group_by} -**See Also** +Type: UInt64 -- [min_count_to_compile_aggregate_expression](#min_count_to_compile_aggregate_expression) +Default value: 0 + +If aggregation during GROUP BY is generating more than the specified number of rows (unique GROUP BY keys), the behavior will be determined by the 'group_by_overflow_mode' which by default is - throw an exception, but can be also switched to an approximate GROUP BY mode. + +## max_rows_to_read {#max_rows_to_read} + +Type: UInt64 + +Default value: 0 + +Limit on read rows from the most 'deep' sources. That is, only in the deepest subquery. When reading from a remote server, it is only checked on a remote server. + +## max_rows_to_read_leaf {#max_rows_to_read_leaf} + +Type: UInt64 + +Default value: 0 + +Limit on read rows on the leaf nodes for distributed queries. Limit is applied for local reads only, excluding the final merge stage on the root node. Note, the setting is unstable with prefer_localhost_replica=1. + +## max_rows_to_sort {#max_rows_to_sort} + +Type: UInt64 + +Default value: 0 + +If more than the specified amount of records have to be processed for ORDER BY operation, the behavior will be determined by the 'sort_overflow_mode' which by default is - throw an exception + +## max_rows_to_transfer {#max_rows_to_transfer} + +Type: UInt64 + +Default value: 0 + +Maximum size (in rows) of the transmitted external table obtained when the GLOBAL IN/JOIN section is executed. + +## max_sessions_for_user {#max_sessions_for_user} + +Type: UInt64 + +Default value: 0 + +Maximum number of simultaneous sessions for a user. + +## max_size_to_preallocate_for_aggregation {#max_size_to_preallocate_for_aggregation} + +Type: UInt64 + +Default value: 100000000 + +For how many elements it is allowed to preallocate space in all hash tables in total before aggregation + +## max_size_to_preallocate_for_joins {#max_size_to_preallocate_for_joins} + +Type: UInt64 + +Default value: 100000000 + +For how many elements it is allowed to preallocate space in all hash tables in total before join + +## max_streams_for_merge_tree_reading {#max_streams_for_merge_tree_reading} + +Type: UInt64 + +Default value: 0 + +If is not zero, limit the number of reading streams for MergeTree table. + +## max_streams_multiplier_for_merge_tables {#max_streams_multiplier_for_merge_tables} + +Type: Float + +Default value: 5 + +Ask more streams when reading from Merge table. Streams will be spread across tables that Merge table will use. This allows more even distribution of work across threads and is especially helpful when merged tables differ in size. + +## max_streams_to_max_threads_ratio {#max_streams_to_max_threads_ratio} + +Type: Float + +Default value: 1 + +Allows you to use more sources than the number of threads - to more evenly distribute work across threads. It is assumed that this is a temporary solution since it will be possible in the future to make the number of sources equal to the number of threads, but for each source to dynamically select available work for itself. + +## max_subquery_depth {#max_subquery_depth} + +Type: UInt64 + +Default value: 100 + +If a query has more than the specified number of nested subqueries, throw an exception. This allows you to have a sanity check to protect the users of your cluster from going insane with their queries. + +## max_table_size_to_drop {#max_table_size_to_drop} + +Type: UInt64 + +Default value: 50000000000 + +Restriction on deleting tables in query time. The value 0 means that you can delete all tables without any restrictions. + +Cloud default value: 1 TB. + +:::note +This query setting overwrites its server setting equivalent, see [max_table_size_to_drop](/docs/en/operations/server-configuration-parameters/settings.md/#max-table-size-to-drop) +::: + +## max_temporary_columns {#max_temporary_columns} + +Type: UInt64 + +Default value: 0 + +If a query generates more than the specified number of temporary columns in memory as a result of intermediate calculation, the exception is thrown. Zero value means unlimited. This setting is useful to prevent too complex queries. + +## max_temporary_data_on_disk_size_for_query {#max_temporary_data_on_disk_size_for_query} + +Type: UInt64 + +Default value: 0 + +The maximum amount of data consumed by temporary files on disk in bytes for all concurrently running queries. Zero means unlimited. + +## max_temporary_data_on_disk_size_for_user {#max_temporary_data_on_disk_size_for_user} + +Type: UInt64 + +Default value: 0 + +The maximum amount of data consumed by temporary files on disk in bytes for all concurrently running user queries. Zero means unlimited. + +## max_temporary_non_const_columns {#max_temporary_non_const_columns} + +Type: UInt64 + +Default value: 0 + +Similar to the 'max_temporary_columns' setting but applies only to non-constant columns. This makes sense because constant columns are cheap and it is reasonable to allow more of them. + +## max_threads {#max_threads} + +Type: MaxThreads + +Default value: 'auto(16)' + +The maximum number of query processing threads, excluding threads for retrieving data from remote servers (see the ‘max_distributed_connections’ parameter). + +This parameter applies to threads that perform the same stages of the query processing pipeline in parallel. +For example, when reading from a table, if it is possible to evaluate expressions with functions, filter with WHERE and pre-aggregate for GROUP BY in parallel using at least ‘max_threads’ number of threads, then ‘max_threads’ are used. + +For queries that are completed quickly because of a LIMIT, you can set a lower ‘max_threads’. For example, if the necessary number of entries are located in every block and max_threads = 8, then 8 blocks are retrieved, although it would have been enough to read just one. + +The smaller the `max_threads` value, the less memory is consumed. + +## max_threads_for_indexes {#max_threads_for_indexes} + +Type: UInt64 + +Default value: 0 + +The maximum number of threads process indices. + +## max_untracked_memory {#max_untracked_memory} + +Type: UInt64 + +Default value: 4194304 + +Small allocations and deallocations are grouped in thread local variable and tracked or profiled only when an amount (in absolute value) becomes larger than the specified value. If the value is higher than 'memory_profiler_step' it will be effectively lowered to 'memory_profiler_step'. + +## memory_overcommit_ratio_denominator {#memory_overcommit_ratio_denominator} + +Type: UInt64 + +Default value: 1073741824 + +It represents the soft memory limit when the hard limit is reached on the global level. +This value is used to compute the overcommit ratio for the query. +Zero means skip the query. +Read more about [memory overcommit](memory-overcommit.md). + +## memory_overcommit_ratio_denominator_for_user {#memory_overcommit_ratio_denominator_for_user} + +Type: UInt64 + +Default value: 1073741824 + +It represents the soft memory limit when the hard limit is reached on the user level. +This value is used to compute the overcommit ratio for the query. +Zero means skip the query. +Read more about [memory overcommit](memory-overcommit.md). + +## memory_profiler_sample_max_allocation_size {#memory_profiler_sample_max_allocation_size} + +Type: UInt64 + +Default value: 0 + +Collect random allocations of size less or equal than the specified value with probability equal to `memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold work as expected. + +## memory_profiler_sample_min_allocation_size {#memory_profiler_sample_min_allocation_size} + +Type: UInt64 + +Default value: 0 + +Collect random allocations of size greater or equal than the specified value with probability equal to `memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold work as expected. + +## memory_profiler_sample_probability {#memory_profiler_sample_probability} + +Type: Float + +Default value: 0 + +Collect random allocations and deallocations and write them into system.trace_log with 'MemorySample' trace_type. The probability is for every alloc/free regardless of the size of the allocation (can be changed with `memory_profiler_sample_min_allocation_size` and `memory_profiler_sample_max_allocation_size`). Note that sampling happens only when the amount of untracked memory exceeds 'max_untracked_memory'. You may want to set 'max_untracked_memory' to 0 for extra fine-grained sampling. + +## memory_profiler_step {#memory_profiler_step} + +Type: UInt64 + +Default value: 4194304 + +Sets the step of memory profiler. Whenever query memory usage becomes larger than every next step in number of bytes the memory profiler will collect the allocating stacktrace and will write it into [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log). + +Possible values: + +- A positive integer number of bytes. + +- 0 for turning off the memory profiler. + +## memory_tracker_fault_probability {#memory_tracker_fault_probability} + +Type: Float + +Default value: 0 + +For testing of `exception safety` - throw an exception every time you allocate memory with the specified probability. + +## memory_usage_overcommit_max_wait_microseconds {#memory_usage_overcommit_max_wait_microseconds} + +Type: UInt64 + +Default value: 5000000 + +Maximum time thread will wait for memory to be freed in the case of memory overcommit on a user level. +If the timeout is reached and memory is not freed, an exception is thrown. +Read more about [memory overcommit](memory-overcommit.md). + +## merge_tree_coarse_index_granularity {#merge_tree_coarse_index_granularity} + +Type: UInt64 + +Default value: 8 + +When searching for data, ClickHouse checks the data marks in the index file. If ClickHouse finds that required keys are in some range, it divides this range into `merge_tree_coarse_index_granularity` subranges and searches the required keys there recursively. + +Possible values: + +- Any positive even integer. + +## merge_tree_compact_parts_min_granules_to_multibuffer_read {#merge_tree_compact_parts_min_granules_to_multibuffer_read} + +Type: UInt64 + +Default value: 16 + +Only available in ClickHouse Cloud. Number of granules in stripe of compact part of MergeTree tables to use multibuffer reader, which supports parallel reading and prefetch. In case of reading from remote fs using of multibuffer reader increases number of read request. + +## merge_tree_determine_task_size_by_prewhere_columns {#merge_tree_determine_task_size_by_prewhere_columns} + +Type: Bool + +Default value: 1 + +Whether to use only prewhere columns size to determine reading task size. + +## merge_tree_max_bytes_to_use_cache {#merge_tree_max_bytes_to_use_cache} + +Type: UInt64 + +Default value: 2013265920 + +If ClickHouse should read more than `merge_tree_max_bytes_to_use_cache` bytes in one query, it does not use the cache of uncompressed blocks. + +The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md/#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. + +Possible values: + +- Any positive integer. + +## merge_tree_max_rows_to_use_cache {#merge_tree_max_rows_to_use_cache} + +Type: UInt64 + +Default value: 1048576 + +If ClickHouse should read more than `merge_tree_max_rows_to_use_cache` rows in one query, it does not use the cache of uncompressed blocks. + +The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md/#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. + +Possible values: + +- Any positive integer. + +## merge_tree_min_bytes_for_concurrent_read {#merge_tree_min_bytes_for_concurrent_read} + +Type: UInt64 + +Default value: 251658240 + +If the number of bytes to read from one file of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md)-engine table exceeds `merge_tree_min_bytes_for_concurrent_read`, then ClickHouse tries to concurrently read from this file in several threads. + +Possible value: + +- Positive integer. + +## merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem {#merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem} + +Type: UInt64 + +Default value: 251658240 + +The minimum number of bytes to read from one file before [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem. + +Possible values: + +- Positive integer. + +## merge_tree_min_bytes_for_seek {#merge_tree_min_bytes_for_seek} + +Type: UInt64 + +Default value: 0 + +If the distance between two data blocks to be read in one file is less than `merge_tree_min_bytes_for_seek` bytes, then ClickHouse sequentially reads a range of file that contains both blocks, thus avoiding extra seek. + +Possible values: + +- Any positive integer. + +## merge_tree_min_bytes_per_task_for_remote_reading {#merge_tree_min_bytes_per_task_for_remote_reading} + +Type: UInt64 + +Default value: 2097152 + +Min bytes to read per task. + +## merge_tree_min_rows_for_concurrent_read {#merge_tree_min_rows_for_concurrent_read} + +Type: UInt64 + +Default value: 163840 + +If the number of rows to be read from a file of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table exceeds `merge_tree_min_rows_for_concurrent_read` then ClickHouse tries to perform a concurrent reading from this file on several threads. + +Possible values: + +- Positive integer. + +## merge_tree_min_rows_for_concurrent_read_for_remote_filesystem {#merge_tree_min_rows_for_concurrent_read_for_remote_filesystem} + +Type: UInt64 + +Default value: 163840 + +The minimum number of lines to read from one file before the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem. + +Possible values: + +- Positive integer. + +## merge_tree_min_rows_for_seek {#merge_tree_min_rows_for_seek} + +Type: UInt64 + +Default value: 0 + +If the distance between two data blocks to be read in one file is less than `merge_tree_min_rows_for_seek` rows, then ClickHouse does not seek through the file but reads the data sequentially. + +Possible values: + +- Any positive integer. + +## merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability {#merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability} + +Type: Float + +Default value: 0 + +For testing of `PartsSplitter` - split read ranges into intersecting and non intersecting every time you read from MergeTree with the specified probability. + +## merge_tree_use_const_size_tasks_for_remote_reading {#merge_tree_use_const_size_tasks_for_remote_reading} + +Type: Bool + +Default value: 1 + +Whether to use constant size tasks for reading from a remote table. + +## metrics_perf_events_enabled {#metrics_perf_events_enabled} + +Type: Bool + +Default value: 0 + +If enabled, some of the perf events will be measured throughout queries' execution. + +## metrics_perf_events_list {#metrics_perf_events_list} + +Type: String + +Default value: + +Comma separated list of perf metrics that will be measured throughout queries' execution. Empty means all events. See PerfEventInfo in sources for the available events. + +## min_bytes_to_use_direct_io {#min_bytes_to_use_direct_io} + +Type: UInt64 + +Default value: 0 + +The minimum data volume required for using direct I/O access to the storage disk. + +ClickHouse uses this setting when reading data from tables. If the total storage volume of all the data to be read exceeds `min_bytes_to_use_direct_io` bytes, then ClickHouse reads the data from the storage disk with the `O_DIRECT` option. + +Possible values: + +- 0 — Direct I/O is disabled. +- Positive integer. + +## min_bytes_to_use_mmap_io {#min_bytes_to_use_mmap_io} + +Type: UInt64 + +Default value: 0 + +This is an experimental setting. Sets the minimum amount of memory for reading large files without copying data from the kernel to userspace. Recommended threshold is about 64 MB, because [mmap/munmap](https://en.wikipedia.org/wiki/Mmap) is slow. It makes sense only for large files and helps only if data reside in the page cache. + +Possible values: + +- Positive integer. +- 0 — Big files read with only copying data from kernel to userspace. + +## min_chunk_bytes_for_parallel_parsing {#min_chunk_bytes_for_parallel_parsing} + +Type: UInt64 + +Default value: 10485760 + +- Type: unsigned int +- Default value: 1 MiB + +The minimum chunk size in bytes, which each thread will parse in parallel. + +## min_compress_block_size {#min_compress_block_size} + +Type: UInt64 + +Default value: 65536 + +For [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. In order to reduce latency when processing queries, a block is compressed when writing the next mark if its size is at least `min_compress_block_size`. By default, 65,536. + +The actual size of the block, if the uncompressed data is less than `max_compress_block_size`, is no less than this value and no less than the volume of data for one mark. + +Let’s look at an example. Assume that `index_granularity` was set to 8192 during table creation. + +We are writing a UInt32-type column (4 bytes per value). When writing 8192 rows, the total will be 32 KB of data. Since min_compress_block_size = 65,536, a compressed block will be formed for every two marks. + +We are writing a URL column with the String type (average size of 60 bytes per value). When writing 8192 rows, the average will be slightly less than 500 KB of data. Since this is more than 65,536, a compressed block will be formed for each mark. In this case, when reading data from the disk in the range of a single mark, extra data won’t be decompressed. + +:::note +This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse. +::: ## min_count_to_compile_aggregate_expression {#min_count_to_compile_aggregate_expression} +Type: UInt64 + +Default value: 3 + The minimum number of identical aggregate expressions to start JIT-compilation. Works only if the [compile_aggregate_expressions](#compile_aggregate_expressions) setting is enabled. Possible values: @@ -1676,811 +6173,463 @@ Possible values: - Positive integer. - 0 — Identical aggregate expressions are always JIT-compiled. -Default value: `3`. +## min_count_to_compile_expression {#min_count_to_compile_expression} -## use_query_cache {#use-query-cache} +Type: UInt64 -If turned on, `SELECT` queries may utilize the [query cache](../query-cache.md). Parameters [enable_reads_from_query_cache](#enable-reads-from-query-cache) -and [enable_writes_to_query_cache](#enable-writes-to-query-cache) control in more detail how the cache is used. +Default value: 3 -Possible values: +Minimum count of executing same expression before it is get compiled. -- 0 - Disabled -- 1 - Enabled +## min_count_to_compile_sort_description {#min_count_to_compile_sort_description} -Default value: `0`. +Type: UInt64 -## enable_reads_from_query_cache {#enable-reads-from-query-cache} +Default value: 3 -If turned on, results of `SELECT` queries are retrieved from the [query cache](../query-cache.md). +The number of identical sort descriptions before they are JIT-compiled -Possible values: +## min_execution_speed {#min_execution_speed} -- 0 - Disabled -- 1 - Enabled +Type: UInt64 -Default value: `1`. +Default value: 0 -## enable_writes_to_query_cache {#enable-writes-to-query-cache} +Minimum number of execution rows per second. -If turned on, results of `SELECT` queries are stored in the [query cache](../query-cache.md). +## min_execution_speed_bytes {#min_execution_speed_bytes} -Possible values: +Type: UInt64 -- 0 - Disabled -- 1 - Enabled +Default value: 0 -Default value: `1`. +Minimum number of execution bytes per second. -## query_cache_nondeterministic_function_handling {#query-cache-nondeterministic-function-handling} +## min_external_table_block_size_bytes {#min_external_table_block_size_bytes} -Controls how the [query cache](../query-cache.md) handles `SELECT` queries with non-deterministic functions like `rand()` or `now()`. +Type: UInt64 -Possible values: +Default value: 268402944 -- `'throw'` - Throw an exception and don't cache the query result. -- `'save'` - Cache the query result. -- `'ignore'` - Don't cache the query result and don't throw an exception. +Squash blocks passed to the external table to a specified size in bytes, if blocks are not big enough. -Default value: `throw`. +## min_external_table_block_size_rows {#min_external_table_block_size_rows} -## query_cache_system_table_handling {#query-cache-system-table-handling} +Type: UInt64 -Controls how the [query cache](../query-cache.md) handles `SELECT` queries against system tables, i.e. tables in databases `system.*` and `information_schema.*`. +Default value: 1048449 -Possible values: +Squash blocks passed to external table to specified size in rows, if blocks are not big enough. -- `'throw'` - Throw an exception and don't cache the query result. -- `'save'` - Cache the query result. -- `'ignore'` - Don't cache the query result and don't throw an exception. +## min_free_disk_bytes_to_perform_insert {#min_free_disk_bytes_to_perform_insert} -Default value: `throw`. +Type: UInt64 -## query_cache_min_query_runs {#query-cache-min-query-runs} +Default value: 0 -Minimum number of times a `SELECT` query must run before its result is stored in the [query cache](../query-cache.md). +Minimum free disk space bytes to perform an insert. -Possible values: +## min_free_disk_ratio_to_perform_insert {#min_free_disk_ratio_to_perform_insert} -- Positive integer >= 0. +Type: Float -Default value: `0` +Default value: 0 -## query_cache_min_query_duration {#query-cache-min-query-duration} +Minimum free disk space ratio to perform an insert. -Minimum duration in milliseconds a query needs to run for its result to be stored in the [query cache](../query-cache.md). +## min_free_disk_space_for_temporary_data {#min_free_disk_space_for_temporary_data} -Possible values: +Type: UInt64 -- Positive integer >= 0. +Default value: 0 -Default value: `0` +The minimum disk space to keep while writing temporary data used in external sorting and aggregation. -## query_cache_compress_entries {#query-cache-compress-entries} +## min_hit_rate_to_use_consecutive_keys_optimization {#min_hit_rate_to_use_consecutive_keys_optimization} -Compress entries in the [query cache](../query-cache.md). Lessens the memory consumption of the query cache at the cost of slower inserts into / reads from it. +Type: Float -Possible values: +Default value: 0.5 -- 0 - Disabled -- 1 - Enabled +Minimal hit rate of a cache which is used for consecutive keys optimization in aggregation to keep it enabled -Default value: `1` +## min_insert_block_size_bytes {#min_insert_block_size_bytes} -## query_cache_squash_partial_results {#query-cache-squash-partial-results} +Type: UInt64 -Squash partial result blocks to blocks of size [max_block_size](#setting-max_block_size). Reduces performance of inserts into the [query cache](../query-cache.md) but improves the compressability of cache entries (see [query_cache_compress-entries](#query-cache-compress-entries)). +Default value: 268402944 -Possible values: - -- 0 - Disabled -- 1 - Enabled - -Default value: `1` - -## query_cache_ttl {#query-cache-ttl} - -After this time in seconds entries in the [query cache](../query-cache.md) become stale. - -Possible values: - -- Positive integer >= 0. - -Default value: `60` - -## query_cache_share_between_users {#query-cache-share-between-users} - -If turned on, the result of `SELECT` queries cached in the [query cache](../query-cache.md) can be read by other users. -It is not recommended to enable this setting due to security reasons. - -Possible values: - -- 0 - Disabled -- 1 - Enabled - -Default value: `0`. - -## query_cache_tag {#query-cache-tag} - -A string which acts as a label for [query cache](../query-cache.md) entries. -The same queries with different tags are considered different by the query cache. - -Possible values: - -- Any string - -Default value: `''` - -## query_cache_max_size_in_bytes {#query-cache-max-size-in-bytes} - -The maximum amount of memory (in bytes) the current user may allocate in the [query cache](../query-cache.md). 0 means unlimited. - -Possible values: - -- Positive integer >= 0. - -Default value: 0 (no restriction). - -## query_cache_max_entries {#query-cache-max-entries} - -The maximum number of query results the current user may store in the [query cache](../query-cache.md). 0 means unlimited. - -Possible values: - -- Positive integer >= 0. - -Default value: 0 (no restriction). - -## insert_quorum {#insert_quorum} - -:::note -This setting is not applicable to SharedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information. -::: - -Enables the quorum writes. - -- If `insert_quorum < 2`, the quorum writes are disabled. -- If `insert_quorum >= 2`, the quorum writes are enabled. -- If `insert_quorum = 'auto'`, use majority number (`number_of_replicas / 2 + 1`) as quorum number. - -Default value: 0 - disabled. - -Quorum writes - -`INSERT` succeeds only when ClickHouse manages to correctly write data to the `insert_quorum` of replicas during the `insert_quorum_timeout`. If for any reason the number of replicas with successful writes does not reach the `insert_quorum`, the write is considered failed and ClickHouse will delete the inserted block from all the replicas where data has already been written. - -When `insert_quorum_parallel` is disabled, all replicas in the quorum are consistent, i.e. they contain data from all previous `INSERT` queries (the `INSERT` sequence is linearized). When reading data written using `insert_quorum` and `insert_quorum_parallel` is disabled, you can turn on sequential consistency for `SELECT` queries using [select_sequential_consistency](#select_sequential_consistency). - -ClickHouse generates an exception: - -- If the number of available replicas at the time of the query is less than the `insert_quorum`. -- When `insert_quorum_parallel` is disabled and an attempt to write data is made when the previous block has not yet been inserted in `insert_quorum` of replicas. This situation may occur if the user tries to perform another `INSERT` query to the same table before the previous one with `insert_quorum` is completed. - -See also: - -- [insert_quorum_timeout](#insert_quorum_timeout) -- [insert_quorum_parallel](#insert_quorum_parallel) -- [select_sequential_consistency](#select_sequential_consistency) - -## insert_quorum_timeout {#insert_quorum_timeout} - -Write to a quorum timeout in milliseconds. If the timeout has passed and no write has taken place yet, ClickHouse will generate an exception and the client must repeat the query to write the same block to the same or any other replica. - -Default value: 600 000 milliseconds (ten minutes). - -See also: - -- [insert_quorum](#insert_quorum) -- [insert_quorum_parallel](#insert_quorum_parallel) -- [select_sequential_consistency](#select_sequential_consistency) - -## insert_quorum_parallel {#insert_quorum_parallel} - -:::note -This setting is not applicable to SharedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information. -::: - -Enables or disables parallelism for quorum `INSERT` queries. If enabled, additional `INSERT` queries can be sent while previous queries have not yet finished. If disabled, additional writes to the same table will be rejected. - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: 1. - -See also: - -- [insert_quorum](#insert_quorum) -- [insert_quorum_timeout](#insert_quorum_timeout) -- [select_sequential_consistency](#select_sequential_consistency) - -## select_sequential_consistency {#select_sequential_consistency} - -:::note -This setting differ in behavior between SharedMergeTree and ReplicatedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information about the behavior of `select_sequential_consistency` in SharedMergeTree. -::: - -Enables or disables sequential consistency for `SELECT` queries. Requires `insert_quorum_parallel` to be disabled (enabled by default). - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: 0. - -Usage - -When sequential consistency is enabled, ClickHouse allows the client to execute the `SELECT` query only for those replicas that contain data from all previous `INSERT` queries executed with `insert_quorum`. If the client refers to a partial replica, ClickHouse will generate an exception. The SELECT query will not include data that has not yet been written to the quorum of replicas. - -When `insert_quorum_parallel` is enabled (the default), then `select_sequential_consistency` does not work. This is because parallel `INSERT` queries can be written to different sets of quorum replicas so there is no guarantee a single replica will have received all writes. - -See also: - -- [insert_quorum](#insert_quorum) -- [insert_quorum_timeout](#insert_quorum_timeout) -- [insert_quorum_parallel](#insert_quorum_parallel) - -## insert_deduplicate {#insert-deduplicate} - -Enables or disables block deduplication of `INSERT` (for Replicated\* tables). - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: 1. - -By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)). -For the replicated tables by default the only 100 of the most recent blocks for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md/#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)). -For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window). - -## Asynchronous Insert settings - -### async_insert {#async-insert} - -Enables or disables asynchronous inserts. Note that deduplication is disabled by default, see [async_insert_deduplicate](#async-insert-deduplicate). - -If enabled, the data is combined into batches before the insertion into tables, so it is possible to do small and frequent insertions into ClickHouse (up to 15000 queries per second) without buffer tables. - -The data is inserted either after the [async_insert_max_data_size](#async-insert-max-data-size) is exceeded or after [async_insert_busy_timeout_ms](#async-insert-busy-timeout-ms) milliseconds since the first `INSERT` query. If the [async_insert_stale_timeout_ms](#async-insert-stale-timeout-ms) is set to a non-zero value, the data is inserted after `async_insert_stale_timeout_ms` milliseconds since the last query. Also the buffer will be flushed to disk if at least [async_insert_max_query_number](#async-insert-max-query-number) async insert queries per block were received. This last setting takes effect only if [async_insert_deduplicate](#async-insert-deduplicate) is enabled. - -If [wait_for_async_insert](#wait-for-async-insert) is enabled, every client will wait for the data to be processed and flushed to the table. Otherwise, the query would be processed almost instantly, even if the data is not inserted. - -Possible values: - -- 0 — Insertions are made synchronously, one after another. -- 1 — Multiple asynchronous insertions enabled. - -Default value: `0`. - -### async_insert_threads {#async-insert-threads} - -The maximum number of threads for background data parsing and insertion. +Sets the minimum number of bytes in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. Possible values: - Positive integer. -- 0 — Asynchronous insertions are disabled. +- 0 — Squashing disabled. -Default value: `16`. +## min_insert_block_size_bytes_for_materialized_views {#min_insert_block_size_bytes_for_materialized_views} -### wait_for_async_insert {#wait-for-async-insert} +Type: UInt64 -Enables or disables waiting for processing of asynchronous insertion. If enabled, server will return `OK` only after the data is inserted. Otherwise, it will return `OK` as soon it has received the data, but it might still fail to parse or insert it later (You can check in system.asynchronous_insert_log) +Default value: 0 -If you want to use asynchronous inserts, we need to also enable [`async_insert`](#async-insert). +Sets the minimum number of bytes in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. This setting is applied only for blocks inserted into [materialized view](../../sql-reference/statements/create/view.md). By adjusting this setting, you control blocks squashing while pushing to materialized view and avoid excessive memory usage. Possible values: -- 0 — Server returns `OK` even if the data is not yet inserted. -- 1 — Server returns `OK` only after the data is inserted. +- Any positive integer. +- 0 — Squashing disabled. -Default value: `1`. +**See also** -### wait_for_async_insert_timeout {#wait-for-async-insert-timeout} +- [min_insert_block_size_bytes](#min-insert-block-size-bytes) -The timeout in seconds for waiting for processing of asynchronous insertion. +## min_insert_block_size_rows {#min_insert_block_size_rows} + +Type: UInt64 + +Default value: 1048449 + +Sets the minimum number of rows in the block that can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. Possible values: - Positive integer. -- 0 — Disabled. +- 0 — Squashing disabled. -Default value: [lock_acquire_timeout](#lock_acquire_timeout). +## min_insert_block_size_rows_for_materialized_views {#min_insert_block_size_rows_for_materialized_views} -### async_insert_max_data_size {#async-insert-max-data-size} +Type: UInt64 -The maximum size of the unparsed data in bytes collected per query before being inserted. +Default value: 0 + +Sets the minimum number of rows in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. This setting is applied only for blocks inserted into [materialized view](../../sql-reference/statements/create/view.md). By adjusting this setting, you control blocks squashing while pushing to materialized view and avoid excessive memory usage. Possible values: +- Any positive integer. +- 0 — Squashing disabled. + +**See Also** + +- [min_insert_block_size_rows](#min-insert-block-size-rows) + +## mongodb_throw_on_unsupported_query {#mongodb_throw_on_unsupported_query} + +Type: Bool + +Default value: 1 + +If enabled, MongoDB tables will return an error when a MongoDB query cannot be built. Otherwise, ClickHouse reads the full table and processes it locally. This option does not apply to the legacy implementation or when 'allow_experimental_analyzer=0'. + +## move_all_conditions_to_prewhere {#move_all_conditions_to_prewhere} + +Type: Bool + +Default value: 1 + +Move all viable conditions from WHERE to PREWHERE + +## move_primary_key_columns_to_end_of_prewhere {#move_primary_key_columns_to_end_of_prewhere} + +Type: Bool + +Default value: 1 + +Move PREWHERE conditions containing primary key columns to the end of AND chain. It is likely that these conditions are taken into account during primary key analysis and thus will not contribute a lot to PREWHERE filtering. + +## multiple_joins_try_to_keep_original_names {#multiple_joins_try_to_keep_original_names} + +Type: Bool + +Default value: 0 + +Do not add aliases to top level expression list on multiple joins rewrite + +## mutations_execute_nondeterministic_on_initiator {#mutations_execute_nondeterministic_on_initiator} + +Type: Bool + +Default value: 0 + +If true constant nondeterministic functions (e.g. function `now()`) are executed on initiator and replaced to literals in `UPDATE` and `DELETE` queries. It helps to keep data in sync on replicas while executing mutations with constant nondeterministic functions. Default value: `false`. + +## mutations_execute_subqueries_on_initiator {#mutations_execute_subqueries_on_initiator} + +Type: Bool + +Default value: 0 + +If true scalar subqueries are executed on initiator and replaced to literals in `UPDATE` and `DELETE` queries. Default value: `false`. + +## mutations_max_literal_size_to_replace {#mutations_max_literal_size_to_replace} + +Type: UInt64 + +Default value: 16384 + +The maximum size of serialized literal in bytes to replace in `UPDATE` and `DELETE` queries. Takes effect only if at least one the two settings above is enabled. Default value: 16384 (16 KiB). + +## mutations_sync {#mutations_sync} + +Type: UInt64 + +Default value: 0 + +Allows to execute `ALTER TABLE ... UPDATE|DELETE|MATERIALIZE INDEX|MATERIALIZE PROJECTION|MATERIALIZE COLUMN` queries ([mutations](../../sql-reference/statements/alter/index.md#mutations)) synchronously. + +Possible values: + +- 0 - Mutations execute asynchronously. +- 1 - The query waits for all mutations to complete on the current server. +- 2 - The query waits for all mutations to complete on all replicas (if they exist). + +## mysql_datatypes_support_level {#mysql_datatypes_support_level} + +Type: MySQLDataTypesSupport + +Default value: + +Defines how MySQL types are converted to corresponding ClickHouse types. A comma separated list in any combination of `decimal`, `datetime64`, `date2Date32` or `date2String`. +- `decimal`: convert `NUMERIC` and `DECIMAL` types to `Decimal` when precision allows it. +- `datetime64`: convert `DATETIME` and `TIMESTAMP` types to `DateTime64` instead of `DateTime` when precision is not `0`. +- `date2Date32`: convert `DATE` to `Date32` instead of `Date`. Takes precedence over `date2String`. +- `date2String`: convert `DATE` to `String` instead of `Date`. Overridden by `datetime64`. + +## mysql_map_fixed_string_to_text_in_show_columns {#mysql_map_fixed_string_to_text_in_show_columns} + +Type: Bool + +Default value: 1 + +When enabled, [FixedString](../../sql-reference/data-types/fixedstring.md) ClickHouse data type will be displayed as `TEXT` in [SHOW COLUMNS](../../sql-reference/statements/show.md#show_columns). + +Has an effect only when the connection is made through the MySQL wire protocol. + +- 0 - Use `BLOB`. +- 1 - Use `TEXT`. + +## mysql_map_string_to_text_in_show_columns {#mysql_map_string_to_text_in_show_columns} + +Type: Bool + +Default value: 1 + +When enabled, [String](../../sql-reference/data-types/string.md) ClickHouse data type will be displayed as `TEXT` in [SHOW COLUMNS](../../sql-reference/statements/show.md#show_columns). + +Has an effect only when the connection is made through the MySQL wire protocol. + +- 0 - Use `BLOB`. +- 1 - Use `TEXT`. + +## mysql_max_rows_to_insert {#mysql_max_rows_to_insert} + +Type: UInt64 + +Default value: 65536 + +The maximum number of rows in MySQL batch insertion of the MySQL storage engine + +## network_compression_method {#network_compression_method} + +Type: String + +Default value: LZ4 + +Sets the method of data compression that is used for communication between servers and between server and [clickhouse-client](../../interfaces/cli.md). + +Possible values: + +- `LZ4` — sets LZ4 compression method. +- `ZSTD` — sets ZSTD compression method. + +**See Also** + +- [network_zstd_compression_level](#network_zstd_compression_level) + +## network_zstd_compression_level {#network_zstd_compression_level} + +Type: Int64 + +Default value: 1 + +Adjusts the level of ZSTD compression. Used only when [network_compression_method](#network_compression_method) is set to `ZSTD`. + +Possible values: + +- Positive integer from 1 to 15. + +## normalize_function_names {#normalize_function_names} + +Type: Bool + +Default value: 1 + +Normalize function names to their canonical names + +## number_of_mutations_to_delay {#number_of_mutations_to_delay} + +Type: UInt64 + +Default value: 0 + +If the mutated table contains at least that many unfinished mutations, artificially slow down mutations of table. 0 - disabled + +## number_of_mutations_to_throw {#number_of_mutations_to_throw} + +Type: UInt64 + +Default value: 0 + +If the mutated table contains at least that many unfinished mutations, throw 'Too many mutations ...' exception. 0 - disabled + +## odbc_bridge_connection_pool_size {#odbc_bridge_connection_pool_size} + +Type: UInt64 + +Default value: 16 + +Connection pool size for each connection settings string in ODBC bridge. + +## odbc_bridge_use_connection_pooling {#odbc_bridge_use_connection_pooling} + +Type: Bool + +Default value: 1 + +Use connection pooling in ODBC bridge. If set to false, a new connection is created every time. + +## offset {#offset} + +Type: UInt64 + +Default value: 0 + +Sets the number of rows to skip before starting to return rows from the query. It adjusts the offset set by the [OFFSET](../../sql-reference/statements/select/offset.md/#offset-fetch) clause, so that these two values are summarized. + +Possible values: + +- 0 — No rows are skipped . - Positive integer. -- 0 — Asynchronous insertions are disabled. - -Default value: `10485760`. - -### async_insert_max_query_number {#async-insert-max-query-number} - -The maximum number of insert queries per block before being inserted. This setting takes effect only if [async_insert_deduplicate](#async-insert-deduplicate) is enabled. - -Possible values: - -- Positive integer. -- 0 — Asynchronous insertions are disabled. - -Default value: `450`. - -### async_insert_busy_timeout_max_ms {#async-insert-busy-timeout-max-ms} - -The maximum timeout in milliseconds since the first `INSERT` query before inserting collected data. - -Possible values: - -- Positive integer. -- 0 — Timeout disabled. - -Default value: `200`. - -Cloud default value: `1000`. - -### async_insert_poll_timeout_ms {#async-insert-poll-timeout-ms} - -Timeout in milliseconds for polling data from asynchronous insert queue. - -Possible values: - -- Positive integer. - -Default value: `10`. - -### async_insert_use_adaptive_busy_timeout {#allow-experimental-async-insert-adaptive-busy-timeout} - -Use adaptive asynchronous insert timeout. - -Possible values: - -- 0 - Disabled. -- 1 - Enabled. - -Default value: `1`. - -### async_insert_busy_timeout_min_ms {#async-insert-busy-timeout-min-ms} - -If adaptive asynchronous insert timeout is allowed through [async_insert_use_adaptive_busy_timeout](#allow-experimental-async-insert-adaptive-busy-timeout), the setting specifies the minimum value of the asynchronous insert timeout in milliseconds. It also serves as the initial value, which may be increased later by the adaptive algorithm, up to the [async_insert_busy_timeout_ms](#async_insert_busy_timeout_ms). - -Possible values: - -- Positive integer. - -Default value: `50`. - -### async_insert_busy_timeout_ms {#async-insert-busy-timeout-ms} - -Alias for [`async_insert_busy_timeout_max_ms`](#async_insert_busy_timeout_max_ms). - -### async_insert_busy_timeout_increase_rate {#async-insert-busy-timeout-increase-rate} - -If adaptive asynchronous insert timeout is allowed through [async_insert_use_adaptive_busy_timeout](#allow-experimental-async-insert-adaptive-busy-timeout), the setting specifies the exponential growth rate at which the adaptive asynchronous insert timeout increases. - -Possible values: - -- A positive floating-point number. - -Default value: `0.2`. - -### async_insert_busy_timeout_decrease_rate {#async-insert-busy-timeout-decrease-rate} - -If adaptive asynchronous insert timeout is allowed through [async_insert_use_adaptive_busy_timeout](#allow-experimental-async-insert-adaptive-busy-timeout), the setting specifies the exponential growth rate at which the adaptive asynchronous insert timeout decreases. - -Possible values: - -- A positive floating-point number. - -Default value: `0.2`. - -### async_insert_stale_timeout_ms {#async-insert-stale-timeout-ms} - -The maximum timeout in milliseconds since the last `INSERT` query before dumping collected data. If enabled, the settings prolongs the [async_insert_busy_timeout_ms](#async-insert-busy-timeout-ms) with every `INSERT` query as long as [async_insert_max_data_size](#async-insert-max-data-size) is not exceeded. - -Possible values: - -- Positive integer. -- 0 — Timeout disabled. - -Default value: `0`. - -### async_insert_deduplicate {#async-insert-deduplicate} - -Enables or disables insert deduplication of `ASYNC INSERT` (for Replicated\* tables). - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: 0. - -By default, async inserts are inserted into replicated tables by the `INSERT` statement enabling [async_insert](#async-insert) are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)). -For the replicated tables, by default, only 10000 of the most recent inserts for each partition are deduplicated (see [replicated_deduplication_window_for_async_inserts](merge-tree-settings.md/#replicated-deduplication-window-async-inserts), [replicated_deduplication_window_seconds_for_async_inserts](merge-tree-settings.md/#replicated-deduplication-window-seconds-async-inserts)). -We recommend enabling the [async_block_ids_cache](merge-tree-settings.md/#use-async-block-ids-cache) to increase the efficiency of deduplication. -This function does not work for non-replicated tables. - -## deduplicate_blocks_in_dependent_materialized_views {#deduplicate-blocks-in-dependent-materialized-views} - -Enables or disables the deduplication check for materialized views that receive data from Replicated\* tables. - -Possible values: - - 0 — Disabled. - 1 — Enabled. - -Default value: 0. - -Usage - -By default, deduplication is not performed for materialized views but is done upstream, in the source table. -If an INSERTed block is skipped due to deduplication in the source table, there will be no insertion into attached materialized views. This behaviour exists to enable the insertion of highly aggregated data into materialized views, for cases where inserted blocks are the same after materialized view aggregation but derived from different INSERTs into the source table. -At the same time, this behaviour “breaks” `INSERT` idempotency. If an `INSERT` into the main table was successful and `INSERT` into a materialized view failed (e.g. because of communication failure with ClickHouse Keeper) a client will get an error and can retry the operation. However, the materialized view won’t receive the second insert because it will be discarded by deduplication in the main (source) table. The setting `deduplicate_blocks_in_dependent_materialized_views` allows for changing this behaviour. On retry, a materialized view will receive the repeat insert and will perform a deduplication check by itself, -ignoring check result for the source table, and will insert rows lost because of the first failure. - -## insert_deduplication_token {#insert_deduplication_token} - -The setting allows a user to provide own deduplication semantic in MergeTree/ReplicatedMergeTree -For example, by providing a unique value for the setting in each INSERT statement, -user can avoid the same inserted data being deduplicated. - -Possible values: - -- Any string - -Default value: empty string (disabled) - -`insert_deduplication_token` is used for deduplication _only_ when not empty. - -For the replicated tables by default the only 100 of the most recent inserts for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md/#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)). -For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window). - -:::note -`insert_deduplication_token` works on a partition level (the same as `insert_deduplication` checksum). Multiple partitions can have the same `insert_deduplication_token`. -::: - -Example: - -```sql -CREATE TABLE test_table -( A Int64 ) -ENGINE = MergeTree -ORDER BY A -SETTINGS non_replicated_deduplication_window = 100; - -INSERT INTO test_table SETTINGS insert_deduplication_token = 'test' VALUES (1); - --- the next insert won't be deduplicated because insert_deduplication_token is different -INSERT INTO test_table SETTINGS insert_deduplication_token = 'test1' VALUES (1); - --- the next insert will be deduplicated because insert_deduplication_token --- is the same as one of the previous -INSERT INTO test_table SETTINGS insert_deduplication_token = 'test' VALUES (2); - -SELECT * FROM test_table - -┌─A─┐ -│ 1 │ -└───┘ -┌─A─┐ -│ 1 │ -└───┘ -``` - -## update_insert_deduplication_token_in_dependent_materialized_views {#update-insert-deduplication-token-in-dependent-materialized-views} - -Allows to update `insert_deduplication_token` with view identifier during insert in dependent materialized views, if setting `deduplicate_blocks_in_dependent_materialized_views` is enabled and `insert_deduplication_token` is set. - -Possible values: - - 0 — Disabled. - 1 — Enabled. - -Default value: 0. - -Usage: - -If setting `deduplicate_blocks_in_dependent_materialized_views` is enabled, `insert_deduplication_token` is passed to dependent materialized views. But in complex INSERT flows it is possible that we want to avoid deduplication for dependent materialized views. - -Example: -``` -landing -┬--> mv_1_1 ---> ds_1_1 ---> mv_2_1 --┬-> ds_2_1 ---> mv_3_1 ---> ds_3_1 - | | - └--> mv_1_2 ---> ds_1_2 ---> mv_2_2 --┘ -``` - -In this example we want to avoid deduplication for two different blocks generated from `mv_2_1` and `mv_2_2` that will be inserted into `ds_2_1`. Without `update_insert_deduplication_token_in_dependent_materialized_views` setting enabled, those two different blocks will be deduplicated, because different blocks from `mv_2_1` and `mv_2_2` will have the same `insert_deduplication_token`. - -If setting `update_insert_deduplication_token_in_dependent_materialized_views` is enabled, during each insert into dependent materialized views `insert_deduplication_token` is updated with table identifier, so block from `mv_2_1` and block from `mv_2_2` will have different `insert_deduplication_token` and will not be deduplicated. - -## insert_keeper_max_retries - -The setting sets the maximum number of retries for ClickHouse Keeper (or ZooKeeper) requests during insert into replicated MergeTree. Only Keeper requests which failed due to network error, Keeper session timeout, or request timeout are considered for retries. - -Possible values: - -- Positive integer. -- 0 — Retries are disabled - -Default value: 20 - -Cloud default value: `20`. - -Keeper request retries are done after some timeout. The timeout is controlled by the following settings: `insert_keeper_retry_initial_backoff_ms`, `insert_keeper_retry_max_backoff_ms`. -The first retry is done after `insert_keeper_retry_initial_backoff_ms` timeout. The consequent timeouts will be calculated as follows: -``` -timeout = min(insert_keeper_retry_max_backoff_ms, latest_timeout * 2) -``` - -For example, if `insert_keeper_retry_initial_backoff_ms=100`, `insert_keeper_retry_max_backoff_ms=10000` and `insert_keeper_max_retries=8` then timeouts will be `100, 200, 400, 800, 1600, 3200, 6400, 10000`. - -Apart from fault tolerance, the retries aim to provide a better user experience - they allow to avoid returning an error during INSERT execution if Keeper is restarted, for example, due to an upgrade. - -## insert_keeper_retry_initial_backoff_ms {#insert_keeper_retry_initial_backoff_ms} - -Initial timeout(in milliseconds) to retry a failed Keeper request during INSERT query execution - -Possible values: - -- Positive integer. -- 0 — No timeout - -Default value: 100 - -## insert_keeper_retry_max_backoff_ms {#insert_keeper_retry_max_backoff_ms} - -Maximum timeout (in milliseconds) to retry a failed Keeper request during INSERT query execution - -Possible values: - -- Positive integer. -- 0 — Maximum timeout is not limited - -Default value: 10000 - -## max_network_bytes {#max-network-bytes} - -Limits the data volume (in bytes) that is received or transmitted over the network when executing a query. This setting applies to every individual query. - -Possible values: - -- Positive integer. -- 0 — Data volume control is disabled. - -Default value: 0. - -## max_network_bandwidth {#max-network-bandwidth} - -Limits the speed of the data exchange over the network in bytes per second. This setting applies to every query. - -Possible values: - -- Positive integer. -- 0 — Bandwidth control is disabled. - -Default value: 0. - -## max_network_bandwidth_for_user {#max-network-bandwidth-for-user} - -Limits the speed of the data exchange over the network in bytes per second. This setting applies to all concurrently running queries performed by a single user. - -Possible values: - -- Positive integer. -- 0 — Control of the data speed is disabled. - -Default value: 0. - -## max_network_bandwidth_for_all_users {#max-network-bandwidth-for-all-users} - -Limits the speed that data is exchanged at over the network in bytes per second. This setting applies to all concurrently running queries on the server. - -Possible values: - -- Positive integer. -- 0 — Control of the data speed is disabled. - -Default value: 0. - -## count_distinct_implementation {#count_distinct_implementation} - -Specifies which of the `uniq*` functions should be used to perform the [COUNT(DISTINCT ...)](../../sql-reference/aggregate-functions/reference/count.md/#agg_function-count) construction. - -Possible values: - -- [uniq](../../sql-reference/aggregate-functions/reference/uniq.md/#agg_function-uniq) -- [uniqCombined](../../sql-reference/aggregate-functions/reference/uniqcombined.md/#agg_function-uniqcombined) -- [uniqCombined64](../../sql-reference/aggregate-functions/reference/uniqcombined64.md/#agg_function-uniqcombined64) -- [uniqHLL12](../../sql-reference/aggregate-functions/reference/uniqhll12.md/#agg_function-uniqhll12) -- [uniqExact](../../sql-reference/aggregate-functions/reference/uniqexact.md/#agg_function-uniqexact) - -Default value: `uniqExact`. - -## skip_unavailable_shards {#skip_unavailable_shards} - -Enables or disables silently skipping of unavailable shards. - -Shard is considered unavailable if all its replicas are unavailable. A replica is unavailable in the following cases: - -- ClickHouse can’t connect to replica for any reason. - - When connecting to a replica, ClickHouse performs several attempts. If all these attempts fail, the replica is considered unavailable. - -- Replica can’t be resolved through DNS. - - If replica’s hostname can’t be resolved through DNS, it can indicate the following situations: - - - Replica’s host has no DNS record. It can occur in systems with dynamic DNS, for example, [Kubernetes](https://kubernetes.io), where nodes can be unresolvable during downtime, and this is not an error. - - - Configuration error. ClickHouse configuration file contains a wrong hostname. - -Possible values: - -- 1 — skipping enabled. - - If a shard is unavailable, ClickHouse returns a result based on partial data and does not report node availability issues. - -- 0 — skipping disabled. - - If a shard is unavailable, ClickHouse throws an exception. - -Default value: 0. - -## distributed_group_by_no_merge {#distributed-group-by-no-merge} - -Do not merge aggregation states from different servers for distributed query processing, you can use this in case it is for certain that there are different keys on different shards - -Possible values: - -- `0` — Disabled (final query processing is done on the initiator node). -- `1` - Do not merge aggregation states from different servers for distributed query processing (query completely processed on the shard, initiator only proxy the data), can be used in case it is for certain that there are different keys on different shards. -- `2` - Same as `1` but applies `ORDER BY` and `LIMIT` (it is not possible when the query processed completely on the remote node, like for `distributed_group_by_no_merge=1`) on the initiator (can be used for queries with `ORDER BY` and/or `LIMIT`). - -Default value: `0` **Example** -```sql -SELECT * -FROM remote('127.0.0.{2,3}', system.one) -GROUP BY dummy -LIMIT 1 -SETTINGS distributed_group_by_no_merge = 1 -FORMAT PrettyCompactMonoBlock +Input table: -┌─dummy─┐ -│ 0 │ -│ 0 │ -└───────┘ +``` sql +CREATE TABLE test (i UInt64) ENGINE = MergeTree() ORDER BY i; +INSERT INTO test SELECT number FROM numbers(500); ``` -```sql -SELECT * -FROM remote('127.0.0.{2,3}', system.one) -GROUP BY dummy -LIMIT 1 -SETTINGS distributed_group_by_no_merge = 2 -FORMAT PrettyCompactMonoBlock +Query: -┌─dummy─┐ -│ 0 │ -└───────┘ +``` sql +SET limit = 5; +SET offset = 7; +SELECT * FROM test LIMIT 10 OFFSET 100; +``` +Result: + +``` text +┌───i─┐ +│ 107 │ +│ 108 │ +│ 109 │ +└─────┘ ``` -## distributed_push_down_limit {#distributed-push-down-limit} +## opentelemetry_start_trace_probability {#opentelemetry_start_trace_probability} -Enables or disables [LIMIT](#limit) applying on each shard separately. - -This will allow to avoid: -- Sending extra rows over network; -- Processing rows behind the limit on the initiator. - -Starting from 21.9 version you cannot get inaccurate results anymore, since `distributed_push_down_limit` changes query execution only if at least one of the conditions met: -- [distributed_group_by_no_merge](#distributed-group-by-no-merge) > 0. -- Query **does not have** `GROUP BY`/`DISTINCT`/`LIMIT BY`, but it has `ORDER BY`/`LIMIT`. -- Query **has** `GROUP BY`/`DISTINCT`/`LIMIT BY` with `ORDER BY`/`LIMIT` and: - - [optimize_skip_unused_shards](#optimize-skip-unused-shards) is enabled. - - [optimize_distributed_group_by_sharding_key](#optimize-distributed-group-by-sharding-key) is enabled. - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: `1`. - -See also: - -- [distributed_group_by_no_merge](#distributed-group-by-no-merge) -- [optimize_skip_unused_shards](#optimize-skip-unused-shards) -- [optimize_distributed_group_by_sharding_key](#optimize-distributed-group-by-sharding-key) - -## optimize_skip_unused_shards_limit {#optimize-skip-unused-shards-limit} - -Limit for number of sharding key values, turns off `optimize_skip_unused_shards` if the limit is reached. - -Too many values may require significant amount for processing, while the benefit is doubtful, since if you have huge number of values in `IN (...)`, then most likely the query will be sent to all shards anyway. - -Default value: 1000 - -## optimize_skip_unused_shards {#optimize-skip-unused-shards} - -Enables or disables skipping of unused shards for [SELECT](../../sql-reference/statements/select/index.md) queries that have sharding key condition in `WHERE/PREWHERE` (assuming that the data is distributed by sharding key, otherwise a query yields incorrect result). - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. +Type: Float Default value: 0 -## optimize_skip_unused_shards_rewrite_in {#optimize-skip-unused-shards-rewrite-in} - -Rewrite IN in query for remote shards to exclude values that does not belong to the shard (requires optimize_skip_unused_shards). +Sets the probability that the ClickHouse can start a trace for executed queries (if no parent [trace context](https://www.w3.org/TR/trace-context/) is supplied). Possible values: -- 0 — Disabled. -- 1 — Enabled. +- 0 — The trace for all executed queries is disabled (if no parent trace context is supplied). +- Positive floating-point number in the range [0..1]. For example, if the setting value is `0,5`, ClickHouse can start a trace on average for half of the queries. +- 1 — The trace for all executed queries is enabled. -Default value: 1 (since it requires `optimize_skip_unused_shards` anyway, which `0` by default) +## opentelemetry_trace_processors {#opentelemetry_trace_processors} -## allow_nondeterministic_optimize_skip_unused_shards {#allow-nondeterministic-optimize-skip-unused-shards} - -Allow nondeterministic (like `rand` or `dictGet`, since later has some caveats with updates) functions in sharding key. - -Possible values: - -- 0 — Disallowed. -- 1 — Allowed. +Type: Bool Default value: 0 -## optimize_skip_unused_shards_nesting {#optimize-skip-unused-shards-nesting} +Collect OpenTelemetry spans for processors. -Controls [`optimize_skip_unused_shards`](#optimize-skip-unused-shards) (hence still requires [`optimize_skip_unused_shards`](#optimize-skip-unused-shards)) depends on the nesting level of the distributed query (case when you have `Distributed` table that look into another `Distributed` table). +## optimize_aggregation_in_order {#optimize_aggregation_in_order} -Possible values: - -- 0 — Disabled, `optimize_skip_unused_shards` works always. -- 1 — Enables `optimize_skip_unused_shards` only for the first level. -- 2 — Enables `optimize_skip_unused_shards` up to the second level. +Type: Bool Default value: 0 -## force_optimize_skip_unused_shards {#force-optimize-skip-unused-shards} - -Enables or disables query execution if [optimize_skip_unused_shards](#optimize-skip-unused-shards) is enabled and skipping of unused shards is not possible. If the skipping is not possible and the setting is enabled, an exception will be thrown. +Enables [GROUP BY](../../sql-reference/statements/select/group-by.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries for aggregating data in corresponding order in [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. Possible values: -- 0 — Disabled. ClickHouse does not throw an exception. -- 1 — Enabled. Query execution is disabled only if the table has a sharding key. -- 2 — Enabled. Query execution is disabled regardless of whether a sharding key is defined for the table. +- 0 — `GROUP BY` optimization is disabled. +- 1 — `GROUP BY` optimization is enabled. + +**See Also** + +- [GROUP BY optimization](../../sql-reference/statements/select/group-by.md/#aggregation-in-order) + +## optimize_aggregators_of_group_by_keys {#optimize_aggregators_of_group_by_keys} + +Type: Bool + +Default value: 1 + +Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section + +## optimize_append_index {#optimize_append_index} + +Type: Bool Default value: 0 -## force_optimize_skip_unused_shards_nesting {#force_optimize_skip_unused_shards_nesting} - -Controls [`force_optimize_skip_unused_shards`](#force-optimize-skip-unused-shards) (hence still requires [`force_optimize_skip_unused_shards`](#force-optimize-skip-unused-shards)) depends on the nesting level of the distributed query (case when you have `Distributed` table that look into another `Distributed` table). +Use [constraints](../../sql-reference/statements/create/table.md#constraints) in order to append index condition. The default is `false`. Possible values: -- 0 - Disabled, `force_optimize_skip_unused_shards` works always. -- 1 — Enables `force_optimize_skip_unused_shards` only for the first level. -- 2 — Enables `force_optimize_skip_unused_shards` up to the second level. +- true, false -Default value: 0 +## optimize_arithmetic_operations_in_aggregate_functions {#optimize_arithmetic_operations_in_aggregate_functions} -## optimize_distributed_group_by_sharding_key {#optimize-distributed-group-by-sharding-key} +Type: Bool + +Default value: 1 + +Move arithmetic operations out of aggregation functions + +## optimize_count_from_files {#optimize_count_from_files} + +Type: Bool + +Default value: 1 + +Enables or disables the optimization of counting number of rows from files in different input formats. It applies to table functions/engines `file`/`s3`/`url`/`hdfs`/`azureBlobStorage`. + +Possible values: + +- 0 — Optimization disabled. +- 1 — Optimization enabled. + +## optimize_distinct_in_order {#optimize_distinct_in_order} + +Type: Bool + +Default value: 1 + +Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement + +## optimize_distributed_group_by_sharding_key {#optimize_distributed_group_by_sharding_key} + +Type: Bool + +Default value: 1 Optimize `GROUP BY sharding_key` queries, by avoiding costly aggregation on the initiator server (which will reduce memory usage for the query on the initiator server). @@ -2504,8 +6653,6 @@ Possible values: - 0 — Disabled. - 1 — Enabled. -Default value: 0 - See also: - [distributed_group_by_no_merge](#distributed-group-by-no-merge) @@ -2516,35 +6663,11 @@ See also: Right now it requires `optimize_skip_unused_shards` (the reason behind this is that one day it may be enabled by default, and it will work correctly only if data was inserted via Distributed table, i.e. data is distributed according to sharding_key). ::: -## optimize_throw_if_noop {#setting-optimize_throw_if_noop} +## optimize_functions_to_subcolumns {#optimize_functions_to_subcolumns} -Enables or disables throwing an exception if an [OPTIMIZE](../../sql-reference/statements/optimize.md) query didn’t perform a merge. +Type: Bool -By default, `OPTIMIZE` returns successfully even if it didn’t do anything. This setting lets you differentiate these situations and get the reason in an exception message. - -Possible values: - -- 1 — Throwing an exception is enabled. -- 0 — Throwing an exception is disabled. - -Default value: 0. - -## optimize_skip_merged_partitions {#optimize-skip-merged-partitions} - -Enables or disables optimization for [OPTIMIZE TABLE ... FINAL](../../sql-reference/statements/optimize.md) query if there is only one part with level > 0 and it doesn't have expired TTL. - -- `OPTIMIZE TABLE ... FINAL SETTINGS optimize_skip_merged_partitions=1` - -By default, `OPTIMIZE TABLE ... FINAL` query rewrites the one part even if there is only a single part. - -Possible values: - -- 1 - Enable optimization. -- 0 - Disable optimization. - -Default value: 0. - -## optimize_functions_to_subcolumns {#optimize-functions-to-subcolumns} +Default value: 1 Enables or disables optimization by transforming some functions to reading subcolumns. This reduces the amount of data to read. @@ -2564,1040 +6687,125 @@ Possible values: - 0 — Optimization disabled. - 1 — Optimization enabled. -Default value: `1`. +## optimize_group_by_constant_keys {#optimize_group_by_constant_keys} -## optimize_trivial_count_query {#optimize-trivial-count-query} +Type: Bool -Enables or disables the optimization to trivial query `SELECT count() FROM table` using metadata from MergeTree. If you need to use row-level security, disable this setting. +Default value: 1 + +Optimize GROUP BY when all keys in block are constant + +## optimize_group_by_function_keys {#optimize_group_by_function_keys} + +Type: Bool + +Default value: 1 + +Eliminates functions of other keys in GROUP BY section + +## optimize_if_chain_to_multiif {#optimize_if_chain_to_multiif} + +Type: Bool + +Default value: 0 + +Replace if(cond1, then1, if(cond2, ...)) chains to multiIf. Currently it's not beneficial for numeric types. + +## optimize_if_transform_strings_to_enum {#optimize_if_transform_strings_to_enum} + +Type: Bool + +Default value: 0 + +Replaces string-type arguments in If and Transform to enum. Disabled by default cause it could make inconsistent change in distributed query that would lead to its fail. + +## optimize_injective_functions_in_group_by {#optimize_injective_functions_in_group_by} + +Type: Bool + +Default value: 1 + +Replaces injective functions by it's arguments in GROUP BY section + +## optimize_injective_functions_inside_uniq {#optimize_injective_functions_inside_uniq} + +Type: Bool + +Default value: 1 + +Delete injective functions of one argument inside uniq*() functions. + +## optimize_min_equality_disjunction_chain_length {#optimize_min_equality_disjunction_chain_length} + +Type: UInt64 + +Default value: 3 + +The minimum length of the expression `expr = x1 OR ... expr = xN` for optimization + +## optimize_min_inequality_conjunction_chain_length {#optimize_min_inequality_conjunction_chain_length} + +Type: UInt64 + +Default value: 3 + +The minimum length of the expression `expr <> x1 AND ... expr <> xN` for optimization + +## optimize_move_to_prewhere {#optimize_move_to_prewhere} + +Type: Bool + +Default value: 1 + +Enables or disables automatic [PREWHERE](../../sql-reference/statements/select/prewhere.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries. + +Works only for [*MergeTree](../../engines/table-engines/mergetree-family/index.md) tables. Possible values: - - 0 — Optimization disabled. - - 1 — Optimization enabled. +- 0 — Automatic `PREWHERE` optimization is disabled. +- 1 — Automatic `PREWHERE` optimization is enabled. -Default value: `1`. +## optimize_move_to_prewhere_if_final {#optimize_move_to_prewhere_if_final} -See also: +Type: Bool -- [optimize_functions_to_subcolumns](#optimize-functions-to-subcolumns) +Default value: 0 -## optimize_trivial_approximate_count_query {#optimize_trivial_approximate_count_query} +Enables or disables automatic [PREWHERE](../../sql-reference/statements/select/prewhere.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries with [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier. -Use an approximate value for trivial count optimization of storages that support such estimation, for example, EmbeddedRocksDB. +Works only for [*MergeTree](../../engines/table-engines/mergetree-family/index.md) tables. Possible values: - - 0 — Optimization disabled. - - 1 — Optimization enabled. - -Default value: `0`. - -## optimize_count_from_files {#optimize_count_from_files} - -Enables or disables the optimization of counting number of rows from files in different input formats. It applies to table functions/engines `file`/`s3`/`url`/`hdfs`/`azureBlobStorage`. - -Possible values: - -- 0 — Optimization disabled. -- 1 — Optimization enabled. - -Default value: `1`. - -## use_cache_for_count_from_files {#use_cache_for_count_from_files} - -Enables caching of rows number during count from files in table functions `file`/`s3`/`url`/`hdfs`/`azureBlobStorage`. - -Enabled by default. - -## distributed_replica_error_half_life {#distributed_replica_error_half_life} - -- Type: seconds -- Default value: 60 seconds - -Controls how fast errors in distributed tables are zeroed. If a replica is unavailable for some time, accumulates 5 errors, and distributed_replica_error_half_life is set to 1 second, then the replica is considered normal 3 seconds after the last error. - -See also: - -- [load_balancing](#load_balancing-round_robin) -- [Table engine Distributed](../../engines/table-engines/special/distributed.md) -- [distributed_replica_error_cap](#distributed_replica_error_cap) -- [distributed_replica_max_ignored_errors](#distributed_replica_max_ignored_errors) - -## distributed_replica_error_cap {#distributed_replica_error_cap} - -- Type: unsigned int -- Default value: 1000 - -The error count of each replica is capped at this value, preventing a single replica from accumulating too many errors. - -See also: - -- [load_balancing](#load_balancing-round_robin) -- [Table engine Distributed](../../engines/table-engines/special/distributed.md) -- [distributed_replica_error_half_life](#distributed_replica_error_half_life) -- [distributed_replica_max_ignored_errors](#distributed_replica_max_ignored_errors) - -## distributed_replica_max_ignored_errors {#distributed_replica_max_ignored_errors} - -- Type: unsigned int -- Default value: 0 - -The number of errors that will be ignored while choosing replicas (according to `load_balancing` algorithm). - -See also: - -- [load_balancing](#load_balancing-round_robin) -- [Table engine Distributed](../../engines/table-engines/special/distributed.md) -- [distributed_replica_error_cap](#distributed_replica_error_cap) -- [distributed_replica_error_half_life](#distributed_replica_error_half_life) - -## distributed_background_insert_sleep_time_ms {#distributed_background_insert_sleep_time_ms} - -Base interval for the [Distributed](../../engines/table-engines/special/distributed.md) table engine to send data. The actual interval grows exponentially in the event of errors. - -Possible values: - -- A positive integer number of milliseconds. - -Default value: 100 milliseconds. - -## distributed_background_insert_max_sleep_time_ms {#distributed_background_insert_max_sleep_time_ms} - -Maximum interval for the [Distributed](../../engines/table-engines/special/distributed.md) table engine to send data. Limits exponential growth of the interval set in the [distributed_background_insert_sleep_time_ms](#distributed_background_insert_sleep_time_ms) setting. - -Possible values: - -- A positive integer number of milliseconds. - -Default value: 30000 milliseconds (30 seconds). - -## distributed_background_insert_batch {#distributed_background_insert_batch} - -Enables/disables inserted data sending in batches. - -When batch sending is enabled, the [Distributed](../../engines/table-engines/special/distributed.md) table engine tries to send multiple files of inserted data in one operation instead of sending them separately. Batch sending improves cluster performance by better-utilizing server and network resources. - -Possible values: - -- 1 — Enabled. -- 0 — Disabled. - -Default value: 0. - -## distributed_background_insert_split_batch_on_failure {#distributed_background_insert_split_batch_on_failure} - -Enables/disables splitting batches on failures. - -Sometimes sending particular batch to the remote shard may fail, because of some complex pipeline after (i.e. `MATERIALIZED VIEW` with `GROUP BY`) due to `Memory limit exceeded` or similar errors. In this case, retrying will not help (and this will stuck distributed sends for the table) but sending files from that batch one by one may succeed INSERT. - -So installing this setting to `1` will disable batching for such batches (i.e. temporary disables `distributed_background_insert_batch` for failed batches). - -Possible values: - -- 1 — Enabled. -- 0 — Disabled. - -Default value: 0. - -:::note -This setting also affects broken batches (that may appears because of abnormal server (machine) termination and no `fsync_after_insert`/`fsync_directories` for [Distributed](../../engines/table-engines/special/distributed.md) table engine). -::: - -:::note -You should not rely on automatic batch splitting, since this may hurt performance. -::: - -## os_thread_priority {#setting-os-thread-priority} - -Sets the priority ([nice](https://en.wikipedia.org/wiki/Nice_(Unix))) for threads that execute queries. The OS scheduler considers this priority when choosing the next thread to run on each available CPU core. - -:::note -To use this setting, you need to set the `CAP_SYS_NICE` capability. The `clickhouse-server` package sets it up during installation. Some virtual environments do not allow you to set the `CAP_SYS_NICE` capability. In this case, `clickhouse-server` shows a message about it at the start. -::: - -Possible values: - -- You can set values in the range `[-20, 19]`. - -Lower values mean higher priority. Threads with low `nice` priority values are executed more frequently than threads with high values. High values are preferable for long-running non-interactive queries because it allows them to quickly give up resources in favour of short interactive queries when they arrive. - -Default value: 0. - -## query_profiler_real_time_period_ns {#query_profiler_real_time_period_ns} - -Sets the period for a real clock timer of the [query profiler](../../operations/optimizing-performance/sampling-query-profiler.md). Real clock timer counts wall-clock time. - -Possible values: - -- Positive integer number, in nanoseconds. - - Recommended values: - - - 10000000 (100 times a second) nanoseconds and less for single queries. - - 1000000000 (once a second) for cluster-wide profiling. - -- 0 for turning off the timer. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). - -Default value: 1000000000 nanoseconds (once a second). - -**Temporarily disabled in ClickHouse Cloud.** - -See also: - -- System table [trace_log](../../operations/system-tables/trace_log.md/#system_tables-trace_log) - -## query_profiler_cpu_time_period_ns {#query_profiler_cpu_time_period_ns} - -Sets the period for a CPU clock timer of the [query profiler](../../operations/optimizing-performance/sampling-query-profiler.md). This timer counts only CPU time. - -Possible values: - -- A positive integer number of nanoseconds. - - Recommended values: - - - 10000000 (100 times a second) nanoseconds and more for single queries. - - 1000000000 (once a second) for cluster-wide profiling. - -- 0 for turning off the timer. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). - -Default value: 1000000000 nanoseconds. - -**Temporarily disabled in ClickHouse Cloud.** - -See also: - -- System table [trace_log](../../operations/system-tables/trace_log.md/#system_tables-trace_log) - -## memory_profiler_step {#memory_profiler_step} - -Sets the step of memory profiler. Whenever query memory usage becomes larger than every next step in number of bytes the memory profiler will collect the allocating stacktrace and will write it into [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log). - -Possible values: - -- A positive integer number of bytes. - -- 0 for turning off the memory profiler. - -Default value: 4,194,304 bytes (4 MiB). - -## memory_profiler_sample_probability {#memory_profiler_sample_probability} - -Sets the probability of collecting stacktraces at random allocations and deallocations and writing them into [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log). - -Possible values: - -- A positive floating-point number in the range [0..1]. - -- 0.0 for turning off the memory sampling. - -Default value: 0.0. - -## trace_profile_events {#trace_profile_events} - -Enables or disables collecting stacktraces on each update of profile events along with the name of profile event and the value of increment and sending them into [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log). - -Possible values: - -- 1 — Tracing of profile events enabled. -- 0 — Tracing of profile events disabled. - -Default value: 0. - -## allow_introspection_functions {#allow_introspection_functions} - -Enables or disables [introspection functions](../../sql-reference/functions/introspection.md) for query profiling. - -Possible values: - -- 1 — Introspection functions enabled. -- 0 — Introspection functions disabled. - -Default value: 0. +- 0 — Automatic `PREWHERE` optimization in `SELECT` queries with `FINAL` modifier is disabled. +- 1 — Automatic `PREWHERE` optimization in `SELECT` queries with `FINAL` modifier is enabled. **See Also** -- [Sampling Query Profiler](../../operations/optimizing-performance/sampling-query-profiler.md) -- System table [trace_log](../../operations/system-tables/trace_log.md/#system_tables-trace_log) +- [optimize_move_to_prewhere](#optimize_move_to_prewhere) setting -## input_format_parallel_parsing {#input-format-parallel-parsing} +## optimize_multiif_to_if {#optimize_multiif_to_if} -Enables or disables order-preserving parallel parsing of data formats. Supported only for [TSV](../../interfaces/formats.md/#tabseparated), [TSKV](../../interfaces/formats.md/#tskv), [CSV](../../interfaces/formats.md/#csv) and [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) formats. +Type: Bool -Possible values: +Default value: 1 -- 1 — Enabled. -- 0 — Disabled. +Replace 'multiIf' with only one condition to 'if'. -Default value: `1`. +## optimize_normalize_count_variants {#optimize_normalize_count_variants} -## output_format_parallel_formatting {#output-format-parallel-formatting} +Type: Bool -Enables or disables parallel formatting of data formats. Supported only for [TSV](../../interfaces/formats.md/#tabseparated), [TSKV](../../interfaces/formats.md/#tskv), [CSV](../../interfaces/formats.md/#csv) and [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) formats. +Default value: 1 -Possible values: +Rewrite aggregate functions that semantically equals to count() as count(). -- 1 — Enabled. -- 0 — Disabled. +## optimize_on_insert {#optimize_on_insert} -Default value: `1`. +Type: Bool -## min_chunk_bytes_for_parallel_parsing {#min-chunk-bytes-for-parallel-parsing} - -- Type: unsigned int -- Default value: 1 MiB - -The minimum chunk size in bytes, which each thread will parse in parallel. - -## merge_selecting_sleep_ms {#merge_selecting_sleep_ms} - -Minimum time to wait before trying to select parts to merge again after no parts were selected. A lower setting triggers selecting tasks in `background_schedule_pool` frequently, which results in a large number of requests to ClickHouse Keeper in large-scale clusters. - -Possible values: - -- Any positive integer. - -Default value: `5000`. - -## max_merge_selecting_sleep_ms - -Maximum time to wait before trying to select parts to merge again after no parts were selected. A lower setting triggers selecting tasks in `background_schedule_pool` frequently, which results in a large number of requests to ClickHouse Keeper in large-scale clusters. - -Possible values: - -- Any positive integer. - -Default value: `60000`. - -## parallel_distributed_insert_select {#parallel_distributed_insert_select} - -Enables parallel distributed `INSERT ... SELECT` query. - -If we execute `INSERT INTO distributed_table_a SELECT ... FROM distributed_table_b` queries and both tables use the same cluster, and both tables are either [replicated](../../engines/table-engines/mergetree-family/replication.md) or non-replicated, then this query is processed locally on every shard. - -Possible values: - -- 0 — Disabled. -- 1 — `SELECT` will be executed on each shard from the underlying table of the distributed engine. -- 2 — `SELECT` and `INSERT` will be executed on each shard from/to the underlying table of the distributed engine. - -Default value: 0. - -## distributed_insert_skip_read_only_replicas {#distributed_insert_skip_read_only_replicas} - -Enables skipping read-only replicas for INSERT queries into Distributed. - -Possible values: - -- 0 — INSERT was as usual, if it will go to read-only replica it will fail -- 1 — Initiator will skip read-only replicas before sending data to shards. - -Default value: `0` - -## distributed_foreground_insert {#distributed_foreground_insert} - -Enables or disables synchronous data insertion into a [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table. - -By default, when inserting data into a `Distributed` table, the ClickHouse server sends data to cluster nodes in background mode. When `distributed_foreground_insert=1`, the data is processed synchronously, and the `INSERT` operation succeeds only after all the data is saved on all shards (at least one replica for each shard if `internal_replication` is true). - -Possible values: - -- 0 — Data is inserted in background mode. -- 1 — Data is inserted in synchronous mode. - -Default value: `0`. - -Cloud default value: `1`. - -**See Also** - -- [Distributed Table Engine](../../engines/table-engines/special/distributed.md/#distributed) -- [Managing Distributed Tables](../../sql-reference/statements/system.md/#query-language-system-distributed) - -## insert_distributed_sync {#insert_distributed_sync} - -Alias for [`distributed_foreground_insert`](#distributed_foreground_insert). - -## insert_shard_id {#insert_shard_id} - -If not `0`, specifies the shard of [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table into which the data will be inserted synchronously. - -If `insert_shard_id` value is incorrect, the server will throw an exception. - -To get the number of shards on `requested_cluster`, you can check server config or use this query: - -``` sql -SELECT uniq(shard_num) FROM system.clusters WHERE cluster = 'requested_cluster'; -``` - -Possible values: - -- 0 — Disabled. -- Any number from `1` to `shards_num` of corresponding [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table. - -Default value: `0`. - -**Example** - -Query: - -```sql -CREATE TABLE x AS system.numbers ENGINE = MergeTree ORDER BY number; -CREATE TABLE x_dist AS x ENGINE = Distributed('test_cluster_two_shards_localhost', currentDatabase(), x); -INSERT INTO x_dist SELECT * FROM numbers(5) SETTINGS insert_shard_id = 1; -SELECT * FROM x_dist ORDER BY number ASC; -``` - -Result: - -``` text -┌─number─┐ -│ 0 │ -│ 0 │ -│ 1 │ -│ 1 │ -│ 2 │ -│ 2 │ -│ 3 │ -│ 3 │ -│ 4 │ -│ 4 │ -└────────┘ -``` - -## use_compact_format_in_distributed_parts_names {#use_compact_format_in_distributed_parts_names} - -Uses compact format for storing blocks for background (`distributed_foreground_insert`) INSERT into tables with `Distributed` engine. - -Possible values: - -- 0 — Uses `user[:password]@host:port#default_database` directory format. -- 1 — Uses `[shard{shard_index}[_replica{replica_index}]]` directory format. - -Default value: `1`. - -:::note -- with `use_compact_format_in_distributed_parts_names=0` changes from cluster definition will not be applied for background INSERT. -- with `use_compact_format_in_distributed_parts_names=1` changing the order of the nodes in the cluster definition, will change the `shard_index`/`replica_index` so be aware. -::: - -## background_buffer_flush_schedule_pool_size {#background_buffer_flush_schedule_pool_size} - -That setting was moved to the [server configuration parameters](../../operations/server-configuration-parameters/settings.md/#background_buffer_flush_schedule_pool_size). - -## background_move_pool_size {#background_move_pool_size} - -That setting was moved to the [server configuration parameters](../../operations/server-configuration-parameters/settings.md/#background_move_pool_size). - -## background_schedule_pool_size {#background_schedule_pool_size} - -That setting was moved to the [server configuration parameters](../../operations/server-configuration-parameters/settings.md/#background_schedule_pool_size). - -## background_fetches_pool_size {#background_fetches_pool_size} - -That setting was moved to the [server configuration parameters](../../operations/server-configuration-parameters/settings.md/#background_fetches_pool_size). - -## always_fetch_merged_part {#always_fetch_merged_part} - -Prohibits data parts merging in [Replicated\*MergeTree](../../engines/table-engines/mergetree-family/replication.md)-engine tables. - -When merging is prohibited, the replica never merges parts and always downloads merged parts from other replicas. If there is no required data yet, the replica waits for it. CPU and disk load on the replica server decreases, but the network load on the cluster increases. This setting can be useful on servers with relatively weak CPUs or slow disks, such as servers for backups storage. - -Possible values: - -- 0 — `Replicated*MergeTree`-engine tables merge data parts at the replica. -- 1 — `Replicated*MergeTree`-engine tables do not merge data parts at the replica. The tables download merged data parts from other replicas. - -Default value: 0. - -**See Also** - -- [Data Replication](../../engines/table-engines/mergetree-family/replication.md) - -## background_distributed_schedule_pool_size {#background_distributed_schedule_pool_size} - -That setting was moved to the [server configuration parameters](../../operations/server-configuration-parameters/settings.md/#background_distributed_schedule_pool_size). - -## background_message_broker_schedule_pool_size {#background_message_broker_schedule_pool_size} - -That setting was moved to the [server configuration parameters](../../operations/server-configuration-parameters/settings.md/#background_message_broker_schedule_pool_size). - -## validate_polygons {#validate_polygons} - -Enables or disables throwing an exception in the [pointInPolygon](../../sql-reference/functions/geo/index.md#pointinpolygon) function, if the polygon is self-intersecting or self-tangent. - -Possible values: - -- 0 — Throwing an exception is disabled. `pointInPolygon` accepts invalid polygons and returns possibly incorrect results for them. -- 1 — Throwing an exception is enabled. - -Default value: 1. - -## transform_null_in {#transform_null_in} - -Enables equality of [NULL](../../sql-reference/syntax.md/#null-literal) values for [IN](../../sql-reference/operators/in.md) operator. - -By default, `NULL` values can’t be compared because `NULL` means undefined value. Thus, comparison `expr = NULL` must always return `false`. With this setting `NULL = NULL` returns `true` for `IN` operator. - -Possible values: - -- 0 — Comparison of `NULL` values in `IN` operator returns `false`. -- 1 — Comparison of `NULL` values in `IN` operator returns `true`. - -Default value: 0. - -**Example** - -Consider the `null_in` table: - -``` text -┌──idx─┬─────i─┐ -│ 1 │ 1 │ -│ 2 │ NULL │ -│ 3 │ 3 │ -└──────┴───────┘ -``` - -Query: - -``` sql -SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 0; -``` - -Result: - -``` text -┌──idx─┬────i─┐ -│ 1 │ 1 │ -└──────┴──────┘ -``` - -Query: - -``` sql -SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 1; -``` - -Result: - -``` text -┌──idx─┬─────i─┐ -│ 1 │ 1 │ -│ 2 │ NULL │ -└──────┴───────┘ -``` - -**See Also** - -- [NULL Processing in IN Operators](../../sql-reference/operators/in.md/#in-null-processing) - -## low_cardinality_max_dictionary_size {#low_cardinality_max_dictionary_size} - -Sets a maximum size in rows of a shared global dictionary for the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) data type that can be written to a storage file system. This setting prevents issues with RAM in case of unlimited dictionary growth. All the data that can’t be encoded due to maximum dictionary size limitation ClickHouse writes in an ordinary method. - -Possible values: - -- Any positive integer. - -Default value: 8192. - -## low_cardinality_use_single_dictionary_for_part {#low_cardinality_use_single_dictionary_for_part} - -Turns on or turns off using of single dictionary for the data part. - -By default, the ClickHouse server monitors the size of dictionaries and if a dictionary overflows then the server starts to write the next one. To prohibit creating several dictionaries set `low_cardinality_use_single_dictionary_for_part = 1`. - -Possible values: - -- 1 — Creating several dictionaries for the data part is prohibited. -- 0 — Creating several dictionaries for the data part is not prohibited. - -Default value: 0. - -## low_cardinality_allow_in_native_format {#low_cardinality_allow_in_native_format} - -Allows or restricts using the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) data type with the [Native](../../interfaces/formats.md/#native) format. - -If usage of `LowCardinality` is restricted, ClickHouse server converts `LowCardinality`-columns to ordinary ones for `SELECT` queries, and convert ordinary columns to `LowCardinality`-columns for `INSERT` queries. - -This setting is required mainly for third-party clients which do not support `LowCardinality` data type. - -Possible values: - -- 1 — Usage of `LowCardinality` is not restricted. -- 0 — Usage of `LowCardinality` is restricted. - -Default value: 1. - -## allow_suspicious_low_cardinality_types {#allow_suspicious_low_cardinality_types} - -Allows or restricts using [LowCardinality](../../sql-reference/data-types/lowcardinality.md) with data types with fixed size of 8 bytes or less: numeric data types and `FixedString(8_bytes_or_less)`. - -For small fixed values using of `LowCardinality` is usually inefficient, because ClickHouse stores a numeric index for each row. As a result: - -- Disk space usage can rise. -- RAM consumption can be higher, depending on a dictionary size. -- Some functions can work slower due to extra coding/encoding operations. - -Merge times in [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md)-engine tables can grow due to all the reasons described above. - -Possible values: - -- 1 — Usage of `LowCardinality` is not restricted. -- 0 — Usage of `LowCardinality` is restricted. - -Default value: 0. - -## min_insert_block_size_rows_for_materialized_views {#min-insert-block-size-rows-for-materialized-views} - -Sets the minimum number of rows in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. This setting is applied only for blocks inserted into [materialized view](../../sql-reference/statements/create/view.md). By adjusting this setting, you control blocks squashing while pushing to materialized view and avoid excessive memory usage. - -Possible values: - -- Any positive integer. -- 0 — Squashing disabled. - -Default value: 1048576. - -**See Also** - -- [min_insert_block_size_rows](#min-insert-block-size-rows) - -## min_insert_block_size_bytes_for_materialized_views {#min-insert-block-size-bytes-for-materialized-views} - -Sets the minimum number of bytes in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. This setting is applied only for blocks inserted into [materialized view](../../sql-reference/statements/create/view.md). By adjusting this setting, you control blocks squashing while pushing to materialized view and avoid excessive memory usage. - -Possible values: - -- Any positive integer. -- 0 — Squashing disabled. - -Default value: 268435456. - -**See also** - -- [min_insert_block_size_bytes](#min-insert-block-size-bytes) - -## optimize_read_in_order {#optimize_read_in_order} - -Enables [ORDER BY](../../sql-reference/statements/select/order-by.md/#optimize_read_in_order) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries for reading data from [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. - -Possible values: - -- 0 — `ORDER BY` optimization is disabled. -- 1 — `ORDER BY` optimization is enabled. - -Default value: `1`. - -**See Also** - -- [ORDER BY Clause](../../sql-reference/statements/select/order-by.md/#optimize_read_in_order) - -## optimize_aggregation_in_order {#optimize_aggregation_in_order} - -Enables [GROUP BY](../../sql-reference/statements/select/group-by.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries for aggregating data in corresponding order in [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. - -Possible values: - -- 0 — `GROUP BY` optimization is disabled. -- 1 — `GROUP BY` optimization is enabled. - -Default value: `0`. - -**See Also** - -- [GROUP BY optimization](../../sql-reference/statements/select/group-by.md/#aggregation-in-order) - -## mutations_sync {#mutations_sync} - -Allows to execute `ALTER TABLE ... UPDATE|DELETE|MATERIALIZE INDEX|MATERIALIZE PROJECTION|MATERIALIZE COLUMN` queries ([mutations](../../sql-reference/statements/alter/index.md#mutations)) synchronously. - -Possible values: - -- 0 - Mutations execute asynchronously. -- 1 - The query waits for all mutations to complete on the current server. -- 2 - The query waits for all mutations to complete on all replicas (if they exist). - -Default value: `0`. - -## lightweight_deletes_sync {#lightweight_deletes_sync} - -The same as [`mutations_sync`](#mutations_sync), but controls only execution of lightweight deletes. - -Possible values: - -- 0 - Mutations execute asynchronously. -- 1 - The query waits for the lightweight deletes to complete on the current server. -- 2 - The query waits for the lightweight deletes to complete on all replicas (if they exist). - -Default value: `2`. - -**See Also** - -- [Synchronicity of ALTER Queries](../../sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) -- [Mutations](../../sql-reference/statements/alter/index.md#mutations) - -## ttl_only_drop_parts {#ttl_only_drop_parts} - -Enables or disables complete dropping of data parts where all rows are expired in [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. - -When `ttl_only_drop_parts` is disabled (by default), the ClickHouse server only deletes expired rows according to their TTL. - -When `ttl_only_drop_parts` is enabled, the ClickHouse server drops a whole part when all rows in it are expired. - -Dropping whole parts instead of partial cleaning TTL-d rows allows having shorter `merge_with_ttl_timeout` times and lower impact on system performance. - -Possible values: - -- 0 — The complete dropping of data parts is disabled. -- 1 — The complete dropping of data parts is enabled. - -Default value: `0`. - -**See Also** - -- [CREATE TABLE query clauses and settings](../../engines/table-engines/mergetree-family/mergetree.md/#mergetree-query-clauses) (`merge_with_ttl_timeout` setting) -- [Table TTL](../../engines/table-engines/mergetree-family/mergetree.md/#mergetree-table-ttl) - -## lock_acquire_timeout {#lock_acquire_timeout} - -Defines how many seconds a locking request waits before failing. - -Locking timeout is used to protect from deadlocks while executing read/write operations with tables. When the timeout expires and the locking request fails, the ClickHouse server throws an exception "Locking attempt timed out! Possible deadlock avoided. Client should retry." with error code `DEADLOCK_AVOIDED`. - -Possible values: - -- Positive integer (in seconds). -- 0 — No locking timeout. - -Default value: `120` seconds. - -## cast_keep_nullable {#cast_keep_nullable} - -Enables or disables keeping of the `Nullable` data type in [CAST](../../sql-reference/functions/type-conversion-functions.md/#castx-t) operations. - -When the setting is enabled and the argument of `CAST` function is `Nullable`, the result is also transformed to `Nullable` type. When the setting is disabled, the result always has the destination type exactly. - -Possible values: - -- 0 — The `CAST` result has exactly the destination type specified. -- 1 — If the argument type is `Nullable`, the `CAST` result is transformed to `Nullable(DestinationDataType)`. - -Default value: `0`. - -**Examples** - -The following query results in the destination data type exactly: - -```sql -SET cast_keep_nullable = 0; -SELECT CAST(toNullable(toInt32(0)) AS Int32) as x, toTypeName(x); -``` - -Result: - -```text -┌─x─┬─toTypeName(CAST(toNullable(toInt32(0)), 'Int32'))─┐ -│ 0 │ Int32 │ -└───┴───────────────────────────────────────────────────┘ -``` - -The following query results in the `Nullable` modification on the destination data type: - -```sql -SET cast_keep_nullable = 1; -SELECT CAST(toNullable(toInt32(0)) AS Int32) as x, toTypeName(x); -``` - -Result: - -```text -┌─x─┬─toTypeName(CAST(toNullable(toInt32(0)), 'Int32'))─┐ -│ 0 │ Nullable(Int32) │ -└───┴───────────────────────────────────────────────────┘ -``` - -**See Also** - -- [CAST](../../sql-reference/functions/type-conversion-functions.md/#type_conversion_function-cast) function - -## system_events_show_zero_values {#system_events_show_zero_values} - -Allows to select zero-valued events from [`system.events`](../../operations/system-tables/events.md). - -Some monitoring systems require passing all the metrics values to them for each checkpoint, even if the metric value is zero. - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: `0`. - -**Examples** - -Query - -```sql -SELECT * FROM system.events WHERE event='QueryMemoryLimitExceeded'; -``` - -Result - -```text -Ok. -``` - -Query -```sql -SET system_events_show_zero_values = 1; -SELECT * FROM system.events WHERE event='QueryMemoryLimitExceeded'; -``` - -Result - -```text -┌─event────────────────────┬─value─┬─description───────────────────────────────────────────┐ -│ QueryMemoryLimitExceeded │ 0 │ Number of times when memory limit exceeded for query. │ -└──────────────────────────┴───────┴───────────────────────────────────────────────────────┘ -``` - -## allow_nullable_key {#allow-nullable-key} - -Allows using of the [Nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable)-typed values in a sorting and a primary key for [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md/#table_engines-mergetree) tables. - -Possible values: - -- 1 — `Nullable`-type expressions are allowed in keys. -- 0 — `Nullable`-type expressions are not allowed in keys. - -Default value: `0`. - -:::note -Nullable primary key usually indicates bad design. It is forbidden in almost all main stream DBMS. The feature is mainly for [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) and is not heavily tested. Use with care. -::: - -:::note -Do not enable this feature in version `<= 21.8`. It's not properly implemented and may lead to server crash. -::: - -## aggregate_functions_null_for_empty {#aggregate_functions_null_for_empty} - -Enables or disables rewriting all aggregate functions in a query, adding [-OrNull](../../sql-reference/aggregate-functions/combinators.md/#agg-functions-combinator-ornull) suffix to them. Enable it for SQL standard compatibility. -It is implemented via query rewrite (similar to [count_distinct_implementation](#count_distinct_implementation) setting) to get consistent results for distributed queries. - -Possible values: - -- 0 — Disabled. -- 1 — Enabled. - -Default value: 0. - -**Example** - -Consider the following query with aggregate functions: -```sql -SELECT SUM(-1), MAX(0) FROM system.one WHERE 0; -``` - -With `aggregate_functions_null_for_empty = 0` it would produce: -```text -┌─SUM(-1)─┬─MAX(0)─┐ -│ 0 │ 0 │ -└─────────┴────────┘ -``` - -With `aggregate_functions_null_for_empty = 1` the result would be: -```text -┌─SUMOrNull(-1)─┬─MAXOrNull(0)─┐ -│ NULL │ NULL │ -└───────────────┴──────────────┘ -``` - -## union_default_mode {#union-default-mode} - -Sets a mode for combining `SELECT` query results. The setting is only used when shared with [UNION](../../sql-reference/statements/select/union.md) without explicitly specifying the `UNION ALL` or `UNION DISTINCT`. - -Possible values: - -- `'DISTINCT'` — ClickHouse outputs rows as a result of combining queries removing duplicate rows. -- `'ALL'` — ClickHouse outputs all rows as a result of combining queries including duplicate rows. -- `''` — ClickHouse generates an exception when used with `UNION`. - -Default value: `''`. - -See examples in [UNION](../../sql-reference/statements/select/union.md). - -## default_table_engine {#default_table_engine} - -Default table engine to use when `ENGINE` is not set in a `CREATE` statement. - -Possible values: - -- a string representing any valid table engine name - -Default value: `MergeTree`. - -Cloud default value: `SharedMergeTree`. - -**Example** - -Query: - -```sql -SET default_table_engine = 'Log'; - -SELECT name, value, changed FROM system.settings WHERE name = 'default_table_engine'; -``` - -Result: - -```response -┌─name─────────────────┬─value─┬─changed─┐ -│ default_table_engine │ Log │ 1 │ -└──────────────────────┴───────┴─────────┘ -``` - -In this example, any new table that does not specify an `Engine` will use the `Log` table engine: - -Query: - -```sql -CREATE TABLE my_table ( - x UInt32, - y UInt32 -); - -SHOW CREATE TABLE my_table; -``` - -Result: - -```response -┌─statement────────────────────────────────────────────────────────────────┐ -│ CREATE TABLE default.my_table -( - `x` UInt32, - `y` UInt32 -) -ENGINE = Log -└──────────────────────────────────────────────────────────────────────────┘ -``` - -## default_temporary_table_engine {#default_temporary_table_engine} - -Same as [default_table_engine](#default_table_engine) but for temporary tables. - -Default value: `Memory`. - -In this example, any new temporary table that does not specify an `Engine` will use the `Log` table engine: - -Query: - -```sql -SET default_temporary_table_engine = 'Log'; - -CREATE TEMPORARY TABLE my_table ( - x UInt32, - y UInt32 -); - -SHOW CREATE TEMPORARY TABLE my_table; -``` - -Result: - -```response -┌─statement────────────────────────────────────────────────────────────────┐ -│ CREATE TEMPORARY TABLE default.my_table -( - `x` UInt32, - `y` UInt32 -) -ENGINE = Log -└──────────────────────────────────────────────────────────────────────────┘ -``` - -## data_type_default_nullable {#data_type_default_nullable} - -Allows data types without explicit modifiers [NULL or NOT NULL](../../sql-reference/statements/create/table.md/#null-modifiers) in column definition will be [Nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable). - -Possible values: - -- 1 — The data types in column definitions are set to `Nullable` by default. -- 0 — The data types in column definitions are set to not `Nullable` by default. - -Default value: `0`. - -## mysql_map_string_to_text_in_show_columns {#mysql_map_string_to_text_in_show_columns} - -When enabled, [String](../../sql-reference/data-types/string.md) ClickHouse data type will be displayed as `TEXT` in [SHOW COLUMNS](../../sql-reference/statements/show.md#show_columns). - -Has an effect only when the connection is made through the MySQL wire protocol. - -- 0 - Use `BLOB`. -- 1 - Use `TEXT`. - -Default value: `1`. - -## mysql_map_fixed_string_to_text_in_show_columns {#mysql_map_fixed_string_to_text_in_show_columns} - -When enabled, [FixedString](../../sql-reference/data-types/fixedstring.md) ClickHouse data type will be displayed as `TEXT` in [SHOW COLUMNS](../../sql-reference/statements/show.md#show_columns). - -Has an effect only when the connection is made through the MySQL wire protocol. - -- 0 - Use `BLOB`. -- 1 - Use `TEXT`. - -Default value: `1`. - -## execute_merges_on_single_replica_time_threshold {#execute-merges-on-single-replica-time-threshold} - -Enables special logic to perform merges on replicas. - -Possible values: - -- Positive integer (in seconds). -- 0 — Special merges logic is not used. Merges happen in the usual way on all the replicas. - -Default value: `0`. - -**Usage** - -Selects one replica to perform the merge on. Sets the time threshold from the start of the merge. Other replicas wait for the merge to finish, then download the result. If the time threshold passes and the selected replica does not perform the merge, then the merge is performed on other replicas as usual. - -High values for that threshold may lead to replication delays. - -It can be useful when merges are CPU bounded not IO bounded (performing heavy data compression, calculating aggregate functions or default expressions that require a large amount of calculations, or just very high number of tiny merges). - -## max_final_threads {#max-final-threads} - -Sets the maximum number of parallel threads for the `SELECT` query data read phase with the [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier. - -Possible values: - -- Positive integer. -- 0 or 1 — Disabled. `SELECT` queries are executed in a single thread. - -Default value: `max_threads`. - -## opentelemetry_start_trace_probability {#opentelemetry-start-trace-probability} - -Sets the probability that the ClickHouse can start a trace for executed queries (if no parent [trace context](https://www.w3.org/TR/trace-context/) is supplied). - -Possible values: - -- 0 — The trace for all executed queries is disabled (if no parent trace context is supplied). -- Positive floating-point number in the range [0..1]. For example, if the setting value is `0,5`, ClickHouse can start a trace on average for half of the queries. -- 1 — The trace for all executed queries is enabled. - -Default value: `0`. - -## optimize_on_insert {#optimize-on-insert} +Default value: 1 Enables or disables data transformation before the insertion, as if merge was done on this block (according to table engine). @@ -3606,8 +6814,6 @@ Possible values: - 0 — Disabled. - 1 — Enabled. -Default value: 1. - **Example** The difference between enabled and disabled: @@ -3651,298 +6857,636 @@ Result: Note that this setting influences [Materialized view](../../sql-reference/statements/create/view.md/#materialized) and [MaterializedMySQL](../../engines/database-engines/materialized-mysql.md) behaviour. -## engine_file_empty_if_not_exists {#engine-file-empty_if-not-exists} +## optimize_or_like_chain {#optimize_or_like_chain} -Allows to select data from a file engine table without file. +Type: Bool -Possible values: -- 0 — `SELECT` throws exception. -- 1 — `SELECT` returns empty result. +Default value: 0 -Default value: `0`. +Optimize multiple OR LIKE into multiMatchAny. This optimization should not be enabled by default, because it defies index analysis in some cases. -## engine_file_truncate_on_insert {#engine-file-truncate-on-insert} +## optimize_read_in_order {#optimize_read_in_order} -Enables or disables truncate before insert in [File](../../engines/table-engines/special/file.md) engine tables. +Type: Bool -Possible values: -- 0 — `INSERT` query appends new data to the end of the file. -- 1 — `INSERT` query replaces existing content of the file with the new data. +Default value: 1 -Default value: `0`. - -## engine_file_allow_create_multiple_files {#engine_file_allow_create_multiple_files} - -Enables or disables creating a new file on each insert in file engine tables if the format has the suffix (`JSON`, `ORC`, `Parquet`, etc.). If enabled, on each insert a new file will be created with a name following this pattern: - -`data.Parquet` -> `data.1.Parquet` -> `data.2.Parquet`, etc. - -Possible values: -- 0 — `INSERT` query appends new data to the end of the file. -- 1 — `INSERT` query creates a new file. - -Default value: `0`. - -## engine_file_skip_empty_files {#engine_file_skip_empty_files} - -Enables or disables skipping empty files in [File](../../engines/table-engines/special/file.md) engine tables. - -Possible values: -- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. -- 1 — `SELECT` returns empty result for empty file. - -Default value: `0`. - -## storage_file_read_method {#storage_file_read_method} - -Method of reading data from storage file, one of: `read`, `pread`, `mmap`. The mmap method does not apply to clickhouse-server (it's intended for clickhouse-local). - -Default value: `pread` for clickhouse-server, `mmap` for clickhouse-local. - -## s3_truncate_on_insert {#s3_truncate_on_insert} - -Enables or disables truncate before inserts in s3 engine tables. If disabled, an exception will be thrown on insert attempts if an S3 object already exists. - -Possible values: -- 0 — `INSERT` query appends new data to the end of the file. -- 1 — `INSERT` query replaces existing content of the file with the new data. - -Default value: `0`. - -## s3_create_new_file_on_insert {#s3_create_new_file_on_insert} - -Enables or disables creating a new file on each insert in s3 engine tables. If enabled, on each insert a new S3 object will be created with the key, similar to this pattern: - -initial: `data.Parquet.gz` -> `data.1.Parquet.gz` -> `data.2.Parquet.gz`, etc. - -Possible values: -- 0 — `INSERT` query appends new data to the end of the file. -- 1 — `INSERT` query creates a new file. - -Default value: `0`. - -## s3_skip_empty_files {#s3_skip_empty_files} - -Enables or disables skipping empty files in [S3](../../engines/table-engines/integrations/s3.md) engine tables. - -Possible values: -- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. -- 1 — `SELECT` returns empty result for empty file. - -Default value: `0`. - -## s3_ignore_file_doesnt_exist {#s3_ignore_file_doesnt_exist} - -Ignore absence of file if it does not exist when reading certain keys. - -Possible values: -- 1 — `SELECT` returns empty result. -- 0 — `SELECT` throws an exception. - -Default value: `0`. - -## s3_validate_request_settings {#s3_validate_request_settings} - -Enables s3 request settings validation. - -Possible values: -- 1 — validate settings. -- 0 — do not validate settings. - -Default value: `1`. - -## hdfs_truncate_on_insert {#hdfs_truncate_on_insert} - -Enables or disables truncation before an insert in hdfs engine tables. If disabled, an exception will be thrown on an attempt to insert if a file in HDFS already exists. - -Possible values: -- 0 — `INSERT` query appends new data to the end of the file. -- 1 — `INSERT` query replaces existing content of the file with the new data. - -Default value: `0`. - -## hdfs_create_new_file_on_insert {#hdfs_create_new_file_on_insert - -Enables or disables creating a new file on each insert in HDFS engine tables. If enabled, on each insert a new HDFS file will be created with the name, similar to this pattern: - -initial: `data.Parquet.gz` -> `data.1.Parquet.gz` -> `data.2.Parquet.gz`, etc. - -Possible values: -- 0 — `INSERT` query appends new data to the end of the file. -- 1 — `INSERT` query creates a new file. - -Default value: `0`. - -## hdfs_skip_empty_files {#hdfs_skip_empty_files} - -Enables or disables skipping empty files in [HDFS](../../engines/table-engines/integrations/hdfs.md) engine tables. - -Possible values: -- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. -- 1 — `SELECT` returns empty result for empty file. - -Default value: `0`. - -## hdfs_throw_on_zero_files_match {#hdfs_throw_on_zero_files_match} - -Throw an error if matched zero files according to glob expansion rules. - -Possible values: -- 1 — `SELECT` throws an exception. -- 0 — `SELECT` returns empty result. - -Default value: `0`. - -## hdfs_ignore_file_doesnt_exist {#hdfs_ignore_file_doesnt_exist} - -Ignore absence of file if it does not exist when reading certain keys. - -Possible values: -- 1 — `SELECT` returns empty result. -- 0 — `SELECT` throws an exception. - -Default value: `0`. - -## azure_throw_on_zero_files_match {#azure_throw_on_zero_files_match} - -Throw an error if matched zero files according to glob expansion rules. - -Possible values: -- 1 — `SELECT` throws an exception. -- 0 — `SELECT` returns empty result. - -Default value: `0`. - -## azure_ignore_file_doesnt_exist {#azure_ignore_file_doesnt_exist} - -Ignore absence of file if it does not exist when reading certain keys. - -Possible values: -- 1 — `SELECT` returns empty result. -- 0 — `SELECT` throws an exception. - -Default value: `0`. - -## azure_skip_empty_files {#azure_skip_empty_files} - -Enables or disables skipping empty files in S3 engine. - -Possible values: -- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. -- 1 — `SELECT` returns empty result for empty file. - -Default value: `0`. - -## engine_url_skip_empty_files {#engine_url_skip_empty_files} - -Enables or disables skipping empty files in [URL](../../engines/table-engines/special/url.md) engine tables. - -Possible values: -- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. -- 1 — `SELECT` returns empty result for empty file. - -Default value: `0`. - -## enable_url_encoding {#enable_url_encoding} - -Allows to enable/disable decoding/encoding path in uri in [URL](../../engines/table-engines/special/url.md) engine tables. - -Enabled by default. - -## database_atomic_wait_for_drop_and_detach_synchronously {#database_atomic_wait_for_drop_and_detach_synchronously} - -Adds a modifier `SYNC` to all `DROP` and `DETACH` queries. +Enables [ORDER BY](../../sql-reference/statements/select/order-by.md/#optimize_read_in_order) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries for reading data from [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. Possible values: -- 0 — Queries will be executed with delay. -- 1 — Queries will be executed without delay. +- 0 — `ORDER BY` optimization is disabled. +- 1 — `ORDER BY` optimization is enabled. -Default value: `0`. +**See Also** -## show_table_uuid_in_table_create_query_if_not_nil {#show_table_uuid_in_table_create_query_if_not_nil} +- [ORDER BY Clause](../../sql-reference/statements/select/order-by.md/#optimize_read_in_order) -Sets the `SHOW TABLE` query display. +## optimize_read_in_window_order {#optimize_read_in_window_order} -Possible values: +Type: Bool -- 0 — The query will be displayed without table UUID. -- 1 — The query will be displayed with table UUID. +Default value: 1 -Default value: `0`. +Enable ORDER BY optimization in window clause for reading data in corresponding order in MergeTree tables. -## allow_experimental_live_view {#allow-experimental-live-view} +## optimize_redundant_functions_in_order_by {#optimize_redundant_functions_in_order_by} -Allows creation of a deprecated LIVE VIEW. +Type: Bool -Possible values: +Default value: 1 -- 0 — Working with live views is disabled. -- 1 — Working with live views is enabled. +Remove functions from ORDER BY if its argument is also in ORDER BY -Default value: `0`. +## optimize_respect_aliases {#optimize_respect_aliases} -## live_view_heartbeat_interval {#live-view-heartbeat-interval} +Type: Bool -Deprecated. +Default value: 1 -## max_live_view_insert_blocks_before_refresh {#max-live-view-insert-blocks-before-refresh} +If it is set to true, it will respect aliases in WHERE/GROUP BY/ORDER BY, that will help with partition pruning/secondary indexes/optimize_aggregation_in_order/optimize_read_in_order/optimize_trivial_count -Deprecated. +## optimize_rewrite_aggregate_function_with_if {#optimize_rewrite_aggregate_function_with_if} -## periodic_live_view_refresh {#periodic-live-view-refresh} +Type: Bool -Deprecated. +Default value: 1 -## http_connection_timeout {#http_connection_timeout} - -HTTP connection timeout (in seconds). - -Possible values: - -- Any positive integer. -- 0 - Disabled (infinite timeout). - -Default value: 1. - -## http_send_timeout {#http_send_timeout} - -HTTP send timeout (in seconds). - -Possible values: - -- Any positive integer. -- 0 - Disabled (infinite timeout). - -Default value: 30. +Rewrite aggregate functions with if expression as argument when logically equivalent. +For example, `avg(if(cond, col, null))` can be rewritten to `avgOrNullIf(cond, col)`. It may improve performance. :::note -It's applicable only to the default profile. A server reboot is required for the changes to take effect. +Supported only with experimental analyzer (`enable_analyzer = 1`). ::: -## http_receive_timeout {#http_receive_timeout} +## optimize_rewrite_array_exists_to_has {#optimize_rewrite_array_exists_to_has} -HTTP receive timeout (in seconds). +Type: Bool + +Default value: 0 + +Rewrite arrayExists() functions to has() when logically equivalent. For example, arrayExists(x -> x = 1, arr) can be rewritten to has(arr, 1) + +## optimize_rewrite_sum_if_to_count_if {#optimize_rewrite_sum_if_to_count_if} + +Type: Bool + +Default value: 1 + +Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent + +## optimize_skip_merged_partitions {#optimize_skip_merged_partitions} + +Type: Bool + +Default value: 0 + +Enables or disables optimization for [OPTIMIZE TABLE ... FINAL](../../sql-reference/statements/optimize.md) query if there is only one part with level > 0 and it doesn't have expired TTL. + +- `OPTIMIZE TABLE ... FINAL SETTINGS optimize_skip_merged_partitions=1` + +By default, `OPTIMIZE TABLE ... FINAL` query rewrites the one part even if there is only a single part. Possible values: -- Any positive integer. -- 0 - Disabled (infinite timeout). +- 1 - Enable optimization. +- 0 - Disable optimization. -Default value: 30. +## optimize_skip_unused_shards {#optimize_skip_unused_shards} -## check_query_single_value_result {#check_query_single_value_result} +Type: Bool -Defines the level of detail for the [CHECK TABLE](../../sql-reference/statements/check-table.md/#checking-mergetree-tables) query result for `MergeTree` family engines . +Default value: 0 + +Enables or disables skipping of unused shards for [SELECT](../../sql-reference/statements/select/index.md) queries that have sharding key condition in `WHERE/PREWHERE` (assuming that the data is distributed by sharding key, otherwise a query yields incorrect result). Possible values: -- 0 — the query shows a check status for every individual data part of a table. -- 1 — the query shows the general table check status. +- 0 — Disabled. +- 1 — Enabled. -Default value: `0`. +## optimize_skip_unused_shards_limit {#optimize_skip_unused_shards_limit} -## prefer_column_name_to_alias {#prefer-column-name-to-alias} +Type: UInt64 + +Default value: 1000 + +Limit for number of sharding key values, turns off `optimize_skip_unused_shards` if the limit is reached. + +Too many values may require significant amount for processing, while the benefit is doubtful, since if you have huge number of values in `IN (...)`, then most likely the query will be sent to all shards anyway. + +## optimize_skip_unused_shards_nesting {#optimize_skip_unused_shards_nesting} + +Type: UInt64 + +Default value: 0 + +Controls [`optimize_skip_unused_shards`](#optimize-skip-unused-shards) (hence still requires [`optimize_skip_unused_shards`](#optimize-skip-unused-shards)) depends on the nesting level of the distributed query (case when you have `Distributed` table that look into another `Distributed` table). + +Possible values: + +- 0 — Disabled, `optimize_skip_unused_shards` works always. +- 1 — Enables `optimize_skip_unused_shards` only for the first level. +- 2 — Enables `optimize_skip_unused_shards` up to the second level. + +## optimize_skip_unused_shards_rewrite_in {#optimize_skip_unused_shards_rewrite_in} + +Type: Bool + +Default value: 1 + +Rewrite IN in query for remote shards to exclude values that does not belong to the shard (requires optimize_skip_unused_shards). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +## optimize_sorting_by_input_stream_properties {#optimize_sorting_by_input_stream_properties} + +Type: Bool + +Default value: 1 + +Optimize sorting by sorting properties of input stream + +## optimize_substitute_columns {#optimize_substitute_columns} + +Type: Bool + +Default value: 0 + +Use [constraints](../../sql-reference/statements/create/table.md#constraints) for column substitution. The default is `false`. + +Possible values: + +- true, false + +## optimize_syntax_fuse_functions {#optimize_syntax_fuse_functions} + +Type: Bool + +Default value: 0 + +Enables to fuse aggregate functions with identical argument. It rewrites query contains at least two aggregate functions from [sum](../../sql-reference/aggregate-functions/reference/sum.md/#agg_function-sum), [count](../../sql-reference/aggregate-functions/reference/count.md/#agg_function-count) or [avg](../../sql-reference/aggregate-functions/reference/avg.md/#agg_function-avg) with identical argument to [sumCount](../../sql-reference/aggregate-functions/reference/sumcount.md/#agg_function-sumCount). + +Possible values: + +- 0 — Functions with identical argument are not fused. +- 1 — Functions with identical argument are fused. + +**Example** + +Query: + +``` sql +CREATE TABLE fuse_tbl(a Int8, b Int8) Engine = Log; +SET optimize_syntax_fuse_functions = 1; +EXPLAIN SYNTAX SELECT sum(a), sum(b), count(b), avg(b) from fuse_tbl FORMAT TSV; +``` + +Result: + +``` text +SELECT + sum(a), + sumCount(b).1, + sumCount(b).2, + (sumCount(b).1) / (sumCount(b).2) +FROM fuse_tbl +``` + +## optimize_throw_if_noop {#optimize_throw_if_noop} + +Type: Bool + +Default value: 0 + +Enables or disables throwing an exception if an [OPTIMIZE](../../sql-reference/statements/optimize.md) query didn’t perform a merge. + +By default, `OPTIMIZE` returns successfully even if it didn’t do anything. This setting lets you differentiate these situations and get the reason in an exception message. + +Possible values: + +- 1 — Throwing an exception is enabled. +- 0 — Throwing an exception is disabled. + +## optimize_time_filter_with_preimage {#optimize_time_filter_with_preimage} + +Type: Bool + +Default value: 1 + +Optimize Date and DateTime predicates by converting functions into equivalent comparisons without conversions (e.g. toYear(col) = 2023 -> col >= '2023-01-01' AND col <= '2023-12-31') + +## optimize_trivial_approximate_count_query {#optimize_trivial_approximate_count_query} + +Type: Bool + +Default value: 0 + +Use an approximate value for trivial count optimization of storages that support such estimation, for example, EmbeddedRocksDB. + +Possible values: + + - 0 — Optimization disabled. + - 1 — Optimization enabled. + +## optimize_trivial_count_query {#optimize_trivial_count_query} + +Type: Bool + +Default value: 1 + +Enables or disables the optimization to trivial query `SELECT count() FROM table` using metadata from MergeTree. If you need to use row-level security, disable this setting. + +Possible values: + + - 0 — Optimization disabled. + - 1 — Optimization enabled. + +See also: + +- [optimize_functions_to_subcolumns](#optimize-functions-to-subcolumns) + +## optimize_trivial_insert_select {#optimize_trivial_insert_select} + +Type: Bool + +Default value: 0 + +Optimize trivial 'INSERT INTO table SELECT ... FROM TABLES' query + +## optimize_uniq_to_count {#optimize_uniq_to_count} + +Type: Bool + +Default value: 1 + +Rewrite uniq and its variants(except uniqUpTo) to count if subquery has distinct or group by clause. + +## optimize_use_implicit_projections {#optimize_use_implicit_projections} + +Type: Bool + +Default value: 1 + +Automatically choose implicit projections to perform SELECT query + +## optimize_use_projections {#optimize_use_projections} + +Type: Bool + +Default value: 1 + +Enables or disables [projection](../../engines/table-engines/mergetree-family/mergetree.md/#projections) optimization when processing `SELECT` queries. + +Possible values: + +- 0 — Projection optimization disabled. +- 1 — Projection optimization enabled. + +## optimize_using_constraints {#optimize_using_constraints} + +Type: Bool + +Default value: 0 + +Use [constraints](../../sql-reference/statements/create/table.md#constraints) for query optimization. The default is `false`. + +Possible values: + +- true, false + +## os_thread_priority {#os_thread_priority} + +Type: Int64 + +Default value: 0 + +Sets the priority ([nice](https://en.wikipedia.org/wiki/Nice_(Unix))) for threads that execute queries. The OS scheduler considers this priority when choosing the next thread to run on each available CPU core. + +:::note +To use this setting, you need to set the `CAP_SYS_NICE` capability. The `clickhouse-server` package sets it up during installation. Some virtual environments do not allow you to set the `CAP_SYS_NICE` capability. In this case, `clickhouse-server` shows a message about it at the start. +::: + +Possible values: + +- You can set values in the range `[-20, 19]`. + +Lower values mean higher priority. Threads with low `nice` priority values are executed more frequently than threads with high values. High values are preferable for long-running non-interactive queries because it allows them to quickly give up resources in favour of short interactive queries when they arrive. + +## output_format_compression_level {#output_format_compression_level} + +Type: UInt64 + +Default value: 3 + +Default compression level if query output is compressed. The setting is applied when `SELECT` query has `INTO OUTFILE` or when writing to table functions `file`, `url`, `hdfs`, `s3`, or `azureBlobStorage`. + +Possible values: from `1` to `22` + +## output_format_compression_zstd_window_log {#output_format_compression_zstd_window_log} + +Type: UInt64 + +Default value: 0 + +Can be used when the output compression method is `zstd`. If greater than `0`, this setting explicitly sets compression window size (power of `2`) and enables a long-range mode for zstd compression. This can help to achieve a better compression ratio. + +Possible values: non-negative numbers. Note that if the value is too small or too big, `zstdlib` will throw an exception. Typical values are from `20` (window size = `1MB`) to `30` (window size = `1GB`). + +## output_format_parallel_formatting {#output_format_parallel_formatting} + +Type: Bool + +Default value: 1 + +Enables or disables parallel formatting of data formats. Supported only for [TSV](../../interfaces/formats.md/#tabseparated), [TSKV](../../interfaces/formats.md/#tskv), [CSV](../../interfaces/formats.md/#csv) and [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) formats. + +Possible values: + +- 1 — Enabled. +- 0 — Disabled. + +## page_cache_inject_eviction {#page_cache_inject_eviction} + +Type: Bool + +Default value: 0 + +Userspace page cache will sometimes invalidate some pages at random. Intended for testing. + +## parallel_distributed_insert_select {#parallel_distributed_insert_select} + +Type: UInt64 + +Default value: 0 + +Enables parallel distributed `INSERT ... SELECT` query. + +If we execute `INSERT INTO distributed_table_a SELECT ... FROM distributed_table_b` queries and both tables use the same cluster, and both tables are either [replicated](../../engines/table-engines/mergetree-family/replication.md) or non-replicated, then this query is processed locally on every shard. + +Possible values: + +- 0 — Disabled. +- 1 — `SELECT` will be executed on each shard from the underlying table of the distributed engine. +- 2 — `SELECT` and `INSERT` will be executed on each shard from/to the underlying table of the distributed engine. + +## parallel_replica_offset {#parallel_replica_offset} + +Type: UInt64 + +Default value: 0 + +This is internal setting that should not be used directly and represents an implementation detail of the 'parallel replicas' mode. This setting will be automatically set up by the initiator server for distributed queries to the index of the replica participating in query processing among parallel replicas. + +## parallel_replicas_allow_in_with_subquery {#parallel_replicas_allow_in_with_subquery} + +Type: Bool + +Default value: 1 + +If true, subquery for IN will be executed on every follower replica. + +## parallel_replicas_count {#parallel_replicas_count} + +Type: UInt64 + +Default value: 0 + +This is internal setting that should not be used directly and represents an implementation detail of the 'parallel replicas' mode. This setting will be automatically set up by the initiator server for distributed queries to the number of parallel replicas participating in query processing. + +## parallel_replicas_custom_key {#parallel_replicas_custom_key} + +Type: String + +Default value: + +An arbitrary integer expression that can be used to split work between replicas for a specific table. +The value can be any integer expression. + +Simple expressions using primary keys are preferred. + +If the setting is used on a cluster that consists of a single shard with multiple replicas, those replicas will be converted into virtual shards. +Otherwise, it will behave same as for `SAMPLE` key, it will use multiple replicas of each shard. + +## parallel_replicas_custom_key_range_lower {#parallel_replicas_custom_key_range_lower} + +Type: UInt64 + +Default value: 0 + +Allows the filter type `range` to split the work evenly between replicas based on the custom range `[parallel_replicas_custom_key_range_lower, INT_MAX]`. + +When used in conjunction with [parallel_replicas_custom_key_range_upper](#parallel_replicas_custom_key_range_upper), it lets the filter evenly split the work over replicas for the range `[parallel_replicas_custom_key_range_lower, parallel_replicas_custom_key_range_upper]`. + +Note: This setting will not cause any additional data to be filtered during query processing, rather it changes the points at which the range filter breaks up the range `[0, INT_MAX]` for parallel processing. + +## parallel_replicas_custom_key_range_upper {#parallel_replicas_custom_key_range_upper} + +Type: UInt64 + +Default value: 0 + +Allows the filter type `range` to split the work evenly between replicas based on the custom range `[0, parallel_replicas_custom_key_range_upper]`. A value of 0 disables the upper bound, setting it the max value of the custom key expression. + +When used in conjunction with [parallel_replicas_custom_key_range_lower](#parallel_replicas_custom_key_range_lower), it lets the filter evenly split the work over replicas for the range `[parallel_replicas_custom_key_range_lower, parallel_replicas_custom_key_range_upper]`. + +Note: This setting will not cause any additional data to be filtered during query processing, rather it changes the points at which the range filter breaks up the range `[0, INT_MAX]` for parallel processing + +## parallel_replicas_for_non_replicated_merge_tree {#parallel_replicas_for_non_replicated_merge_tree} + +Type: Bool + +Default value: 0 + +If true, ClickHouse will use parallel replicas algorithm also for non-replicated MergeTree tables + +## parallel_replicas_local_plan {#parallel_replicas_local_plan} + +Type: Bool + +Default value: 0 + +Build local plan for local replica + +## parallel_replicas_mark_segment_size {#parallel_replicas_mark_segment_size} + +Type: UInt64 + +Default value: 0 + +Parts virtually divided into segments to be distributed between replicas for parallel reading. This setting controls the size of these segments. Not recommended to change until you're absolutely sure in what you're doing. Value should be in range [128; 16384] + +## parallel_replicas_min_number_of_rows_per_replica {#parallel_replicas_min_number_of_rows_per_replica} + +Type: UInt64 + +Default value: 0 + +Limit the number of replicas used in a query to (estimated rows to read / min_number_of_rows_per_replica). The max is still limited by 'max_parallel_replicas' + +## parallel_replicas_mode {#parallel_replicas_mode} + +Type: ParallelReplicasMode + +Default value: read_tasks + +Type of filter to use with custom key for parallel replicas. default - use modulo operation on the custom key, range - use range filter on custom key using all possible values for the value type of custom key. + +## parallel_replicas_prefer_local_join {#parallel_replicas_prefer_local_join} + +Type: Bool + +Default value: 1 + +If true, and JOIN can be executed with parallel replicas algorithm, and all storages of right JOIN part are *MergeTree, local JOIN will be used instead of GLOBAL JOIN. + +## parallel_replicas_single_task_marks_count_multiplier {#parallel_replicas_single_task_marks_count_multiplier} + +Type: Float + +Default value: 2 + +A multiplier which will be added during calculation for minimal number of marks to retrieve from coordinator. This will be applied only for remote replicas. + +## parallel_view_processing {#parallel_view_processing} + +Type: Bool + +Default value: 0 + +Enables pushing to attached views concurrently instead of sequentially. + +## parallelize_output_from_storages {#parallelize_output_from_storages} + +Type: Bool + +Default value: 1 + +Parallelize output for reading step from storage. It allows parallelization of query processing right after reading from storage if possible + +## parsedatetime_parse_without_leading_zeros {#parsedatetime_parse_without_leading_zeros} + +Type: Bool + +Default value: 1 + +Formatters '%c', '%l' and '%k' in function 'parseDateTime()' parse months and hours without leading zeros. + +## partial_merge_join_left_table_buffer_bytes {#partial_merge_join_left_table_buffer_bytes} + +Type: UInt64 + +Default value: 0 + +If not 0 group left table blocks in bigger ones for left-side table in partial merge join. It uses up to 2x of specified memory per joining thread. + +## partial_merge_join_rows_in_right_blocks {#partial_merge_join_rows_in_right_blocks} + +Type: UInt64 + +Default value: 65536 + +Limits sizes of right-hand join data blocks in partial merge join algorithm for [JOIN](../../sql-reference/statements/select/join.md) queries. + +ClickHouse server: + +1. Splits right-hand join data into blocks with up to the specified number of rows. +2. Indexes each block with its minimum and maximum values. +3. Unloads prepared blocks to disk if it is possible. + +Possible values: + +- Any positive integer. Recommended range of values: \[1000, 100000\]. + +## partial_result_on_first_cancel {#partial_result_on_first_cancel} + +Type: Bool + +Default value: 0 + +Allows query to return a partial result after cancel. + +## parts_to_delay_insert {#parts_to_delay_insert} + +Type: UInt64 + +Default value: 0 + +If the destination table contains at least that many active parts in a single partition, artificially slow down insert into table. + +## parts_to_throw_insert {#parts_to_throw_insert} + +Type: UInt64 + +Default value: 0 + +If more than this number active parts in a single partition of the destination table, throw 'Too many parts ...' exception. + +## periodic_live_view_refresh {#periodic_live_view_refresh} + +Type: Seconds + +Default value: 60 + +Interval after which periodically refreshed live view is forced to refresh. + +## poll_interval {#poll_interval} + +Type: UInt64 + +Default value: 10 + +Block at the query wait loop on the server for the specified number of seconds. + +## postgresql_connection_attempt_timeout {#postgresql_connection_attempt_timeout} + +Type: UInt64 + +Default value: 2 + +Connection timeout in seconds of a single attempt to connect PostgreSQL end-point. +The value is passed as a `connect_timeout` parameter of the connection URL. + +## postgresql_connection_pool_auto_close_connection {#postgresql_connection_pool_auto_close_connection} + +Type: Bool + +Default value: 0 + +Close connection before returning connection to the pool. + +## postgresql_connection_pool_retries {#postgresql_connection_pool_retries} + +Type: UInt64 + +Default value: 2 + +Connection pool push/pop retries number for PostgreSQL table engine and database engine. + +## postgresql_connection_pool_size {#postgresql_connection_pool_size} + +Type: UInt64 + +Default value: 16 + +Connection pool size for PostgreSQL table engine and database engine. + +## postgresql_connection_pool_wait_timeout {#postgresql_connection_pool_wait_timeout} + +Type: UInt64 + +Default value: 5000 + +Connection pool push/pop timeout on empty pool for PostgreSQL table engine and database engine. By default it will block on empty pool. + +## prefer_column_name_to_alias {#prefer_column_name_to_alias} + +Type: Bool + +Default value: 0 Enables or disables using the original column names instead of aliases in query expressions and clauses. It especially matters when alias is the same as the column name, see [Expression Aliases](../../sql-reference/syntax.md/#notes-on-usage). Enable this setting to make aliases syntax rules in ClickHouse more compatible with most other database engines. @@ -3951,8 +7495,6 @@ Possible values: - 0 — The column name is substituted with the alias. - 1 — The column name is not substituted with the alias. -Default value: `0`. - **Example** The difference between enabled and disabled: @@ -3986,250 +7528,88 @@ Result: └────────┴─────────────┘ ``` -## limit {#limit} +## prefer_external_sort_block_bytes {#prefer_external_sort_block_bytes} -Sets the maximum number of rows to get from the query result. It adjusts the value set by the [LIMIT](../../sql-reference/statements/select/limit.md/#limit-clause) clause, so that the limit, specified in the query, cannot exceed the limit, set by this setting. +Type: UInt64 + +Default value: 16744704 + +Prefer maximum block bytes for external sort, reduce the memory usage during merging. + +## prefer_global_in_and_join {#prefer_global_in_and_join} + +Type: Bool + +Default value: 0 + +Enables the replacement of `IN`/`JOIN` operators with `GLOBAL IN`/`GLOBAL JOIN`. Possible values: -- 0 — The number of rows is not limited. -- Positive integer. +- 0 — Disabled. `IN`/`JOIN` operators are not replaced with `GLOBAL IN`/`GLOBAL JOIN`. +- 1 — Enabled. `IN`/`JOIN` operators are replaced with `GLOBAL IN`/`GLOBAL JOIN`. -Default value: `0`. +**Usage** -## offset {#offset} +Although `SET distributed_product_mode=global` can change the queries behavior for the distributed tables, it's not suitable for local tables or tables from external resources. Here is when the `prefer_global_in_and_join` setting comes into play. -Sets the number of rows to skip before starting to return rows from the query. It adjusts the offset set by the [OFFSET](../../sql-reference/statements/select/offset.md/#offset-fetch) clause, so that these two values are summarized. +For example, we have query serving nodes that contain local tables, which are not suitable for distribution. We need to scatter their data on the fly during distributed processing with the `GLOBAL` keyword — `GLOBAL IN`/`GLOBAL JOIN`. + +Another use case of `prefer_global_in_and_join` is accessing tables created by external engines. This setting helps to reduce the number of calls to external sources while joining such tables: only one call per query. + +**See also:** + +- [Distributed subqueries](../../sql-reference/operators/in.md/#select-distributed-subqueries) for more information on how to use `GLOBAL IN`/`GLOBAL JOIN` + +## prefer_localhost_replica {#prefer_localhost_replica} + +Type: Bool + +Default value: 1 + +Enables/disables preferable using the localhost replica when processing distributed queries. Possible values: -- 0 — No rows are skipped . -- Positive integer. - -Default value: `0`. - -**Example** - -Input table: - -``` sql -CREATE TABLE test (i UInt64) ENGINE = MergeTree() ORDER BY i; -INSERT INTO test SELECT number FROM numbers(500); -``` - -Query: - -``` sql -SET limit = 5; -SET offset = 7; -SELECT * FROM test LIMIT 10 OFFSET 100; -``` -Result: - -``` text -┌───i─┐ -│ 107 │ -│ 108 │ -│ 109 │ -└─────┘ -``` - -## optimize_syntax_fuse_functions {#optimize_syntax_fuse_functions} - -Enables to fuse aggregate functions with identical argument. It rewrites query contains at least two aggregate functions from [sum](../../sql-reference/aggregate-functions/reference/sum.md/#agg_function-sum), [count](../../sql-reference/aggregate-functions/reference/count.md/#agg_function-count) or [avg](../../sql-reference/aggregate-functions/reference/avg.md/#agg_function-avg) with identical argument to [sumCount](../../sql-reference/aggregate-functions/reference/sumcount.md/#agg_function-sumCount). - -Possible values: - -- 0 — Functions with identical argument are not fused. -- 1 — Functions with identical argument are fused. - -Default value: `0`. - -**Example** - -Query: - -``` sql -CREATE TABLE fuse_tbl(a Int8, b Int8) Engine = Log; -SET optimize_syntax_fuse_functions = 1; -EXPLAIN SYNTAX SELECT sum(a), sum(b), count(b), avg(b) from fuse_tbl FORMAT TSV; -``` - -Result: - -``` text -SELECT - sum(a), - sumCount(b).1, - sumCount(b).2, - (sumCount(b).1) / (sumCount(b).2) -FROM fuse_tbl -``` - -## optimize_rewrite_aggregate_function_with_if - -Rewrite aggregate functions with if expression as argument when logically equivalent. -For example, `avg(if(cond, col, null))` can be rewritten to `avgOrNullIf(cond, col)`. It may improve performance. +- 1 — ClickHouse always sends a query to the localhost replica if it exists. +- 0 — ClickHouse uses the balancing strategy specified by the [load_balancing](#load_balancing) setting. :::note -Supported only with experimental analyzer (`enable_analyzer = 1`). +Disable this setting if you use [max_parallel_replicas](#max_parallel_replicas) without [parallel_replicas_custom_key](#parallel_replicas_custom_key). +If [parallel_replicas_custom_key](#parallel_replicas_custom_key) is set, disable this setting only if it's used on a cluster with multiple shards containing multiple replicas. +If it's used on a cluster with a single shard and multiple replicas, disabling this setting will have negative effects. ::: -## database_replicated_initial_query_timeout_sec {#database_replicated_initial_query_timeout_sec} +## prefer_warmed_unmerged_parts_seconds {#prefer_warmed_unmerged_parts_seconds} -Sets how long initial DDL query should wait for Replicated database to process previous DDL queue entries in seconds. +Type: Int64 -Possible values: +Default value: 0 -- Positive integer. -- 0 — Unlimited. +Only available in ClickHouse Cloud. If a merged part is less than this many seconds old and is not pre-warmed (see cache_populated_by_fetch), but all its source parts are available and pre-warmed, SELECT queries will read from those parts instead. Only for ReplicatedMergeTree. Note that this only checks whether CacheWarmer processed the part; if the part was fetched into cache by something else, it'll still be considered cold until CacheWarmer gets to it; if it was warmed, then evicted from cache, it'll still be considered warm. -Default value: `300`. +## preferred_block_size_bytes {#preferred_block_size_bytes} -## distributed_ddl_task_timeout {#distributed_ddl_task_timeout} +Type: UInt64 -Sets timeout for DDL query responses from all hosts in cluster. If a DDL request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. Negative value means infinite. +Default value: 1000000 -Possible values: +This setting adjusts the data block size for query processing and represents additional fine-tuning to the more rough 'max_block_size' setting. If the columns are large and with 'max_block_size' rows the block size is likely to be larger than the specified amount of bytes, its size will be lowered for better CPU cache locality. -- Positive integer. -- 0 — Async mode. -- Negative integer — infinite timeout. +## preferred_max_column_in_block_size_bytes {#preferred_max_column_in_block_size_bytes} -Default value: `180`. +Type: UInt64 -## distributed_ddl_output_mode {#distributed_ddl_output_mode} +Default value: 0 -Sets format of distributed DDL query result. - -Possible values: - -- `throw` — Returns result set with query execution status for all hosts where query is finished. If query has failed on some hosts, then it will rethrow the first exception. If query is not finished yet on some hosts and [distributed_ddl_task_timeout](#distributed_ddl_task_timeout) exceeded, then it throws `TIMEOUT_EXCEEDED` exception. -- `none` — Is similar to throw, but distributed DDL query returns no result set. -- `null_status_on_timeout` — Returns `NULL` as execution status in some rows of result set instead of throwing `TIMEOUT_EXCEEDED` if query is not finished on the corresponding hosts. -- `never_throw` — Do not throw `TIMEOUT_EXCEEDED` and do not rethrow exceptions if query has failed on some hosts. -- `none_only_active` - similar to `none`, but doesn't wait for inactive replicas of the `Replicated` database. Note: with this mode it's impossible to figure out that the query was not executed on some replica and will be executed in background. -- `null_status_on_timeout_only_active` — similar to `null_status_on_timeout`, but doesn't wait for inactive replicas of the `Replicated` database -- `throw_only_active` — similar to `throw`, but doesn't wait for inactive replicas of the `Replicated` database - -Default value: `throw`. - -Cloud default value: `none`. - -## flatten_nested {#flatten-nested} - -Sets the data format of a [nested](../../sql-reference/data-types/nested-data-structures/index.md) columns. - -Possible values: - -- 1 — Nested column is flattened to separate arrays. -- 0 — Nested column stays a single array of tuples. - -Default value: `1`. - -**Usage** - -If the setting is set to `0`, it is possible to use an arbitrary level of nesting. - -**Examples** - -Query: - -``` sql -SET flatten_nested = 1; -CREATE TABLE t_nest (`n` Nested(a UInt32, b UInt32)) ENGINE = MergeTree ORDER BY tuple(); - -SHOW CREATE TABLE t_nest; -``` - -Result: - -``` text -┌─statement───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ CREATE TABLE default.t_nest -( - `n.a` Array(UInt32), - `n.b` Array(UInt32) -) -ENGINE = MergeTree -ORDER BY tuple() -SETTINGS index_granularity = 8192 │ -└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -Query: - -``` sql -SET flatten_nested = 0; - -CREATE TABLE t_nest (`n` Nested(a UInt32, b UInt32)) ENGINE = MergeTree ORDER BY tuple(); - -SHOW CREATE TABLE t_nest; -``` - -Result: - -``` text -┌─statement──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ CREATE TABLE default.t_nest -( - `n` Nested(a UInt32, b UInt32) -) -ENGINE = MergeTree -ORDER BY tuple() -SETTINGS index_granularity = 8192 │ -└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -## external_table_functions_use_nulls {#external-table-functions-use-nulls} - -Defines how [mysql](../../sql-reference/table-functions/mysql.md), [postgresql](../../sql-reference/table-functions/postgresql.md) and [odbc](../../sql-reference/table-functions/odbc.md) table functions use Nullable columns. - -Possible values: - -- 0 — The table function explicitly uses Nullable columns. -- 1 — The table function implicitly uses Nullable columns. - -Default value: `1`. - -**Usage** - -If the setting is set to `0`, the table function does not make Nullable columns and inserts default values instead of NULL. This is also applicable for NULL values inside arrays. - -## optimize_use_projections {#optimize_use_projections} - -Enables or disables [projection](../../engines/table-engines/mergetree-family/mergetree.md/#projections) optimization when processing `SELECT` queries. - -Possible values: - -- 0 — Projection optimization disabled. -- 1 — Projection optimization enabled. - -Default value: `1`. - -## force_optimize_projection {#force-optimize-projection} - -Enables or disables the obligatory use of [projections](../../engines/table-engines/mergetree-family/mergetree.md/#projections) in `SELECT` queries, when projection optimization is enabled (see [optimize_use_projections](#optimize_use_projections) setting). - -Possible values: - -- 0 — Projection optimization is not obligatory. -- 1 — Projection optimization is obligatory. - -Default value: `0`. - -## force_optimize_projection_name {#force-optimize-projection_name} - -If it is set to a non-empty string, check that this projection is used in the query at least once. - -Possible values: - -- string: name of projection that used in a query - -Default value: `''`. +Limit on max column size in block while reading. Helps to decrease cache misses count. Should be close to L2 cache size. ## preferred_optimize_projection_name {#preferred_optimize_projection_name} +Type: String + +Default value: + If it is set to a non-empty string, ClickHouse will try to apply specified projection in query. @@ -4237,27 +7617,802 @@ Possible values: - string: name of preferred projection -Default value: `''`. +## prefetch_buffer_size {#prefetch_buffer_size} -## alter_sync {#alter-sync} +Type: UInt64 -Allows to set up waiting for actions to be executed on replicas by [ALTER](../../sql-reference/statements/alter/index.md), [OPTIMIZE](../../sql-reference/statements/optimize.md) or [TRUNCATE](../../sql-reference/statements/truncate.md) queries. +Default value: 1048576 + +The maximum size of the prefetch buffer to read from the filesystem. + +## print_pretty_type_names {#print_pretty_type_names} + +Type: Bool + +Default value: 1 + +Allows to print deep-nested type names in a pretty way with indents in `DESCRIBE` query and in `toTypeName()` function. + +Example: + +```sql +CREATE TABLE test (a Tuple(b String, c Tuple(d Nullable(UInt64), e Array(UInt32), f Array(Tuple(g String, h Map(String, Array(Tuple(i String, j UInt64))))), k Date), l Nullable(String))) ENGINE=Memory; +DESCRIBE TABLE test FORMAT TSVRaw SETTINGS print_pretty_type_names=1; +``` + +``` +a Tuple( + b String, + c Tuple( + d Nullable(UInt64), + e Array(UInt32), + f Array(Tuple( + g String, + h Map( + String, + Array(Tuple( + i String, + j UInt64 + )) + ) + )), + k Date + ), + l Nullable(String) +) +``` + +## priority {#priority} + +Type: UInt64 + +Default value: 0 + +Priority of the query. 1 - the highest, higher value - lower priority; 0 - do not use priorities. + +## query_cache_compress_entries {#query_cache_compress_entries} + +Type: Bool + +Default value: 1 + +Compress entries in the [query cache](../query-cache.md). Lessens the memory consumption of the query cache at the cost of slower inserts into / reads from it. Possible values: -- 0 — Do not wait. -- 1 — Wait for own execution. -- 2 — Wait for everyone. +- 0 - Disabled +- 1 - Enabled -Default value: `1`. +## query_cache_max_entries {#query_cache_max_entries} -Cloud default value: `0`. +Type: UInt64 + +Default value: 0 + +The maximum number of query results the current user may store in the [query cache](../query-cache.md). 0 means unlimited. + +Possible values: + +- Positive integer >= 0. + +## query_cache_max_size_in_bytes {#query_cache_max_size_in_bytes} + +Type: UInt64 + +Default value: 0 + +The maximum amount of memory (in bytes) the current user may allocate in the [query cache](../query-cache.md). 0 means unlimited. + +Possible values: + +- Positive integer >= 0. + +## query_cache_min_query_duration {#query_cache_min_query_duration} + +Type: Milliseconds + +Default value: 0 + +Minimum duration in milliseconds a query needs to run for its result to be stored in the [query cache](../query-cache.md). + +Possible values: + +- Positive integer >= 0. + +## query_cache_min_query_runs {#query_cache_min_query_runs} + +Type: UInt64 + +Default value: 0 + +Minimum number of times a `SELECT` query must run before its result is stored in the [query cache](../query-cache.md). + +Possible values: + +- Positive integer >= 0. + +## query_cache_nondeterministic_function_handling {#query_cache_nondeterministic_function_handling} + +Type: QueryCacheNondeterministicFunctionHandling + +Default value: throw + +Controls how the [query cache](../query-cache.md) handles `SELECT` queries with non-deterministic functions like `rand()` or `now()`. + +Possible values: + +- `'throw'` - Throw an exception and don't cache the query result. +- `'save'` - Cache the query result. +- `'ignore'` - Don't cache the query result and don't throw an exception. + +## query_cache_share_between_users {#query_cache_share_between_users} + +Type: Bool + +Default value: 0 + +If turned on, the result of `SELECT` queries cached in the [query cache](../query-cache.md) can be read by other users. +It is not recommended to enable this setting due to security reasons. + +Possible values: + +- 0 - Disabled +- 1 - Enabled + +## query_cache_squash_partial_results {#query_cache_squash_partial_results} + +Type: Bool + +Default value: 1 + +Squash partial result blocks to blocks of size [max_block_size](#setting-max_block_size). Reduces performance of inserts into the [query cache](../query-cache.md) but improves the compressability of cache entries (see [query_cache_compress-entries](#query-cache-compress-entries)). + +Possible values: + +- 0 - Disabled +- 1 - Enabled + +## query_cache_system_table_handling {#query_cache_system_table_handling} + +Type: QueryCacheSystemTableHandling + +Default value: throw + +Controls how the [query cache](../query-cache.md) handles `SELECT` queries against system tables, i.e. tables in databases `system.*` and `information_schema.*`. + +Possible values: + +- `'throw'` - Throw an exception and don't cache the query result. +- `'save'` - Cache the query result. +- `'ignore'` - Don't cache the query result and don't throw an exception. + +## query_cache_tag {#query_cache_tag} + +Type: String + +Default value: + +A string which acts as a label for [query cache](../query-cache.md) entries. +The same queries with different tags are considered different by the query cache. + +Possible values: + +- Any string + +## query_cache_ttl {#query_cache_ttl} + +Type: Seconds + +Default value: 60 + +After this time in seconds entries in the [query cache](../query-cache.md) become stale. + +Possible values: + +- Positive integer >= 0. + +## query_plan_aggregation_in_order {#query_plan_aggregation_in_order} + +Type: Bool + +Default value: 1 + +Toggles the aggregation in-order query-plan-level optimization. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. :::note -`alter_sync` is applicable to `Replicated` tables only, it does nothing to alters of not `Replicated` tables. +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. ::: -## replication_wait_for_inactive_replica_timeout {#replication-wait-for-inactive-replica-timeout} +Possible values: + +- 0 - Disable +- 1 - Enable + +## query_plan_convert_outer_join_to_inner_join {#query_plan_convert_outer_join_to_inner_join} + +Type: Bool + +Default value: 1 + +Allow to convert OUTER JOIN to INNER JOIN if filter after JOIN always filters default values + +## query_plan_enable_multithreading_after_window_functions {#query_plan_enable_multithreading_after_window_functions} + +Type: Bool + +Default value: 1 + +Enable multithreading after evaluating window functions to allow parallel stream processing + +## query_plan_enable_optimizations {#query_plan_enable_optimizations} + +Type: Bool + +Default value: 1 + +Toggles query optimization at the query plan level. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable all optimizations at the query plan level +- 1 - Enable optimizations at the query plan level (but individual optimizations may still be disabled via their individual settings) + +## query_plan_execute_functions_after_sorting {#query_plan_execute_functions_after_sorting} + +Type: Bool + +Default value: 1 + +Toggles a query-plan-level optimization which moves expressions after sorting steps. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable + +## query_plan_filter_push_down {#query_plan_filter_push_down} + +Type: Bool + +Default value: 1 + +Toggles a query-plan-level optimization which moves filters down in the execution plan. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable + +## query_plan_lift_up_array_join {#query_plan_lift_up_array_join} + +Type: Bool + +Default value: 1 + +Toggles a query-plan-level optimization which moves ARRAY JOINs up in the execution plan. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable + +## query_plan_lift_up_union {#query_plan_lift_up_union} + +Type: Bool + +Default value: 1 + +Toggles a query-plan-level optimization which moves larger subtrees of the query plan into union to enable further optimizations. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable + +## query_plan_max_optimizations_to_apply {#query_plan_max_optimizations_to_apply} + +Type: UInt64 + +Default value: 10000 + +Limits the total number of optimizations applied to query plan, see setting [query_plan_enable_optimizations](#query_plan_enable_optimizations). +Useful to avoid long optimization times for complex queries. +If the actual number of optimizations exceeds this setting, an exception is thrown. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +## query_plan_merge_expressions {#query_plan_merge_expressions} + +Type: Bool + +Default value: 1 + +Toggles a query-plan-level optimization which merges consecutive filters. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable + +## query_plan_merge_filters {#query_plan_merge_filters} + +Type: Bool + +Default value: 0 + +Allow to merge filters in the query plan + +## query_plan_optimize_prewhere {#query_plan_optimize_prewhere} + +Type: Bool + +Default value: 1 + +Allow to push down filter to PREWHERE expression for supported storages + +## query_plan_push_down_limit {#query_plan_push_down_limit} + +Type: Bool + +Default value: 1 + +Toggles a query-plan-level optimization which moves LIMITs down in the execution plan. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable + +## query_plan_read_in_order {#query_plan_read_in_order} + +Type: Bool + +Default value: 1 + +Toggles the read in-order optimization query-plan-level optimization. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable + +## query_plan_remove_redundant_distinct {#query_plan_remove_redundant_distinct} + +Type: Bool + +Default value: 1 + +Toggles a query-plan-level optimization which removes redundant DISTINCT steps. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable + +## query_plan_remove_redundant_sorting {#query_plan_remove_redundant_sorting} + +Type: Bool + +Default value: 1 + +Toggles a query-plan-level optimization which removes redundant sorting steps, e.g. in subqueries. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable + +## query_plan_reuse_storage_ordering_for_window_functions {#query_plan_reuse_storage_ordering_for_window_functions} + +Type: Bool + +Default value: 1 + +Toggles a query-plan-level optimization which uses storage sorting when sorting for window functions. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable + +## query_plan_split_filter {#query_plan_split_filter} + +Type: Bool + +Default value: 1 + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Toggles a query-plan-level optimization which splits filters into expressions. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +Possible values: + +- 0 - Disable +- 1 - Enable + +## query_profiler_cpu_time_period_ns {#query_profiler_cpu_time_period_ns} + +Type: UInt64 + +Default value: 1000000000 + +Sets the period for a CPU clock timer of the [query profiler](../../operations/optimizing-performance/sampling-query-profiler.md). This timer counts only CPU time. + +Possible values: + +- A positive integer number of nanoseconds. + + Recommended values: + + - 10000000 (100 times a second) nanoseconds and more for single queries. + - 1000000000 (once a second) for cluster-wide profiling. + +- 0 for turning off the timer. + +**Temporarily disabled in ClickHouse Cloud.** + +See also: + +- System table [trace_log](../../operations/system-tables/trace_log.md/#system_tables-trace_log) + +## query_profiler_real_time_period_ns {#query_profiler_real_time_period_ns} + +Type: UInt64 + +Default value: 1000000000 + +Sets the period for a real clock timer of the [query profiler](../../operations/optimizing-performance/sampling-query-profiler.md). Real clock timer counts wall-clock time. + +Possible values: + +- Positive integer number, in nanoseconds. + + Recommended values: + + - 10000000 (100 times a second) nanoseconds and less for single queries. + - 1000000000 (once a second) for cluster-wide profiling. + +- 0 for turning off the timer. + +**Temporarily disabled in ClickHouse Cloud.** + +See also: + +- System table [trace_log](../../operations/system-tables/trace_log.md/#system_tables-trace_log) + +## queue_max_wait_ms {#queue_max_wait_ms} + +Type: Milliseconds + +Default value: 0 + +The wait time in the request queue, if the number of concurrent requests exceeds the maximum. + +## rabbitmq_max_wait_ms {#rabbitmq_max_wait_ms} + +Type: Milliseconds + +Default value: 5000 + +The wait time for reading from RabbitMQ before retry. + +## read_backoff_max_throughput {#read_backoff_max_throughput} + +Type: UInt64 + +Default value: 1048576 + +Settings to reduce the number of threads in case of slow reads. Count events when the read bandwidth is less than that many bytes per second. + +## read_backoff_min_concurrency {#read_backoff_min_concurrency} + +Type: UInt64 + +Default value: 1 + +Settings to try keeping the minimal number of threads in case of slow reads. + +## read_backoff_min_events {#read_backoff_min_events} + +Type: UInt64 + +Default value: 2 + +Settings to reduce the number of threads in case of slow reads. The number of events after which the number of threads will be reduced. + +## read_backoff_min_interval_between_events_ms {#read_backoff_min_interval_between_events_ms} + +Type: Milliseconds + +Default value: 1000 + +Settings to reduce the number of threads in case of slow reads. Do not pay attention to the event, if the previous one has passed less than a certain amount of time. + +## read_backoff_min_latency_ms {#read_backoff_min_latency_ms} + +Type: Milliseconds + +Default value: 1000 + +Setting to reduce the number of threads in case of slow reads. Pay attention only to reads that took at least that much time. + +## read_from_filesystem_cache_if_exists_otherwise_bypass_cache {#read_from_filesystem_cache_if_exists_otherwise_bypass_cache} + +Type: Bool + +Default value: 0 + +Allow to use the filesystem cache in passive mode - benefit from the existing cache entries, but don't put more entries into the cache. If you set this setting for heavy ad-hoc queries and leave it disabled for short real-time queries, this will allows to avoid cache threshing by too heavy queries and to improve the overall system efficiency. + +## read_from_page_cache_if_exists_otherwise_bypass_cache {#read_from_page_cache_if_exists_otherwise_bypass_cache} + +Type: Bool + +Default value: 0 + +Use userspace page cache in passive mode, similar to read_from_filesystem_cache_if_exists_otherwise_bypass_cache. + +## read_in_order_two_level_merge_threshold {#read_in_order_two_level_merge_threshold} + +Type: UInt64 + +Default value: 100 + +Minimal number of parts to read to run preliminary merge step during multithread reading in order of primary key. + +## read_in_order_use_buffering {#read_in_order_use_buffering} + +Type: Bool + +Default value: 1 + +Use buffering before merging while reading in order of primary key. It increases the parallelism of query execution + +## read_overflow_mode {#read_overflow_mode} + +Type: OverflowMode + +Default value: throw + +What to do when the limit is exceeded. + +## read_overflow_mode_leaf {#read_overflow_mode_leaf} + +Type: OverflowMode + +Default value: throw + +What to do when the leaf limit is exceeded. + +## read_priority {#read_priority} + +Type: Int64 + +Default value: 0 + +Priority to read data from local filesystem or remote filesystem. Only supported for 'pread_threadpool' method for local filesystem and for `threadpool` method for remote filesystem. + +## read_through_distributed_cache {#read_through_distributed_cache} + +Type: Bool + +Default value: 0 + +Only in ClickHouse Cloud. Allow reading from distributed cache + +## readonly {#readonly} + +Type: UInt64 + +Default value: 0 + +0 - no read-only restrictions. 1 - only read requests, as well as changing explicitly allowed settings. 2 - only read requests, as well as changing settings, except for the 'readonly' setting. + +## receive_data_timeout_ms {#receive_data_timeout_ms} + +Type: Milliseconds + +Default value: 2000 + +Connection timeout for receiving first packet of data or packet with positive progress from replica + +## receive_timeout {#receive_timeout} + +Type: Seconds + +Default value: 300 + +Timeout for receiving data from the network, in seconds. If no bytes were received in this interval, the exception is thrown. If you set this setting on the client, the 'send_timeout' for the socket will also be set on the corresponding connection end on the server. + +## regexp_max_matches_per_row {#regexp_max_matches_per_row} + +Type: UInt64 + +Default value: 1000 + +Sets the maximum number of matches for a single regular expression per row. Use it to protect against memory overload when using greedy regular expression in the [extractAllGroupsHorizontal](../../sql-reference/functions/string-search-functions.md/#extractallgroups-horizontal) function. + +Possible values: + +- Positive integer. + +## reject_expensive_hyperscan_regexps {#reject_expensive_hyperscan_regexps} + +Type: Bool + +Default value: 1 + +Reject patterns which will likely be expensive to evaluate with hyperscan (due to NFA state explosion) + +## remerge_sort_lowered_memory_bytes_ratio {#remerge_sort_lowered_memory_bytes_ratio} + +Type: Float + +Default value: 2 + +If memory usage after remerge does not reduced by this ratio, remerge will be disabled. + +## remote_filesystem_read_method {#remote_filesystem_read_method} + +Type: String + +Default value: threadpool + +Method of reading data from remote filesystem, one of: read, threadpool. + +## remote_filesystem_read_prefetch {#remote_filesystem_read_prefetch} + +Type: Bool + +Default value: 1 + +Should use prefetching when reading data from remote filesystem. + +## remote_fs_read_backoff_max_tries {#remote_fs_read_backoff_max_tries} + +Type: UInt64 + +Default value: 5 + +Max attempts to read with backoff + +## remote_fs_read_max_backoff_ms {#remote_fs_read_max_backoff_ms} + +Type: UInt64 + +Default value: 10000 + +Max wait time when trying to read data for remote disk + +## remote_read_min_bytes_for_seek {#remote_read_min_bytes_for_seek} + +Type: UInt64 + +Default value: 4194304 + +Min bytes required for remote read (url, s3) to do seek, instead of read with ignore. + +## rename_files_after_processing {#rename_files_after_processing} + +Type: String + +Default value: + +- **Type:** String + +- **Default value:** Empty string + +This setting allows to specify renaming pattern for files processed by `file` table function. When option is set, all files read by `file` table function will be renamed according to specified pattern with placeholders, only if files processing was successful. + +### Placeholders + +- `%a` — Full original filename (e.g., "sample.csv"). +- `%f` — Original filename without extension (e.g., "sample"). +- `%e` — Original file extension with dot (e.g., ".csv"). +- `%t` — Timestamp (in microseconds). +- `%%` — Percentage sign ("%"). + +### Example +- Option: `--rename_files_after_processing="processed_%f_%t%e"` + +- Query: `SELECT * FROM file('sample.csv')` + + +If reading `sample.csv` is successful, file will be renamed to `processed_sample_1683473210851438.csv` + +## replace_running_query {#replace_running_query} + +Type: Bool + +Default value: 0 + +When using the HTTP interface, the ‘query_id’ parameter can be passed. This is any string that serves as the query identifier. +If a query from the same user with the same ‘query_id’ already exists at this time, the behaviour depends on the ‘replace_running_query’ parameter. + +`0` (default) – Throw an exception (do not allow the query to run if a query with the same ‘query_id’ is already running). + +`1` – Cancel the old query and start running the new one. + +Set this parameter to 1 for implementing suggestions for segmentation conditions. After entering the next character, if the old query hasn’t finished yet, it should be cancelled. + +## replace_running_query_max_wait_ms {#replace_running_query_max_wait_ms} + +Type: Milliseconds + +Default value: 5000 + +The wait time for running the query with the same `query_id` to finish, when the [replace_running_query](#replace-running-query) setting is active. + +Possible values: + +- Positive integer. +- 0 — Throwing an exception that does not allow to run a new query if the server already executes a query with the same `query_id`. + +## replication_wait_for_inactive_replica_timeout {#replication_wait_for_inactive_replica_timeout} + +Type: Int64 + +Default value: 120 Specifies how long (in seconds) to wait for inactive replicas to execute [ALTER](../../sql-reference/statements/alter/index.md), [OPTIMIZE](../../sql-reference/statements/optimize.md) or [TRUNCATE](../../sql-reference/statements/truncate.md) queries. @@ -4267,509 +8422,460 @@ Possible values: - Negative integer — Wait for unlimited time. - Positive integer — The number of seconds to wait. -Default value: `120` seconds. +## restore_replace_external_dictionary_source_to_null {#restore_replace_external_dictionary_source_to_null} -## regexp_max_matches_per_row {#regexp-max-matches-per-row} +Type: Bool -Sets the maximum number of matches for a single regular expression per row. Use it to protect against memory overload when using greedy regular expression in the [extractAllGroupsHorizontal](../../sql-reference/functions/string-search-functions.md/#extractallgroups-horizontal) function. +Default value: 0 + +Replace external dictionary sources to Null on restore. Useful for testing purposes + +## restore_replace_external_engines_to_null {#restore_replace_external_engines_to_null} + +Type: Bool + +Default value: 0 + +For testing purposes. Replaces all external engines to Null to not initiate external connections. + +## restore_replace_external_table_functions_to_null {#restore_replace_external_table_functions_to_null} + +Type: Bool + +Default value: 0 + +For testing purposes. Replaces all external table functions to Null to not initiate external connections. + +## result_overflow_mode {#result_overflow_mode} + +Type: OverflowMode + +Default value: throw + +What to do when the limit is exceeded. + +## rewrite_count_distinct_if_with_count_distinct_implementation {#rewrite_count_distinct_if_with_count_distinct_implementation} + +Type: Bool + +Default value: 0 + +Allows you to rewrite `countDistcintIf` with [count_distinct_implementation](#count_distinct_implementation) setting. Possible values: -- Positive integer. +- true — Allow. +- false — Disallow. -Default value: `1000`. +## s3_allow_parallel_part_upload {#s3_allow_parallel_part_upload} -## http_max_single_read_retries {#http-max-single-read-retries} +Type: Bool -Sets the maximum number of retries during a single HTTP read. +Default value: 1 + +Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage + +## s3_check_objects_after_upload {#s3_check_objects_after_upload} + +Type: Bool + +Default value: 0 + +Check each uploaded object to s3 with head request to be sure that upload was successful + +## s3_connect_timeout_ms {#s3_connect_timeout_ms} + +Type: UInt64 + +Default value: 1000 + +Connection timeout for host from s3 disks. + +## s3_create_new_file_on_insert {#s3_create_new_file_on_insert} + +Type: Bool + +Default value: 0 + +Enables or disables creating a new file on each insert in s3 engine tables. If enabled, on each insert a new S3 object will be created with the key, similar to this pattern: + +initial: `data.Parquet.gz` -> `data.1.Parquet.gz` -> `data.2.Parquet.gz`, etc. Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` query creates a new file. -- Positive integer. +## s3_disable_checksum {#s3_disable_checksum} -Default value: `1024`. +Type: Bool -## log_queries_probability {#log-queries-probability} +Default value: 0 -Allows a user to write to [query_log](../../operations/system-tables/query_log.md), [query_thread_log](../../operations/system-tables/query_thread_log.md), and [query_views_log](../../operations/system-tables/query_views_log.md) system tables only a sample of queries selected randomly with the specified probability. It helps to reduce the load with a large volume of queries in a second. +Do not calculate a checksum when sending a file to S3. This speeds up writes by avoiding excessive processing passes on a file. It is mostly safe as the data of MergeTree tables is checksummed by ClickHouse anyway, and when S3 is accessed with HTTPS, the TLS layer already provides integrity while transferring through the network. While additional checksums on S3 give defense in depth. + +## s3_ignore_file_doesnt_exist {#s3_ignore_file_doesnt_exist} + +Type: Bool + +Default value: 0 + +Ignore absence of file if it does not exist when reading certain keys. Possible values: +- 1 — `SELECT` returns empty result. +- 0 — `SELECT` throws an exception. -- 0 — Queries are not logged in the system tables. -- Positive floating-point number in the range [0..1]. For example, if the setting value is `0.5`, about half of the queries are logged in the system tables. -- 1 — All queries are logged in the system tables. +## s3_list_object_keys_size {#s3_list_object_keys_size} -Default value: `1`. +Type: UInt64 -## short_circuit_function_evaluation {#short-circuit-function-evaluation} +Default value: 1000 -Allows calculating the [if](../../sql-reference/functions/conditional-functions.md/#if), [multiIf](../../sql-reference/functions/conditional-functions.md/#multiif), [and](../../sql-reference/functions/logical-functions.md/#logical-and-function), and [or](../../sql-reference/functions/logical-functions.md/#logical-or-function) functions according to a [short scheme](https://en.wikipedia.org/wiki/Short-circuit_evaluation). This helps optimize the execution of complex expressions in these functions and prevent possible exceptions (such as division by zero when it is not expected). +Maximum number of files that could be returned in batch by ListObject request + +## s3_max_connections {#s3_max_connections} + +Type: UInt64 + +Default value: 1024 + +The maximum number of connections per server. + +## s3_max_get_burst {#s3_max_get_burst} + +Type: UInt64 + +Default value: 0 + +Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_get_rps` + +## s3_max_get_rps {#s3_max_get_rps} + +Type: UInt64 + +Default value: 0 + +Limit on S3 GET request per second rate before throttling. Zero means unlimited. + +## s3_max_inflight_parts_for_one_file {#s3_max_inflight_parts_for_one_file} + +Type: UInt64 + +Default value: 20 + +The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited. + +## s3_max_part_number {#s3_max_part_number} + +Type: UInt64 + +Default value: 10000 + +Maximum part number number for s3 upload part. + +## s3_max_put_burst {#s3_max_put_burst} + +Type: UInt64 + +Default value: 0 + +Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_put_rps` + +## s3_max_put_rps {#s3_max_put_rps} + +Type: UInt64 + +Default value: 0 + +Limit on S3 PUT request per second rate before throttling. Zero means unlimited. + +## s3_max_redirects {#s3_max_redirects} + +Type: UInt64 + +Default value: 10 + +Max number of S3 redirects hops allowed. + +## s3_max_single_operation_copy_size {#s3_max_single_operation_copy_size} + +Type: UInt64 + +Default value: 33554432 + +Maximum size for a single copy operation in s3 + +## s3_max_single_part_upload_size {#s3_max_single_part_upload_size} + +Type: UInt64 + +Default value: 33554432 + +The maximum size of object to upload using singlepart upload to S3. + +## s3_max_single_read_retries {#s3_max_single_read_retries} + +Type: UInt64 + +Default value: 4 + +The maximum number of retries during single S3 read. + +## s3_max_unexpected_write_error_retries {#s3_max_unexpected_write_error_retries} + +Type: UInt64 + +Default value: 4 + +The maximum number of retries in case of unexpected errors during S3 write. + +## s3_max_upload_part_size {#s3_max_upload_part_size} + +Type: UInt64 + +Default value: 5368709120 + +The maximum size of part to upload during multipart upload to S3. + +## s3_min_upload_part_size {#s3_min_upload_part_size} + +Type: UInt64 + +Default value: 16777216 + +The minimum size of part to upload during multipart upload to S3. + +## s3_request_timeout_ms {#s3_request_timeout_ms} + +Type: UInt64 + +Default value: 30000 + +Idleness timeout for sending and receiving data to/from S3. Fail if a single TCP read or write call blocks for this long. + +## s3_retry_attempts {#s3_retry_attempts} + +Type: UInt64 + +Default value: 100 + +Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries + +## s3_skip_empty_files {#s3_skip_empty_files} + +Type: Bool + +Default value: 0 + +Enables or disables skipping empty files in [S3](../../engines/table-engines/integrations/s3.md) engine tables. Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. -- `enable` — Enables short-circuit function evaluation for functions that are suitable for it (can throw an exception or computationally heavy). -- `force_enable` — Enables short-circuit function evaluation for all functions. -- `disable` — Disables short-circuit function evaluation. +## s3_strict_upload_part_size {#s3_strict_upload_part_size} -Default value: `enable`. +Type: UInt64 -## max_hyperscan_regexp_length {#max-hyperscan-regexp-length} +Default value: 0 -Defines the maximum length for each regular expression in the [hyperscan multi-match functions](../../sql-reference/functions/string-search-functions.md/#multimatchanyhaystack-pattern1-pattern2-patternn). +The exact size of part to upload during multipart upload to S3 (some implementations does not supports variable size parts). + +## s3_throw_on_zero_files_match {#s3_throw_on_zero_files_match} + +Type: Bool + +Default value: 0 + +Throw an error, when ListObjects request cannot match any files + +## s3_truncate_on_insert {#s3_truncate_on_insert} + +Type: Bool + +Default value: 0 + +Enables or disables truncate before inserts in s3 engine tables. If disabled, an exception will be thrown on insert attempts if an S3 object already exists. Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` query replaces existing content of the file with the new data. -- Positive integer. -- 0 - The length is not limited. +## s3_upload_part_size_multiply_factor {#s3_upload_part_size_multiply_factor} -Default value: `0`. +Type: UInt64 -**Example** +Default value: 2 -Query: +Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3. -```sql -SELECT multiMatchAny('abcd', ['ab','bcd','c','d']) SETTINGS max_hyperscan_regexp_length = 3; -``` +## s3_upload_part_size_multiply_parts_count_threshold {#s3_upload_part_size_multiply_parts_count_threshold} -Result: +Type: UInt64 -```text -┌─multiMatchAny('abcd', ['ab', 'bcd', 'c', 'd'])─┐ -│ 1 │ -└────────────────────────────────────────────────┘ -``` +Default value: 500 -Query: +Each time this number of parts was uploaded to S3, s3_min_upload_part_size is multiplied by s3_upload_part_size_multiply_factor. -```sql -SELECT multiMatchAny('abcd', ['ab','bcd','c','d']) SETTINGS max_hyperscan_regexp_length = 2; -``` +## s3_use_adaptive_timeouts {#s3_use_adaptive_timeouts} -Result: +Type: Bool -```text -Exception: Regexp length too large. -``` +Default value: 1 -**See Also** +When set to `true` than for all s3 requests first two attempts are made with low send and receive timeouts. +When set to `false` than all attempts are made with identical timeouts. -- [max_hyperscan_regexp_total_length](#max-hyperscan-regexp-total-length) +## s3_validate_request_settings {#s3_validate_request_settings} -## max_hyperscan_regexp_total_length {#max-hyperscan-regexp-total-length} +Type: Bool -Sets the maximum length total of all regular expressions in each [hyperscan multi-match function](../../sql-reference/functions/string-search-functions.md/#multimatchanyhaystack-pattern1-pattern2-patternn). +Default value: 1 + +Enables s3 request settings validation. Possible values: +- 1 — validate settings. +- 0 — do not validate settings. -- Positive integer. -- 0 - The length is not limited. +## s3queue_default_zookeeper_path {#s3queue_default_zookeeper_path} -Default value: `0`. +Type: String -**Example** +Default value: /clickhouse/s3queue/ -Query: +Default zookeeper path prefix for S3Queue engine -```sql -SELECT multiMatchAny('abcd', ['a','b','c','d']) SETTINGS max_hyperscan_regexp_total_length = 5; -``` +## s3queue_enable_logging_to_s3queue_log {#s3queue_enable_logging_to_s3queue_log} -Result: +Type: Bool -```text -┌─multiMatchAny('abcd', ['a', 'b', 'c', 'd'])─┐ -│ 1 │ -└─────────────────────────────────────────────┘ -``` +Default value: 0 -Query: +Enable writing to system.s3queue_log. The value can be overwritten per table with table settings -```sql -SELECT multiMatchAny('abcd', ['ab','bc','c','d']) SETTINGS max_hyperscan_regexp_total_length = 5; -``` +## schema_inference_cache_require_modification_time_for_url {#schema_inference_cache_require_modification_time_for_url} -Result: +Type: Bool -```text -Exception: Total regexp lengths too large. -``` +Default value: 1 -**See Also** +Use schema from cache for URL with last modification time validation (for URLs with Last-Modified header) -- [max_hyperscan_regexp_length](#max-hyperscan-regexp-length) +## schema_inference_use_cache_for_azure {#schema_inference_use_cache_for_azure} -## enable_positional_arguments {#enable-positional-arguments} +Type: Bool -Enables or disables supporting positional arguments for [GROUP BY](../../sql-reference/statements/select/group-by.md), [LIMIT BY](../../sql-reference/statements/select/limit-by.md), [ORDER BY](../../sql-reference/statements/select/order-by.md) statements. +Default value: 1 -Possible values: +Use cache in schema inference while using azure table function -- 0 — Positional arguments aren't supported. -- 1 — Positional arguments are supported: column numbers can use instead of column names. +## schema_inference_use_cache_for_file {#schema_inference_use_cache_for_file} -Default value: `1`. +Type: Bool -**Example** +Default value: 1 -Query: +Use cache in schema inference while using file table function -```sql -CREATE TABLE positional_arguments(one Int, two Int, three Int) ENGINE=Memory(); +## schema_inference_use_cache_for_hdfs {#schema_inference_use_cache_for_hdfs} -INSERT INTO positional_arguments VALUES (10, 20, 30), (20, 20, 10), (30, 10, 20); +Type: Bool -SELECT * FROM positional_arguments ORDER BY 2,3; -``` +Default value: 1 -Result: +Use cache in schema inference while using hdfs table function -```text -┌─one─┬─two─┬─three─┐ -│ 30 │ 10 │ 20 │ -│ 20 │ 20 │ 10 │ -│ 10 │ 20 │ 30 │ -└─────┴─────┴───────┘ -``` +## schema_inference_use_cache_for_s3 {#schema_inference_use_cache_for_s3} -## enable_order_by_all {#enable-order-by-all} +Type: Bool -Enables or disables sorting with `ORDER BY ALL` syntax, see [ORDER BY](../../sql-reference/statements/select/order-by.md). +Default value: 1 -Possible values: +Use cache in schema inference while using s3 table function -- 0 — Disable ORDER BY ALL. -- 1 — Enable ORDER BY ALL. +## schema_inference_use_cache_for_url {#schema_inference_use_cache_for_url} -Default value: `1`. +Type: Bool -**Example** +Default value: 1 -Query: +Use cache in schema inference while using url table function -```sql -CREATE TABLE TAB(C1 Int, C2 Int, ALL Int) ENGINE=Memory(); +## select_sequential_consistency {#select_sequential_consistency} -INSERT INTO TAB VALUES (10, 20, 30), (20, 20, 10), (30, 10, 20); +Type: UInt64 -SELECT * FROM TAB ORDER BY ALL; -- returns an error that ALL is ambiguous +Default value: 0 -SELECT * FROM TAB ORDER BY ALL SETTINGS enable_order_by_all = 0; -``` +:::note +This setting differ in behavior between SharedMergeTree and ReplicatedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information about the behavior of `select_sequential_consistency` in SharedMergeTree. +::: -Result: - -```text -┌─C1─┬─C2─┬─ALL─┐ -│ 20 │ 20 │ 10 │ -│ 30 │ 10 │ 20 │ -│ 10 │ 20 │ 30 │ -└────┴────┴─────┘ -``` - -## splitby_max_substrings_includes_remaining_string {#splitby_max_substrings_includes_remaining_string} - -Controls whether function [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with argument `max_substrings` > 0 will include the remaining string in the last element of the result array. - -Possible values: - -- `0` - The remaining string will not be included in the last element of the result array. -- `1` - The remaining string will be included in the last element of the result array. This is the behavior of Spark's [`split()`](https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.sql.functions.split.html) function and Python's ['string.split()'](https://docs.python.org/3/library/stdtypes.html#str.split) method. - -Default value: `0` - -## enable_extended_results_for_datetime_functions {#enable-extended-results-for-datetime-functions} - -Enables or disables returning results of type: -- `Date32` with extended range (compared to type `Date`) for functions [toStartOfYear](../../sql-reference/functions/date-time-functions.md#tostartofyear), [toStartOfISOYear](../../sql-reference/functions/date-time-functions.md#tostartofisoyear), [toStartOfQuarter](../../sql-reference/functions/date-time-functions.md#tostartofquarter), [toStartOfMonth](../../sql-reference/functions/date-time-functions.md#tostartofmonth), [toLastDayOfMonth](../../sql-reference/functions/date-time-functions.md#tolastdayofmonth), [toStartOfWeek](../../sql-reference/functions/date-time-functions.md#tostartofweek), [toLastDayOfWeek](../../sql-reference/functions/date-time-functions.md#tolastdayofweek) and [toMonday](../../sql-reference/functions/date-time-functions.md#tomonday). -- `DateTime64` with extended range (compared to type `DateTime`) for functions [toStartOfDay](../../sql-reference/functions/date-time-functions.md#tostartofday), [toStartOfHour](../../sql-reference/functions/date-time-functions.md#tostartofhour), [toStartOfMinute](../../sql-reference/functions/date-time-functions.md#tostartofminute), [toStartOfFiveMinutes](../../sql-reference/functions/date-time-functions.md#tostartoffiveminutes), [toStartOfTenMinutes](../../sql-reference/functions/date-time-functions.md#tostartoftenminutes), [toStartOfFifteenMinutes](../../sql-reference/functions/date-time-functions.md#tostartoffifteenminutes) and [timeSlot](../../sql-reference/functions/date-time-functions.md#timeslot). - -Possible values: - -- 0 — Functions return `Date` or `DateTime` for all types of arguments. -- 1 — Functions return `Date32` or `DateTime64` for `Date32` or `DateTime64` arguments and `Date` or `DateTime` otherwise. - -Default value: `0`. - - -## function_locate_has_mysql_compatible_argument_order {#function-locate-has-mysql-compatible-argument-order} - -Controls the order of arguments in function [locate](../../sql-reference/functions/string-search-functions.md#locate). - -Possible values: - -- 0 — Function `locate` accepts arguments `(haystack, needle[, start_pos])`. -- 1 — Function `locate` accepts arguments `(needle, haystack, [, start_pos])` (MySQL-compatible behavior) - -Default value: `1`. - -## date_time_overflow_behavior {#date_time_overflow_behavior} - -Defines the behavior when [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md), [DateTime64](../../sql-reference/data-types/datetime64.md) or integers are converted into Date, Date32, DateTime or DateTime64 but the value cannot be represented in the result type. - -Possible values: - -- `ignore` — Silently ignore overflows. The result is random. -- `throw` — Throw an exception in case of conversion overflow. -- `saturate` — Silently saturate the result. If the value is smaller than the smallest value that can be represented by the target type, the result is chosen as the smallest representable value. If the value is bigger than the largest value that can be represented by the target type, the result is chosen as the largest representable value. - -Default value: `ignore`. - -## optimize_move_to_prewhere {#optimize_move_to_prewhere} - -Enables or disables automatic [PREWHERE](../../sql-reference/statements/select/prewhere.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries. - -Works only for [*MergeTree](../../engines/table-engines/mergetree-family/index.md) tables. - -Possible values: - -- 0 — Automatic `PREWHERE` optimization is disabled. -- 1 — Automatic `PREWHERE` optimization is enabled. - -Default value: `1`. - -## optimize_move_to_prewhere_if_final {#optimize_move_to_prewhere_if_final} - -Enables or disables automatic [PREWHERE](../../sql-reference/statements/select/prewhere.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries with [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier. - -Works only for [*MergeTree](../../engines/table-engines/mergetree-family/index.md) tables. - -Possible values: - -- 0 — Automatic `PREWHERE` optimization in `SELECT` queries with `FINAL` modifier is disabled. -- 1 — Automatic `PREWHERE` optimization in `SELECT` queries with `FINAL` modifier is enabled. - -Default value: `0`. - -**See Also** - -- [optimize_move_to_prewhere](#optimize_move_to_prewhere) setting - -## optimize_using_constraints - -Use [constraints](../../sql-reference/statements/create/table.md#constraints) for query optimization. The default is `false`. - -Possible values: - -- true, false - -## optimize_append_index - -Use [constraints](../../sql-reference/statements/create/table.md#constraints) in order to append index condition. The default is `false`. - -Possible values: - -- true, false - -## optimize_substitute_columns - -Use [constraints](../../sql-reference/statements/create/table.md#constraints) for column substitution. The default is `false`. - -Possible values: - -- true, false - -## describe_include_subcolumns {#describe_include_subcolumns} - -Enables describing subcolumns for a [DESCRIBE](../../sql-reference/statements/describe-table.md) query. For example, members of a [Tuple](../../sql-reference/data-types/tuple.md) or subcolumns of a [Map](../../sql-reference/data-types/map.md/#map-subcolumns), [Nullable](../../sql-reference/data-types/nullable.md/#finding-null) or an [Array](../../sql-reference/data-types/array.md/#array-size) data type. - -Possible values: - -- 0 — Subcolumns are not included in `DESCRIBE` queries. -- 1 — Subcolumns are included in `DESCRIBE` queries. - -Default value: `0`. - -**Example** - -See an example for the [DESCRIBE](../../sql-reference/statements/describe-table.md) statement. - - -## alter_partition_verbose_result {#alter-partition-verbose-result} - -Enables or disables the display of information about the parts to which the manipulation operations with partitions and parts have been successfully applied. -Applicable to [ATTACH PARTITION|PART](../../sql-reference/statements/alter/partition.md/#alter_attach-partition) and to [FREEZE PARTITION](../../sql-reference/statements/alter/partition.md/#alter_freeze-partition). - -Possible values: - -- 0 — disable verbosity. -- 1 — enable verbosity. - -Default value: `0`. - -**Example** - -```sql -CREATE TABLE test(a Int64, d Date, s String) ENGINE = MergeTree PARTITION BY toYYYYMM(d) ORDER BY a; -INSERT INTO test VALUES(1, '2021-01-01', ''); -INSERT INTO test VALUES(1, '2021-01-01', ''); -ALTER TABLE test DETACH PARTITION ID '202101'; - -ALTER TABLE test ATTACH PARTITION ID '202101' SETTINGS alter_partition_verbose_result = 1; - -┌─command_type─────┬─partition_id─┬─part_name────┬─old_part_name─┐ -│ ATTACH PARTITION │ 202101 │ 202101_7_7_0 │ 202101_5_5_0 │ -│ ATTACH PARTITION │ 202101 │ 202101_8_8_0 │ 202101_6_6_0 │ -└──────────────────┴──────────────┴──────────────┴───────────────┘ - -ALTER TABLE test FREEZE SETTINGS alter_partition_verbose_result = 1; - -┌─command_type─┬─partition_id─┬─part_name────┬─backup_name─┬─backup_path───────────────────┬─part_backup_path────────────────────────────────────────────┐ -│ FREEZE ALL │ 202101 │ 202101_7_7_0 │ 8 │ /var/lib/clickhouse/shadow/8/ │ /var/lib/clickhouse/shadow/8/data/default/test/202101_7_7_0 │ -│ FREEZE ALL │ 202101 │ 202101_8_8_0 │ 8 │ /var/lib/clickhouse/shadow/8/ │ /var/lib/clickhouse/shadow/8/data/default/test/202101_8_8_0 │ -└──────────────┴──────────────┴──────────────┴─────────────┴───────────────────────────────┴─────────────────────────────────────────────────────────────┘ -``` - -## min_bytes_to_use_mmap_io {#min-bytes-to-use-mmap-io} - -This is an experimental setting. Sets the minimum amount of memory for reading large files without copying data from the kernel to userspace. Recommended threshold is about 64 MB, because [mmap/munmap](https://en.wikipedia.org/wiki/Mmap) is slow. It makes sense only for large files and helps only if data reside in the page cache. - -Possible values: - -- Positive integer. -- 0 — Big files read with only copying data from kernel to userspace. - -Default value: `0`. - -## shutdown_wait_unfinished_queries {#shutdown_wait_unfinished_queries} - -Enables or disables waiting unfinished queries when shutdown server. +Enables or disables sequential consistency for `SELECT` queries. Requires `insert_quorum_parallel` to be disabled (enabled by default). Possible values: - 0 — Disabled. -- 1 — Enabled. The wait time equal shutdown_wait_unfinished config. +- 1 — Enabled. -Default value: 0. +Usage -## shutdown_wait_unfinished {#shutdown_wait_unfinished} +When sequential consistency is enabled, ClickHouse allows the client to execute the `SELECT` query only for those replicas that contain data from all previous `INSERT` queries executed with `insert_quorum`. If the client refers to a partial replica, ClickHouse will generate an exception. The SELECT query will not include data that has not yet been written to the quorum of replicas. -The waiting time in seconds for currently handled connections when shutdown server. +When `insert_quorum_parallel` is enabled (the default), then `select_sequential_consistency` does not work. This is because parallel `INSERT` queries can be written to different sets of quorum replicas so there is no guarantee a single replica will have received all writes. -Default Value: 5. +See also: -## memory_overcommit_ratio_denominator {#memory_overcommit_ratio_denominator} +- [insert_quorum](#insert_quorum) +- [insert_quorum_timeout](#insert_quorum_timeout) +- [insert_quorum_parallel](#insert_quorum_parallel) -It represents the soft memory limit when the hard limit is reached on the global level. -This value is used to compute the overcommit ratio for the query. -Zero means skip the query. -Read more about [memory overcommit](memory-overcommit.md). +## send_logs_level {#send_logs_level} -Default value: `1GiB`. +Type: LogsLevel -## memory_usage_overcommit_max_wait_microseconds {#memory_usage_overcommit_max_wait_microseconds} +Default value: fatal -Maximum time thread will wait for memory to be freed in the case of memory overcommit on a user level. -If the timeout is reached and memory is not freed, an exception is thrown. -Read more about [memory overcommit](memory-overcommit.md). +Send server text logs with specified minimum level to client. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none' -Default value: `5000000`. +## send_logs_source_regexp {#send_logs_source_regexp} -## memory_overcommit_ratio_denominator_for_user {#memory_overcommit_ratio_denominator_for_user} +Type: String -It represents the soft memory limit when the hard limit is reached on the user level. -This value is used to compute the overcommit ratio for the query. -Zero means skip the query. -Read more about [memory overcommit](memory-overcommit.md). +Default value: -Default value: `1GiB`. +Send server text logs with specified regexp to match log source name. Empty means all sources. -## Schema Inference settings +## send_progress_in_http_headers {#send_progress_in_http_headers} -See [schema inference](../../interfaces/schema-inference.md#schema-inference-modes) documentation for more details. +Type: Bool -### schema_inference_use_cache_for_file {schema_inference_use_cache_for_file} +Default value: 0 -Enable schemas cache for schema inference in `file` table function. +Enables or disables `X-ClickHouse-Progress` HTTP response headers in `clickhouse-server` responses. -Default value: `true`. - -### schema_inference_use_cache_for_s3 {schema_inference_use_cache_for_s3} - -Enable schemas cache for schema inference in `s3` table function. - -Default value: `true`. - -### schema_inference_use_cache_for_url {schema_inference_use_cache_for_url} - -Enable schemas cache for schema inference in `url` table function. - -Default value: `true`. - -### schema_inference_use_cache_for_hdfs {schema_inference_use_cache_for_hdfs} - -Enable schemas cache for schema inference in `hdfs` table function. - -Default value: `true`. - -### schema_inference_cache_require_modification_time_for_url {#schema_inference_cache_require_modification_time_for_url} - -Use schema from cache for URL with last modification time validation (for urls with Last-Modified header). If this setting is enabled and URL doesn't have Last-Modified header, schema from cache won't be used. - -Default value: `true`. - -### use_structure_from_insertion_table_in_table_functions {use_structure_from_insertion_table_in_table_functions} - -Use structure from insertion table instead of schema inference from data. - -Possible values: -- 0 - disabled -- 1 - enabled -- 2 - auto - -Default value: 2. - -### schema_inference_mode {schema_inference_mode} - -The mode of schema inference. Possible values: `default` and `union`. -See [schema inference modes](../../interfaces/schema-inference.md#schema-inference-modes) section for more details. - -Default value: `default`. - -## compatibility {#compatibility} - -The `compatibility` setting causes ClickHouse to use the default settings of a previous version of ClickHouse, where the previous version is provided as the setting. - -If settings are set to non-default values, then those settings are honored (only settings that have not been modified are affected by the `compatibility` setting). - -This setting takes a ClickHouse version number as a string, like `22.3`, `22.8`. An empty value means that this setting is disabled. - -Disabled by default. - -:::note -In ClickHouse Cloud the compatibility setting must be set by ClickHouse Cloud support. Please [open a case](https://clickhouse.cloud/support) to have it set. -::: - -## allow_settings_after_format_in_insert {#allow_settings_after_format_in_insert} - -Control whether `SETTINGS` after `FORMAT` in `INSERT` queries is allowed or not. It is not recommended to use this, since this may interpret part of `SETTINGS` as values. - -Example: - -```sql -INSERT INTO FUNCTION null('foo String') SETTINGS max_threads=1 VALUES ('bar'); -``` - -But the following query will work only with `allow_settings_after_format_in_insert`: - -```sql -SET allow_settings_after_format_in_insert=1; -INSERT INTO FUNCTION null('foo String') VALUES ('bar') SETTINGS max_threads=1; -``` +For more information, read the [HTTP interface description](../../interfaces/http.md). Possible values: -- 0 — Disallow. -- 1 — Allow. +- 0 — Disabled. +- 1 — Enabled. -Default value: `0`. +## send_timeout {#send_timeout} -:::note -Use this setting only for backward compatibility if your use cases depend on old syntax. -::: +Type: Seconds + +Default value: 300 + +Timeout for sending data to the network, in seconds. If a client needs to send some data but is not able to send any bytes in this interval, the exception is thrown. If you set this setting on the client, the 'receive_timeout' for the socket will also be set on the corresponding connection end on the server. ## session_timezone {#session_timezone} +Type: Timezone + +Default value: + Sets the implicit time zone of the current session or query. The implicit time zone is the time zone applied to values of type DateTime/DateTime64 which have no explicitly specified time zone. The setting takes precedence over the globally configured (server-level) implicit time zone. @@ -4781,8 +8887,6 @@ Possible values: - Any time zone name from `system.time_zones`, e.g. `Europe/Berlin`, `UTC` or `Zulu` -Default value: `''`. - Examples: ```sql @@ -4831,88 +8935,539 @@ This happens due to different parsing pipelines: - [timezone](../server-configuration-parameters/settings.md#timezone) -## final {#final} +## set_overflow_mode {#set_overflow_mode} -Automatically applies [FINAL](../../sql-reference/statements/select/from.md#final-modifier) modifier to all tables in a query, to tables where [FINAL](../../sql-reference/statements/select/from.md#final-modifier) is applicable, including joined tables and tables in sub-queries, and -distributed tables. +Type: OverflowMode + +Default value: throw + +What to do when the limit is exceeded. + +## short_circuit_function_evaluation {#short_circuit_function_evaluation} + +Type: ShortCircuitFunctionEvaluation + +Default value: enable + +Allows calculating the [if](../../sql-reference/functions/conditional-functions.md/#if), [multiIf](../../sql-reference/functions/conditional-functions.md/#multiif), [and](../../sql-reference/functions/logical-functions.md/#logical-and-function), and [or](../../sql-reference/functions/logical-functions.md/#logical-or-function) functions according to a [short scheme](https://en.wikipedia.org/wiki/Short-circuit_evaluation). This helps optimize the execution of complex expressions in these functions and prevent possible exceptions (such as division by zero when it is not expected). Possible values: -- 0 - disabled -- 1 - enabled +- `enable` — Enables short-circuit function evaluation for functions that are suitable for it (can throw an exception or computationally heavy). +- `force_enable` — Enables short-circuit function evaluation for all functions. +- `disable` — Disables short-circuit function evaluation. -Default value: `0`. +## show_table_uuid_in_table_create_query_if_not_nil {#show_table_uuid_in_table_create_query_if_not_nil} -Example: +Type: Bool + +Default value: 0 + +Sets the `SHOW TABLE` query display. + +Possible values: + +- 0 — The query will be displayed without table UUID. +- 1 — The query will be displayed with table UUID. + +## single_join_prefer_left_table {#single_join_prefer_left_table} + +Type: Bool + +Default value: 1 + +For single JOIN in case of identifier ambiguity prefer left table + +## skip_download_if_exceeds_query_cache {#skip_download_if_exceeds_query_cache} + +Type: Bool + +Default value: 1 + +Skip download from remote filesystem if exceeds query cache size + +## skip_unavailable_shards {#skip_unavailable_shards} + +Type: Bool + +Default value: 0 + +Enables or disables silently skipping of unavailable shards. + +Shard is considered unavailable if all its replicas are unavailable. A replica is unavailable in the following cases: + +- ClickHouse can’t connect to replica for any reason. + + When connecting to a replica, ClickHouse performs several attempts. If all these attempts fail, the replica is considered unavailable. + +- Replica can’t be resolved through DNS. + + If replica’s hostname can’t be resolved through DNS, it can indicate the following situations: + + - Replica’s host has no DNS record. It can occur in systems with dynamic DNS, for example, [Kubernetes](https://kubernetes.io), where nodes can be unresolvable during downtime, and this is not an error. + + - Configuration error. ClickHouse configuration file contains a wrong hostname. + +Possible values: + +- 1 — skipping enabled. + + If a shard is unavailable, ClickHouse returns a result based on partial data and does not report node availability issues. + +- 0 — skipping disabled. + + If a shard is unavailable, ClickHouse throws an exception. + +## sleep_after_receiving_query_ms {#sleep_after_receiving_query_ms} + +Type: Milliseconds + +Default value: 0 + +Time to sleep after receiving query in TCPHandler + +## sleep_in_send_data_ms {#sleep_in_send_data_ms} + +Type: Milliseconds + +Default value: 0 + +Time to sleep in sending data in TCPHandler + +## sleep_in_send_tables_status_ms {#sleep_in_send_tables_status_ms} + +Type: Milliseconds + +Default value: 0 + +Time to sleep in sending tables status response in TCPHandler + +## sort_overflow_mode {#sort_overflow_mode} + +Type: OverflowMode + +Default value: throw + +What to do when the limit is exceeded. + +## split_intersecting_parts_ranges_into_layers_final {#split_intersecting_parts_ranges_into_layers_final} + +Type: Bool + +Default value: 1 + +Split intersecting parts ranges into layers during FINAL optimization + +## split_parts_ranges_into_intersecting_and_non_intersecting_final {#split_parts_ranges_into_intersecting_and_non_intersecting_final} + +Type: Bool + +Default value: 1 + +Split parts ranges into intersecting and non intersecting during FINAL optimization + +## splitby_max_substrings_includes_remaining_string {#splitby_max_substrings_includes_remaining_string} + +Type: Bool + +Default value: 0 + +Controls whether function [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with argument `max_substrings` > 0 will include the remaining string in the last element of the result array. + +Possible values: + +- `0` - The remaining string will not be included in the last element of the result array. +- `1` - The remaining string will be included in the last element of the result array. This is the behavior of Spark's [`split()`](https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.sql.functions.split.html) function and Python's ['string.split()'](https://docs.python.org/3/library/stdtypes.html#str.split) method. + +## stop_refreshable_materialized_views_on_startup {#stop_refreshable_materialized_views_on_startup} + +Type: Bool + +Default value: 0 + +On server startup, prevent scheduling of refreshable materialized views, as if with SYSTEM STOP VIEWS. You can manually start them with SYSTEM START VIEWS or SYSTEM START VIEW \\ afterwards. Also applies to newly created views. Has no effect on non-refreshable materialized views. + +## storage_file_read_method {#storage_file_read_method} + +Type: LocalFSReadMethod + +Default value: pread + +Method of reading data from storage file, one of: `read`, `pread`, `mmap`. The mmap method does not apply to clickhouse-server (it's intended for clickhouse-local). + +## storage_system_stack_trace_pipe_read_timeout_ms {#storage_system_stack_trace_pipe_read_timeout_ms} + +Type: Milliseconds + +Default value: 100 + +Maximum time to read from a pipe for receiving information from the threads when querying the `system.stack_trace` table. This setting is used for testing purposes and not meant to be changed by users. + +## stream_flush_interval_ms {#stream_flush_interval_ms} + +Type: Milliseconds + +Default value: 7500 + +Works for tables with streaming in the case of a timeout, or when a thread generates [max_insert_block_size](#max_insert_block_size) rows. + +The default value is 7500. + +The smaller the value, the more often data is flushed into the table. Setting the value too low leads to poor performance. + +## stream_like_engine_allow_direct_select {#stream_like_engine_allow_direct_select} + +Type: Bool + +Default value: 0 + +Allow direct SELECT query for Kafka, RabbitMQ, FileLog, Redis Streams, and NATS engines. In case there are attached materialized views, SELECT query is not allowed even if this setting is enabled. + +## stream_like_engine_insert_queue {#stream_like_engine_insert_queue} + +Type: String + +Default value: + +When stream-like engine reads from multiple queues, the user will need to select one queue to insert into when writing. Used by Redis Streams and NATS. + +## stream_poll_timeout_ms {#stream_poll_timeout_ms} + +Type: Milliseconds + +Default value: 500 + +Timeout for polling data from/to streaming storages. + +## system_events_show_zero_values {#system_events_show_zero_values} + +Type: Bool + +Default value: 0 + +Allows to select zero-valued events from [`system.events`](../../operations/system-tables/events.md). + +Some monitoring systems require passing all the metrics values to them for each checkpoint, even if the metric value is zero. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +**Examples** + +Query ```sql -CREATE TABLE test -( - key Int64, - some String -) -ENGINE = ReplacingMergeTree -ORDER BY key; - -INSERT INTO test FORMAT Values (1, 'first'); -INSERT INTO test FORMAT Values (1, 'second'); - -SELECT * FROM test; -┌─key─┬─some───┐ -│ 1 │ second │ -└─────┴────────┘ -┌─key─┬─some──┐ -│ 1 │ first │ -└─────┴───────┘ - -SELECT * FROM test SETTINGS final = 1; -┌─key─┬─some───┐ -│ 1 │ second │ -└─────┴────────┘ - -SET final = 1; -SELECT * FROM test; -┌─key─┬─some───┐ -│ 1 │ second │ -└─────┴────────┘ +SELECT * FROM system.events WHERE event='QueryMemoryLimitExceeded'; ``` -## asterisk_include_materialized_columns {#asterisk_include_materialized_columns} +Result -Include [MATERIALIZED](../../sql-reference/statements/create/table.md#materialized) columns for wildcard query (`SELECT *`). +```text +Ok. +``` + +Query +```sql +SET system_events_show_zero_values = 1; +SELECT * FROM system.events WHERE event='QueryMemoryLimitExceeded'; +``` + +Result + +```text +┌─event────────────────────┬─value─┬─description───────────────────────────────────────────┐ +│ QueryMemoryLimitExceeded │ 0 │ Number of times when memory limit exceeded for query. │ +└──────────────────────────┴───────┴───────────────────────────────────────────────────────┘ +``` + +## table_function_remote_max_addresses {#table_function_remote_max_addresses} + +Type: UInt64 + +Default value: 1000 + +Sets the maximum number of addresses generated from patterns for the [remote](../../sql-reference/table-functions/remote.md) function. Possible values: -- 0 - disabled -- 1 - enabled +- Positive integer. -Default value: `0`. +## tcp_keep_alive_timeout {#tcp_keep_alive_timeout} -## asterisk_include_alias_columns {#asterisk_include_alias_columns} +Type: Seconds -Include [ALIAS](../../sql-reference/statements/create/table.md#alias) columns for wildcard query (`SELECT *`). +Default value: 290 + +The time in seconds the connection needs to remain idle before TCP starts sending keepalive probes + +## temporary_data_in_cache_reserve_space_wait_lock_timeout_milliseconds {#temporary_data_in_cache_reserve_space_wait_lock_timeout_milliseconds} + +Type: UInt64 + +Default value: 600000 + +Wait time to lock cache for space reservation for temporary data in filesystem cache + +## temporary_files_codec {#temporary_files_codec} + +Type: String + +Default value: LZ4 + +Sets compression codec for temporary files used in sorting and joining operations on disk. Possible values: -- 0 - disabled -- 1 - enabled +- LZ4 — [LZ4](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) compression is applied. +- NONE — No compression is applied. -Default value: `0`. +## throw_if_deduplication_in_dependent_materialized_views_enabled_with_async_insert {#throw_if_deduplication_in_dependent_materialized_views_enabled_with_async_insert} -## async_socket_for_remote {#async_socket_for_remote} +Type: Bool -Enables asynchronous read from socket while executing remote query. +Default value: 1 + +Throw exception on INSERT query when the setting `deduplicate_blocks_in_dependent_materialized_views` is enabled along with `async_insert`. It guarantees correctness, because these features can't work together. + +## throw_if_no_data_to_insert {#throw_if_no_data_to_insert} + +Type: Bool + +Default value: 1 + +Allows or forbids empty INSERTs, enabled by default (throws an error on an empty insert) + +## throw_on_error_from_cache_on_write_operations {#throw_on_error_from_cache_on_write_operations} + +Type: Bool + +Default value: 0 + +Ignore error from cache when caching on write operations (INSERT, merges) + +## throw_on_max_partitions_per_insert_block {#throw_on_max_partitions_per_insert_block} + +Type: Bool + +Default value: 1 + +Used with max_partitions_per_insert_block. If true (default), an exception will be thrown when max_partitions_per_insert_block is reached. If false, details of the insert query reaching this limit with the number of partitions will be logged. This can be useful if you're trying to understand the impact on users when changing max_partitions_per_insert_block. + +## throw_on_unsupported_query_inside_transaction {#throw_on_unsupported_query_inside_transaction} + +Type: Bool + +Default value: 1 + +Throw exception if unsupported query is used inside transaction + +## timeout_before_checking_execution_speed {#timeout_before_checking_execution_speed} + +Type: Seconds + +Default value: 10 + +Check that the speed is not too low after the specified time has elapsed. + +## timeout_overflow_mode {#timeout_overflow_mode} + +Type: OverflowMode + +Default value: throw + +What to do when the limit is exceeded. + +## timeout_overflow_mode_leaf {#timeout_overflow_mode_leaf} + +Type: OverflowMode + +Default value: throw + +What to do when the leaf limit is exceeded. + +## totals_auto_threshold {#totals_auto_threshold} + +Type: Float + +Default value: 0.5 + +The threshold for `totals_mode = 'auto'`. +See the section “WITH TOTALS modifier”. + +## totals_mode {#totals_mode} + +Type: TotalsMode + +Default value: after_having_exclusive + +How to calculate TOTALS when HAVING is present, as well as when max_rows_to_group_by and group_by_overflow_mode = ‘any’ are present. +See the section “WITH TOTALS modifier”. + +## trace_profile_events {#trace_profile_events} + +Type: Bool + +Default value: 0 + +Enables or disables collecting stacktraces on each update of profile events along with the name of profile event and the value of increment and sending them into [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log). + +Possible values: + +- 1 — Tracing of profile events enabled. +- 0 — Tracing of profile events disabled. + +## transfer_overflow_mode {#transfer_overflow_mode} + +Type: OverflowMode + +Default value: throw + +What to do when the limit is exceeded. + +## transform_null_in {#transform_null_in} + +Type: Bool + +Default value: 0 + +Enables equality of [NULL](../../sql-reference/syntax.md/#null-literal) values for [IN](../../sql-reference/operators/in.md) operator. + +By default, `NULL` values can’t be compared because `NULL` means undefined value. Thus, comparison `expr = NULL` must always return `false`. With this setting `NULL = NULL` returns `true` for `IN` operator. + +Possible values: + +- 0 — Comparison of `NULL` values in `IN` operator returns `false`. +- 1 — Comparison of `NULL` values in `IN` operator returns `true`. + +**Example** + +Consider the `null_in` table: + +``` text +┌──idx─┬─────i─┐ +│ 1 │ 1 │ +│ 2 │ NULL │ +│ 3 │ 3 │ +└──────┴───────┘ +``` + +Query: + +``` sql +SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 0; +``` + +Result: + +``` text +┌──idx─┬────i─┐ +│ 1 │ 1 │ +└──────┴──────┘ +``` + +Query: + +``` sql +SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 1; +``` + +Result: + +``` text +┌──idx─┬─────i─┐ +│ 1 │ 1 │ +│ 2 │ NULL │ +└──────┴───────┘ +``` + +**See Also** + +- [NULL Processing in IN Operators](../../sql-reference/operators/in.md/#in-null-processing) + +## traverse_shadow_remote_data_paths {#traverse_shadow_remote_data_paths} + +Type: Bool + +Default value: 0 + +Traverse shadow directory when query system.remote_data_paths + +## union_default_mode {#union_default_mode} + +Type: SetOperationMode + +Default value: + +Sets a mode for combining `SELECT` query results. The setting is only used when shared with [UNION](../../sql-reference/statements/select/union.md) without explicitly specifying the `UNION ALL` or `UNION DISTINCT`. + +Possible values: + +- `'DISTINCT'` — ClickHouse outputs rows as a result of combining queries removing duplicate rows. +- `'ALL'` — ClickHouse outputs all rows as a result of combining queries including duplicate rows. +- `''` — ClickHouse generates an exception when used with `UNION`. + +See examples in [UNION](../../sql-reference/statements/select/union.md). + +## unknown_packet_in_send_data {#unknown_packet_in_send_data} + +Type: UInt64 + +Default value: 0 + +Send unknown packet instead of data Nth data packet + +## use_cache_for_count_from_files {#use_cache_for_count_from_files} + +Type: Bool + +Default value: 1 + +Enables caching of rows number during count from files in table functions `file`/`s3`/`url`/`hdfs`/`azureBlobStorage`. Enabled by default. -## async_query_sending_for_remote {#async_query_sending_for_remote} +## use_client_time_zone {#use_client_time_zone} -Enables asynchronous connection creation and query sending while executing remote query. +Type: Bool -Enabled by default. +Default value: 0 + +Use client timezone for interpreting DateTime string values, instead of adopting server timezone. + +## use_compact_format_in_distributed_parts_names {#use_compact_format_in_distributed_parts_names} + +Type: Bool + +Default value: 1 + +Uses compact format for storing blocks for background (`distributed_foreground_insert`) INSERT into tables with `Distributed` engine. + +Possible values: + +- 0 — Uses `user[:password]@host:port#default_database` directory format. +- 1 — Uses `[shard{shard_index}[_replica{replica_index}]]` directory format. + +:::note +- with `use_compact_format_in_distributed_parts_names=0` changes from cluster definition will not be applied for background INSERT. +- with `use_compact_format_in_distributed_parts_names=1` changing the order of the nodes in the cluster definition, will change the `shard_index`/`replica_index` so be aware. +::: + +## use_concurrency_control {#use_concurrency_control} + +Type: Bool + +Default value: 1 + +Respect the server's concurrency control (see the `concurrent_threads_soft_limit_num` and `concurrent_threads_soft_limit_ratio_to_cores` global server settings). If disabled, it allows using a larger number of threads even if the server is overloaded (not recommended for normal usage, and needed mostly for tests). ## use_hedged_requests {#use_hedged_requests} +Type: Bool + +Default value: 1 + Enables hedged requests logic for remote queries. It allows to establish many connections with different replicas for query. New connection is enabled in case existent connection(s) with replica(s) were not established within `hedged_connection_timeout` or no data was received within `receive_data_timeout`. Query uses the first connection which send non empty progress packet (or data packet, if `allow_changing_replica_until_first_data_packet`); @@ -4922,557 +9477,121 @@ Enabled by default. Disabled by default on Cloud. -## hedged_connection_timeout {#hedged_connection_timeout} +## use_hive_partitioning {#use_hive_partitioning} -If we can't establish connection with replica after this timeout in hedged requests, we start working with the next replica without cancelling connection to the previous. -Timeout value is in milliseconds. +Type: Bool -Default value: `50`. +Default value: 0 -## receive_data_timeout {#receive_data_timeout} +When enabled, ClickHouse will detect Hive-style partitioning in path (`/name=value/`) in file-like table engines [File](../../engines/table-engines/special/file.md#hive-style-partitioning)/[S3](../../engines/table-engines/integrations/s3.md#hive-style-partitioning)/[URL](../../engines/table-engines/special/url.md#hive-style-partitioning)/[HDFS](../../engines/table-engines/integrations/hdfs.md#hive-style-partitioning)/[AzureBlobStorage](../../engines/table-engines/integrations/azureBlobStorage.md#hive-style-partitioning) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. -This timeout is set when the query is sent to the replica in hedged requests, if we don't receive first packet of data and we don't make any progress in query execution after this timeout, -we start working with the next replica, without cancelling connection to the previous. -Timeout value is in milliseconds. +## use_index_for_in_with_subqueries {#use_index_for_in_with_subqueries} -Default value: `2000` +Type: Bool -## allow_changing_replica_until_first_data_packet {#allow_changing_replica_until_first_data_packet} +Default value: 1 -If it's enabled, in hedged requests we can start new connection until receiving first data packet even if we have already made some progress -(but progress haven't updated for `receive_data_timeout` timeout), otherwise we disable changing replica after the first time we made progress. +Try using an index if there is a subquery or a table expression on the right side of the IN operator. -## parallel_view_processing +## use_index_for_in_with_subqueries_max_values {#use_index_for_in_with_subqueries_max_values} -Enables pushing to attached views concurrently instead of sequentially. +Type: UInt64 -Default value: `false`. +Default value: 0 -## partial_result_on_first_cancel {#partial_result_on_first_cancel} -When set to `true` and the user wants to interrupt a query (for example using `Ctrl+C` on the client), then the query continues execution only on data that was already read from the table. Afterwards, it will return a partial result of the query for the part of the table that was read. To fully stop the execution of a query without a partial result, the user should send 2 cancel requests. +The maximum size of the set in the right-hand side of the IN operator to use table index for filtering. It allows to avoid performance degradation and higher memory usage due to the preparation of additional data structures for large queries. Zero means no limit. -**Example without setting on Ctrl+C** -```sql -SELECT sum(number) FROM numbers(10000000000) +## use_json_alias_for_old_object_type {#use_json_alias_for_old_object_type} -Cancelling query. -Ok. -Query was cancelled. +Type: Bool -0 rows in set. Elapsed: 1.334 sec. Processed 52.65 million rows, 421.23 MB (39.48 million rows/s., 315.85 MB/s.) -``` +Default value: 0 -**Example with setting on Ctrl+C** -```sql -SELECT sum(number) FROM numbers(10000000000) SETTINGS partial_result_on_first_cancel=true +When enabled, `JSON` data type alias will be used to create an old [Object('json')](../../sql-reference/data-types/json.md) type instead of the new [JSON](../../sql-reference/data-types/newjson.md) type. -┌──────sum(number)─┐ -│ 1355411451286266 │ -└──────────────────┘ +## use_local_cache_for_remote_storage {#use_local_cache_for_remote_storage} -1 row in set. Elapsed: 1.331 sec. Processed 52.13 million rows, 417.05 MB (39.17 million rows/s., 313.33 MB/s.) -``` +Type: Bool -Possible values: `true`, `false` +Default value: 1 -Default value: `false` -## function_json_value_return_type_allow_nullable +Use local cache for remote storage like HDFS or S3, it's used for remote table engine only -Control whether allow to return `NULL` when value is not exist for JSON_VALUE function. +## use_page_cache_for_disks_without_file_cache {#use_page_cache_for_disks_without_file_cache} -```sql -SELECT JSON_VALUE('{"hello":"world"}', '$.b') settings function_json_value_return_type_allow_nullable=true; +Type: Bool -┌─JSON_VALUE('{"hello":"world"}', '$.b')─┐ -│ ᴺᵁᴸᴸ │ -└────────────────────────────────────────┘ +Default value: 0 -1 row in set. Elapsed: 0.001 sec. -``` +Use userspace page cache for remote disks that don't have filesystem cache enabled. -Possible values: +## use_query_cache {#use_query_cache} -- true — Allow. -- false — Disallow. +Type: Bool -Default value: `false`. +Default value: 0 -## rename_files_after_processing {#rename_files_after_processing} - -- **Type:** String - -- **Default value:** Empty string - -This setting allows to specify renaming pattern for files processed by `file` table function. When option is set, all files read by `file` table function will be renamed according to specified pattern with placeholders, only if files processing was successful. - -### Placeholders - -- `%a` — Full original filename (e.g., "sample.csv"). -- `%f` — Original filename without extension (e.g., "sample"). -- `%e` — Original file extension with dot (e.g., ".csv"). -- `%t` — Timestamp (in microseconds). -- `%%` — Percentage sign ("%"). - -### Example -- Option: `--rename_files_after_processing="processed_%f_%t%e"` - -- Query: `SELECT * FROM file('sample.csv')` - - -If reading `sample.csv` is successful, file will be renamed to `processed_sample_1683473210851438.csv` - - - - -## function_json_value_return_type_allow_complex - -Control whether allow to return complex type (such as: struct, array, map) for json_value function. - -```sql -SELECT JSON_VALUE('{"hello":{"world":"!"}}', '$.hello') settings function_json_value_return_type_allow_complex=true - -┌─JSON_VALUE('{"hello":{"world":"!"}}', '$.hello')─┐ -│ {"world":"!"} │ -└──────────────────────────────────────────────────┘ - -1 row in set. Elapsed: 0.001 sec. -``` - -Possible values: - -- true — Allow. -- false — Disallow. - -Default value: `false`. - -## zstd_window_log_max - -Allows you to select the max window log of ZSTD (it will not be used for MergeTree family) - -Type: Int64 - -Default: 0 - -## enable_deflate_qpl_codec {#enable_deflate_qpl_codec} - -If turned on, the DEFLATE_QPL codec may be used to compress columns. +If turned on, `SELECT` queries may utilize the [query cache](../query-cache.md). Parameters [enable_reads_from_query_cache](#enable-reads-from-query-cache) +and [enable_writes_to_query_cache](#enable-writes-to-query-cache) control in more detail how the cache is used. Possible values: - 0 - Disabled - 1 - Enabled -Type: Bool - -## enable_zstd_qat_codec {#enable_zstd_qat_codec} - -If turned on, the ZSTD_QAT codec may be used to compress columns. - -Possible values: - -- 0 - Disabled -- 1 - Enabled +## use_skip_indexes {#use_skip_indexes} Type: Bool -## output_format_compression_level +Default value: 1 -Default compression level if query output is compressed. The setting is applied when `SELECT` query has `INTO OUTFILE` or when writing to table functions `file`, `url`, `hdfs`, `s3`, or `azureBlobStorage`. - -Possible values: from `1` to `22` - -Default: `3` - - -## output_format_compression_zstd_window_log - -Can be used when the output compression method is `zstd`. If greater than `0`, this setting explicitly sets compression window size (power of `2`) and enables a long-range mode for zstd compression. This can help to achieve a better compression ratio. - -Possible values: non-negative numbers. Note that if the value is too small or too big, `zstdlib` will throw an exception. Typical values are from `20` (window size = `1MB`) to `30` (window size = `1GB`). - -Default: `0` - -## rewrite_count_distinct_if_with_count_distinct_implementation - -Allows you to rewrite `countDistcintIf` with [count_distinct_implementation](#count_distinct_implementation) setting. +Use data skipping indexes during query execution. Possible values: -- true — Allow. -- false — Disallow. +- 0 — Disabled. +- 1 — Enabled. -Default value: `false`. +## use_skip_indexes_if_final {#use_skip_indexes_if_final} -## precise_float_parsing {#precise_float_parsing} +Type: Bool -Switches [Float32/Float64](../../sql-reference/data-types/float.md) parsing algorithms: -* If the value is `1`, then precise method is used. It is slower than fast method, but it always returns a number that is the closest machine representable number to the input. -* Otherwise, fast method is used (default). It usually returns the same value as precise, but in rare cases result may differ by one or two least significant digits. +Default value: 0 -Possible values: `0`, `1`. +Controls whether skipping indexes are used when executing a query with the FINAL modifier. -Default value: `0`. - -Example: - -```sql -SELECT toFloat64('1.7091'), toFloat64('1.5008753E7') SETTINGS precise_float_parsing = 0; - -┌─toFloat64('1.7091')─┬─toFloat64('1.5008753E7')─┐ -│ 1.7090999999999998 │ 15008753.000000002 │ -└─────────────────────┴──────────────────────────┘ - -SELECT toFloat64('1.7091'), toFloat64('1.5008753E7') SETTINGS precise_float_parsing = 1; - -┌─toFloat64('1.7091')─┬─toFloat64('1.5008753E7')─┐ -│ 1.7091 │ 15008753 │ -└─────────────────────┴──────────────────────────┘ -``` - -## validate_tcp_client_information {#validate-tcp-client-information} - -Determines whether validation of client information enabled when query packet is received from a client using a TCP connection. - -If `true`, an exception will be thrown on invalid client information from the TCP client. - -If `false`, the data will not be validated. The server will work with clients of all versions. - -The default value is `false`. - -**Example** - -``` xml -true -``` - -## print_pretty_type_names {#print_pretty_type_names} - -Allows to print deep-nested type names in a pretty way with indents in `DESCRIBE` query and in `toTypeName()` function. - -Example: - -```sql -CREATE TABLE test (a Tuple(b String, c Tuple(d Nullable(UInt64), e Array(UInt32), f Array(Tuple(g String, h Map(String, Array(Tuple(i String, j UInt64))))), k Date), l Nullable(String))) ENGINE=Memory; -DESCRIBE TABLE test FORMAT TSVRaw SETTINGS print_pretty_type_names=1; -``` - -``` -a Tuple( - b String, - c Tuple( - d Nullable(UInt64), - e Array(UInt32), - f Array(Tuple( - g String, - h Map( - String, - Array(Tuple( - i String, - j UInt64 - )) - ) - )), - k Date - ), - l Nullable(String) -) -``` - -## allow_experimental_statistics {#allow_experimental_statistics} - -Allows defining columns with [statistics](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) and [manipulate statistics](../../engines/table-engines/mergetree-family/mergetree.md#column-statistics). - -## allow_statistic_optimize {#allow_statistic_optimize} - -Allows using statistic to optimize the order of [prewhere conditions](../../sql-reference/statements/select/prewhere.md). - -## analyze_index_with_space_filling_curves - -If a table has a space-filling curve in its index, e.g. `ORDER BY mortonEncode(x, y)` or `ORDER BY hilbertEncode(x, y)`, and the query has conditions on its arguments, e.g. `x >= 10 AND x <= 20 AND y >= 20 AND y <= 30`, use the space-filling curve for index analysis. - -## query_plan_enable_optimizations {#query_plan_enable_optimizations} - -Toggles query optimization at the query plan level. - -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: +By default, this setting is disabled because skip indexes may exclude rows (granules) containing the latest data, which could lead to incorrect results. When enabled, skipping indexes are applied even with the FINAL modifier, potentially improving performance but with the risk of missing recent updates. Possible values: -- 0 - Disable all optimizations at the query plan level -- 1 - Enable optimizations at the query plan level (but individual optimizations may still be disabled via their individual settings) +- 0 — Disabled. +- 1 — Enabled. -Default value: `1`. +## use_structure_from_insertion_table_in_table_functions {#use_structure_from_insertion_table_in_table_functions} -## query_plan_max_optimizations_to_apply +Type: UInt64 -Limits the total number of optimizations applied to query plan, see setting [query_plan_enable_optimizations](#query_plan_enable_optimizations). -Useful to avoid long optimization times for complex queries. -If the actual number of optimizations exceeds this setting, an exception is thrown. +Default value: 2 -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: +Use structure from insertion table instead of schema inference from data. Possible values: 0 - disabled, 1 - enabled, 2 - auto -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +## use_uncompressed_cache {#use_uncompressed_cache} -Default value: '10000' +Type: Bool -## query_plan_lift_up_array_join +Default value: 0 -Toggles a query-plan-level optimization which moves ARRAY JOINs up in the execution plan. -Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. +Whether to use a cache of uncompressed blocks. Accepts 0 or 1. By default, 0 (disabled). +Using the uncompressed cache (only for tables in the MergeTree family) can significantly reduce latency and increase throughput when working with a large number of short queries. Enable this setting for users who send frequent short requests. Also pay attention to the [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md/#server-settings-uncompressed_cache_size) configuration parameter (only set in the config file) – the size of uncompressed cache blocks. By default, it is 8 GiB. The uncompressed cache is filled in as needed and the least-used data is automatically deleted. -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: - -Possible values: - -- 0 - Disable -- 1 - Enable - -Default value: `1`. - -## query_plan_push_down_limit - -Toggles a query-plan-level optimization which moves LIMITs down in the execution plan. -Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. - -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: - -Possible values: - -- 0 - Disable -- 1 - Enable - -Default value: `1`. - -## query_plan_split_filter - -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: - -Toggles a query-plan-level optimization which splits filters into expressions. -Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. - -Possible values: - -- 0 - Disable -- 1 - Enable - -Default value: `1`. - -## query_plan_merge_expressions - -Toggles a query-plan-level optimization which merges consecutive filters. -Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. - -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: - -Possible values: - -- 0 - Disable -- 1 - Enable - -Default value: `1`. - -## query_plan_filter_push_down - -Toggles a query-plan-level optimization which moves filters down in the execution plan. -Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. - -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: - -Possible values: - -- 0 - Disable -- 1 - Enable - -Default value: `1`. - -## query_plan_execute_functions_after_sorting - -Toggles a query-plan-level optimization which moves expressions after sorting steps. -Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. - -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: - -Possible values: - -- 0 - Disable -- 1 - Enable - -Default value: `1`. - -## query_plan_reuse_storage_ordering_for_window_functions - -Toggles a query-plan-level optimization which uses storage sorting when sorting for window functions. -Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. - -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: - -Possible values: - -- 0 - Disable -- 1 - Enable - -Default value: `1`. - -## query_plan_lift_up_union - -Toggles a query-plan-level optimization which moves larger subtrees of the query plan into union to enable further optimizations. -Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. - -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: - -Possible values: - -- 0 - Disable -- 1 - Enable - -Default value: `1`. - -## query_plan_distinct_in_order - -Toggles the distinct in-order optimization query-plan-level optimization. -Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. - -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: - -Possible values: - -- 0 - Disable -- 1 - Enable - -Default value: `1`. - -## query_plan_read_in_order - -Toggles the read in-order optimization query-plan-level optimization. -Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. - -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: - -Possible values: - -- 0 - Disable -- 1 - Enable - -Default value: `1`. - -## query_plan_aggregation_in_order - -Toggles the aggregation in-order query-plan-level optimization. -Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. - -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: - -Possible values: - -- 0 - Disable -- 1 - Enable - -Default value: `0`. - -## query_plan_remove_redundant_sorting - -Toggles a query-plan-level optimization which removes redundant sorting steps, e.g. in subqueries. -Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. - -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: - -Possible values: - -- 0 - Disable -- 1 - Enable - -Default value: `1`. - -## query_plan_remove_redundant_distinct - -Toggles a query-plan-level optimization which removes redundant DISTINCT steps. -Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. - -:::note -This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. -::: - -Possible values: - -- 0 - Disable -- 1 - Enable - -Default value: `1`. - -## dictionary_use_async_executor {#dictionary_use_async_executor} - -Execute a pipeline for reading dictionary source in several threads. It's supported only by dictionaries with local CLICKHOUSE source. - -You may specify it in `SETTINGS` section of dictionary definition: - -```sql -CREATE DICTIONARY t1_dict ( key String, attr UInt64 ) -PRIMARY KEY key -SOURCE(CLICKHOUSE(QUERY `SELECT key, attr FROM t1 GROUP BY key`)) -LIFETIME(MIN 0 MAX 3600) -LAYOUT(COMPLEX_KEY_HASHED_ARRAY()) -SETTINGS(dictionary_use_async_executor=1, max_threads=8); -``` - -## storage_metadata_write_full_object_key {#storage_metadata_write_full_object_key} - -When set to `true` the metadata files are written with `VERSION_FULL_OBJECT_KEY` format version. With that format full object storage key names are written to the metadata files. -When set to `false` the metadata files are written with the previous format version, `VERSION_INLINE_DATA`. With that format only suffixes of object storage key names are written to the metadata files. The prefix for all of object storage key names is set in configurations files at `storage_configuration.disks` section. - -Default value: `false`. - -## s3_use_adaptive_timeouts {#s3_use_adaptive_timeouts} - -When set to `true` than for all s3 requests first two attempts are made with low send and receive timeouts. -When set to `false` than all attempts are made with identical timeouts. - -Default value: `true`. - -## allow_deprecated_snowflake_conversion_functions {#allow_deprecated_snowflake_conversion_functions} - -Functions `snowflakeToDateTime`, `snowflakeToDateTime64`, `dateTimeToSnowflake`, and `dateTime64ToSnowflake` are deprecated and disabled by default. -Please use functions `snowflakeIDToDateTime`, `snowflakeIDToDateTime64`, `dateTimeToSnowflakeID`, and `dateTime64ToSnowflakeID` instead. - -To re-enable the deprecated functions (e.g., during a transition period), please set this setting to `true`. - -Default value: `false` - -## allow_experimental_variant_type {#allow_experimental_variant_type} - -Allows creation of experimental [Variant](../../sql-reference/data-types/variant.md). - -Default value: `false`. +For queries that read at least a somewhat large volume of data (one million rows or more), the uncompressed cache is disabled automatically to save space for truly small queries. This means that you can keep the ‘use_uncompressed_cache’ setting always set to 1. ## use_variant_as_common_type {#use_variant_as_common_type} +Type: Bool + +Default value: 0 + Allows to use `Variant` type as a result type for [if](../../sql-reference/functions/conditional-functions.md/#if)/[multiIf](../../sql-reference/functions/conditional-functions.md/#multiif)/[array](../../sql-reference/functions/array-functions.md)/[map](../../sql-reference/functions/tuple-map-functions.md) functions when there is no common type for argument types. Example: @@ -5551,188 +9670,95 @@ SELECT map('a', range(number), 'b', number, 'c', 'str_' || toString(number)) as └───────────────────────────────┘ ``` +## use_with_fill_by_sorting_prefix {#use_with_fill_by_sorting_prefix} -Default value: `false`. +Type: Bool -## default_normal_view_sql_security {#default_normal_view_sql_security} +Default value: 1 -Allows to set default `SQL SECURITY` option while creating a normal view. [More about SQL security](../../sql-reference/statements/create/view.md#sql_security). +Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently -The default value is `INVOKER`. +## validate_polygons {#validate_polygons} -## default_materialized_view_sql_security {#default_materialized_view_sql_security} +Type: Bool -Allows to set a default value for SQL SECURITY option when creating a materialized view. [More about SQL security](../../sql-reference/statements/create/view.md#sql_security). +Default value: 1 -The default value is `DEFINER`. - -## default_view_definer {#default_view_definer} - -Allows to set default `DEFINER` option while creating a view. [More about SQL security](../../sql-reference/statements/create/view.md#sql_security). - -The default value is `CURRENT_USER`. - -## max_partition_size_to_drop - -Restriction on dropping partitions in query time. The value 0 means that you can drop partitions without any restrictions. - -Default value: 50 GB. - -Cloud default value: 1 TB. - -:::note -This query setting overwrites its server setting equivalent, see [max_partition_size_to_drop](/docs/en/operations/server-configuration-parameters/settings.md/#max-partition-size-to-drop) -::: - -## max_table_size_to_drop - -Restriction on deleting tables in query time. The value 0 means that you can delete all tables without any restrictions. - -Default value: 50 GB. - -Cloud default value: 1 TB. - -:::note -This query setting overwrites its server setting equivalent, see [max_table_size_to_drop](/docs/en/operations/server-configuration-parameters/settings.md/#max-table-size-to-drop) -::: - -## iceberg_engine_ignore_schema_evolution {#iceberg_engine_ignore_schema_evolution} - -Allow to ignore schema evolution in Iceberg table engine and read all data using schema specified by the user on table creation or latest schema parsed from metadata on table creation. - -:::note -Enabling this setting can lead to incorrect result as in case of evolved schema all data files will be read using the same schema. -::: - -Default value: 'false'. - -## allow_suspicious_primary_key {#allow_suspicious_primary_key} - -Allow suspicious `PRIMARY KEY`/`ORDER BY` for MergeTree (i.e. SimpleAggregateFunction). - -## mysql_datatypes_support_level - -Defines how MySQL types are converted to corresponding ClickHouse types. A comma separated list in any combination of `decimal`, `datetime64`, `date2Date32` or `date2String`. -- `decimal`: convert `NUMERIC` and `DECIMAL` types to `Decimal` when precision allows it. -- `datetime64`: convert `DATETIME` and `TIMESTAMP` types to `DateTime64` instead of `DateTime` when precision is not `0`. -- `date2Date32`: convert `DATE` to `Date32` instead of `Date`. Takes precedence over `date2String`. -- `date2String`: convert `DATE` to `String` instead of `Date`. Overridden by `datetime64`. - -## cross_join_min_rows_to_compress - -Minimal count of rows to compress block in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached. - -Default value: `10000000`. - -## cross_join_min_bytes_to_compress - -Minimal size of block to compress in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached. - -Default value: `1GiB`. - -## use_json_alias_for_old_object_type - -When enabled, `JSON` data type alias will be used to create an old [Object('json')](../../sql-reference/data-types/json.md) type instead of the new [JSON](../../sql-reference/data-types/newjson.md) type. - -Default value: `false`. - -## type_json_skip_duplicated_paths - -When enabled, ClickHouse will skip duplicated paths during parsing of [JSON](../../sql-reference/data-types/newjson.md) object. Only the value of the first occurrence of each path will be inserted. - -Default value: `false` - -## restore_replace_external_engines_to_null - -For testing purposes. Replaces all external engines to Null to not initiate external connections. - -Default value: `False` - -## restore_replace_external_table_functions_to_null - -For testing purposes. Replaces all external table functions to Null to not initiate external connections. - -Default value: `False` - -## disable_insertion_and_mutation - -Disable all insert and mutations (alter table update / alter table delete / alter table drop partition). Set to true, can make this node focus on reading queries. - -Default value: `false`. - -## use_hive_partitioning - -When enabled, ClickHouse will detect Hive-style partitioning in path (`/name=value/`) in file-like table engines [File](../../engines/table-engines/special/file.md#hive-style-partitioning)/[S3](../../engines/table-engines/integrations/s3.md#hive-style-partitioning)/[URL](../../engines/table-engines/special/url.md#hive-style-partitioning)/[HDFS](../../engines/table-engines/integrations/hdfs.md#hive-style-partitioning)/[AzureBlobStorage](../../engines/table-engines/integrations/azureBlobStorage.md#hive-style-partitioning) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. - -Default value: `false`. - -## allow_experimental_time_series_table {#allow-experimental-time-series-table} - -Allows creation of tables with the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine. +Enables or disables throwing an exception in the [pointInPolygon](../../sql-reference/functions/geo/index.md#pointinpolygon) function, if the polygon is self-intersecting or self-tangent. Possible values: -- 0 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is disabled. -- 1 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is enabled. +- 0 — Throwing an exception is disabled. `pointInPolygon` accepts invalid polygons and returns possibly incorrect results for them. +- 1 — Throwing an exception is enabled. -Default value: `0`. +## wait_changes_become_visible_after_commit_mode {#wait_changes_become_visible_after_commit_mode} -## create_if_not_exists +Type: TransactionsWaitCSNMode -Enable `IF NOT EXISTS` for `CREATE` statement by default. If either this setting or `IF NOT EXISTS` is specified and a table with the provided name already exists, no exception will be thrown. +Default value: wait_unknown -Default value: `false`. +Wait for committed changes to become actually visible in the latest snapshot -## allow_suspicious_types_in_group_by {#allow_suspicious_types_in_group_by} +## wait_for_async_insert {#wait_for_async_insert} -Allows or restricts using [Variant](../../sql-reference/data-types/variant.md) and [Dynamic](../../sql-reference/data-types/dynamic.md) types in GROUP BY keys. +Type: Bool -Possible values: +Default value: 1 -- 0 — Usage of `Variant` and `Dynamic` types is restricted. -- 1 — Usage of `Variant` and `Dynamic` types is not restricted. +If true wait for processing of asynchronous insertion -Default value: 0. +## wait_for_async_insert_timeout {#wait_for_async_insert_timeout} -## allow_suspicious_types_in_order_by {#allow_suspicious_types_in_order_by} +Type: Seconds -Allows or restricts using [Variant](../../sql-reference/data-types/variant.md) and [Dynamic](../../sql-reference/data-types/dynamic.md) types in ORDER BY keys. +Default value: 120 -Possible values: +Timeout for waiting for processing asynchronous insertion -- 0 — Usage of `Variant` and `Dynamic` types is restricted. -- 1 — Usage of `Variant` and `Dynamic` types is not restricted. +## wait_for_window_view_fire_signal_timeout {#wait_for_window_view_fire_signal_timeout} -Default value: 0. +Type: Seconds -## enable_secure_identifiers +Default value: 10 -If enabled, only allow secure identifiers which contain only underscore and alphanumeric characters +Timeout for waiting for window view fire signal in event time processing -Default value: `false`. +## window_view_clean_interval {#window_view_clean_interval} -## show_create_query_identifier_quoting_rule +Type: Seconds -Define identifier quoting behavior of the show create query result: -- `when_necessary`: When the identifiers is one of `{"distinct", "all", "table"}`, or it can cause ambiguity: column names, dictionary attribute names. -- `always`: Always quote identifiers. -- `user_display`: When the identifiers is a keyword. +Default value: 60 -Default value: `when_necessary`. +The clean interval of window view in seconds to free outdated data. -## show_create_query_identifier_quoting_style +## window_view_heartbeat_interval {#window_view_heartbeat_interval} -Define identifier quoting style of the show create query result: -- `Backticks`: \`clickhouse\` style. -- `DoubleQuotes`: "postgres" style -- `BackticksMySQL`: \`mysql\` style, most same as `Backticks`, but it uses '``' to escape '`' +Type: Seconds -Default value: `Backticks`. +Default value: 15 -## mongodb_throw_on_unsupported_query +The heartbeat interval in seconds to indicate watch query is alive. -If enabled, MongoDB tables will return an error when a MongoDB query can't be built. +## workload {#workload} -Not applied for the legacy implementation, or when 'allow_experimental_analyzer=0`. +Type: String -Default value: `true`. +Default value: default + +Name of workload to be used to access resources + +## write_through_distributed_cache {#write_through_distributed_cache} + +Type: Bool + +Default value: 0 + +Only in ClickHouse Cloud. Allow writing to distributed cache (writing to s3 will also be done by distributed cache) + +## zstd_window_log_max {#zstd_window_log_max} + +Type: Int64 + +Default value: 0 + +Allows you to select the max window log of ZSTD (it will not be used for MergeTree family) diff --git a/docs/en/operations/system-tables/view_refreshes.md b/docs/en/operations/system-tables/view_refreshes.md index e792e0d095d..9d3dafe686d 100644 --- a/docs/en/operations/system-tables/view_refreshes.md +++ b/docs/en/operations/system-tables/view_refreshes.md @@ -10,21 +10,21 @@ Columns: - `database` ([String](../../sql-reference/data-types/string.md)) — The name of the database the table is in. - `view` ([String](../../sql-reference/data-types/string.md)) — Table name. +- `uuid` ([UUID](../../sql-reference/data-types/uuid.md)) — Table uuid (Atomic database). - `status` ([String](../../sql-reference/data-types/string.md)) — Current state of the refresh. -- `last_refresh_result` ([String](../../sql-reference/data-types/string.md)) — Outcome of the latest refresh attempt. -- `last_refresh_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Time of the last refresh attempt. `NULL` if no refresh attempts happened since server startup or table creation. -- `last_success_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Time of the last successful refresh. `NULL` if no successful refreshes happened since server startup or table creation. -- `duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — How long the last refresh attempt took. -- `next_refresh_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Time at which the next refresh is scheduled to start. -- `remaining_dependencies` ([Array(String)](../../sql-reference/data-types/array.md)) — If the view has [refresh dependencies](../../sql-reference/statements/create/view.md#refresh-dependencies), this array contains the subset of those dependencies that are not satisfied for the current refresh yet. If `status = 'WaitingForDependencies'`, a refresh is ready to start as soon as these dependencies are fulfilled. -- `exception` ([String](../../sql-reference/data-types/string.md)) — if `last_refresh_result = 'Error'`, i.e. the last refresh attempt failed, this column contains the corresponding error message and stack trace. -- `retry` ([UInt64](../../sql-reference/data-types/int-uint.md)) — If nonzero, the current or next refresh is a retry (see `refresh_retries` refresh setting), and `retry` is the 1-based index of that retry. -- `refresh_count` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of successful refreshes since last server restart or table creation. -- `progress` ([Float64](../../sql-reference/data-types/float.md)) — Progress of the current refresh, between 0 and 1. -- `read_rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of rows read by the current refresh so far. -- `total_rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Estimated total number of rows that need to be read by the current refresh. - -(There are additional columns related to current refresh progress, but they are currently unreliable.) +- `last_success_time` ([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md))) — Time when the latest successful refresh started. NULL if no successful refreshes happened since server startup or table creation. +- `last_success_duration_ms` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — How long the latest refresh took. +- `last_refresh_time` ([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md))) — Time when the latest refresh attempt finished (if known) or started (if unknown or still running). NULL if no refresh attempts happened since server startup or table creation. +- `last_refresh_replica` ([String](../../sql-reference/data-types/string.md)) — If coordination is enabled, name of the replica that made the current (if running) or previous (if not running) refresh attempt. +- `next_refresh_time` ([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md))) — Time at which the next refresh is scheduled to start, if status = Scheduled. +- `exception` ([String](../../sql-reference/data-types/string.md)) — Error message from previous attempt if it failed. +- `retry` ([UInt64](../../sql-reference/data-types/int-uint.md)) — How many failed attempts there were so far, for the current refresh. +- `progress` ([Float64](../../sql-reference/data-types/float.md)) — Progress of the current refresh, between 0 and 1. Not available if status is `RunningOnAnotherReplica`. +- `read_rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of rows read by the current refresh so far. Not available if status is `RunningOnAnotherReplica`. +- `read_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of bytes read during the current refresh. Not available if status is `RunningOnAnotherReplica`. +- `total_rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Estimated total number of rows that need to be read by the current refresh. Not available if status is `RunningOnAnotherReplica`. +- `written_rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of rows written during the current refresh. Not available if status is `RunningOnAnotherReplica`. +- `written_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number rof bytes written during the current refresh. Not available if status is `RunningOnAnotherReplica`. **Example** diff --git a/docs/en/sql-reference/functions/bit-functions.md b/docs/en/sql-reference/functions/bit-functions.md index 5ab7e07fcad..709f4ca95b2 100644 --- a/docs/en/sql-reference/functions/bit-functions.md +++ b/docs/en/sql-reference/functions/bit-functions.md @@ -226,9 +226,9 @@ Result: ## bitTestAll -Returns result of [logical conjuction](https://en.wikipedia.org/wiki/Logical_conjunction) (AND operator) of all bits at given positions. Counting is right-to-left, starting at 0. +Returns result of [logical conjunction](https://en.wikipedia.org/wiki/Logical_conjunction) (AND operator) of all bits at given positions. Counting is right-to-left, starting at 0. -The conjuction for bit-wise operations: +The conjunction for bit-wise operations: 0 AND 0 = 0 @@ -251,7 +251,7 @@ SELECT bitTestAll(number, index1, index2, index3, index4, ...) **Returned value** -- Result of the logical conjuction. [UInt8](../data-types/int-uint.md). +- Result of the logical conjunction. [UInt8](../data-types/int-uint.md). **Example** diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md index 9416036aff1..0623e209852 100644 --- a/docs/en/sql-reference/functions/ip-address-functions.md +++ b/docs/en/sql-reference/functions/ip-address-functions.md @@ -316,6 +316,38 @@ Result: Same as `toIPv4`, but if the IPv4 address has an invalid format, it returns null. +**Syntax** + +```sql +toIPv4OrNull(value) +``` + +**Arguments** + +- `value` — The value with IPv4 address. + +**Returned value** + +- `value` converted to the current IPv4 address. [String](../data-types/string.md). + +**Example** + +Query: + +```sql +SELECT + toIPv4OrNull('192.168.0.1') AS s1, + toIPv4OrNull('192.168.0') AS s2 +``` + +Result: + +```response +┌─s1──────────┬─s2───┐ +│ 192.168.0.1 │ ᴺᵁᴸᴸ │ +└─────────────┴──────┘ +``` + ## toIPv6OrDefault(string) Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns `::` (0 IPv6). diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 9d94f040648..11f43649e6e 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -135,15 +135,15 @@ To change SQL security for an existing view, use ALTER TABLE MODIFY SQL SECURITY { DEFINER | INVOKER | NONE } [DEFINER = { user | CURRENT_USER }] ``` -### Examples sql security +### Examples ```sql -CREATE test_view +CREATE VIEW test_view DEFINER = alice SQL SECURITY DEFINER AS SELECT ... ``` ```sql -CREATE test_view +CREATE VIEW test_view SQL SECURITY INVOKER AS SELECT ... ``` @@ -184,14 +184,6 @@ Differences from regular non-refreshable materialized views: The settings in the `REFRESH ... SETTINGS` part of the query are refresh settings (e.g. `refresh_retries`), distinct from regular settings (e.g. `max_threads`). Regular settings can be specified using `SETTINGS` at the end of the query. ::: -:::note -Refreshable materialized views are a work in progress. Setting `allow_experimental_refreshable_materialized_view = 1` is required for creating one. Current limitations: - * not compatible with Replicated database or table engines - * It is not supported in ClickHouse Cloud - * require [Atomic database engine](../../../engines/database-engines/atomic.md), - * no limit on number of concurrent refreshes. -::: - ### Refresh Schedule Example refresh schedules: @@ -203,6 +195,10 @@ REFRESH EVERY 2 WEEK OFFSET 5 DAY 15 HOUR 10 MINUTE -- every other Saturday, at REFRESH EVERY 30 MINUTE -- at 00:00, 00:30, 01:00, 01:30, etc REFRESH AFTER 30 MINUTE -- 30 minutes after the previous refresh completes, no alignment with time of day -- REFRESH AFTER 1 HOUR OFFSET 1 MINUTE -- syntax errror, OFFSET is not allowed with AFTER +REFRESH EVERY 1 WEEK 2 DAYS -- every 9 days, not on any particular day of the week or month; + -- specifically, when day number (since 1969-12-29) is divisible by 9 +REFRESH EVERY 5 MONTHS -- every 5 months, different months each year (as 12 is not divisible by 5); + -- specifically, when month number (since 1970-01) is divisible by 5 ``` `RANDOMIZE FOR` randomly adjusts the time of each refresh, e.g.: @@ -214,6 +210,16 @@ At most one refresh may be running at a time, for a given view. E.g. if a view w Additionally, a refresh is started immediately after the materialized view is created, unless `EMPTY` is specified in the `CREATE` query. If `EMPTY` is specified, the first refresh happens according to schedule. +### In Replicated DB + +If the refreshable materialized view is in a [Replicated database](../../../engines/database-engines/replicated.md), the replicas coordinate with each other such that only one replica performs the refresh at each scheduled time. [ReplicatedMergeTree](../../../engines/table-engines/mergetree-family/replication.md) table engine is required, so that all replicas see the data produced by the refresh. + +In `APPEND` mode, coordination can be disabled using `SETTINGS all_replicas = 1`. This makes replicas do refreshes independently of each other. In this case ReplicatedMergeTree is not required. + +In non-`APPEND` mode, only coordinated refreshing is supported. For uncoordinated, use `Atomic` database and `CREATE ... ON CLUSTER` query to create refreshable materialized views on all replicas. + +The coordination is done through Keeper. The znode path is determined by [default_replica_path](../../../operations/server-configuration-parameters/settings.md#default_replica_path) server setting. + ### Dependencies {#refresh-dependencies} `DEPENDS ON` synchronizes refreshes of different tables. By way of example, suppose there's a chain of two refreshable materialized views: @@ -277,6 +283,8 @@ The status of all refreshable materialized views is available in table [`system. To manually stop, start, trigger, or cancel refreshes use [`SYSTEM STOP|START|REFRESH|CANCEL VIEW`](../system.md#refreshable-materialized-views). +To wait for a refresh to complete, use [`SYSTEM WAIT VIEW`](../system.md#refreshable-materialized-views). In particular, useful for waiting for initial refresh after creating a view. + :::note Fun fact: the refresh query is allowed to read from the view that's being refreshed, seeing pre-refresh version of the data. This means you can implement Conway's game of life: https://pastila.nl/?00021a4b/d6156ff819c83d490ad2dcec05676865#O0LGWTO7maUQIA4AcGUtlA== ::: diff --git a/docs/en/sql-reference/statements/grant.md b/docs/en/sql-reference/statements/grant.md index c8f3b987358..c11299baf38 100644 --- a/docs/en/sql-reference/statements/grant.md +++ b/docs/en/sql-reference/statements/grant.md @@ -233,15 +233,20 @@ Hierarchy of privileges: - `addressToSymbol` - `demangle` - [SOURCES](#sources) + - `AZURE` - `FILE` - - `URL` - - `REMOTE` - - `YSQL` - - `ODBC` - - `JDBC` - `HDFS` - - `S3` + - `HIVE` + - `JDBC` + - `MONGO` + - `MYSQL` + - `ODBC` - `POSTGRES` + - `REDIS` + - `REMOTE` + - `S3` + - `SQLITE` + - `URL` - [dictGet](#dictget) - [displaySecretsInShowAndSelect](#displaysecretsinshowandselect) - [NAMED COLLECTION ADMIN](#named-collection-admin) @@ -510,15 +515,20 @@ Allows using [introspection](../../operations/optimizing-performance/sampling-qu Allows using external data sources. Applies to [table engines](../../engines/table-engines/index.md) and [table functions](../../sql-reference/table-functions/index.md#table-functions). - `SOURCES`. Level: `GROUP` + - `AZURE`. Level: `GLOBAL` - `FILE`. Level: `GLOBAL` - - `URL`. Level: `GLOBAL` - - `REMOTE`. Level: `GLOBAL` - - `YSQL`. Level: `GLOBAL` - - `ODBC`. Level: `GLOBAL` - - `JDBC`. Level: `GLOBAL` - `HDFS`. Level: `GLOBAL` - - `S3`. Level: `GLOBAL` + - `HIVE`. Level: `GLOBAL` + - `JDBC`. Level: `GLOBAL` + - `MONGO`. Level: `GLOBAL` + - `MYSQL`. Level: `GLOBAL` + - `ODBC`. Level: `GLOBAL` - `POSTGRES`. Level: `GLOBAL` + - `REDIS`. Level: `GLOBAL` + - `REMOTE`. Level: `GLOBAL` + - `S3`. Level: `GLOBAL` + - `SQLITE`. Level: `GLOBAL` + - `URL`. Level: `GLOBAL` The `SOURCES` privilege enables use of all the sources. Also you can grant a privilege for each source individually. To use sources, you need additional privileges. diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index ea0735206a1..2b3ce53f1c7 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -565,3 +565,13 @@ If there's a refresh in progress for the given view, interrupt and cancel it. Ot ```sql SYSTEM CANCEL VIEW [db.]name ``` + +### SYSTEM WAIT VIEW + +Waits for the running refresh to complete. If no refresh is running, returns immediately. If the latest refresh attempt failed, reports an error. + +Can be used right after creating a new refreshable materialized view (without EMPTY keyword) to wait for the initial refresh to complete. + +```sql +SYSTEM WAIT VIEW [db.]name +``` diff --git a/docs/zh/sql-reference/functions/bit-functions.md b/docs/zh/sql-reference/functions/bit-functions.md index c1e54892d9a..745758be30b 100644 --- a/docs/zh/sql-reference/functions/bit-functions.md +++ b/docs/zh/sql-reference/functions/bit-functions.md @@ -220,7 +220,7 @@ SELECT bitTest(43, 2); ## bitTestAll {#bittestall} -返回给定位置所有位的 [logical conjuction](https://en.wikipedia.org/wiki/Logical_conjunction) 进行与操作的结果。位值从右到左数,从0开始计数。 +返回给定位置所有位的 [logical conjunction](https://en.wikipedia.org/wiki/Logical_conjunction) 进行与操作的结果。位值从右到左数,从0开始计数。 与运算的结果: diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 494c761ecab..1f99134aa10 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -346,7 +346,9 @@ try processConfig(); adjustSettings(); - initTTYBuffer(toProgressOption(config().getString("progress", "default"))); + initTTYBuffer(toProgressOption(config().getString("progress", "default")), + toProgressOption(config().getString("progress-table", "default"))); + initKeystrokeInterceptor(); ASTAlterCommand::setFormatAlterCommandsWithParentheses(true); { @@ -772,7 +774,7 @@ bool Client::processWithFuzzing(const String & full_query) else this_query_runs = 1; } - else if (const auto * insert = orig_ast->as()) + else if (const auto * /*insert*/ _ = orig_ast->as()) { this_query_runs = 1; queries_for_fuzzed_tables = fuzzer.getInsertQueriesForFuzzedTables(full_query); diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 00d4ee1ca65..278eb7b9181 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -518,7 +518,9 @@ try SCOPE_EXIT({ cleanup(); }); - initTTYBuffer(toProgressOption(getClientConfiguration().getString("progress", "default"))); + initTTYBuffer(toProgressOption(getClientConfiguration().getString("progress", "default")), + toProgressOption(config().getString("progress-table", "default"))); + initKeystrokeInterceptor(); ASTAlterCommand::setFormatAlterCommandsWithParentheses(true); /// try to load user defined executable functions, throw on error and die diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 188dd2c019d..0dbc0c727ab 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -158,6 +158,11 @@ namespace Setting extern const SettingsSeconds send_timeout; } +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool allow_remote_fs_zero_copy_replication; +} + } namespace CurrentMetrics @@ -599,7 +604,7 @@ void sanityChecks(Server & server) { } - if (server.context()->getMergeTreeSettings().allow_remote_fs_zero_copy_replication) + if (server.context()->getMergeTreeSettings()[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) { server.context()->addWarningMessage("The setting 'allow_remote_fs_zero_copy_replication' is enabled for MergeTree tables." " But the feature of 'zero-copy replication' is under development and is not ready for production." @@ -628,7 +633,9 @@ void loadStartupScripts(const Poco::Util::AbstractConfiguration & config, Contex auto condition_write_buffer = WriteBufferFromOwnString(); LOG_DEBUG(log, "Checking startup query condition `{}`", condition); - executeQuery(condition_read_buffer, condition_write_buffer, true, context, callback, QueryFlags{ .internal = true }, std::nullopt, {}); + auto startup_context = Context::createCopy(context); + startup_context->makeQueryContext(); + executeQuery(condition_read_buffer, condition_write_buffer, true, startup_context, callback, QueryFlags{ .internal = true }, std::nullopt, {}); auto result = condition_write_buffer.str(); @@ -648,7 +655,9 @@ void loadStartupScripts(const Poco::Util::AbstractConfiguration & config, Contex auto write_buffer = WriteBufferFromOwnString(); LOG_DEBUG(log, "Executing query `{}`", query); - executeQuery(read_buffer, write_buffer, true, context, callback, QueryFlags{ .internal = true }, std::nullopt, {}); + auto startup_context = Context::createCopy(context); + startup_context->makeQueryContext(); + executeQuery(read_buffer, write_buffer, true, startup_context, callback, QueryFlags{ .internal = true }, std::nullopt, {}); } } catch (...) @@ -1125,9 +1134,6 @@ try /// We need to reload server settings because config could be updated via zookeeper. server_settings.loadSettingsFromConfig(config()); - /// NOTE: Do sanity checks after we loaded all possible substitutions (for the configuration) from ZK - sanityChecks(*this); - #if defined(OS_LINUX) std::string executable_path = getExecutablePath(); @@ -2019,6 +2025,11 @@ try if (!filesystem_caches_path.empty()) global_context->setFilesystemCachesPath(filesystem_caches_path); + /// NOTE: Do sanity checks after we loaded all possible substitutions (for the configuration) from ZK + /// Additionally, making the check after the default profile is initialized. + /// It is important to initialize MergeTreeSettings after Settings, to support compatibility for MergeTreeSettings. + sanityChecks(*this); + /// Check sanity of MergeTreeSettings on server startup { size_t background_pool_tasks = global_context->getMergeMutateExecutor()->getMaxTasksCount(); diff --git a/src/Access/AccessControl.cpp b/src/Access/AccessControl.cpp index 093f1a19618..86f814bf5c7 100644 --- a/src/Access/AccessControl.cpp +++ b/src/Access/AccessControl.cpp @@ -22,8 +22,10 @@ #include #include #include +#include #include #include + #include #include #include @@ -133,7 +135,7 @@ public: String{setting_name}, boost::algorithm::join(registered_prefixes, "' or '")); } else - BaseSettingsHelpers::throwSettingNotFound(setting_name); + throw Exception(ErrorCodes::UNKNOWN_SETTING, "Unknown setting '{}'", String{setting_name}); } private: diff --git a/src/Access/ContextAccess.h b/src/Access/ContextAccess.h index 3a12a07426b..9f21f6438dd 100644 --- a/src/Access/ContextAccess.h +++ b/src/Access/ContextAccess.h @@ -1,18 +1,17 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include namespace Poco { class Logger; } diff --git a/src/Access/Credentials.h b/src/Access/Credentials.h index 5f6b0269eef..f220b8d2c48 100644 --- a/src/Access/Credentials.h +++ b/src/Access/Credentials.h @@ -1,9 +1,8 @@ #pragma once -#include -#include -#include #include +#include +#include #include "config.h" diff --git a/src/Access/RoleCache.h b/src/Access/RoleCache.h index b5712a24f46..75d1fd32685 100644 --- a/src/Access/RoleCache.h +++ b/src/Access/RoleCache.h @@ -1,10 +1,9 @@ #pragma once -#include -#include -#include #include #include +#include +#include namespace DB diff --git a/src/Access/SettingsConstraints.cpp b/src/Access/SettingsConstraints.cpp index 483dca7e13f..ecf7537f6a9 100644 --- a/src/Access/SettingsConstraints.cpp +++ b/src/Access/SettingsConstraints.cpp @@ -431,8 +431,8 @@ SettingsConstraints::Checker SettingsConstraints::getMergeTreeChecker(std::strin auto full_name = settingFullName(short_name); auto it = constraints.find(resolveSettingNameWithCache(full_name)); if (it == constraints.end()) - return Checker(MergeTreeSettings::Traits::resolveName); // Allowed - return Checker(it->second, MergeTreeSettings::Traits::resolveName); + return Checker(MergeTreeSettings::resolveName); // Allowed + return Checker(it->second, MergeTreeSettings::resolveName); } bool SettingsConstraints::Constraint::operator==(const Constraint & other) const diff --git a/src/Access/UsersConfigAccessStorage.cpp b/src/Access/UsersConfigAccessStorage.cpp index 8f6761766c8..a723511b800 100644 --- a/src/Access/UsersConfigAccessStorage.cpp +++ b/src/Access/UsersConfigAccessStorage.cpp @@ -28,6 +28,7 @@ #include #include #include +#include namespace DB diff --git a/src/AggregateFunctions/AggregateFunctionSumMap.cpp b/src/AggregateFunctions/AggregateFunctionSumMap.cpp index 9a94c3dfe1a..55db0a43a6e 100644 --- a/src/AggregateFunctions/AggregateFunctionSumMap.cpp +++ b/src/AggregateFunctions/AggregateFunctionSumMap.cpp @@ -298,12 +298,13 @@ public: Field value = values[col_idx]; /// Compatibility with previous versions. - if (value.getType() == Field::Types::Decimal32) + WhichDataType value_type(values_types[col_idx]); + if (value_type.isDecimal32()) { auto source = value.safeGet>(); value = DecimalField(source.getValue(), source.getScale()); } - else if (value.getType() == Field::Types::Decimal64) + else if (value_type.isDecimal64()) { auto source = value.safeGet>(); value = DecimalField(source.getValue(), source.getScale()); @@ -545,7 +546,28 @@ public: } } - bool keepKey(const Field & key) const { return keys_to_keep.contains(key); } + bool keepKey(const Field & key) const + { + if (keys_to_keep.contains(key)) + return true; + + // Determine whether the numerical value of the key can have both types (UInt or Int), + // and use the other type with the same numerical value for keepKey verification. + if (key.getType() == Field::Types::UInt64) + { + const auto & value = key.safeGet(); + if (value <= std::numeric_limits::max()) + return keys_to_keep.contains(Field(Int64(value))); + } + else if (key.getType() == Field::Types::Int64) + { + const auto & value = key.safeGet(); + if (value >= 0) + return keys_to_keep.contains(Field(UInt64(value))); + } + + return false; + } }; diff --git a/src/Analyzer/IQueryTreeNode.cpp b/src/Analyzer/IQueryTreeNode.cpp index cd085babf38..cfba4b41410 100644 --- a/src/Analyzer/IQueryTreeNode.cpp +++ b/src/Analyzer/IQueryTreeNode.cpp @@ -336,7 +336,7 @@ ASTPtr IQueryTreeNode::toAST(const ConvertToASTOptions & options) const { auto converted_node = toASTImpl(options); - if (auto * ast_with_alias = dynamic_cast(converted_node.get())) + if (auto * /*ast_with_alias*/ _ = dynamic_cast(converted_node.get())) converted_node->setAlias(alias); return converted_node; diff --git a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp index f78eb2a0b76..abd0a95c6f2 100644 --- a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp +++ b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp @@ -285,7 +285,7 @@ public: return; } - if (const auto * join_node = node->as()) + if (const auto * /*join_node*/ _ = node->as()) { can_wrap_result_columns_with_nullable |= getContext()->getSettingsRef()[Setting::join_use_nulls]; return; diff --git a/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp b/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp index f11d5fa003f..e4490789cab 100644 --- a/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp +++ b/src/Analyzer/Passes/OptimizeDateOrDateTimeConverterWithPreimagePass.cpp @@ -77,7 +77,7 @@ public: for (size_t i = 0; i < function->getArguments().getNodes().size(); i++) { - if (const auto * func = function->getArguments().getNodes()[i]->as()) + if (const auto * /*func*/ _ = function->getArguments().getNodes()[i]->as()) { func_id = i; break; diff --git a/src/Analyzer/QueryTreeBuilder.cpp b/src/Analyzer/QueryTreeBuilder.cpp index 08cf0cc4397..39c59d27e2c 100644 --- a/src/Analyzer/QueryTreeBuilder.cpp +++ b/src/Analyzer/QueryTreeBuilder.cpp @@ -676,7 +676,7 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co result = std::move(query_node); } - else if (const auto * select_with_union_query = expression->as()) + else if (const auto * /*select_with_union_query*/ _ = expression->as()) { auto query_node = buildSelectWithUnionExpression(expression, false /*is_subquery*/, {} /*cte_name*/, context); result = std::move(query_node); diff --git a/src/Analyzer/Resolve/IdentifierResolver.cpp b/src/Analyzer/Resolve/IdentifierResolver.cpp index e46c301f9ea..000e7ff391a 100644 --- a/src/Analyzer/Resolve/IdentifierResolver.cpp +++ b/src/Analyzer/Resolve/IdentifierResolver.cpp @@ -7,6 +7,8 @@ #include #include +#include +#include #include #include @@ -417,9 +419,16 @@ QueryTreeNodePtr IdentifierResolver::tryResolveTableIdentifierFromDatabaseCatalo bool is_temporary_table = storage_id.getDatabaseName() == DatabaseCatalog::TEMPORARY_DATABASE; StoragePtr storage; + TableLockHolder storage_lock; if (is_temporary_table) storage = DatabaseCatalog::instance().getTable(storage_id, context); + else if (auto refresh_task = context->getRefreshSet().tryGetTaskForInnerTable(storage_id)) + { + /// If table is the target of a refreshable materialized view, it needs additional + /// synchronization to make sure we see all of the data (e.g. if refresh happened on another replica). + std::tie(storage, storage_lock) = refresh_task->getAndLockTargetTable(storage_id, context); + } else storage = DatabaseCatalog::instance().tryGetTable(storage_id, context); @@ -434,7 +443,8 @@ QueryTreeNodePtr IdentifierResolver::tryResolveTableIdentifierFromDatabaseCatalo if (!storage) return {}; - auto storage_lock = storage->lockForShare(context->getInitialQueryId(), context->getSettingsRef()[Setting::lock_acquire_timeout]); + if (!storage_lock) + storage_lock = storage->lockForShare(context->getInitialQueryId(), context->getSettingsRef()[Setting::lock_acquire_timeout]); auto storage_snapshot = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr(), context); auto result = std::make_shared(std::move(storage), std::move(storage_lock), std::move(storage_snapshot)); if (is_temporary_table) diff --git a/src/Analyzer/Resolve/QueryAnalyzer.cpp b/src/Analyzer/Resolve/QueryAnalyzer.cpp index 7dc1d99efd0..2dba0c4d9ad 100644 --- a/src/Analyzer/Resolve/QueryAnalyzer.cpp +++ b/src/Analyzer/Resolve/QueryAnalyzer.cpp @@ -1951,7 +1951,7 @@ QueryAnalyzer::QueryTreeNodesWithNames QueryAnalyzer::resolveUnqualifiedMatcher( { bool table_expression_in_resolve_process = nearest_query_scope->table_expressions_in_resolve_process.contains(table_expression.get()); - if (auto * array_join_node = table_expression->as()) + if (auto * /*array_join_node*/ _ = table_expression->as()) { if (table_expressions_column_nodes_with_names_stack.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, @@ -4045,9 +4045,10 @@ ProjectionNames QueryAnalyzer::resolveSortNodeList(QueryTreeNodePtr & sort_node_ const auto * constant_node = sort_node.getFillTo()->as(); if (!constant_node || !isColumnedAsNumber(constant_node->getResultType())) - throw Exception(ErrorCodes::INVALID_WITH_FILL_EXPRESSION, + throw Exception( + ErrorCodes::INVALID_WITH_FILL_EXPRESSION, "Sort FILL TO expression must be constant with numeric type. Actual {}. In scope {}", - sort_node.getFillFrom()->formatASTForErrorMessage(), + sort_node.getFillTo()->formatASTForErrorMessage(), scope.scope_node->formatASTForErrorMessage()); size_t fill_to_expression_projection_names_size = fill_to_expression_projection_names.size(); diff --git a/src/Analyzer/Resolve/QueryExpressionsAliasVisitor.h b/src/Analyzer/Resolve/QueryExpressionsAliasVisitor.h index 45d081e34ea..31b2f07b97a 100644 --- a/src/Analyzer/Resolve/QueryExpressionsAliasVisitor.h +++ b/src/Analyzer/Resolve/QueryExpressionsAliasVisitor.h @@ -45,7 +45,7 @@ public: bool needChildVisit(const QueryTreeNodePtr &, const QueryTreeNodePtr & child) { - if (auto * lambda_node = child->as()) + if (auto * /*lambda_node*/ _ = child->as()) { updateAliasesIfNeeded(child, true /*is_lambda_node*/); return false; diff --git a/src/Client/ClientApplicationBase.cpp b/src/Client/ClientApplicationBase.cpp index df78b890c5e..d26641fe5f9 100644 --- a/src/Client/ClientApplicationBase.cpp +++ b/src/Client/ClientApplicationBase.cpp @@ -171,6 +171,8 @@ void ClientApplicationBase::init(int argc, char ** argv) ("stage", po::value()->default_value("complete"), "Request query processing up to specified stage: complete,fetch_columns,with_mergeable_state,with_mergeable_state_after_aggregation,with_mergeable_state_after_aggregation_and_limit") ("progress", po::value()->implicit_value(ProgressOption::TTY, "tty")->default_value(ProgressOption::DEFAULT, "default"), "Print progress of queries execution - to TTY: tty|on|1|true|yes; to STDERR non-interactive mode: err; OFF: off|0|false|no; DEFAULT - interactive to TTY, non-interactive is off") + ("progress-table", po::value()->implicit_value(ProgressOption::TTY, "tty")->default_value(ProgressOption::DEFAULT, "default"), "Print a progress table with changing metrics during query execution - to TTY: tty|on|1|true|yes; to STDERR non-interactive mode: err; OFF: off|0|false|no; DEFAULT - interactive to TTY, non-interactive is off.") + ("enable-progress-table-toggle", po::value()->default_value(true), "Enable toggling of the progress table by pressing the control key (Space). Only applicable in interactive mode with the progress table enabled.") ("disable_suggestion,A", "Disable loading suggestion data. Note that suggestion data is loaded asynchronously through a second connection to ClickHouse server. Also it is reasonable to disable suggestion if you want to paste a query with TAB characters. Shorthand option -A is for those who get used to mysql client.") ("wait_for_suggestions_to_load", "Load suggestion data synchonously.") @@ -316,6 +318,26 @@ void ClientApplicationBase::init(int argc, char ** argv) break; } } + if (options.count("progress-table")) + { + switch (options["progress-table"].as()) + { + case DEFAULT: + config().setString("progress-table", "default"); + break; + case OFF: + config().setString("progress-table", "off"); + break; + case TTY: + config().setString("progress-table", "tty"); + break; + case ERR: + config().setString("progress-table", "err"); + break; + } + } + if (options.count("enable-progress-table-toggle")) + getClientConfiguration().setBool("enable-progress-table-toggle", options["enable-progress-table-toggle"].as()); if (options.count("echo")) getClientConfiguration().setBool("echo", true); if (options.count("disable_suggestion")) diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 155e65ff568..2377a013425 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1,8 +1,9 @@ #include -#include #include -#include #include +#include +#include +#include #include #include @@ -288,6 +289,7 @@ ClientBase::ClientBase( : std_in(in_fd_) , std_out(out_fd_) , progress_indication(output_stream_, in_fd_, err_fd_) + , progress_table(output_stream_, in_fd_, err_fd_) , in_fd(in_fd_) , out_fd(out_fd_) , err_fd(err_fd_) @@ -438,6 +440,8 @@ void ClientBase::onData(Block & block, ASTPtr parsed_query) /// If results are written INTO OUTFILE, we can avoid clearing progress to avoid flicker. if (need_render_progress && tty_buf && (!select_into_file || select_into_file_and_stdout)) progress_indication.clearProgressOutput(*tty_buf); + if (need_render_progress_table && tty_buf && (!select_into_file || select_into_file_and_stdout)) + progress_table.clearTableOutput(*tty_buf); try { @@ -453,13 +457,20 @@ void ClientBase::onData(Block & block, ASTPtr parsed_query) /// Received data block is immediately displayed to the user. output_format->flush(); - /// Restore progress bar after data block. + /// Restore progress bar and progress table after data block. if (need_render_progress && tty_buf) { if (select_into_file && !select_into_file_and_stdout) error_stream << "\r"; progress_indication.writeProgress(*tty_buf); } + if (need_render_progress_table && tty_buf) + { + if (!need_render_progress && select_into_file && !select_into_file_and_stdout) + error_stream << "\r"; + bool toggle_enabled = getClientConfiguration().getBool("enable-progress-table-toggle", true); + progress_table.writeTable(*tty_buf, show_progress_table.load(), toggle_enabled); + } } @@ -468,6 +479,8 @@ void ClientBase::onLogData(Block & block) initLogsOutputStream(); if (need_render_progress && tty_buf) progress_indication.clearProgressOutput(*tty_buf); + if (need_render_progress_table && tty_buf) + progress_table.clearTableOutput(*tty_buf); logs_out_stream->writeLogs(block); logs_out_stream->flush(); } @@ -796,16 +809,23 @@ void ClientBase::setDefaultFormatsAndCompressionFromConfiguration() } } -void ClientBase::initTTYBuffer(ProgressOption progress) +void ClientBase::initTTYBuffer(ProgressOption progress_option, ProgressOption progress_table_option) { if (tty_buf) return; - if (progress == ProgressOption::OFF || (!is_interactive && progress == ProgressOption::DEFAULT)) - { - need_render_progress = false; - return; - } + if (progress_option == ProgressOption::OFF || (!is_interactive && progress_option == ProgressOption::DEFAULT)) + need_render_progress = false; + + if (progress_table_option == ProgressOption::OFF || (!is_interactive && progress_table_option == ProgressOption::DEFAULT)) + need_render_progress_table = false; + + if (!need_render_progress && !need_render_progress_table) + return; + + /// If need_render_progress and need_render_progress_table are enabled, + /// use ProgressOption that was set for the progress bar for progress table as well. + ProgressOption progress = progress_option ? progress_option : progress_table_option; static constexpr auto tty_file_name = "/dev/tty"; @@ -851,7 +871,20 @@ void ClientBase::initTTYBuffer(ProgressOption progress) tty_buf = std::make_unique(STDERR_FILENO, buf_size); } else + { need_render_progress = false; + need_render_progress_table = false; + } +} + +void ClientBase::initKeystrokeInterceptor() +{ + if (is_interactive && need_render_progress_table && getClientConfiguration().getBool("enable-progress-table-toggle", true)) + { + keystroke_interceptor = std::make_unique(in_fd, error_stream); + keystroke_interceptor->registerCallback(' ', [this]() { show_progress_table = !show_progress_table; }); + + } } void ClientBase::updateSuggest(const ASTPtr & ast) @@ -1115,6 +1148,34 @@ void ClientBase::receiveResult(ASTPtr parsed_query, Int32 signals_before_stop, b std::exception_ptr local_format_error; + if (keystroke_interceptor) + { + try + { + keystroke_interceptor->startIntercept(); + } + catch (const DB::Exception &) + { + error_stream << getCurrentExceptionMessage(false); + keystroke_interceptor.reset(); + } + } + + SCOPE_EXIT({ + if (keystroke_interceptor) + { + try + { + keystroke_interceptor->stopIntercept(); + } + catch (...) + { + error_stream << getCurrentExceptionMessage(false); + keystroke_interceptor.reset(); + } + } + }); + while (true) { Stopwatch receive_watch(CLOCK_MONOTONIC_COARSE); @@ -1266,6 +1327,8 @@ void ClientBase::onEndOfStream() { if (need_render_progress && tty_buf) progress_indication.clearProgressOutput(*tty_buf); + if (need_render_progress_table && tty_buf) + progress_table.clearTableOutput(*tty_buf); if (output_format) { @@ -1344,9 +1407,15 @@ void ClientBase::onProfileEvents(Block & block) thread_times[host_name].peak_memory_usage = value; } progress_indication.updateThreadEventData(thread_times); + progress_table.updateTable(block); if (need_render_progress && tty_buf) progress_indication.writeProgress(*tty_buf); + if (need_render_progress_table && tty_buf) + { + bool toggle_enabled = getClientConfiguration().getBool("enable-progress-table-toggle", true); + progress_table.writeTable(*tty_buf, show_progress_table.load(), toggle_enabled); + } if (profile_events.print) { @@ -1357,6 +1426,8 @@ void ClientBase::onProfileEvents(Block & block) initLogsOutputStream(); if (need_render_progress && tty_buf) progress_indication.clearProgressOutput(*tty_buf); + if (need_render_progress_table && tty_buf) + progress_table.clearTableOutput(*tty_buf); logs_out_stream->writeProfileEvents(block); logs_out_stream->flush(); @@ -1838,6 +1909,8 @@ void ClientBase::cancelQuery() connection->sendCancel(); if (need_render_progress && tty_buf) progress_indication.clearProgressOutput(*tty_buf); + if (need_render_progress_table && tty_buf) + progress_table.clearTableOutput(*tty_buf); if (is_interactive) output_stream << "Cancelling query." << std::endl; @@ -1904,6 +1977,7 @@ void ClientBase::processParsedSingleQuery(const String & full_query, const Strin processed_rows = 0; written_first_block = false; progress_indication.resetProgress(); + progress_table.resetTable(); profile_events.watch.restart(); { @@ -2030,6 +2104,8 @@ void ClientBase::processParsedSingleQuery(const String & full_query, const Strin initLogsOutputStream(); if (need_render_progress && tty_buf) progress_indication.clearProgressOutput(*tty_buf); + if (need_render_progress_table && tty_buf) + progress_table.clearTableOutput(*tty_buf); logs_out_stream->writeProfileEvents(profile_events.last_block); logs_out_stream->flush(); @@ -2043,6 +2119,8 @@ void ClientBase::processParsedSingleQuery(const String & full_query, const Strin output_stream << processed_rows << " row" << (processed_rows == 1 ? "" : "s") << " in set. "; output_stream << "Elapsed: " << progress_indication.elapsedSeconds() << " sec. "; progress_indication.writeFinalProgress(); + if (need_render_progress_table && show_progress_table) + progress_table.writeFinalTable(); output_stream << std::endl << std::endl; } else @@ -2498,7 +2576,7 @@ bool ClientBase::addMergeTreeSettings(ASTCreateQuery & ast_create) || ast_create.storage->engine->name.find("MergeTree") == std::string::npos) return false; - auto all_changed = cmd_merge_tree_settings.allChanged(); + auto all_changed = cmd_merge_tree_settings.changes(); if (all_changed.begin() == all_changed.end()) return false; @@ -2512,11 +2590,11 @@ bool ClientBase::addMergeTreeSettings(ASTCreateQuery & ast_create) auto & storage_settings = *ast_create.storage->settings; bool added_new_setting = false; - for (const auto & setting : all_changed) + for (const auto & change : all_changed) { - if (!storage_settings.changes.tryGet(setting.getName())) + if (!storage_settings.changes.tryGet(change.name)) { - storage_settings.changes.emplace_back(setting.getName(), setting.getValue()); + storage_settings.changes.emplace_back(change.name, change.value); added_new_setting = true; } } diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index 45251aea28a..e0fcce7153d 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -1,21 +1,22 @@ #pragma once +#include #include -#include -#include -#include -#include -#include -#include #include #include +#include +#include #include #include #include -#include -#include #include +#include +#include +#include +#include +#include +#include #include #include @@ -68,6 +69,7 @@ ProgressOption toProgressOption(std::string progress); std::istream& operator>> (std::istream & in, ProgressOption & progress); class InternalTextLogs; +class TerminalKeystrokeInterceptor; class WriteBufferFromFileDescriptor; /** @@ -245,7 +247,8 @@ protected: void setDefaultFormatsAndCompressionFromConfiguration(); - void initTTYBuffer(ProgressOption progress); + void initTTYBuffer(ProgressOption progress_option, ProgressOption progress_table_option); + void initKeystrokeInterceptor(); /// Should be one of the first, to be destroyed the last, /// since other members can use them. @@ -255,6 +258,8 @@ protected: /// Client context is a context used only by the client to parse queries, process query parameters and to connect to clickhouse-server. ContextMutablePtr client_context; + std::unique_ptr keystroke_interceptor; + bool is_interactive = false; /// Use either interactive line editing interface or batch mode. bool delayed_interactive = false; @@ -332,7 +337,10 @@ protected: String server_display_name; ProgressIndication progress_indication; + ProgressTable progress_table; bool need_render_progress = true; + bool need_render_progress_table = true; + std::atomic_bool show_progress_table = false; bool need_render_profile_events = true; bool written_first_block = false; size_t processed_rows = 0; /// How many rows have been read or written. diff --git a/src/Client/ClientBaseOptimizedParts.cpp b/src/Client/ClientBaseOptimizedParts.cpp index 4222aab63b2..ac4d3417779 100644 --- a/src/Client/ClientBaseOptimizedParts.cpp +++ b/src/Client/ClientBaseOptimizedParts.cpp @@ -1,5 +1,4 @@ #include -#include namespace DB { @@ -19,17 +18,6 @@ namespace ErrorCodes namespace { -/// Define transparent hash to we can use -/// std::string_view with the containers -struct TransparentStringHash -{ - using is_transparent = void; - size_t operator()(std::string_view txt) const - { - return std::hash{}(txt); - } -}; - /* * This functor is used to parse command line arguments and replace dashes with underscores, * allowing options to be specified using either dashes or underscores. @@ -89,41 +77,8 @@ void ClientApplicationBase::parseAndCheckOptions(OptionsDescription & options_de if (allow_merge_tree_settings) { - /// Add merge tree settings manually, because names of some settings - /// may clash. Query settings have higher priority and we just - /// skip ambiguous merge tree settings. auto & main_options = options_description.main_description.value(); - - std::unordered_set> main_option_names; - for (const auto & option : main_options.options()) - main_option_names.insert(option->long_name()); - - for (const auto & setting : cmd_merge_tree_settings.all()) - { - const auto add_setting = [&](const std::string_view name) - { - if (auto it = main_option_names.find(name); it != main_option_names.end()) - return; - - if (allow_repeated_settings) - addProgramOptionAsMultitoken(cmd_merge_tree_settings, main_options, name, setting); - else - addProgramOption(cmd_merge_tree_settings, main_options, name, setting); - }; - - const auto & setting_name = setting.getName(); - - add_setting(setting_name); - - const auto & settings_to_aliases = MergeTreeSettings::Traits::settingsToAliases(); - if (auto it = settings_to_aliases.find(setting_name); it != settings_to_aliases.end()) - { - for (const auto alias : it->second) - { - add_setting(alias); - } - } - } + cmd_merge_tree_settings.addToProgramOptionsIfNotPresent(main_options, allow_repeated_settings); } /// Parse main commandline options. diff --git a/src/Client/ProgressTable.cpp b/src/Client/ProgressTable.cpp new file mode 100644 index 00000000000..6df30866e2e --- /dev/null +++ b/src/Client/ProgressTable.cpp @@ -0,0 +1,474 @@ +#include "ProgressTable.h" +#include "Common/AllocatorWithMemoryTracking.h" +#include "Common/ProfileEvents.h" +#include "base/defines.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int LOGICAL_ERROR; +} + +namespace +{ + +constexpr UInt64 THREAD_GROUP_ID = 0; + +constexpr std::string_view CLEAR_TO_END_OF_LINE = "\033[K"; +constexpr std::string_view CLEAR_TO_END_OF_SCREEN = "\033[0J"; +constexpr std::string_view RESET_COLOR = "\033[0m"; +constexpr std::string_view HIDE_CURSOR = "\033[?25l"; +constexpr std::string_view SHOW_CURSOR = "\033[?25h"; + +std::string moveUpNLines(size_t N) +{ + return std::format("\033[{}A", N); +} + +std::string formatReadableValue(ProfileEvents::ValueType value_type, double value) +{ + switch (value_type) + { + case ProfileEvents::ValueType::Number: + return formatReadableQuantity(value, /*precision*/ std::floor(value) == value && fabs(value) < 1000 ? 0 : 2); + case ProfileEvents::ValueType::Bytes: + return formatReadableSizeWithDecimalSuffix(value); + case ProfileEvents::ValueType::Nanoseconds: + return formatReadableTime(value); + case ProfileEvents::ValueType::Microseconds: + return formatReadableTime(value * 1e3); + case ProfileEvents::ValueType::Milliseconds: + return formatReadableTime(value * 1e6); + } +} + +const std::unordered_map & getEventNameToEvent() +{ + /// TODO: MemoryTracker::USAGE_EVENT_NAME and PEAK_USAGE_EVENT_NAME + static std::unordered_map event_name_to_event; + + if (!event_name_to_event.empty()) + return event_name_to_event; + + for (ProfileEvents::Event event = ProfileEvents::Event(0); event < ProfileEvents::end(); ++event) + { + event_name_to_event.emplace(ProfileEvents::getName(event), event); + } + + return event_name_to_event; +} + + +std::string_view setColorForProgress(double progress, double max_progress) +{ + constexpr std::array colors = { + "\033[38;5;236m", /// Dark Grey + "\033[38;5;250m", /// Light Grey + "\033[38;5;34m", /// Green + "\033[38;5;226m", /// Yellow + "\033[1;33m", /// Bold + }; + + constexpr std::array fractions = { + 0.05, + 0.20, + 0.80, + 0.95, + }; + + if (max_progress == 0) + return colors.front(); + + auto fraction = progress / max_progress; + auto dist = std::upper_bound(fractions.begin(), fractions.end(), fraction) - fractions.begin(); + return colors[dist]; +} + +std::string_view setColorForBytesBasedMetricsProgress(double progress) +{ + constexpr std::array colors = { + "\033[38;5;236m", /// Dark Grey + "\033[38;5;250m", /// Light Grey + "\033[38;5;34m", /// Green + "\033[38;5;226m", /// Yellow + "\033[38;5;208m", /// Orange + "\033[1;33m", /// Bold + "\033[38;5;160m", /// Red: corresponds to >= 1T/s. Not a practical scenario. + }; + + /// Bytes. + constexpr std::array thresholds = { + 1LL << 20, + 100LL << 20, + 1'000LL << 20, + 10'000LL << 20, + 100'000LL << 20, + 1'000'000LL << 20, + }; + + auto dist = std::upper_bound(thresholds.begin(), thresholds.end(), progress) - thresholds.begin(); + return colors[dist]; +} + +std::string_view setColorForTimeBasedMetricsProgress(ProfileEvents::ValueType value_type, double progress) +{ + /// Time units in a second. + auto units = [](ProfileEvents::ValueType t) -> double + { + switch (t) + { + case ProfileEvents::ValueType::Milliseconds: + return 1e3; + case ProfileEvents::ValueType::Microseconds: + return 1e6; + case ProfileEvents::ValueType::Nanoseconds: + return 1e9; + default: + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong value type, expecting time units"); + } + }(value_type); + + constexpr std::array colors = { + "\033[38;5;236m", /// Dark Grey + "\033[38;5;250m", /// Light Grey + "\033[38;5;34m", /// Green + "\033[38;5;226m", /// Yellow + "\033[1;33m" /// Bold + }; + + const std::array thresholds = {0.001 * units, 0.01 * units, 0.1 * units, 1.0 * units}; + + auto dist = std::upper_bound(thresholds.begin(), thresholds.end(), progress) - thresholds.begin(); + return colors[dist]; +} + +std::string_view setColorForStaleMetrics() +{ + return "\033[38;5;236m"; /// Dark Grey +} + +std::string_view setColorForDocumentation() +{ + return "\033[38;5;236m"; /// Dark Grey +} + +template +void writeWithWidth(Out & out, std::string_view s, size_t width) +{ + if (s.size() >= width) + out << s << " "; + else + out << s << std::string(width - s.size(), ' '); +} + +template +void writeWithWidthStrict(Out & out, std::string_view s, size_t width) +{ + chassert(width != 0); + if (s.size() > width) + out << s.substr(0, width - 1) << "…"; + else + out << s; +} + +} + +void ProgressTable::writeTable(WriteBufferFromFileDescriptor & message, bool show_table, bool toggle_enabled) +{ + std::lock_guard lock{mutex}; + if (!show_table) + { + if (written_first_block) + message << CLEAR_TO_END_OF_SCREEN; + + if (toggle_enabled) + { + message << HIDE_CURSOR; + message << "\n"; + message << "Press the space key to toggle the display of the progress table."; + message << moveUpNLines(1); + message.next(); + } + return; + } + + const auto & event_name_to_event = getEventNameToEvent(); + + size_t terminal_width = getTerminalWidth(in_fd, err_fd); + if (terminal_width < column_event_name_width + COLUMN_VALUE_WIDTH + COLUMN_PROGRESS_WIDTH) + return; + + if (metrics.empty()) + return; + + message << HIDE_CURSOR; + message << "\n"; + writeWithWidth(message, COLUMN_EVENT_NAME, column_event_name_width); + writeWithWidth(message, COLUMN_VALUE, COLUMN_VALUE_WIDTH); + writeWithWidth(message, COLUMN_PROGRESS, COLUMN_PROGRESS_WIDTH); + writeWithWidth(message, COLUMN_DOCUMENTATION_NAME, COLUMN_DOCUMENTATION_WIDTH); + message << CLEAR_TO_END_OF_LINE; + + double elapsed_sec = watch.elapsedSeconds(); + + for (auto & [name, per_host_info] : metrics) + { + message << "\n"; + if (per_host_info.isStale(elapsed_sec)) + message << setColorForStaleMetrics(); + writeWithWidth(message, name, column_event_name_width); + + auto value = per_host_info.getSummaryValue(); + auto value_type = getValueType(event_name_to_event.at(name)); + writeWithWidth(message, formatReadableValue(value_type, value), COLUMN_VALUE_WIDTH); + + /// Get the maximum progress before it is updated in getSummaryProgress. + auto max_progress = per_host_info.getMaxProgress(); + auto progress = per_host_info.getSummaryProgress(elapsed_sec); + switch (value_type) + { + case ProfileEvents::ValueType::Number: + message << setColorForProgress(progress, max_progress); + break; + case ProfileEvents::ValueType::Bytes: + message << setColorForBytesBasedMetricsProgress(progress); + break; + case ProfileEvents::ValueType::Milliseconds: + [[fallthrough]]; + case ProfileEvents::ValueType::Microseconds: + [[fallthrough]]; + case ProfileEvents::ValueType::Nanoseconds: + message << setColorForTimeBasedMetricsProgress(value_type, progress); + break; + } + + writeWithWidth(message, formatReadableValue(value_type, progress) + "/s", COLUMN_PROGRESS_WIDTH); + + message << setColorForDocumentation(); + const auto * doc = getDocumentation(event_name_to_event.at(name)); + writeWithWidthStrict(message, doc, COLUMN_DOCUMENTATION_WIDTH); + + message << RESET_COLOR; + message << CLEAR_TO_END_OF_LINE; + } + + message << moveUpNLines(tableSize()); + message.next(); +} + +void ProgressTable::writeFinalTable() +{ + std::lock_guard lock{mutex}; + const auto & event_name_to_event = getEventNameToEvent(); + + size_t terminal_width = getTerminalWidth(in_fd, err_fd); + if (terminal_width < column_event_name_width + COLUMN_VALUE_WIDTH) + return; + + if (metrics.empty()) + return; + + output_stream << "\n"; + writeWithWidth(output_stream, COLUMN_EVENT_NAME, column_event_name_width); + writeWithWidth(output_stream, COLUMN_VALUE, COLUMN_VALUE_WIDTH); + + for (auto & [name, per_host_info] : metrics) + { + output_stream << "\n"; + writeWithWidth(output_stream, name, column_event_name_width); + + auto value = per_host_info.getSummaryValue(); + auto value_type = getValueType(event_name_to_event.at(name)); + writeWithWidth(output_stream, formatReadableValue(value_type, value), COLUMN_VALUE_WIDTH); + } +} + +void ProgressTable::updateTable(const Block & block) +{ + const auto & array_thread_id = typeid_cast(*block.getByName("thread_id").column).getData(); + const auto & names = typeid_cast(*block.getByName("name").column); + const auto & host_names = typeid_cast(*block.getByName("host_name").column); + const auto & array_values = typeid_cast(*block.getByName("value").column).getData(); + const auto & array_type = typeid_cast(*block.getByName("type").column).getData(); + + const double time_now = watch.elapsedSeconds(); + size_t max_event_name_width = COLUMN_EVENT_NAME.size(); + + std::lock_guard lock{mutex}; + const auto & event_name_to_event = getEventNameToEvent(); + for (size_t row_num = 0, rows = block.rows(); row_num < rows; ++row_num) + { + auto thread_id = array_thread_id[row_num]; + + /// In ProfileEvents packets thread id 0 specifies common profiling information + /// for all threads executing current query on specific host. So instead of summing per thread + /// consumption it's enough to look for data with thread id 0. + if (thread_id != THREAD_GROUP_ID) + continue; + + auto value = array_values[row_num]; + auto name = names.getDataAt(row_num).toString(); + auto host_name = host_names.getDataAt(row_num).toString(); + auto type = static_cast(array_type[row_num]); + + /// Got unexpected event name. + if (!event_name_to_event.contains(name)) + continue; + + /// Store non-zero values. + if (value == 0) + continue; + + auto it = metrics.find(name); + + /// If the table has already been written, then do not add new metrics to avoid jitter. + if (it == metrics.end() && written_first_block) + continue; + + if (!written_first_block) + it = metrics.try_emplace(name).first; + + it->second.updateHostValue(host_name, type, value, time_now); + + max_event_name_width = std::max(max_event_name_width, name.size()); + } + + if (!written_first_block) + column_event_name_width = max_event_name_width + 1; + + written_first_block = true; +} + +void ProgressTable::clearTableOutput(WriteBufferFromFileDescriptor & message) +{ + message << CLEAR_TO_END_OF_SCREEN; + message << SHOW_CURSOR; + message.next(); +} + +void ProgressTable::resetTable() +{ + std::lock_guard lock{mutex}; + watch.restart(); + metrics.clear(); + written_first_block = false; +} + +size_t ProgressTable::tableSize() const +{ + /// Number of lines + header. + return metrics.empty() ? 0 : metrics.size() + 1; +} + +ProgressTable::MetricInfo::MetricInfo(ProfileEvents::Type t) : type(t) +{ +} + +void ProgressTable::MetricInfo::updateValue(Int64 new_value, double new_time) +{ + /// If the value has not been updated for a long time, + /// reset the time in snapshots to one second ago. + if (new_time - new_snapshot.time >= 0.5 || new_snapshot.time == 0) + { + prev_shapshot = {new_snapshot.value, new_time - 1.0}; + cur_shapshot = {new_snapshot.value, new_time - 1.0}; + } + + switch (type) + { + case ProfileEvents::Type::INCREMENT: + new_snapshot.value = new_snapshot.value + new_value; + break; + case ProfileEvents::Type::GAUGE: + new_snapshot.value = new_value; + break; + } + new_snapshot.time = new_time; + + if (new_snapshot.time - cur_shapshot.time >= 0.5) + prev_shapshot = std::exchange(cur_shapshot, new_snapshot); + + update_time = new_time; +} + +bool ProgressTable::MetricInfo::isStale(double now) const +{ + return update_time != 0 && now - update_time >= 5.0; +} + +double ProgressTable::MetricInfo::calculateProgress(double time_now) const +{ + /// If the value has not been updated for a long time, the progress is 0. + if (time_now - new_snapshot.time >= 0.5) + return 0; + + return (cur_shapshot.value - prev_shapshot.value) / (cur_shapshot.time - prev_shapshot.time); +} + +double ProgressTable::MetricInfo::getValue() const +{ + return new_snapshot.value; +} + +void ProgressTable::MetricInfoPerHost::updateHostValue(const HostName & host, ProfileEvents::Type type, Int64 new_value, double new_time) +{ + auto it = host_to_metric.find(host); + if (it == host_to_metric.end()) + it = host_to_metric.emplace(host, type).first; + it->second.updateValue(new_value, new_time); +} + +double ProgressTable::MetricInfoPerHost::getSummaryValue() +{ + return std::accumulate( + host_to_metric.cbegin(), + host_to_metric.cend(), + 0.0, + [](double acc, const auto & host_data) + { + const MetricInfo & info = host_data.second; + return acc + info.getValue(); + }); +} + +double ProgressTable::MetricInfoPerHost::getSummaryProgress(double time_now) +{ + auto progress = std::accumulate( + host_to_metric.cbegin(), + host_to_metric.cend(), + 0.0, + [time_now](double acc, const auto & host_data) + { + const MetricInfo & info = host_data.second; + return acc + info.calculateProgress(time_now); + }); + max_progress = std::max(max_progress, progress); + return progress; +} + +double ProgressTable::MetricInfoPerHost::getMaxProgress() const +{ + return max_progress; +} + +bool ProgressTable::MetricInfoPerHost::isStale(double now) const +{ + return std::all_of(host_to_metric.cbegin(), host_to_metric.cend(), [&now](const auto & p) { return p.second.isStale(now); }); +} +} diff --git a/src/Client/ProgressTable.h b/src/Client/ProgressTable.h new file mode 100644 index 00000000000..a55326e8d3a --- /dev/null +++ b/src/Client/ProgressTable.h @@ -0,0 +1,120 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +class WriteBufferFromFileDescriptor; +class Block; + +class ProgressTable +{ +public: + explicit ProgressTable(std::ostream & output_stream_, int in_fd_ = STDIN_FILENO, int err_fd_ = STDERR_FILENO) + : output_stream(output_stream_), in_fd(in_fd_), err_fd(err_fd_) + { + } + + /// Write progress table with metrics. + void writeTable(WriteBufferFromFileDescriptor & message, bool show_table, bool toggle_enabled); + void clearTableOutput(WriteBufferFromFileDescriptor & message); + void writeFinalTable(); + + /// Update the metric values. They can be updated from: + /// onProfileEvents in clickhouse-client; + void updateTable(const Block & block); + + /// Reset progress table values. + void resetTable(); + +private: + class MetricInfo + { + public: + explicit MetricInfo(ProfileEvents::Type t); + + void updateValue(Int64 new_value, double new_time); + double calculateProgress(double time_now) const; + double getValue() const; + bool isStale(double now) const; + + private: + const ProfileEvents::Type type; + + struct Snapshot + { + Int64 value = 0; + double time = 0; + }; + + /// The previous and current snapshots are used by `calculateProgress`. + /// They contain information that is outdated by about a second. + /// The new snapshot is used by `updateValue` and `getValue`. + /// We don't use a new snapshot in `calculateProgress` because the time elapsed since + /// the previous update may be very small, causing jitter. + Snapshot prev_shapshot; + Snapshot cur_shapshot; + Snapshot new_snapshot; + + double update_time = 0.0; + }; + + class MetricInfoPerHost + { + public: + using HostName = String; + + void updateHostValue(const HostName & host, ProfileEvents::Type type, Int64 new_value, double new_time); + double getSummaryValue(); + double getSummaryProgress(double time_now); + double getMaxProgress() const; + bool isStale(double now) const; + + private: + std::unordered_map host_to_metric; + double max_progress = 0; + }; + + size_t tableSize() const; + + using MetricName = String; + + /// The server periodically sends Block with profile events. + /// This information is stored here. + std::map metrics; + + /// It is possible concurrent access to the metrics. + std::mutex mutex; + + /// Track query execution time on client. + Stopwatch watch; + + bool written_first_block = false; + + size_t column_event_name_width = 20; + + static constexpr std::string_view COLUMN_EVENT_NAME = "Event name"; + static constexpr std::string_view COLUMN_VALUE = "Value"; + static constexpr std::string_view COLUMN_PROGRESS = "Progress"; + static constexpr std::string_view COLUMN_DOCUMENTATION_NAME = "Documentation"; + static constexpr size_t COLUMN_VALUE_WIDTH = 20; + static constexpr size_t COLUMN_PROGRESS_WIDTH = 20; + static constexpr size_t COLUMN_DOCUMENTATION_WIDTH = 100; + + std::ostream & output_stream; + int in_fd; + int err_fd; +}; + +} diff --git a/src/Client/TerminalKeystrokeInterceptor.cpp b/src/Client/TerminalKeystrokeInterceptor.cpp new file mode 100644 index 00000000000..ed4db02f546 --- /dev/null +++ b/src/Client/TerminalKeystrokeInterceptor.cpp @@ -0,0 +1,118 @@ +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace DB::ErrorCodes +{ +extern const int SYSTEM_ERROR; +} + +namespace DB +{ + +TerminalKeystrokeInterceptor::TerminalKeystrokeInterceptor(int fd_, std::ostream & error_stream_) : fd(fd_), error_stream(error_stream_) +{ +} + +TerminalKeystrokeInterceptor::~TerminalKeystrokeInterceptor() +{ + try + { + stopIntercept(); + } + catch (...) + { + error_stream << getCurrentExceptionMessage(false); + } +} + +void TerminalKeystrokeInterceptor::registerCallback(char key, TerminalKeystrokeInterceptor::Callback cb) +{ + callbacks.emplace(key, cb); +} + +void TerminalKeystrokeInterceptor::startIntercept() +{ + std::lock_guard lock(mutex); + + if (intercept_thread && intercept_thread->joinable()) + return; + + chassert(!orig_termios); + + stop_requested = false; + + /// Save terminal state. + orig_termios = std::make_unique(); + if (tcgetattr(fd, orig_termios.get())) + throw DB::ErrnoException( + DB::ErrorCodes::SYSTEM_ERROR, "Cannot get the state of the terminal referred to by file descriptor '{}'", fd); + + /// Set terminal to the raw terminal mode. + struct termios raw = *orig_termios; + raw.c_lflag &= ~(ECHO | ICANON); + raw.c_cc[VMIN] = 0; + raw.c_cc[VTIME] = 1; + if (tcsetattr(fd, TCSAFLUSH, &raw)) + throw DB::ErrnoException( + DB::ErrorCodes::SYSTEM_ERROR, "Cannot set terminal to the raw mode for the terminal referred to by file descriptor '{}'", fd); + + intercept_thread = std::make_unique(&TerminalKeystrokeInterceptor::run, this, callbacks); +} + +void TerminalKeystrokeInterceptor::stopIntercept() +{ + stop_requested = true; + + std::lock_guard lock(mutex); + + if (intercept_thread && intercept_thread->joinable()) + { + intercept_thread->join(); + intercept_thread.reset(); + } + + /// Set to the original (canonical) terminal mode. + if (orig_termios) + { + if (tcsetattr(fd, TCSAFLUSH, orig_termios.get())) + throw DB::ErrnoException( + DB::ErrorCodes::SYSTEM_ERROR, + "Cannot set terminal to the original (canonical) mode for the terminal referred to by file descriptor '{}'", + fd); + + orig_termios.reset(); + } +} + +void TerminalKeystrokeInterceptor::run(TerminalKeystrokeInterceptor::CallbackMap map) +{ + while (!stop_requested) + { + runImpl(map); + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + } +} + +void TerminalKeystrokeInterceptor::runImpl(const DB::TerminalKeystrokeInterceptor::CallbackMap & map) const +{ + char ch; + if (read(fd, &ch, 1) > 0) + { + auto it = map.find(ch); + if (it != map.end()) + { + auto fn = it->second; + fn(); + } + } +} + +} diff --git a/src/Client/TerminalKeystrokeInterceptor.h b/src/Client/TerminalKeystrokeInterceptor.h new file mode 100644 index 00000000000..89937097e34 --- /dev/null +++ b/src/Client/TerminalKeystrokeInterceptor.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +struct termios; + +namespace DB +{ + +class TerminalKeystrokeInterceptor +{ + using Callback = std::function; + using CallbackMap = std::unordered_map; + +public: + explicit TerminalKeystrokeInterceptor(int fd_, std::ostream & error_stream_); + ~TerminalKeystrokeInterceptor(); + void registerCallback(char key, Callback cb); + + void startIntercept(); + void stopIntercept(); + +private: + void run(CallbackMap); + void runImpl(const CallbackMap &) const; + + const int fd; + std::ostream & error_stream; + + std::mutex mutex; + + CallbackMap callbacks; + std::unique_ptr intercept_thread; + std::unique_ptr orig_termios; + + std::atomic_bool stop_requested = false; +}; + +} diff --git a/src/Common/AsyncLoader.cpp b/src/Common/AsyncLoader.cpp index 9e928110ec9..cc6358adb46 100644 --- a/src/Common/AsyncLoader.cpp +++ b/src/Common/AsyncLoader.cpp @@ -54,7 +54,7 @@ AsyncLoader::Pool::Pool(const AsyncLoader::PoolInitializer & init) init.metric_threads, init.metric_active_threads, init.metric_scheduled_threads, - /* max_threads = */ std::numeric_limits::max(), // Unlimited number of threads, we do worker management ourselves + /* max_threads = */ ThreadPool::MAX_THEORETICAL_THREAD_COUNT, // Unlimited number of threads, we do worker management ourselves /* max_free_threads = */ 0, // We do not require free threads /* queue_size = */ 0)) // Unlimited queue to avoid blocking during worker spawning {} diff --git a/src/Common/CalendarTimeInterval.h b/src/Common/CalendarTimeInterval.h index d5acc6ee2f2..a2e8a71829c 100644 --- a/src/Common/CalendarTimeInterval.h +++ b/src/Common/CalendarTimeInterval.h @@ -40,6 +40,7 @@ struct CalendarTimeInterval /// Add this interval to the timestamp. First months, then seconds. /// Gets weird near month boundaries: October 31 + 1 month = December 1. + /// The returned timestamp is always 28-31 days greater than t. std::chrono::sys_seconds advance(std::chrono::system_clock::time_point t) const; /// Rounds the timestamp down to the nearest timestamp "aligned" with this interval. diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 79bf42ceabd..eff8206e676 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -8,319 +8,319 @@ /// If the event is generic (i.e. not server specific) /// it should be also added to src/Coordination/KeeperConstant.cpp #define APPLY_FOR_BUILTIN_EVENTS(M) \ - M(Query, "Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries.") \ - M(SelectQuery, "Same as Query, but only for SELECT queries.") \ - M(InsertQuery, "Same as Query, but only for INSERT queries.") \ - M(InitialQuery, "Same as Query, but only counts initial queries (see is_initial_query).")\ - M(QueriesWithSubqueries, "Count queries with all subqueries") \ - M(SelectQueriesWithSubqueries, "Count SELECT queries with all subqueries") \ - M(InsertQueriesWithSubqueries, "Count INSERT queries with all subqueries") \ - M(SelectQueriesWithPrimaryKeyUsage, "Count SELECT queries which use the primary key to evaluate the WHERE condition") \ - M(AsyncInsertQuery, "Same as InsertQuery, but only for asynchronous INSERT queries.") \ - M(AsyncInsertBytes, "Data size in bytes of asynchronous INSERT queries.") \ - M(AsyncInsertRows, "Number of rows inserted by asynchronous INSERT queries.") \ - M(AsyncInsertCacheHits, "Number of times a duplicate hash id has been found in asynchronous INSERT hash id cache.") \ - M(FailedQuery, "Number of failed queries.") \ - M(FailedSelectQuery, "Same as FailedQuery, but only for SELECT queries.") \ - M(FailedInsertQuery, "Same as FailedQuery, but only for INSERT queries.") \ - M(FailedAsyncInsertQuery, "Number of failed ASYNC INSERT queries.") \ - M(QueryTimeMicroseconds, "Total time of all queries.") \ - M(SelectQueryTimeMicroseconds, "Total time of SELECT queries.") \ - M(InsertQueryTimeMicroseconds, "Total time of INSERT queries.") \ - M(OtherQueryTimeMicroseconds, "Total time of queries that are not SELECT or INSERT.") \ - M(FileOpen, "Number of files opened.") \ - M(Seek, "Number of times the 'lseek' function was called.") \ - M(ReadBufferFromFileDescriptorRead, "Number of reads (read/pread) from a file descriptor. Does not include sockets.") \ - M(ReadBufferFromFileDescriptorReadFailed, "Number of times the read (read/pread) from a file descriptor have failed.") \ - M(ReadBufferFromFileDescriptorReadBytes, "Number of bytes read from file descriptors. If the file is compressed, this will show the compressed data size.") \ - M(WriteBufferFromFileDescriptorWrite, "Number of writes (write/pwrite) to a file descriptor. Does not include sockets.") \ - M(WriteBufferFromFileDescriptorWriteFailed, "Number of times the write (write/pwrite) to a file descriptor have failed.") \ - M(WriteBufferFromFileDescriptorWriteBytes, "Number of bytes written to file descriptors. If the file is compressed, this will show compressed data size.") \ - M(FileSync, "Number of times the F_FULLFSYNC/fsync/fdatasync function was called for files.") \ - M(DirectorySync, "Number of times the F_FULLFSYNC/fsync/fdatasync function was called for directories.") \ - M(FileSyncElapsedMicroseconds, "Total time spent waiting for F_FULLFSYNC/fsync/fdatasync syscall for files.") \ - M(DirectorySyncElapsedMicroseconds, "Total time spent waiting for F_FULLFSYNC/fsync/fdatasync syscall for directories.") \ - M(ReadCompressedBytes, "Number of bytes (the number of bytes before decompression) read from compressed sources (files, network).") \ - M(CompressedReadBufferBlocks, "Number of compressed blocks (the blocks of data that are compressed independent of each other) read from compressed sources (files, network).") \ - M(CompressedReadBufferBytes, "Number of uncompressed bytes (the number of bytes after decompression) read from compressed sources (files, network).") \ - M(UncompressedCacheHits, "Number of times a block of data has been found in the uncompressed cache (and decompression was avoided).") \ - M(UncompressedCacheMisses, "Number of times a block of data has not been found in the uncompressed cache (and required decompression).") \ - M(UncompressedCacheWeightLost, "Number of bytes evicted from the uncompressed cache.") \ - M(MMappedFileCacheHits, "Number of times a file has been found in the MMap cache (for the 'mmap' read_method), so we didn't have to mmap it again.") \ - M(MMappedFileCacheMisses, "Number of times a file has not been found in the MMap cache (for the 'mmap' read_method), so we had to mmap it again.") \ - M(OpenedFileCacheHits, "Number of times a file has been found in the opened file cache, so we didn't have to open it again.") \ - M(OpenedFileCacheMisses, "Number of times a file has been found in the opened file cache, so we had to open it again.") \ - M(OpenedFileCacheMicroseconds, "Amount of time spent executing OpenedFileCache methods.") \ - M(AIOWrite, "Number of writes with Linux or FreeBSD AIO interface") \ - M(AIOWriteBytes, "Number of bytes written with Linux or FreeBSD AIO interface") \ - M(AIORead, "Number of reads with Linux or FreeBSD AIO interface") \ - M(AIOReadBytes, "Number of bytes read with Linux or FreeBSD AIO interface") \ - M(IOBufferAllocs, "Number of allocations of IO buffers (for ReadBuffer/WriteBuffer).") \ - M(IOBufferAllocBytes, "Number of bytes allocated for IO buffers (for ReadBuffer/WriteBuffer).") \ - M(ArenaAllocChunks, "Number of chunks allocated for memory Arena (used for GROUP BY and similar operations)") \ - M(ArenaAllocBytes, "Number of bytes allocated for memory Arena (used for GROUP BY and similar operations)") \ - M(FunctionExecute, "Number of SQL ordinary function calls (SQL functions are called on per-block basis, so this number represents the number of blocks).") \ - M(TableFunctionExecute, "Number of table function calls.") \ - M(MarkCacheHits, "Number of times an entry has been found in the mark cache, so we didn't have to load a mark file.") \ - M(MarkCacheMisses, "Number of times an entry has not been found in the mark cache, so we had to load a mark file in memory, which is a costly operation, adding to query latency.") \ - M(QueryCacheHits, "Number of times a query result has been found in the query cache (and query computation was avoided). Only updated for SELECT queries with SETTING use_query_cache = 1.") \ - M(QueryCacheMisses, "Number of times a query result has not been found in the query cache (and required query computation). Only updated for SELECT queries with SETTING use_query_cache = 1.") \ + M(Query, "Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries.", ValueType::Number) \ + M(SelectQuery, "Same as Query, but only for SELECT queries.", ValueType::Number) \ + M(InsertQuery, "Same as Query, but only for INSERT queries.", ValueType::Number) \ + M(InitialQuery, "Same as Query, but only counts initial queries (see is_initial_query).", ValueType::Number)\ + M(QueriesWithSubqueries, "Count queries with all subqueries", ValueType::Number) \ + M(SelectQueriesWithSubqueries, "Count SELECT queries with all subqueries", ValueType::Number) \ + M(InsertQueriesWithSubqueries, "Count INSERT queries with all subqueries", ValueType::Number) \ + M(SelectQueriesWithPrimaryKeyUsage, "Count SELECT queries which use the primary key to evaluate the WHERE condition", ValueType::Number) \ + M(AsyncInsertQuery, "Same as InsertQuery, but only for asynchronous INSERT queries.", ValueType::Number) \ + M(AsyncInsertBytes, "Data size in bytes of asynchronous INSERT queries.", ValueType::Bytes) \ + M(AsyncInsertRows, "Number of rows inserted by asynchronous INSERT queries.", ValueType::Number) \ + M(AsyncInsertCacheHits, "Number of times a duplicate hash id has been found in asynchronous INSERT hash id cache.", ValueType::Number) \ + M(FailedQuery, "Number of failed queries.", ValueType::Number) \ + M(FailedSelectQuery, "Same as FailedQuery, but only for SELECT queries.", ValueType::Number) \ + M(FailedInsertQuery, "Same as FailedQuery, but only for INSERT queries.", ValueType::Number) \ + M(FailedAsyncInsertQuery, "Number of failed ASYNC INSERT queries.", ValueType::Number) \ + M(QueryTimeMicroseconds, "Total time of all queries.", ValueType::Microseconds) \ + M(SelectQueryTimeMicroseconds, "Total time of SELECT queries.", ValueType::Microseconds) \ + M(InsertQueryTimeMicroseconds, "Total time of INSERT queries.", ValueType::Microseconds) \ + M(OtherQueryTimeMicroseconds, "Total time of queries that are not SELECT or INSERT.", ValueType::Microseconds) \ + M(FileOpen, "Number of files opened.", ValueType::Number) \ + M(Seek, "Number of times the 'lseek' function was called.", ValueType::Number) \ + M(ReadBufferFromFileDescriptorRead, "Number of reads (read/pread) from a file descriptor. Does not include sockets.", ValueType::Number) \ + M(ReadBufferFromFileDescriptorReadFailed, "Number of times the read (read/pread) from a file descriptor have failed.", ValueType::Number) \ + M(ReadBufferFromFileDescriptorReadBytes, "Number of bytes read from file descriptors. If the file is compressed, this will show the compressed data size.", ValueType::Bytes) \ + M(WriteBufferFromFileDescriptorWrite, "Number of writes (write/pwrite) to a file descriptor. Does not include sockets.", ValueType::Number) \ + M(WriteBufferFromFileDescriptorWriteFailed, "Number of times the write (write/pwrite) to a file descriptor have failed.", ValueType::Number) \ + M(WriteBufferFromFileDescriptorWriteBytes, "Number of bytes written to file descriptors. If the file is compressed, this will show compressed data size.", ValueType::Bytes) \ + M(FileSync, "Number of times the F_FULLFSYNC/fsync/fdatasync function was called for files.", ValueType::Number) \ + M(DirectorySync, "Number of times the F_FULLFSYNC/fsync/fdatasync function was called for directories.", ValueType::Number) \ + M(FileSyncElapsedMicroseconds, "Total time spent waiting for F_FULLFSYNC/fsync/fdatasync syscall for files.", ValueType::Microseconds) \ + M(DirectorySyncElapsedMicroseconds, "Total time spent waiting for F_FULLFSYNC/fsync/fdatasync syscall for directories.", ValueType::Microseconds) \ + M(ReadCompressedBytes, "Number of bytes (the number of bytes before decompression) read from compressed sources (files, network).", ValueType::Bytes) \ + M(CompressedReadBufferBlocks, "Number of compressed blocks (the blocks of data that are compressed independent of each other) read from compressed sources (files, network).", ValueType::Number) \ + M(CompressedReadBufferBytes, "Number of uncompressed bytes (the number of bytes after decompression) read from compressed sources (files, network).", ValueType::Bytes) \ + M(UncompressedCacheHits, "Number of times a block of data has been found in the uncompressed cache (and decompression was avoided).", ValueType::Number) \ + M(UncompressedCacheMisses, "Number of times a block of data has not been found in the uncompressed cache (and required decompression).", ValueType::Number) \ + M(UncompressedCacheWeightLost, "Number of bytes evicted from the uncompressed cache.", ValueType::Bytes) \ + M(MMappedFileCacheHits, "Number of times a file has been found in the MMap cache (for the 'mmap' read_method), so we didn't have to mmap it again.", ValueType::Number) \ + M(MMappedFileCacheMisses, "Number of times a file has not been found in the MMap cache (for the 'mmap' read_method), so we had to mmap it again.", ValueType::Number) \ + M(OpenedFileCacheHits, "Number of times a file has been found in the opened file cache, so we didn't have to open it again.", ValueType::Number) \ + M(OpenedFileCacheMisses, "Number of times a file has been found in the opened file cache, so we had to open it again.", ValueType::Number) \ + M(OpenedFileCacheMicroseconds, "Amount of time spent executing OpenedFileCache methods.", ValueType::Microseconds) \ + M(AIOWrite, "Number of writes with Linux or FreeBSD AIO interface", ValueType::Number) \ + M(AIOWriteBytes, "Number of bytes written with Linux or FreeBSD AIO interface", ValueType::Bytes) \ + M(AIORead, "Number of reads with Linux or FreeBSD AIO interface", ValueType::Number) \ + M(AIOReadBytes, "Number of bytes read with Linux or FreeBSD AIO interface", ValueType::Bytes) \ + M(IOBufferAllocs, "Number of allocations of IO buffers (for ReadBuffer/WriteBuffer).", ValueType::Number) \ + M(IOBufferAllocBytes, "Number of bytes allocated for IO buffers (for ReadBuffer/WriteBuffer).", ValueType::Bytes) \ + M(ArenaAllocChunks, "Number of chunks allocated for memory Arena (used for GROUP BY and similar operations)", ValueType::Number) \ + M(ArenaAllocBytes, "Number of bytes allocated for memory Arena (used for GROUP BY and similar operations)", ValueType::Bytes) \ + M(FunctionExecute, "Number of SQL ordinary function calls (SQL functions are called on per-block basis, so this number represents the number of blocks).", ValueType::Number) \ + M(TableFunctionExecute, "Number of table function calls.", ValueType::Number) \ + M(MarkCacheHits, "Number of times an entry has been found in the mark cache, so we didn't have to load a mark file.", ValueType::Number) \ + M(MarkCacheMisses, "Number of times an entry has not been found in the mark cache, so we had to load a mark file in memory, which is a costly operation, adding to query latency.", ValueType::Number) \ + M(QueryCacheHits, "Number of times a query result has been found in the query cache (and query computation was avoided). Only updated for SELECT queries with SETTING use_query_cache = 1.", ValueType::Number) \ + M(QueryCacheMisses, "Number of times a query result has not been found in the query cache (and required query computation). Only updated for SELECT queries with SETTING use_query_cache = 1.", ValueType::Number) \ /* Each page cache chunk access increments exactly one of the following 5 PageCacheChunk* counters. */ \ /* Something like hit rate: (PageCacheChunkShared + PageCacheChunkDataHits) / [sum of all 5]. */ \ - M(PageCacheChunkMisses, "Number of times a chunk has not been found in the userspace page cache.") \ - M(PageCacheChunkShared, "Number of times a chunk has been found in the userspace page cache, already in use by another thread.") \ - M(PageCacheChunkDataHits, "Number of times a chunk has been found in the userspace page cache, not in use, with all pages intact.") \ - M(PageCacheChunkDataPartialHits, "Number of times a chunk has been found in the userspace page cache, not in use, but some of its pages were evicted by the OS.") \ - M(PageCacheChunkDataMisses, "Number of times a chunk has been found in the userspace page cache, not in use, but all its pages were evicted by the OS.") \ - M(PageCacheBytesUnpinnedRoundedToPages, "Total size of populated pages in chunks that became evictable in PageCache. Rounded up to whole pages.") \ - M(PageCacheBytesUnpinnedRoundedToHugePages, "See PageCacheBytesUnpinnedRoundedToPages, but rounded to huge pages. Use the ratio between the two as a measure of memory waste from using huge pages.") \ - M(CreatedReadBufferOrdinary, "Number of times ordinary read buffer was created for reading data (while choosing among other read methods).") \ - M(CreatedReadBufferDirectIO, "Number of times a read buffer with O_DIRECT was created for reading data (while choosing among other read methods).") \ - M(CreatedReadBufferDirectIOFailed, "Number of times a read buffer with O_DIRECT was attempted to be created for reading data (while choosing among other read methods), but the OS did not allow it (due to lack of filesystem support or other reasons) and we fallen back to the ordinary reading method.") \ - M(CreatedReadBufferMMap, "Number of times a read buffer using 'mmap' was created for reading data (while choosing among other read methods).") \ - M(CreatedReadBufferMMapFailed, "Number of times a read buffer with 'mmap' was attempted to be created for reading data (while choosing among other read methods), but the OS did not allow it (due to lack of filesystem support or other reasons) and we fallen back to the ordinary reading method.") \ - M(DiskReadElapsedMicroseconds, "Total time spent waiting for read syscall. This include reads from page cache.") \ - M(DiskWriteElapsedMicroseconds, "Total time spent waiting for write syscall. This include writes to page cache.") \ - M(NetworkReceiveElapsedMicroseconds, "Total time spent waiting for data to receive or receiving data from network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.") \ - M(NetworkSendElapsedMicroseconds, "Total time spent waiting for data to send to network or sending data to network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.") \ - M(NetworkReceiveBytes, "Total number of bytes received from network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.") \ - M(NetworkSendBytes, "Total number of bytes send to network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.") \ + M(PageCacheChunkMisses, "Number of times a chunk has not been found in the userspace page cache.", ValueType::Number) \ + M(PageCacheChunkShared, "Number of times a chunk has been found in the userspace page cache, already in use by another thread.", ValueType::Number) \ + M(PageCacheChunkDataHits, "Number of times a chunk has been found in the userspace page cache, not in use, with all pages intact.", ValueType::Number) \ + M(PageCacheChunkDataPartialHits, "Number of times a chunk has been found in the userspace page cache, not in use, but some of its pages were evicted by the OS.", ValueType::Number) \ + M(PageCacheChunkDataMisses, "Number of times a chunk has been found in the userspace page cache, not in use, but all its pages were evicted by the OS.", ValueType::Number) \ + M(PageCacheBytesUnpinnedRoundedToPages, "Total size of populated pages in chunks that became evictable in PageCache. Rounded up to whole pages.", ValueType::Number) \ + M(PageCacheBytesUnpinnedRoundedToHugePages, "See PageCacheBytesUnpinnedRoundedToPages, but rounded to huge pages. Use the ratio between the two as a measure of memory waste from using huge pages.", ValueType::Number) \ + M(CreatedReadBufferOrdinary, "Number of times ordinary read buffer was created for reading data (while choosing among other read methods).", ValueType::Number) \ + M(CreatedReadBufferDirectIO, "Number of times a read buffer with O_DIRECT was created for reading data (while choosing among other read methods).", ValueType::Number) \ + M(CreatedReadBufferDirectIOFailed, "Number of times a read buffer with O_DIRECT was attempted to be created for reading data (while choosing among other read methods), but the OS did not allow it (due to lack of filesystem support or other reasons) and we fallen back to the ordinary reading method.", ValueType::Number) \ + M(CreatedReadBufferMMap, "Number of times a read buffer using 'mmap' was created for reading data (while choosing among other read methods).", ValueType::Number) \ + M(CreatedReadBufferMMapFailed, "Number of times a read buffer with 'mmap' was attempted to be created for reading data (while choosing among other read methods), but the OS did not allow it (due to lack of filesystem support or other reasons) and we fallen back to the ordinary reading method.", ValueType::Number) \ + M(DiskReadElapsedMicroseconds, "Total time spent waiting for read syscall. This include reads from page cache.", ValueType::Microseconds) \ + M(DiskWriteElapsedMicroseconds, "Total time spent waiting for write syscall. This include writes to page cache.", ValueType::Microseconds) \ + M(NetworkReceiveElapsedMicroseconds, "Total time spent waiting for data to receive or receiving data from network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.", ValueType::Microseconds) \ + M(NetworkSendElapsedMicroseconds, "Total time spent waiting for data to send to network or sending data to network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.", ValueType::Microseconds) \ + M(NetworkReceiveBytes, "Total number of bytes received from network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.", ValueType::Bytes) \ + M(NetworkSendBytes, "Total number of bytes send to network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.", ValueType::Bytes) \ \ - M(GlobalThreadPoolExpansions, "Counts the total number of times new threads have been added to the global thread pool. This metric indicates the frequency of expansions in the global thread pool to accommodate increased processing demands.") \ - M(GlobalThreadPoolShrinks, "Counts the total number of times the global thread pool has shrunk by removing threads. This occurs when the number of idle threads exceeds max_thread_pool_free_size, indicating adjustments in the global thread pool size in response to decreased thread utilization.") \ - M(GlobalThreadPoolThreadCreationMicroseconds, "Total time spent waiting for new threads to start.") \ - M(GlobalThreadPoolLockWaitMicroseconds, "Total time threads have spent waiting for locks in the global thread pool.") \ - M(GlobalThreadPoolJobs, "Counts the number of jobs that have been pushed to the global thread pool.") \ - M(GlobalThreadPoolJobWaitTimeMicroseconds, "Measures the elapsed time from when a job is scheduled in the thread pool to when it is picked up for execution by a worker thread. This metric helps identify delays in job processing, indicating the responsiveness of the thread pool to new tasks.") \ - M(LocalThreadPoolExpansions, "Counts the total number of times threads have been borrowed from the global thread pool to expand local thread pools.") \ - M(LocalThreadPoolShrinks, "Counts the total number of times threads have been returned to the global thread pool from local thread pools.") \ - M(LocalThreadPoolThreadCreationMicroseconds, "Total time local thread pools have spent waiting to borrow a thread from the global pool.") \ - M(LocalThreadPoolLockWaitMicroseconds, "Total time threads have spent waiting for locks in the local thread pools.") \ - M(LocalThreadPoolJobs, "Counts the number of jobs that have been pushed to the local thread pools.") \ - M(LocalThreadPoolBusyMicroseconds, "Total time threads have spent executing the actual work.") \ - M(LocalThreadPoolJobWaitTimeMicroseconds, "Measures the elapsed time from when a job is scheduled in the thread pool to when it is picked up for execution by a worker thread. This metric helps identify delays in job processing, indicating the responsiveness of the thread pool to new tasks.") \ + M(GlobalThreadPoolExpansions, "Counts the total number of times new threads have been added to the global thread pool. This metric indicates the frequency of expansions in the global thread pool to accommodate increased processing demands.", ValueType::Number) \ + M(GlobalThreadPoolShrinks, "Counts the total number of times the global thread pool has shrunk by removing threads. This occurs when the number of idle threads exceeds max_thread_pool_free_size, indicating adjustments in the global thread pool size in response to decreased thread utilization.", ValueType::Number) \ + M(GlobalThreadPoolThreadCreationMicroseconds, "Total time spent waiting for new threads to start.", ValueType::Microseconds) \ + M(GlobalThreadPoolLockWaitMicroseconds, "Total time threads have spent waiting for locks in the global thread pool.", ValueType::Microseconds) \ + M(GlobalThreadPoolJobs, "Counts the number of jobs that have been pushed to the global thread pool.", ValueType::Number) \ + M(GlobalThreadPoolJobWaitTimeMicroseconds, "Measures the elapsed time from when a job is scheduled in the thread pool to when it is picked up for execution by a worker thread. This metric helps identify delays in job processing, indicating the responsiveness of the thread pool to new tasks.", ValueType::Microseconds) \ + M(LocalThreadPoolExpansions, "Counts the total number of times threads have been borrowed from the global thread pool to expand local thread pools.", ValueType::Number) \ + M(LocalThreadPoolShrinks, "Counts the total number of times threads have been returned to the global thread pool from local thread pools.", ValueType::Number) \ + M(LocalThreadPoolThreadCreationMicroseconds, "Total time local thread pools have spent waiting to borrow a thread from the global pool.", ValueType::Microseconds) \ + M(LocalThreadPoolLockWaitMicroseconds, "Total time threads have spent waiting for locks in the local thread pools.", ValueType::Microseconds) \ + M(LocalThreadPoolJobs, "Counts the number of jobs that have been pushed to the local thread pools.", ValueType::Microseconds) \ + M(LocalThreadPoolBusyMicroseconds, "Total time threads have spent executing the actual work.", ValueType::Microseconds) \ + M(LocalThreadPoolJobWaitTimeMicroseconds, "Measures the elapsed time from when a job is scheduled in the thread pool to when it is picked up for execution by a worker thread. This metric helps identify delays in job processing, indicating the responsiveness of the thread pool to new tasks.", ValueType::Microseconds) \ \ - M(DiskS3GetRequestThrottlerCount, "Number of DiskS3 GET and SELECT requests passed through throttler.") \ - M(DiskS3GetRequestThrottlerSleepMicroseconds, "Total time a query was sleeping to conform DiskS3 GET and SELECT request throttling.") \ - M(DiskS3PutRequestThrottlerCount, "Number of DiskS3 PUT, COPY, POST and LIST requests passed through throttler.") \ - M(DiskS3PutRequestThrottlerSleepMicroseconds, "Total time a query was sleeping to conform DiskS3 PUT, COPY, POST and LIST request throttling.") \ - M(S3GetRequestThrottlerCount, "Number of S3 GET and SELECT requests passed through throttler.") \ - M(S3GetRequestThrottlerSleepMicroseconds, "Total time a query was sleeping to conform S3 GET and SELECT request throttling.") \ - M(S3PutRequestThrottlerCount, "Number of S3 PUT, COPY, POST and LIST requests passed through throttler.") \ - M(S3PutRequestThrottlerSleepMicroseconds, "Total time a query was sleeping to conform S3 PUT, COPY, POST and LIST request throttling.") \ - M(RemoteReadThrottlerBytes, "Bytes passed through 'max_remote_read_network_bandwidth_for_server'/'max_remote_read_network_bandwidth' throttler.") \ - M(RemoteReadThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_remote_read_network_bandwidth_for_server'/'max_remote_read_network_bandwidth' throttling.") \ - M(RemoteWriteThrottlerBytes, "Bytes passed through 'max_remote_write_network_bandwidth_for_server'/'max_remote_write_network_bandwidth' throttler.") \ - M(RemoteWriteThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_remote_write_network_bandwidth_for_server'/'max_remote_write_network_bandwidth' throttling.") \ - M(LocalReadThrottlerBytes, "Bytes passed through 'max_local_read_bandwidth_for_server'/'max_local_read_bandwidth' throttler.") \ - M(LocalReadThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_local_read_bandwidth_for_server'/'max_local_read_bandwidth' throttling.") \ - M(LocalWriteThrottlerBytes, "Bytes passed through 'max_local_write_bandwidth_for_server'/'max_local_write_bandwidth' throttler.") \ - M(LocalWriteThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_local_write_bandwidth_for_server'/'max_local_write_bandwidth' throttling.") \ - M(ThrottlerSleepMicroseconds, "Total time a query was sleeping to conform all throttling settings.") \ - M(PartsWithAppliedMutationsOnFly, "Total number of parts for which there was any mutation applied on fly") \ - M(MutationsAppliedOnFlyInAllParts, "The sum of number of applied mutations on-fly for part among all read parts") \ + M(DiskS3GetRequestThrottlerCount, "Number of DiskS3 GET and SELECT requests passed through throttler.", ValueType::Number) \ + M(DiskS3GetRequestThrottlerSleepMicroseconds, "Total time a query was sleeping to conform DiskS3 GET and SELECT request throttling.", ValueType::Microseconds) \ + M(DiskS3PutRequestThrottlerCount, "Number of DiskS3 PUT, COPY, POST and LIST requests passed through throttler.", ValueType::Number) \ + M(DiskS3PutRequestThrottlerSleepMicroseconds, "Total time a query was sleeping to conform DiskS3 PUT, COPY, POST and LIST request throttling.", ValueType::Microseconds) \ + M(S3GetRequestThrottlerCount, "Number of S3 GET and SELECT requests passed through throttler.", ValueType::Number) \ + M(S3GetRequestThrottlerSleepMicroseconds, "Total time a query was sleeping to conform S3 GET and SELECT request throttling.", ValueType::Microseconds) \ + M(S3PutRequestThrottlerCount, "Number of S3 PUT, COPY, POST and LIST requests passed through throttler.", ValueType::Number) \ + M(S3PutRequestThrottlerSleepMicroseconds, "Total time a query was sleeping to conform S3 PUT, COPY, POST and LIST request throttling.", ValueType::Microseconds) \ + M(RemoteReadThrottlerBytes, "Bytes passed through 'max_remote_read_network_bandwidth_for_server'/'max_remote_read_network_bandwidth' throttler.", ValueType::Bytes) \ + M(RemoteReadThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_remote_read_network_bandwidth_for_server'/'max_remote_read_network_bandwidth' throttling.", ValueType::Microseconds) \ + M(RemoteWriteThrottlerBytes, "Bytes passed through 'max_remote_write_network_bandwidth_for_server'/'max_remote_write_network_bandwidth' throttler.", ValueType::Bytes) \ + M(RemoteWriteThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_remote_write_network_bandwidth_for_server'/'max_remote_write_network_bandwidth' throttling.", ValueType::Microseconds) \ + M(LocalReadThrottlerBytes, "Bytes passed through 'max_local_read_bandwidth_for_server'/'max_local_read_bandwidth' throttler.", ValueType::Bytes) \ + M(LocalReadThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_local_read_bandwidth_for_server'/'max_local_read_bandwidth' throttling.", ValueType::Microseconds) \ + M(LocalWriteThrottlerBytes, "Bytes passed through 'max_local_write_bandwidth_for_server'/'max_local_write_bandwidth' throttler.", ValueType::Bytes) \ + M(LocalWriteThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_local_write_bandwidth_for_server'/'max_local_write_bandwidth' throttling.", ValueType::Microseconds) \ + M(ThrottlerSleepMicroseconds, "Total time a query was sleeping to conform all throttling settings.", ValueType::Microseconds) \ + M(PartsWithAppliedMutationsOnFly, "Total number of parts for which there was any mutation applied on fly", ValueType::Number) \ + M(MutationsAppliedOnFlyInAllParts, "The sum of number of applied mutations on-fly for part among all read parts", ValueType::Number) \ \ - M(SchedulerIOReadRequests, "Resource requests passed through scheduler for IO reads.") \ - M(SchedulerIOReadBytes, "Bytes passed through scheduler for IO reads.") \ - M(SchedulerIOReadWaitMicroseconds, "Total time a query was waiting on resource requests for IO reads.") \ - M(SchedulerIOWriteRequests, "Resource requests passed through scheduler for IO writes.") \ - M(SchedulerIOWriteBytes, "Bytes passed through scheduler for IO writes.") \ - M(SchedulerIOWriteWaitMicroseconds, "Total time a query was waiting on resource requests for IO writes.") \ + M(SchedulerIOReadRequests, "Resource requests passed through scheduler for IO reads.", ValueType::Number) \ + M(SchedulerIOReadBytes, "Bytes passed through scheduler for IO reads.", ValueType::Bytes) \ + M(SchedulerIOReadWaitMicroseconds, "Total time a query was waiting on resource requests for IO reads.", ValueType::Microseconds) \ + M(SchedulerIOWriteRequests, "Resource requests passed through scheduler for IO writes.", ValueType::Number) \ + M(SchedulerIOWriteBytes, "Bytes passed through scheduler for IO writes.", ValueType::Bytes) \ + M(SchedulerIOWriteWaitMicroseconds, "Total time a query was waiting on resource requests for IO writes.", ValueType::Microseconds) \ \ - M(QueryMaskingRulesMatch, "Number of times query masking rules was successfully matched.") \ + M(QueryMaskingRulesMatch, "Number of times query masking rules was successfully matched.", ValueType::Number) \ \ - M(ReplicatedPartFetches, "Number of times a data part was downloaded from replica of a ReplicatedMergeTree table.") \ - M(ReplicatedPartFailedFetches, "Number of times a data part was failed to download from replica of a ReplicatedMergeTree table.") \ - M(ObsoleteReplicatedParts, "Number of times a data part was covered by another data part that has been fetched from a replica (so, we have marked a covered data part as obsolete and no longer needed).") \ - M(ReplicatedPartMerges, "Number of times data parts of ReplicatedMergeTree tables were successfully merged.") \ - M(ReplicatedPartFetchesOfMerged, "Number of times we prefer to download already merged part from replica of ReplicatedMergeTree table instead of performing a merge ourself (usually we prefer doing a merge ourself to save network traffic). This happens when we have not all source parts to perform a merge or when the data part is old enough.") \ - M(ReplicatedPartMutations, "Number of times data parts of ReplicatedMergeTree tables were successfully mutated.") \ - M(ReplicatedPartChecks, "Number of times we had to perform advanced search for a data part on replicas or to clarify the need of an existing data part.") \ - M(ReplicatedPartChecksFailed, "Number of times the advanced search for a data part on replicas did not give result or when unexpected part has been found and moved away.") \ - M(ReplicatedDataLoss, "Number of times a data part that we wanted doesn't exist on any replica (even on replicas that are offline right now). That data parts are definitely lost. This is normal due to asynchronous replication (if quorum inserts were not enabled), when the replica on which the data part was written was failed and when it became online after fail it doesn't contain that data part.") \ - M(ReplicatedCoveredPartsInZooKeeperOnStart, "For debugging purposes. Number of parts in ZooKeeper that have a covering part, but doesn't exist on disk. Checked on server start.") \ + M(ReplicatedPartFetches, "Number of times a data part was downloaded from replica of a ReplicatedMergeTree table.", ValueType::Number) \ + M(ReplicatedPartFailedFetches, "Number of times a data part was failed to download from replica of a ReplicatedMergeTree table.", ValueType::Number) \ + M(ObsoleteReplicatedParts, "Number of times a data part was covered by another data part that has been fetched from a replica (so, we have marked a covered data part as obsolete and no longer needed).", ValueType::Number) \ + M(ReplicatedPartMerges, "Number of times data parts of ReplicatedMergeTree tables were successfully merged.", ValueType::Number) \ + M(ReplicatedPartFetchesOfMerged, "Number of times we prefer to download already merged part from replica of ReplicatedMergeTree table instead of performing a merge ourself (usually we prefer doing a merge ourself to save network traffic). This happens when we have not all source parts to perform a merge or when the data part is old enough.", ValueType::Number) \ + M(ReplicatedPartMutations, "Number of times data parts of ReplicatedMergeTree tables were successfully mutated.", ValueType::Number) \ + M(ReplicatedPartChecks, "Number of times we had to perform advanced search for a data part on replicas or to clarify the need of an existing data part.", ValueType::Number) \ + M(ReplicatedPartChecksFailed, "Number of times the advanced search for a data part on replicas did not give result or when unexpected part has been found and moved away.", ValueType::Number) \ + M(ReplicatedDataLoss, "Number of times a data part that we wanted doesn't exist on any replica (even on replicas that are offline right now). That data parts are definitely lost. This is normal due to asynchronous replication (if quorum inserts were not enabled), when the replica on which the data part was written was failed and when it became online after fail it doesn't contain that data part.", ValueType::Number) \ + M(ReplicatedCoveredPartsInZooKeeperOnStart, "For debugging purposes. Number of parts in ZooKeeper that have a covering part, but doesn't exist on disk. Checked on server start.", ValueType::Number) \ \ - M(InsertedRows, "Number of rows INSERTed to all tables.") \ - M(InsertedBytes, "Number of bytes (uncompressed; for columns as they stored in memory) INSERTed to all tables.") \ - M(DelayedInserts, "Number of times the INSERT of a block to a MergeTree table was throttled due to high number of active data parts for partition.") \ - M(RejectedInserts, "Number of times the INSERT of a block to a MergeTree table was rejected with 'Too many parts' exception due to high number of active data parts for partition.") \ - M(DelayedInsertsMilliseconds, "Total number of milliseconds spent while the INSERT of a block to a MergeTree table was throttled due to high number of active data parts for partition.") \ - M(DelayedMutations, "Number of times the mutation of a MergeTree table was throttled due to high number of unfinished mutations for table.") \ - M(RejectedMutations, "Number of times the mutation of a MergeTree table was rejected with 'Too many mutations' exception due to high number of unfinished mutations for table.") \ - M(DelayedMutationsMilliseconds, "Total number of milliseconds spent while the mutation of a MergeTree table was throttled due to high number of unfinished mutations for table.") \ - M(DistributedDelayedInserts, "Number of times the INSERT of a block to a Distributed table was throttled due to high number of pending bytes.") \ - M(DistributedRejectedInserts, "Number of times the INSERT of a block to a Distributed table was rejected with 'Too many bytes' exception due to high number of pending bytes.") \ - M(DistributedDelayedInsertsMilliseconds, "Total number of milliseconds spent while the INSERT of a block to a Distributed table was throttled due to high number of pending bytes.") \ - M(DuplicatedInsertedBlocks, "Number of times the INSERTed block to a ReplicatedMergeTree table was deduplicated.") \ + M(InsertedRows, "Number of rows INSERTed to all tables.", ValueType::Number) \ + M(InsertedBytes, "Number of bytes (uncompressed; for columns as they stored in memory) INSERTed to all tables.", ValueType::Bytes) \ + M(DelayedInserts, "Number of times the INSERT of a block to a MergeTree table was throttled due to high number of active data parts for partition.", ValueType::Number) \ + M(RejectedInserts, "Number of times the INSERT of a block to a MergeTree table was rejected with 'Too many parts' exception due to high number of active data parts for partition.", ValueType::Number) \ + M(DelayedInsertsMilliseconds, "Total number of milliseconds spent while the INSERT of a block to a MergeTree table was throttled due to high number of active data parts for partition.", ValueType::Milliseconds) \ + M(DelayedMutations, "Number of times the mutation of a MergeTree table was throttled due to high number of unfinished mutations for table.", ValueType::Number) \ + M(RejectedMutations, "Number of times the mutation of a MergeTree table was rejected with 'Too many mutations' exception due to high number of unfinished mutations for table.", ValueType::Number) \ + M(DelayedMutationsMilliseconds, "Total number of milliseconds spent while the mutation of a MergeTree table was throttled due to high number of unfinished mutations for table.", ValueType::Milliseconds) \ + M(DistributedDelayedInserts, "Number of times the INSERT of a block to a Distributed table was throttled due to high number of pending bytes.", ValueType::Number) \ + M(DistributedRejectedInserts, "Number of times the INSERT of a block to a Distributed table was rejected with 'Too many bytes' exception due to high number of pending bytes.", ValueType::Number) \ + M(DistributedDelayedInsertsMilliseconds, "Total number of milliseconds spent while the INSERT of a block to a Distributed table was throttled due to high number of pending bytes.", ValueType::Milliseconds) \ + M(DuplicatedInsertedBlocks, "Number of times the INSERTed block to a ReplicatedMergeTree table was deduplicated.", ValueType::Number) \ \ - M(ZooKeeperInit, "Number of times connection with ZooKeeper has been established.") \ - M(ZooKeeperTransactions, "Number of ZooKeeper operations, which include both read and write operations as well as multi-transactions.") \ - M(ZooKeeperList, "Number of 'list' (getChildren) requests to ZooKeeper.") \ - M(ZooKeeperCreate, "Number of 'create' requests to ZooKeeper.") \ - M(ZooKeeperRemove, "Number of 'remove' requests to ZooKeeper.") \ - M(ZooKeeperExists, "Number of 'exists' requests to ZooKeeper.") \ - M(ZooKeeperGet, "Number of 'get' requests to ZooKeeper.") \ - M(ZooKeeperSet, "Number of 'set' requests to ZooKeeper.") \ - M(ZooKeeperMulti, "Number of 'multi' requests to ZooKeeper (compound transactions).") \ - M(ZooKeeperCheck, "Number of 'check' requests to ZooKeeper. Usually they don't make sense in isolation, only as part of a complex transaction.") \ - M(ZooKeeperSync, "Number of 'sync' requests to ZooKeeper. These requests are rarely needed or usable.") \ - M(ZooKeeperReconfig, "Number of 'reconfig' requests to ZooKeeper.") \ - M(ZooKeeperClose, "Number of times connection with ZooKeeper has been closed voluntary.") \ - M(ZooKeeperWatchResponse, "Number of times watch notification has been received from ZooKeeper.") \ - M(ZooKeeperUserExceptions, "Number of exceptions while working with ZooKeeper related to the data (no node, bad version or similar).") \ - M(ZooKeeperHardwareExceptions, "Number of exceptions while working with ZooKeeper related to network (connection loss or similar).") \ - M(ZooKeeperOtherExceptions, "Number of exceptions while working with ZooKeeper other than ZooKeeperUserExceptions and ZooKeeperHardwareExceptions.") \ - M(ZooKeeperWaitMicroseconds, "Number of microseconds spent waiting for responses from ZooKeeper after creating a request, summed across all the requesting threads.") \ - M(ZooKeeperBytesSent, "Number of bytes send over network while communicating with ZooKeeper.") \ - M(ZooKeeperBytesReceived, "Number of bytes received over network while communicating with ZooKeeper.") \ + M(ZooKeeperInit, "Number of times connection with ZooKeeper has been established.", ValueType::Number) \ + M(ZooKeeperTransactions, "Number of ZooKeeper operations, which include both read and write operations as well as multi-transactions.", ValueType::Number) \ + M(ZooKeeperList, "Number of 'list' (getChildren) requests to ZooKeeper.", ValueType::Number) \ + M(ZooKeeperCreate, "Number of 'create' requests to ZooKeeper.", ValueType::Number) \ + M(ZooKeeperRemove, "Number of 'remove' requests to ZooKeeper.", ValueType::Number) \ + M(ZooKeeperExists, "Number of 'exists' requests to ZooKeeper.", ValueType::Number) \ + M(ZooKeeperGet, "Number of 'get' requests to ZooKeeper.", ValueType::Number) \ + M(ZooKeeperSet, "Number of 'set' requests to ZooKeeper.", ValueType::Number) \ + M(ZooKeeperMulti, "Number of 'multi' requests to ZooKeeper (compound transactions).", ValueType::Number) \ + M(ZooKeeperCheck, "Number of 'check' requests to ZooKeeper. Usually they don't make sense in isolation, only as part of a complex transaction.", ValueType::Number) \ + M(ZooKeeperSync, "Number of 'sync' requests to ZooKeeper. These requests are rarely needed or usable.", ValueType::Number) \ + M(ZooKeeperReconfig, "Number of 'reconfig' requests to ZooKeeper.", ValueType::Number) \ + M(ZooKeeperClose, "Number of times connection with ZooKeeper has been closed voluntary.", ValueType::Number) \ + M(ZooKeeperWatchResponse, "Number of times watch notification has been received from ZooKeeper.", ValueType::Number) \ + M(ZooKeeperUserExceptions, "Number of exceptions while working with ZooKeeper related to the data (no node, bad version or similar).", ValueType::Number) \ + M(ZooKeeperHardwareExceptions, "Number of exceptions while working with ZooKeeper related to network (connection loss or similar).", ValueType::Number) \ + M(ZooKeeperOtherExceptions, "Number of exceptions while working with ZooKeeper other than ZooKeeperUserExceptions and ZooKeeperHardwareExceptions.", ValueType::Number) \ + M(ZooKeeperWaitMicroseconds, "Number of microseconds spent waiting for responses from ZooKeeper after creating a request, summed across all the requesting threads.", ValueType::Microseconds) \ + M(ZooKeeperBytesSent, "Number of bytes send over network while communicating with ZooKeeper.", ValueType::Bytes) \ + M(ZooKeeperBytesReceived, "Number of bytes received over network while communicating with ZooKeeper.", ValueType::Bytes) \ \ - M(DistributedConnectionTries, "Total count of distributed connection attempts.") \ - M(DistributedConnectionUsable, "Total count of successful distributed connections to a usable server (with required table, but maybe stale).") \ - M(DistributedConnectionFailTry, "Total count when distributed connection fails with retry.") \ - M(DistributedConnectionMissingTable, "Number of times we rejected a replica from a distributed query, because it did not contain a table needed for the query.") \ - M(DistributedConnectionStaleReplica, "Number of times we rejected a replica from a distributed query, because some table needed for a query had replication lag higher than the configured threshold.") \ - M(DistributedConnectionSkipReadOnlyReplica, "Number of replicas skipped during INSERT into Distributed table due to replicas being read-only") \ - M(DistributedConnectionFailAtAll, "Total count when distributed connection fails after all retries finished.") \ + M(DistributedConnectionTries, "Total count of distributed connection attempts.", ValueType::Number) \ + M(DistributedConnectionUsable, "Total count of successful distributed connections to a usable server (with required table, but maybe stale).", ValueType::Number) \ + M(DistributedConnectionFailTry, "Total count when distributed connection fails with retry.", ValueType::Number) \ + M(DistributedConnectionMissingTable, "Number of times we rejected a replica from a distributed query, because it did not contain a table needed for the query.", ValueType::Number) \ + M(DistributedConnectionStaleReplica, "Number of times we rejected a replica from a distributed query, because some table needed for a query had replication lag higher than the configured threshold.", ValueType::Number) \ + M(DistributedConnectionSkipReadOnlyReplica, "Number of replicas skipped during INSERT into Distributed table due to replicas being read-only", ValueType::Number) \ + M(DistributedConnectionFailAtAll, "Total count when distributed connection fails after all retries finished.", ValueType::Number) \ \ - M(HedgedRequestsChangeReplica, "Total count when timeout for changing replica expired in hedged requests.") \ - M(SuspendSendingQueryToShard, "Total count when sending query to shard was suspended when async_query_sending_for_remote is enabled.") \ + M(HedgedRequestsChangeReplica, "Total count when timeout for changing replica expired in hedged requests.", ValueType::Number) \ + M(SuspendSendingQueryToShard, "Total count when sending query to shard was suspended when async_query_sending_for_remote is enabled.", ValueType::Number) \ \ - M(CompileFunction, "Number of times a compilation of generated LLVM code (to create fused function for complex expressions) was initiated.") \ - M(CompiledFunctionExecute, "Number of times a compiled function was executed.") \ - M(CompileExpressionsMicroseconds, "Total time spent for compilation of expressions to LLVM code.") \ - M(CompileExpressionsBytes, "Number of bytes used for expressions compilation.") \ + M(CompileFunction, "Number of times a compilation of generated LLVM code (to create fused function for complex expressions) was initiated.", ValueType::Number) \ + M(CompiledFunctionExecute, "Number of times a compiled function was executed.", ValueType::Number) \ + M(CompileExpressionsMicroseconds, "Total time spent for compilation of expressions to LLVM code.", ValueType::Microseconds) \ + M(CompileExpressionsBytes, "Number of bytes used for expressions compilation.", ValueType::Bytes) \ \ - M(ExecuteShellCommand, "Number of shell command executions.") \ + M(ExecuteShellCommand, "Number of shell command executions.", ValueType::Number) \ \ - M(ExternalProcessingCompressedBytesTotal, "Number of compressed bytes written by external processing (sorting/aggragating/joining)") \ - M(ExternalProcessingUncompressedBytesTotal, "Amount of data (uncompressed, before compression) written by external processing (sorting/aggragating/joining)") \ - M(ExternalProcessingFilesTotal, "Number of files used by external processing (sorting/aggragating/joining)") \ - M(ExternalSortWritePart, "Number of times a temporary file was written to disk for sorting in external memory.") \ - M(ExternalSortMerge, "Number of times temporary files were merged for sorting in external memory.") \ - M(ExternalSortCompressedBytes, "Number of compressed bytes written for sorting in external memory.") \ - M(ExternalSortUncompressedBytes, "Amount of data (uncompressed, before compression) written for sorting in external memory.") \ - M(ExternalAggregationWritePart, "Number of times a temporary file was written to disk for aggregation in external memory.") \ - M(ExternalAggregationMerge, "Number of times temporary files were merged for aggregation in external memory.") \ - M(ExternalAggregationCompressedBytes, "Number of bytes written to disk for aggregation in external memory.") \ - M(ExternalAggregationUncompressedBytes, "Amount of data (uncompressed, before compression) written to disk for aggregation in external memory.") \ - M(ExternalJoinWritePart, "Number of times a temporary file was written to disk for JOIN in external memory.") \ - M(ExternalJoinMerge, "Number of times temporary files were merged for JOIN in external memory.") \ - M(ExternalJoinCompressedBytes, "Number of compressed bytes written for JOIN in external memory.") \ - M(ExternalJoinUncompressedBytes, "Amount of data (uncompressed, before compression) written for JOIN in external memory.") \ + M(ExternalProcessingCompressedBytesTotal, "Number of compressed bytes written by external processing (sorting/aggragating/joining)", ValueType::Bytes) \ + M(ExternalProcessingUncompressedBytesTotal, "Amount of data (uncompressed, before compression) written by external processing (sorting/aggragating/joining)", ValueType::Bytes) \ + M(ExternalProcessingFilesTotal, "Number of files used by external processing (sorting/aggragating/joining)", ValueType::Number) \ + M(ExternalSortWritePart, "Number of times a temporary file was written to disk for sorting in external memory.", ValueType::Number) \ + M(ExternalSortMerge, "Number of times temporary files were merged for sorting in external memory.", ValueType::Number) \ + M(ExternalSortCompressedBytes, "Number of compressed bytes written for sorting in external memory.", ValueType::Bytes) \ + M(ExternalSortUncompressedBytes, "Amount of data (uncompressed, before compression) written for sorting in external memory.", ValueType::Bytes) \ + M(ExternalAggregationWritePart, "Number of times a temporary file was written to disk for aggregation in external memory.", ValueType::Number) \ + M(ExternalAggregationMerge, "Number of times temporary files were merged for aggregation in external memory.", ValueType::Number) \ + M(ExternalAggregationCompressedBytes, "Number of bytes written to disk for aggregation in external memory.", ValueType::Bytes) \ + M(ExternalAggregationUncompressedBytes, "Amount of data (uncompressed, before compression) written to disk for aggregation in external memory.", ValueType::Bytes) \ + M(ExternalJoinWritePart, "Number of times a temporary file was written to disk for JOIN in external memory.", ValueType::Number) \ + M(ExternalJoinMerge, "Number of times temporary files were merged for JOIN in external memory.", ValueType::Number) \ + M(ExternalJoinCompressedBytes, "Number of compressed bytes written for JOIN in external memory.", ValueType::Bytes) \ + M(ExternalJoinUncompressedBytes, "Amount of data (uncompressed, before compression) written for JOIN in external memory.", ValueType::Bytes) \ \ - M(SlowRead, "Number of reads from a file that were slow. This indicate system overload. Thresholds are controlled by read_backoff_* settings.") \ - M(ReadBackoff, "Number of times the number of query processing threads was lowered due to slow reads.") \ + M(SlowRead, "Number of reads from a file that were slow. This indicate system overload. Thresholds are controlled by read_backoff_* settings.", ValueType::Number) \ + M(ReadBackoff, "Number of times the number of query processing threads was lowered due to slow reads.", ValueType::Number) \ \ - M(ReplicaPartialShutdown, "How many times Replicated table has to deinitialize its state due to session expiration in ZooKeeper. The state is reinitialized every time when ZooKeeper is available again.") \ + M(ReplicaPartialShutdown, "How many times Replicated table has to deinitialize its state due to session expiration in ZooKeeper. The state is reinitialized every time when ZooKeeper is available again.", ValueType::Number) \ \ - M(SelectedParts, "Number of data parts selected to read from a MergeTree table.") \ - M(SelectedPartsTotal, "Number of total data parts before selecting which ones to read from a MergeTree table.") \ - M(SelectedRanges, "Number of (non-adjacent) ranges in all data parts selected to read from a MergeTree table.") \ - M(SelectedMarks, "Number of marks (index granules) selected to read from a MergeTree table.") \ - M(SelectedMarksTotal, "Number of total marks (index granules) before selecting which ones to read from a MergeTree table.") \ - M(SelectedRows, "Number of rows SELECTed from all tables.") \ - M(SelectedBytes, "Number of bytes (uncompressed; for columns as they stored in memory) SELECTed from all tables.") \ - M(RowsReadByMainReader, "Number of rows read from MergeTree tables by the main reader (after PREWHERE step).") \ - M(RowsReadByPrewhereReaders, "Number of rows read from MergeTree tables (in total) by prewhere readers.") \ + M(SelectedParts, "Number of data parts selected to read from a MergeTree table.", ValueType::Number) \ + M(SelectedPartsTotal, "Number of total data parts before selecting which ones to read from a MergeTree table.", ValueType::Number) \ + M(SelectedRanges, "Number of (non-adjacent) ranges in all data parts selected to read from a MergeTree table.", ValueType::Number) \ + M(SelectedMarks, "Number of marks (index granules) selected to read from a MergeTree table.", ValueType::Number) \ + M(SelectedMarksTotal, "Number of total marks (index granules) before selecting which ones to read from a MergeTree table.", ValueType::Number) \ + M(SelectedRows, "Number of rows SELECTed from all tables.", ValueType::Number) \ + M(SelectedBytes, "Number of bytes (uncompressed; for columns as they stored in memory) SELECTed from all tables.", ValueType::Bytes) \ + M(RowsReadByMainReader, "Number of rows read from MergeTree tables by the main reader (after PREWHERE step).", ValueType::Number) \ + M(RowsReadByPrewhereReaders, "Number of rows read from MergeTree tables (in total) by prewhere readers.", ValueType::Number) \ \ - M(WaitMarksLoadMicroseconds, "Time spent loading marks") \ - M(BackgroundLoadingMarksTasks, "Number of background tasks for loading marks") \ - M(LoadedMarksCount, "Number of marks loaded (total across columns).") \ - M(LoadedMarksMemoryBytes, "Size of in-memory representations of loaded marks.") \ + M(WaitMarksLoadMicroseconds, "Time spent loading marks", ValueType::Microseconds) \ + M(BackgroundLoadingMarksTasks, "Number of background tasks for loading marks", ValueType::Number) \ + M(LoadedMarksCount, "Number of marks loaded (total across columns).", ValueType::Number) \ + M(LoadedMarksMemoryBytes, "Size of in-memory representations of loaded marks.", ValueType::Bytes) \ \ - M(Merge, "Number of launched background merges.") \ - M(MergedRows, "Rows read for background merges. This is the number of rows before merge.") \ - M(MergedColumns, "Number of columns merged during the horizontal stage of merges.") \ - M(GatheredColumns, "Number of columns gathered during the vertical stage of merges.") \ - M(MergedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for background merges. This is the number before merge.") \ - M(MergeTotalMilliseconds, "Total time spent for background merges") \ - M(MergeExecuteMilliseconds, "Total busy time spent for execution of background merges") \ - M(MergeHorizontalStageTotalMilliseconds, "Total time spent for horizontal stage of background merges") \ - M(MergeHorizontalStageExecuteMilliseconds, "Total busy time spent for execution of horizontal stage of background merges") \ - M(MergeVerticalStageTotalMilliseconds, "Total time spent for vertical stage of background merges") \ - M(MergeVerticalStageExecuteMilliseconds, "Total busy time spent for execution of vertical stage of background merges") \ - M(MergeProjectionStageTotalMilliseconds, "Total time spent for projection stage of background merges") \ - M(MergeProjectionStageExecuteMilliseconds, "Total busy time spent for execution of projection stage of background merges") \ + M(Merge, "Number of launched background merges.", ValueType::Number) \ + M(MergedRows, "Rows read for background merges. This is the number of rows before merge.", ValueType::Number) \ + M(MergedColumns, "Number of columns merged during the horizontal stage of merges.", ValueType::Number) \ + M(GatheredColumns, "Number of columns gathered during the vertical stage of merges.", ValueType::Number) \ + M(MergedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for background merges. This is the number before merge.", ValueType::Bytes) \ + M(MergeTotalMilliseconds, "Total time spent for background merges", ValueType::Milliseconds) \ + M(MergeExecuteMilliseconds, "Total busy time spent for execution of background merges", ValueType::Milliseconds) \ + M(MergeHorizontalStageTotalMilliseconds, "Total time spent for horizontal stage of background merges", ValueType::Milliseconds) \ + M(MergeHorizontalStageExecuteMilliseconds, "Total busy time spent for execution of horizontal stage of background merges", ValueType::Milliseconds) \ + M(MergeVerticalStageTotalMilliseconds, "Total time spent for vertical stage of background merges", ValueType::Milliseconds) \ + M(MergeVerticalStageExecuteMilliseconds, "Total busy time spent for execution of vertical stage of background merges", ValueType::Milliseconds) \ + M(MergeProjectionStageTotalMilliseconds, "Total time spent for projection stage of background merges", ValueType::Milliseconds) \ + M(MergeProjectionStageExecuteMilliseconds, "Total busy time spent for execution of projection stage of background merges", ValueType::Milliseconds) \ \ - M(MergingSortedMilliseconds, "Total time spent while merging sorted columns") \ - M(AggregatingSortedMilliseconds, "Total time spent while aggregating sorted columns") \ - M(CollapsingSortedMilliseconds, "Total time spent while collapsing sorted columns") \ - M(ReplacingSortedMilliseconds, "Total time spent while replacing sorted columns") \ - M(SummingSortedMilliseconds, "Total time spent while summing sorted columns") \ - M(VersionedCollapsingSortedMilliseconds, "Total time spent while version collapsing sorted columns") \ - M(GatheringColumnMilliseconds, "Total time spent while gathering columns for vertical merge") \ + M(MergingSortedMilliseconds, "Total time spent while merging sorted columns", ValueType::Milliseconds) \ + M(AggregatingSortedMilliseconds, "Total time spent while aggregating sorted columns", ValueType::Milliseconds) \ + M(CollapsingSortedMilliseconds, "Total time spent while collapsing sorted columns", ValueType::Milliseconds) \ + M(ReplacingSortedMilliseconds, "Total time spent while replacing sorted columns", ValueType::Milliseconds) \ + M(SummingSortedMilliseconds, "Total time spent while summing sorted columns", ValueType::Milliseconds) \ + M(VersionedCollapsingSortedMilliseconds, "Total time spent while version collapsing sorted columns", ValueType::Milliseconds) \ + M(GatheringColumnMilliseconds, "Total time spent while gathering columns for vertical merge", ValueType::Milliseconds) \ \ - M(MutationTotalParts, "Number of total parts for which mutations tried to be applied") \ - M(MutationUntouchedParts, "Number of total parts for which mutations tried to be applied but which was completely skipped according to predicate") \ - M(MutatedRows, "Rows read for mutations. This is the number of rows before mutation") \ - M(MutatedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for mutations. This is the number before mutation.") \ - M(MutationTotalMilliseconds, "Total time spent for mutations.") \ - M(MutationExecuteMilliseconds, "Total busy time spent for execution of mutations.") \ - M(MutationAllPartColumns, "Number of times when task to mutate all columns in part was created") \ - M(MutationSomePartColumns, "Number of times when task to mutate some columns in part was created") \ - M(MutateTaskProjectionsCalculationMicroseconds, "Time spent calculating projections in mutations.") \ + M(MutationTotalParts, "Number of total parts for which mutations tried to be applied", ValueType::Number) \ + M(MutationUntouchedParts, "Number of total parts for which mutations tried to be applied but which was completely skipped according to predicate", ValueType::Number) \ + M(MutatedRows, "Rows read for mutations. This is the number of rows before mutation", ValueType::Number) \ + M(MutatedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for mutations. This is the number before mutation.", ValueType::Bytes) \ + M(MutationTotalMilliseconds, "Total time spent for mutations.", ValueType::Milliseconds) \ + M(MutationExecuteMilliseconds, "Total busy time spent for execution of mutations.", ValueType::Milliseconds) \ + M(MutationAllPartColumns, "Number of times when task to mutate all columns in part was created", ValueType::Number) \ + M(MutationSomePartColumns, "Number of times when task to mutate some columns in part was created", ValueType::Number) \ + M(MutateTaskProjectionsCalculationMicroseconds, "Time spent calculating projections in mutations", ValueType::Microseconds) \ \ - M(MergeTreeDataWriterRows, "Number of rows INSERTed to MergeTree tables.") \ - M(MergeTreeDataWriterUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) INSERTed to MergeTree tables.") \ - M(MergeTreeDataWriterCompressedBytes, "Bytes written to filesystem for data INSERTed to MergeTree tables.") \ - M(MergeTreeDataWriterBlocks, "Number of blocks INSERTed to MergeTree tables. Each block forms a data part of level zero.") \ - M(MergeTreeDataWriterBlocksAlreadySorted, "Number of blocks INSERTed to MergeTree tables that appeared to be already sorted.") \ + M(MergeTreeDataWriterRows, "Number of rows INSERTed to MergeTree tables.", ValueType::Number) \ + M(MergeTreeDataWriterUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) INSERTed to MergeTree tables.", ValueType::Bytes) \ + M(MergeTreeDataWriterCompressedBytes, "Bytes written to filesystem for data INSERTed to MergeTree tables.", ValueType::Bytes) \ + M(MergeTreeDataWriterBlocks, "Number of blocks INSERTed to MergeTree tables. Each block forms a data part of level zero.", ValueType::Number) \ + M(MergeTreeDataWriterBlocksAlreadySorted, "Number of blocks INSERTed to MergeTree tables that appeared to be already sorted.", ValueType::Number) \ \ - M(MergeTreeDataWriterSkipIndicesCalculationMicroseconds, "Time spent calculating skip indices") \ - M(MergeTreeDataWriterStatisticsCalculationMicroseconds, "Time spent calculating statistics") \ - M(MergeTreeDataWriterSortingBlocksMicroseconds, "Time spent sorting blocks") \ - M(MergeTreeDataWriterMergingBlocksMicroseconds, "Time spent merging input blocks (for special MergeTree engines)") \ - M(MergeTreeDataWriterProjectionsCalculationMicroseconds, "Time spent calculating projections") \ - M(MergeTreeDataProjectionWriterSortingBlocksMicroseconds, "Time spent sorting blocks (for projection it might be a key different from table's sorting key)") \ - M(MergeTreeDataProjectionWriterMergingBlocksMicroseconds, "Time spent merging blocks") \ + M(MergeTreeDataWriterSkipIndicesCalculationMicroseconds, "Time spent calculating skip indices", ValueType::Microseconds) \ + M(MergeTreeDataWriterStatisticsCalculationMicroseconds, "Time spent calculating statistics", ValueType::Microseconds) \ + M(MergeTreeDataWriterSortingBlocksMicroseconds, "Time spent sorting blocks", ValueType::Microseconds) \ + M(MergeTreeDataWriterMergingBlocksMicroseconds, "Time spent merging input blocks (for special MergeTree engines)", ValueType::Microseconds) \ + M(MergeTreeDataWriterProjectionsCalculationMicroseconds, "Time spent calculating projections", ValueType::Microseconds) \ + M(MergeTreeDataProjectionWriterSortingBlocksMicroseconds, "Time spent sorting blocks (for projection it might be a key different from table's sorting key)", ValueType::Microseconds) \ + M(MergeTreeDataProjectionWriterMergingBlocksMicroseconds, "Time spent merging blocks", ValueType::Microseconds) \ \ - M(InsertedWideParts, "Number of parts inserted in Wide format.") \ - M(InsertedCompactParts, "Number of parts inserted in Compact format.") \ - M(MergedIntoWideParts, "Number of parts merged into Wide format.") \ - M(MergedIntoCompactParts, "Number of parts merged into Compact format.") \ + M(InsertedWideParts, "Number of parts inserted in Wide format.", ValueType::Number) \ + M(InsertedCompactParts, "Number of parts inserted in Compact format.", ValueType::Number) \ + M(MergedIntoWideParts, "Number of parts merged into Wide format.", ValueType::Number) \ + M(MergedIntoCompactParts, "Number of parts merged into Compact format.", ValueType::Number) \ \ - M(MergeTreeDataProjectionWriterRows, "Number of rows INSERTed to MergeTree tables projection.") \ - M(MergeTreeDataProjectionWriterUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) INSERTed to MergeTree tables projection.") \ - M(MergeTreeDataProjectionWriterCompressedBytes, "Bytes written to filesystem for data INSERTed to MergeTree tables projection.") \ - M(MergeTreeDataProjectionWriterBlocks, "Number of blocks INSERTed to MergeTree tables projection. Each block forms a data part of level zero.") \ - M(MergeTreeDataProjectionWriterBlocksAlreadySorted, "Number of blocks INSERTed to MergeTree tables projection that appeared to be already sorted.") \ + M(MergeTreeDataProjectionWriterRows, "Number of rows INSERTed to MergeTree tables projection.", ValueType::Number) \ + M(MergeTreeDataProjectionWriterUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) INSERTed to MergeTree tables projection.", ValueType::Bytes) \ + M(MergeTreeDataProjectionWriterCompressedBytes, "Bytes written to filesystem for data INSERTed to MergeTree tables projection.", ValueType::Bytes) \ + M(MergeTreeDataProjectionWriterBlocks, "Number of blocks INSERTed to MergeTree tables projection. Each block forms a data part of level zero.", ValueType::Number) \ + M(MergeTreeDataProjectionWriterBlocksAlreadySorted, "Number of blocks INSERTed to MergeTree tables projection that appeared to be already sorted.", ValueType::Number) \ \ - M(CannotRemoveEphemeralNode, "Number of times an error happened while trying to remove ephemeral node. This is not an issue, because our implementation of ZooKeeper library guarantee that the session will expire and the node will be removed.") \ + M(CannotRemoveEphemeralNode, "Number of times an error happened while trying to remove ephemeral node. This is not an issue, because our implementation of ZooKeeper library guarantee that the session will expire and the node will be removed.", ValueType::Number) \ \ - M(RegexpWithMultipleNeedlesCreated, "Regular expressions with multiple needles (VectorScan library) compiled.") \ - M(RegexpWithMultipleNeedlesGlobalCacheHit, "Number of times we fetched compiled regular expression with multiple needles (VectorScan library) from the global cache.") \ - M(RegexpWithMultipleNeedlesGlobalCacheMiss, "Number of times we failed to fetch compiled regular expression with multiple needles (VectorScan library) from the global cache.") \ - M(RegexpLocalCacheHit, "Number of times we fetched compiled regular expression from a local cache.") \ - M(RegexpLocalCacheMiss, "Number of times we failed to fetch compiled regular expression from a local cache.") \ + M(RegexpWithMultipleNeedlesCreated, "Regular expressions with multiple needles (VectorScan library) compiled.", ValueType::Number) \ + M(RegexpWithMultipleNeedlesGlobalCacheHit, "Number of times we fetched compiled regular expression with multiple needles (VectorScan library) from the global cache.", ValueType::Number) \ + M(RegexpWithMultipleNeedlesGlobalCacheMiss, "Number of times we failed to fetch compiled regular expression with multiple needles (VectorScan library) from the global cache.", ValueType::Number) \ + M(RegexpLocalCacheHit, "Number of times we fetched compiled regular expression from a local cache.", ValueType::Number) \ + M(RegexpLocalCacheMiss, "Number of times we failed to fetch compiled regular expression from a local cache.", ValueType::Number) \ \ - M(ContextLock, "Number of times the lock of Context was acquired or tried to acquire. This is global lock.") \ - M(ContextLockWaitMicroseconds, "Context lock wait time in microseconds") \ + M(ContextLock, "Number of times the lock of Context was acquired or tried to acquire. This is global lock.", ValueType::Number) \ + M(ContextLockWaitMicroseconds, "Context lock wait time in microseconds", ValueType::Microseconds) \ \ - M(StorageBufferFlush, "Number of times a buffer in a 'Buffer' table was flushed.") \ - M(StorageBufferErrorOnFlush, "Number of times a buffer in the 'Buffer' table has not been able to flush due to error writing in the destination table.") \ - M(StorageBufferPassedAllMinThresholds, "Number of times a criteria on min thresholds has been reached to flush a buffer in a 'Buffer' table.") \ - M(StorageBufferPassedTimeMaxThreshold, "Number of times a criteria on max time threshold has been reached to flush a buffer in a 'Buffer' table.") \ - M(StorageBufferPassedRowsMaxThreshold, "Number of times a criteria on max rows threshold has been reached to flush a buffer in a 'Buffer' table.") \ - M(StorageBufferPassedBytesMaxThreshold, "Number of times a criteria on max bytes threshold has been reached to flush a buffer in a 'Buffer' table.") \ - M(StorageBufferPassedTimeFlushThreshold, "Number of times background-only flush threshold on time has been reached to flush a buffer in a 'Buffer' table. This is expert-only metric. If you read this and you are not an expert, stop reading.") \ - M(StorageBufferPassedRowsFlushThreshold, "Number of times background-only flush threshold on rows has been reached to flush a buffer in a 'Buffer' table. This is expert-only metric. If you read this and you are not an expert, stop reading.") \ - M(StorageBufferPassedBytesFlushThreshold, "Number of times background-only flush threshold on bytes has been reached to flush a buffer in a 'Buffer' table. This is expert-only metric. If you read this and you are not an expert, stop reading.") \ - M(StorageBufferLayerLockReadersWaitMilliseconds, "Time for waiting for Buffer layer during reading.") \ - M(StorageBufferLayerLockWritersWaitMilliseconds, "Time for waiting free Buffer layer to write to (can be used to tune Buffer layers).") \ + M(StorageBufferFlush, "Number of times a buffer in a 'Buffer' table was flushed.", ValueType::Number) \ + M(StorageBufferErrorOnFlush, "Number of times a buffer in the 'Buffer' table has not been able to flush due to error writing in the destination table.", ValueType::Number) \ + M(StorageBufferPassedAllMinThresholds, "Number of times a criteria on min thresholds has been reached to flush a buffer in a 'Buffer' table.", ValueType::Number) \ + M(StorageBufferPassedTimeMaxThreshold, "Number of times a criteria on max time threshold has been reached to flush a buffer in a 'Buffer' table.", ValueType::Number) \ + M(StorageBufferPassedRowsMaxThreshold, "Number of times a criteria on max rows threshold has been reached to flush a buffer in a 'Buffer' table.", ValueType::Number) \ + M(StorageBufferPassedBytesMaxThreshold, "Number of times a criteria on max bytes threshold has been reached to flush a buffer in a 'Buffer' table.", ValueType::Number) \ + M(StorageBufferPassedTimeFlushThreshold, "Number of times background-only flush threshold on time has been reached to flush a buffer in a 'Buffer' table. This is expert-only metric. If you read this and you are not an expert, stop reading.", ValueType::Number) \ + M(StorageBufferPassedRowsFlushThreshold, "Number of times background-only flush threshold on rows has been reached to flush a buffer in a 'Buffer' table. This is expert-only metric. If you read this and you are not an expert, stop reading.", ValueType::Number) \ + M(StorageBufferPassedBytesFlushThreshold, "Number of times background-only flush threshold on bytes has been reached to flush a buffer in a 'Buffer' table. This is expert-only metric. If you read this and you are not an expert, stop reading.", ValueType::Number) \ + M(StorageBufferLayerLockReadersWaitMilliseconds, "Time for waiting for Buffer layer during reading.", ValueType::Milliseconds) \ + M(StorageBufferLayerLockWritersWaitMilliseconds, "Time for waiting free Buffer layer to write to (can be used to tune Buffer layers).", ValueType::Milliseconds) \ \ - M(DictCacheKeysRequested, "Number of keys requested from the data source for the dictionaries of 'cache' types.") \ - M(DictCacheKeysRequestedMiss, "Number of keys requested from the data source for dictionaries of 'cache' types but not found in the data source.") \ - M(DictCacheKeysRequestedFound, "Number of keys requested from the data source for dictionaries of 'cache' types and found in the data source.") \ - M(DictCacheKeysExpired, "Number of keys looked up in the dictionaries of 'cache' types and found in the cache but they were obsolete.") \ - M(DictCacheKeysNotFound, "Number of keys looked up in the dictionaries of 'cache' types and not found.") \ - M(DictCacheKeysHit, "Number of keys looked up in the dictionaries of 'cache' types and found in the cache.") \ - M(DictCacheRequestTimeNs, "Number of nanoseconds spend in querying the external data sources for the dictionaries of 'cache' types.") \ - M(DictCacheRequests, "Number of bulk requests to the external data sources for the dictionaries of 'cache' types.") \ - M(DictCacheLockWriteNs, "Number of nanoseconds spend in waiting for write lock to update the data for the dictionaries of 'cache' types.") \ - M(DictCacheLockReadNs, "Number of nanoseconds spend in waiting for read lock to lookup the data for the dictionaries of 'cache' types.") \ + M(DictCacheKeysRequested, "Number of keys requested from the data source for the dictionaries of 'cache' types.", ValueType::Number) \ + M(DictCacheKeysRequestedMiss, "Number of keys requested from the data source for dictionaries of 'cache' types but not found in the data source.", ValueType::Number) \ + M(DictCacheKeysRequestedFound, "Number of keys requested from the data source for dictionaries of 'cache' types and found in the data source.", ValueType::Number) \ + M(DictCacheKeysExpired, "Number of keys looked up in the dictionaries of 'cache' types and found in the cache but they were obsolete.", ValueType::Number) \ + M(DictCacheKeysNotFound, "Number of keys looked up in the dictionaries of 'cache' types and not found.", ValueType::Number) \ + M(DictCacheKeysHit, "Number of keys looked up in the dictionaries of 'cache' types and found in the cache.", ValueType::Number) \ + M(DictCacheRequestTimeNs, "Number of nanoseconds spend in querying the external data sources for the dictionaries of 'cache' types.", ValueType::Nanoseconds) \ + M(DictCacheRequests, "Number of bulk requests to the external data sources for the dictionaries of 'cache' types.", ValueType::Number) \ + M(DictCacheLockWriteNs, "Number of nanoseconds spend in waiting for write lock to update the data for the dictionaries of 'cache' types.", ValueType::Nanoseconds) \ + M(DictCacheLockReadNs, "Number of nanoseconds spend in waiting for read lock to lookup the data for the dictionaries of 'cache' types.", ValueType::Nanoseconds) \ \ - M(DistributedSyncInsertionTimeoutExceeded, "A timeout has exceeded while waiting for shards during synchronous insertion into a Distributed table (with 'distributed_foreground_insert' = 1)") \ - M(DistributedAsyncInsertionFailures, "Number of failures for asynchronous insertion into a Distributed table (with 'distributed_foreground_insert' = 0)") \ + M(DistributedSyncInsertionTimeoutExceeded, "A timeout has exceeded while waiting for shards during synchronous insertion into a Distributed table (with 'distributed_foreground_insert' = 1)", ValueType::Number) \ + M(DistributedAsyncInsertionFailures, "Number of failures for asynchronous insertion into a Distributed table (with 'distributed_foreground_insert' = 0)", ValueType::Number) \ M(DataAfterMergeDiffersFromReplica, R"( Number of times data after merge is not byte-identical to the data on another replicas. There could be several reasons: 1. Using newer version of compression library after server update. @@ -333,507 +333,507 @@ Number of times data after merge is not byte-identical to the data on another re 8. Manual modification of checksums stored in ZooKeeper. 9. Part format related settings like 'enable_mixed_granularity_parts' are different on different replicas. The server successfully detected this situation and will download merged part from the replica to force the byte-identical result. -)") \ - M(DataAfterMutationDiffersFromReplica, "Number of times data after mutation is not byte-identical to the data on other replicas. In addition to the reasons described in 'DataAfterMergeDiffersFromReplica', it is also possible due to non-deterministic mutation.") \ - M(PolygonsAddedToPool, "A polygon has been added to the cache (pool) for the 'pointInPolygon' function.") \ - M(PolygonsInPoolAllocatedBytes, "The number of bytes for polygons added to the cache (pool) for the 'pointInPolygon' function.") \ - \ - M(USearchAddCount, "Number of vectors added to usearch indexes.") \ - M(USearchAddVisitedMembers, "Number of nodes visited when adding vectors to usearch indexes.") \ - M(USearchAddComputedDistances, "Number of times distance was computed when adding vectors to usearch indexes.") \ - M(USearchSearchCount, "Number of search operations performed in usearch indexes.") \ - M(USearchSearchVisitedMembers, "Number of nodes visited when searching in usearch indexes.") \ - M(USearchSearchComputedDistances, "Number of times distance was computed when searching usearch indexes.") \ - \ - M(RWLockAcquiredReadLocks, "Number of times a read lock was acquired (in a heavy RWLock).") \ - M(RWLockAcquiredWriteLocks, "Number of times a write lock was acquired (in a heavy RWLock).") \ - M(RWLockReadersWaitMilliseconds, "Total time spent waiting for a read lock to be acquired (in a heavy RWLock).") \ - M(RWLockWritersWaitMilliseconds, "Total time spent waiting for a write lock to be acquired (in a heavy RWLock).") \ - M(DNSError, "Total count of errors in DNS resolution") \ - M(PartsLockHoldMicroseconds, "Total time spent holding data parts lock in MergeTree tables") \ - M(PartsLockWaitMicroseconds, "Total time spent waiting for data parts lock in MergeTree tables") \ - \ - M(RealTimeMicroseconds, "Total (wall clock) time spent in processing (queries and other tasks) threads (note that this is a sum).") \ - M(UserTimeMicroseconds, "Total time spent in processing (queries and other tasks) threads executing CPU instructions in user mode. This includes time CPU pipeline was stalled due to main memory access, cache misses, branch mispredictions, hyper-threading, etc.") \ - M(SystemTimeMicroseconds, "Total time spent in processing (queries and other tasks) threads executing CPU instructions in OS kernel mode. This is time spent in syscalls, excluding waiting time during blocking syscalls.") \ - M(MemoryOvercommitWaitTimeMicroseconds, "Total time spent in waiting for memory to be freed in OvercommitTracker.") \ - M(MemoryAllocatorPurge, "Total number of times memory allocator purge was requested") \ - M(MemoryAllocatorPurgeTimeMicroseconds, "Total number of times memory allocator purge was requested") \ - M(SoftPageFaults, "The number of soft page faults in query execution threads. Soft page fault usually means a miss in the memory allocator cache, which requires a new memory mapping from the OS and subsequent allocation of a page of physical memory.") \ - M(HardPageFaults, "The number of hard page faults in query execution threads. High values indicate either that you forgot to turn off swap on your server, or eviction of memory pages of the ClickHouse binary during very high memory pressure, or successful usage of the 'mmap' read method for the tables data.") \ - \ - M(OSIOWaitMicroseconds, "Total time a thread spent waiting for a result of IO operation, from the OS point of view. This is real IO that doesn't include page cache.") \ - M(OSCPUWaitMicroseconds, "Total time a thread was ready for execution but waiting to be scheduled by OS, from the OS point of view.") \ - M(OSCPUVirtualTimeMicroseconds, "CPU time spent seen by OS. Does not include involuntary waits due to virtualization.") \ - M(OSReadBytes, "Number of bytes read from disks or block devices. Doesn't include bytes read from page cache. May include excessive data due to block size, readahead, etc.") \ - M(OSWriteBytes, "Number of bytes written to disks or block devices. Doesn't include bytes that are in page cache dirty pages. May not include data that was written by OS asynchronously.") \ - M(OSReadChars, "Number of bytes read from filesystem, including page cache.") \ - M(OSWriteChars, "Number of bytes written to filesystem, including page cache.") \ - \ - M(ParallelReplicasHandleRequestMicroseconds, "Time spent processing requests for marks from replicas") \ - M(ParallelReplicasHandleAnnouncementMicroseconds, "Time spent processing replicas announcements") \ - M(ParallelReplicasAnnouncementMicroseconds, "Time spent to send an announcement") \ - M(ParallelReplicasReadRequestMicroseconds, "Time spent for read requests") \ - \ - M(ParallelReplicasReadAssignedMarks, "Sum across all replicas of how many of scheduled marks were assigned by consistent hash") \ - M(ParallelReplicasReadUnassignedMarks, "Sum across all replicas of how many unassigned marks were scheduled") \ - M(ParallelReplicasReadAssignedForStealingMarks, "Sum across all replicas of how many of scheduled marks were assigned for stealing by consistent hash") \ - M(ParallelReplicasReadMarks, "How many marks were read by the given replica") \ - \ - M(ParallelReplicasStealingByHashMicroseconds, "Time spent collecting segments meant for stealing by hash") \ - M(ParallelReplicasProcessingPartsMicroseconds, "Time spent processing data parts") \ - M(ParallelReplicasStealingLeftoversMicroseconds, "Time spent collecting orphaned segments") \ - M(ParallelReplicasCollectingOwnedSegmentsMicroseconds, "Time spent collecting segments meant by hash") \ - M(ParallelReplicasNumRequests, "Number of requests to the initiator.") \ - M(ParallelReplicasDeniedRequests, "Number of completely denied requests to the initiator") \ - M(CacheWarmerBytesDownloaded, "Amount of data fetched into filesystem cache by dedicated background threads.") \ - M(CacheWarmerDataPartsDownloaded, "Number of data parts that were fully fetched by CacheWarmer.") \ - M(IgnoredColdParts, "See setting ignore_cold_parts_seconds. Number of times read queries ignored very new parts that weren't pulled into cache by CacheWarmer yet.") \ - M(PreferredWarmedUnmergedParts, "See setting prefer_warmed_unmerged_parts_seconds. Number of times read queries used outdated pre-merge parts that are in cache instead of merged part that wasn't pulled into cache by CacheWarmer yet.") \ - \ - M(PerfCPUCycles, "Total cycles. Be wary of what happens during CPU frequency scaling.") \ - M(PerfInstructions, "Retired instructions. Be careful, these can be affected by various issues, most notably hardware interrupt counts.") \ - M(PerfCacheReferences, "Cache accesses. Usually, this indicates Last Level Cache accesses, but this may vary depending on your CPU. This may include prefetches and coherency messages; again this depends on the design of your CPU.") \ - M(PerfCacheMisses, "Cache misses. Usually this indicates Last Level Cache misses; this is intended to be used in conjunction with the PERFCOUNTHWCACHEREFERENCES event to calculate cache miss rates.") \ - M(PerfBranchInstructions, "Retired branch instructions. Prior to Linux 2.6.35, this used the wrong event on AMD processors.") \ - M(PerfBranchMisses, "Mispredicted branch instructions.") \ - M(PerfBusCycles, "Bus cycles, which can be different from total cycles.") \ - M(PerfStalledCyclesFrontend, "Stalled cycles during issue.") \ - M(PerfStalledCyclesBackend, "Stalled cycles during retirement.") \ - M(PerfRefCPUCycles, "Total cycles; not affected by CPU frequency scaling.") \ - \ - M(PerfCPUClock, "The CPU clock, a high-resolution per-CPU timer") \ - M(PerfTaskClock, "A clock count specific to the task that is running") \ - M(PerfContextSwitches, "Number of context switches") \ - M(PerfCPUMigrations, "Number of times the process has migrated to a new CPU") \ - M(PerfAlignmentFaults, "Number of alignment faults. These happen when unaligned memory accesses happen; the kernel can handle these but it reduces performance. This happens only on some architectures (never on x86).") \ - M(PerfEmulationFaults, "Number of emulation faults. The kernel sometimes traps on unimplemented instructions and emulates them for user space. This can negatively impact performance.") \ - M(PerfMinEnabledTime, "For all events, minimum time that an event was enabled. Used to track event multiplexing influence") \ - M(PerfMinEnabledRunningTime, "Running time for event with minimum enabled time. Used to track the amount of event multiplexing") \ - M(PerfDataTLBReferences, "Data TLB references") \ - M(PerfDataTLBMisses, "Data TLB misses") \ - M(PerfInstructionTLBReferences, "Instruction TLB references") \ - M(PerfInstructionTLBMisses, "Instruction TLB misses") \ - M(PerfLocalMemoryReferences, "Local NUMA node memory reads") \ - M(PerfLocalMemoryMisses, "Local NUMA node memory read misses") \ - \ - M(CannotWriteToWriteBufferDiscard, "Number of stack traces dropped by query profiler or signal handler because pipe is full or cannot write to pipe.") \ - M(QueryProfilerSignalOverruns, "Number of times we drop processing of a query profiler signal due to overrun plus the number of signals that OS has not delivered due to overrun.") \ - M(QueryProfilerConcurrencyOverruns, "Number of times we drop processing of a query profiler signal due to too many concurrent query profilers in other threads, which may indicate overload.") \ - M(QueryProfilerRuns, "Number of times QueryProfiler had been run.") \ - M(QueryProfilerErrors, "Invalid memory accesses during asynchronous stack unwinding.") \ - \ - M(CreatedLogEntryForMerge, "Successfully created log entry to merge parts in ReplicatedMergeTree.") \ - M(NotCreatedLogEntryForMerge, "Log entry to merge parts in ReplicatedMergeTree is not created due to concurrent log update by another replica.") \ - M(CreatedLogEntryForMutation, "Successfully created log entry to mutate parts in ReplicatedMergeTree.") \ - M(NotCreatedLogEntryForMutation, "Log entry to mutate parts in ReplicatedMergeTree is not created due to concurrent log update by another replica.") \ - \ - M(S3ReadMicroseconds, "Time of GET and HEAD requests to S3 storage.") \ - M(S3ReadRequestsCount, "Number of GET and HEAD requests to S3 storage.") \ - M(S3ReadRequestsErrors, "Number of non-throttling errors in GET and HEAD requests to S3 storage.") \ - M(S3ReadRequestsThrottling, "Number of 429 and 503 errors in GET and HEAD requests to S3 storage.") \ - M(S3ReadRequestsRedirects, "Number of redirects in GET and HEAD requests to S3 storage.") \ - \ - M(S3WriteMicroseconds, "Time of POST, DELETE, PUT and PATCH requests to S3 storage.") \ - M(S3WriteRequestsCount, "Number of POST, DELETE, PUT and PATCH requests to S3 storage.") \ - M(S3WriteRequestsErrors, "Number of non-throttling errors in POST, DELETE, PUT and PATCH requests to S3 storage.") \ - M(S3WriteRequestsThrottling, "Number of 429 and 503 errors in POST, DELETE, PUT and PATCH requests to S3 storage.") \ - M(S3WriteRequestsRedirects, "Number of redirects in POST, DELETE, PUT and PATCH requests to S3 storage.") \ - \ - M(DiskS3ReadMicroseconds, "Time of GET and HEAD requests to DiskS3 storage.") \ - M(DiskS3ReadRequestsCount, "Number of GET and HEAD requests to DiskS3 storage.") \ - M(DiskS3ReadRequestsErrors, "Number of non-throttling errors in GET and HEAD requests to DiskS3 storage.") \ - M(DiskS3ReadRequestsThrottling, "Number of 429 and 503 errors in GET and HEAD requests to DiskS3 storage.") \ - M(DiskS3ReadRequestsRedirects, "Number of redirects in GET and HEAD requests to DiskS3 storage.") \ - \ - M(DiskS3WriteMicroseconds, "Time of POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \ - M(DiskS3WriteRequestsCount, "Number of POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \ - M(DiskS3WriteRequestsErrors, "Number of non-throttling errors in POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \ - M(DiskS3WriteRequestsThrottling, "Number of 429 and 503 errors in POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \ - M(DiskS3WriteRequestsRedirects, "Number of redirects in POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \ - \ - M(S3DeleteObjects, "Number of S3 API DeleteObject(s) calls.") \ - M(S3CopyObject, "Number of S3 API CopyObject calls.") \ - M(S3ListObjects, "Number of S3 API ListObjects calls.") \ - M(S3HeadObject, "Number of S3 API HeadObject calls.") \ - M(S3GetObjectAttributes, "Number of S3 API GetObjectAttributes calls.") \ - M(S3CreateMultipartUpload, "Number of S3 API CreateMultipartUpload calls.") \ - M(S3UploadPartCopy, "Number of S3 API UploadPartCopy calls.") \ - M(S3UploadPart, "Number of S3 API UploadPart calls.") \ - M(S3AbortMultipartUpload, "Number of S3 API AbortMultipartUpload calls.") \ - M(S3CompleteMultipartUpload, "Number of S3 API CompleteMultipartUpload calls.") \ - M(S3PutObject, "Number of S3 API PutObject calls.") \ - M(S3GetObject, "Number of S3 API GetObject calls.") \ - \ - M(DiskS3DeleteObjects, "Number of DiskS3 API DeleteObject(s) calls.") \ - M(DiskS3CopyObject, "Number of DiskS3 API CopyObject calls.") \ - M(DiskS3ListObjects, "Number of DiskS3 API ListObjects calls.") \ - M(DiskS3HeadObject, "Number of DiskS3 API HeadObject calls.") \ - M(DiskS3GetObjectAttributes, "Number of DiskS3 API GetObjectAttributes calls.") \ - M(DiskS3CreateMultipartUpload, "Number of DiskS3 API CreateMultipartUpload calls.") \ - M(DiskS3UploadPartCopy, "Number of DiskS3 API UploadPartCopy calls.") \ - M(DiskS3UploadPart, "Number of DiskS3 API UploadPart calls.") \ - M(DiskS3AbortMultipartUpload, "Number of DiskS3 API AbortMultipartUpload calls.") \ - M(DiskS3CompleteMultipartUpload, "Number of DiskS3 API CompleteMultipartUpload calls.") \ - M(DiskS3PutObject, "Number of DiskS3 API PutObject calls.") \ - M(DiskS3GetObject, "Number of DiskS3 API GetObject calls.") \ - \ - M(DiskPlainRewritableAzureDirectoryCreated, "Number of directories created by the 'plain_rewritable' metadata storage for AzureObjectStorage.") \ - M(DiskPlainRewritableAzureDirectoryRemoved, "Number of directories removed by the 'plain_rewritable' metadata storage for AzureObjectStorage.") \ - M(DiskPlainRewritableLocalDirectoryCreated, "Number of directories created by the 'plain_rewritable' metadata storage for LocalObjectStorage.") \ - M(DiskPlainRewritableLocalDirectoryRemoved, "Number of directories removed by the 'plain_rewritable' metadata storage for LocalObjectStorage.") \ - M(DiskPlainRewritableS3DirectoryCreated, "Number of directories created by the 'plain_rewritable' metadata storage for S3ObjectStorage.") \ - M(DiskPlainRewritableS3DirectoryRemoved, "Number of directories removed by the 'plain_rewritable' metadata storage for S3ObjectStorage.") \ - \ - M(S3Clients, "Number of created S3 clients.") \ - M(TinyS3Clients, "Number of S3 clients copies which reuse an existing auth provider from another client.") \ - \ - M(EngineFileLikeReadFiles, "Number of files read in table engines working with files (like File/S3/URL/HDFS).") \ - \ - M(ReadBufferFromS3Microseconds, "Time spent on reading from S3.") \ - M(ReadBufferFromS3InitMicroseconds, "Time spent initializing connection to S3.") \ - M(ReadBufferFromS3Bytes, "Bytes read from S3.") \ - M(ReadBufferFromS3RequestsErrors, "Number of exceptions while reading from S3.") \ - \ - M(WriteBufferFromS3Microseconds, "Time spent on writing to S3.") \ - M(WriteBufferFromS3Bytes, "Bytes written to S3.") \ - M(WriteBufferFromS3RequestsErrors, "Number of exceptions while writing to S3.") \ - M(WriteBufferFromS3WaitInflightLimitMicroseconds, "Time spent on waiting while some of the current requests are done when its number reached the limit defined by s3_max_inflight_parts_for_one_file.") \ - M(QueryMemoryLimitExceeded, "Number of times when memory limit exceeded for query.") \ - \ - M(AzureGetObject, "Number of Azure API GetObject calls.") \ - M(AzureUpload, "Number of Azure blob storage API Upload calls") \ - M(AzureStageBlock, "Number of Azure blob storage API StageBlock calls") \ - M(AzureCommitBlockList, "Number of Azure blob storage API CommitBlockList calls") \ - M(AzureCopyObject, "Number of Azure blob storage API CopyObject calls") \ - M(AzureDeleteObjects, "Number of Azure blob storage API DeleteObject(s) calls.") \ - M(AzureListObjects, "Number of Azure blob storage API ListObjects calls.") \ - M(AzureGetProperties, "Number of Azure blob storage API GetProperties calls.") \ - M(AzureCreateContainer, "Number of Azure blob storage API CreateContainer calls.") \ - \ - M(DiskAzureGetObject, "Number of Disk Azure API GetObject calls.") \ - M(DiskAzureUpload, "Number of Disk Azure blob storage API Upload calls") \ - M(DiskAzureStageBlock, "Number of Disk Azure blob storage API StageBlock calls") \ - M(DiskAzureCommitBlockList, "Number of Disk Azure blob storage API CommitBlockList calls") \ - M(DiskAzureCopyObject, "Number of Disk Azure blob storage API CopyObject calls") \ - M(DiskAzureListObjects, "Number of Disk Azure blob storage API ListObjects calls.") \ - M(DiskAzureDeleteObjects, "Number of Disk Azure blob storage API DeleteObject(s) calls.") \ - M(DiskAzureGetProperties, "Number of Disk Azure blob storage API GetProperties calls.") \ - M(DiskAzureCreateContainer, "Number of Disk Azure blob storage API CreateContainer calls.") \ - \ - M(ReadBufferFromAzureMicroseconds, "Time spent on reading from Azure.") \ - M(ReadBufferFromAzureInitMicroseconds, "Time spent initializing connection to Azure.") \ - M(ReadBufferFromAzureBytes, "Bytes read from Azure.") \ - M(ReadBufferFromAzureRequestsErrors, "Number of exceptions while reading from Azure") \ - \ - M(CachedReadBufferReadFromCacheHits, "Number of times the read from filesystem cache hit the cache.") \ - M(CachedReadBufferReadFromCacheMisses, "Number of times the read from filesystem cache miss the cache.") \ - M(CachedReadBufferReadFromSourceMicroseconds, "Time reading from filesystem cache source (from remote filesystem, etc)") \ - M(CachedReadBufferReadFromCacheMicroseconds, "Time reading from filesystem cache") \ - M(CachedReadBufferReadFromSourceBytes, "Bytes read from filesystem cache source (from remote fs, etc)") \ - M(CachedReadBufferReadFromCacheBytes, "Bytes read from filesystem cache") \ - M(CachedReadBufferPredownloadedBytes, "Bytes read from filesystem cache source. Cache segments are read from left to right as a whole, it might be that we need to predownload some part of the segment irrelevant for the current task just to get to the needed data") \ - M(CachedReadBufferCacheWriteBytes, "Bytes written from source (remote fs, etc) to filesystem cache") \ - M(CachedReadBufferCacheWriteMicroseconds, "Time spent writing data into filesystem cache") \ - M(CachedReadBufferCreateBufferMicroseconds, "Prepare buffer time") \ - M(CachedWriteBufferCacheWriteBytes, "Bytes written from source (remote fs, etc) to filesystem cache") \ - M(CachedWriteBufferCacheWriteMicroseconds, "Time spent writing data into filesystem cache") \ - \ - M(FilesystemCacheLoadMetadataMicroseconds, "Time spent loading filesystem cache metadata") \ - M(FilesystemCacheEvictedBytes, "Number of bytes evicted from filesystem cache") \ - M(FilesystemCacheEvictedFileSegments, "Number of file segments evicted from filesystem cache") \ - M(FilesystemCacheEvictionSkippedFileSegments, "Number of file segments skipped for eviction because of being in unreleasable state") \ - M(FilesystemCacheEvictionSkippedEvictingFileSegments, "Number of file segments skipped for eviction because of being in evicting state") \ - M(FilesystemCacheEvictionTries, "Number of filesystem cache eviction attempts") \ - M(FilesystemCacheLockKeyMicroseconds, "Lock cache key time") \ - M(FilesystemCacheLockMetadataMicroseconds, "Lock filesystem cache metadata time") \ - M(FilesystemCacheLockCacheMicroseconds, "Lock filesystem cache time") \ - M(FilesystemCacheReserveMicroseconds, "Filesystem cache space reservation time") \ - M(FilesystemCacheEvictMicroseconds, "Filesystem cache eviction time") \ - M(FilesystemCacheGetOrSetMicroseconds, "Filesystem cache getOrSet() time") \ - M(FilesystemCacheGetMicroseconds, "Filesystem cache get() time") \ - M(FileSegmentWaitMicroseconds, "Wait on DOWNLOADING state") \ - M(FileSegmentCompleteMicroseconds, "Duration of FileSegment::complete() in filesystem cache") \ - M(FileSegmentLockMicroseconds, "Lock file segment time") \ - M(FileSegmentWriteMicroseconds, "File segment write() time") \ - M(FileSegmentUseMicroseconds, "File segment use() time") \ - M(FileSegmentRemoveMicroseconds, "File segment remove() time") \ - M(FileSegmentHolderCompleteMicroseconds, "File segments holder complete() time") \ - M(FileSegmentFailToIncreasePriority, "Number of times the priority was not increased due to a high contention on the cache lock") \ - M(FilesystemCacheFailToReserveSpaceBecauseOfLockContention, "Number of times space reservation was skipped due to a high contention on the cache lock") \ - M(FilesystemCacheFailToReserveSpaceBecauseOfCacheResize, "Number of times space reservation was skipped due to the cache is being resized") \ - M(FilesystemCacheHoldFileSegments, "Filesystem cache file segments count, which were hold") \ - M(FilesystemCacheUnusedHoldFileSegments, "Filesystem cache file segments count, which were hold, but not used (because of seek or LIMIT n, etc)") \ - M(FilesystemCacheFreeSpaceKeepingThreadRun, "Number of times background thread executed free space keeping job") \ - M(FilesystemCacheFreeSpaceKeepingThreadWorkMilliseconds, "Time for which background thread executed free space keeping job") \ - \ - M(RemoteFSSeeks, "Total number of seeks for async buffer") \ - M(RemoteFSPrefetches, "Number of prefetches made with asynchronous reading from remote filesystem") \ - M(RemoteFSCancelledPrefetches, "Number of cancelled prefecthes (because of seek)") \ - M(RemoteFSUnusedPrefetches, "Number of prefetches pending at buffer destruction") \ - M(RemoteFSPrefetchedReads, "Number of reads from prefecthed buffer") \ - M(RemoteFSPrefetchedBytes, "Number of bytes from prefecthed buffer") \ - M(RemoteFSUnprefetchedReads, "Number of reads from unprefetched buffer") \ - M(RemoteFSUnprefetchedBytes, "Number of bytes from unprefetched buffer") \ - M(RemoteFSLazySeeks, "Number of lazy seeks") \ - M(RemoteFSSeeksWithReset, "Number of seeks which lead to a new connection") \ - M(RemoteFSBuffers, "Number of buffers created for asynchronous reading from remote filesystem") \ - M(MergeTreePrefetchedReadPoolInit, "Time spent preparing tasks in MergeTreePrefetchedReadPool") \ - M(WaitPrefetchTaskMicroseconds, "Time spend waiting for prefetched reader") \ - \ - M(ThreadpoolReaderTaskMicroseconds, "Time spent getting the data in asynchronous reading") \ - M(ThreadpoolReaderPrepareMicroseconds, "Time spent on preparation (e.g. call to reader seek() method)") \ - M(ThreadpoolReaderReadBytes, "Bytes read from a threadpool task in asynchronous reading") \ - M(ThreadpoolReaderSubmit, "Bytes read from a threadpool task in asynchronous reading") \ - M(ThreadpoolReaderSubmitReadSynchronously, "How many times we haven't scheduled a task on the thread pool and read synchronously instead") \ - M(ThreadpoolReaderSubmitReadSynchronouslyBytes, "How many bytes were read synchronously") \ - M(ThreadpoolReaderSubmitReadSynchronouslyMicroseconds, "How much time we spent reading synchronously") \ - M(ThreadpoolReaderSubmitLookupInCacheMicroseconds, "How much time we spent checking if content is cached") \ - M(AsynchronousReaderIgnoredBytes, "Number of bytes ignored during asynchronous reading") \ - \ - M(FileSegmentWaitReadBufferMicroseconds, "Metric per file segment. Time spend waiting for internal read buffer (includes cache waiting)") \ - M(FileSegmentReadMicroseconds, "Metric per file segment. Time spend reading from file") \ - M(FileSegmentCacheWriteMicroseconds, "Metric per file segment. Time spend writing data to cache") \ - M(FileSegmentPredownloadMicroseconds, "Metric per file segment. Time spent pre-downloading data to cache (pre-downloading - finishing file segment download (after someone who failed to do that) up to the point current thread was requested to do)") \ - M(FileSegmentUsedBytes, "Metric per file segment. How many bytes were actually used from current file segment") \ - \ - M(ReadBufferSeekCancelConnection, "Number of seeks which lead to new connection (s3, http)") \ - \ - M(SleepFunctionCalls, "Number of times a sleep function (sleep, sleepEachRow) has been called.") \ - M(SleepFunctionMicroseconds, "Time set to sleep in a sleep function (sleep, sleepEachRow).") \ - M(SleepFunctionElapsedMicroseconds, "Time spent sleeping in a sleep function (sleep, sleepEachRow).") \ - \ - M(ThreadPoolReaderPageCacheHit, "Number of times the read inside ThreadPoolReader was done from the page cache.") \ - M(ThreadPoolReaderPageCacheHitBytes, "Number of bytes read inside ThreadPoolReader when it was done from the page cache.") \ - M(ThreadPoolReaderPageCacheHitElapsedMicroseconds, "Time spent reading data from page cache in ThreadPoolReader.") \ - M(ThreadPoolReaderPageCacheMiss, "Number of times the read inside ThreadPoolReader was not done from page cache and was hand off to thread pool.") \ - M(ThreadPoolReaderPageCacheMissBytes, "Number of bytes read inside ThreadPoolReader when read was not done from page cache and was hand off to thread pool.") \ - M(ThreadPoolReaderPageCacheMissElapsedMicroseconds, "Time spent reading data inside the asynchronous job in ThreadPoolReader - when read was not done from the page cache.") \ - \ - M(AsynchronousReadWaitMicroseconds, "Time spent in waiting for asynchronous reads in asynchronous local read.") \ - M(SynchronousReadWaitMicroseconds, "Time spent in waiting for synchronous reads in asynchronous local read.") \ - M(AsynchronousRemoteReadWaitMicroseconds, "Time spent in waiting for asynchronous remote reads.") \ - M(SynchronousRemoteReadWaitMicroseconds, "Time spent in waiting for synchronous remote reads.") \ - \ - M(ExternalDataSourceLocalCacheReadBytes, "Bytes read from local cache buffer in RemoteReadBufferCache")\ - \ - M(MainConfigLoads, "Number of times the main configuration was reloaded.") \ - \ - M(AggregationPreallocatedElementsInHashTables, "How many elements were preallocated in hash tables for aggregation.") \ - M(AggregationHashTablesInitializedAsTwoLevel, "How many hash tables were inited as two-level for aggregation.") \ - M(AggregationOptimizedEqualRangesOfKeys, "For how many blocks optimization of equal ranges of keys was applied") \ - M(HashJoinPreallocatedElementsInHashTables, "How many elements were preallocated in hash tables for hash join.") \ - \ - M(MetadataFromKeeperCacheHit, "Number of times an object storage metadata request was answered from cache without making request to Keeper") \ - M(MetadataFromKeeperCacheMiss, "Number of times an object storage metadata request had to be answered from Keeper") \ - M(MetadataFromKeeperCacheUpdateMicroseconds, "Total time spent in updating the cache including waiting for responses from Keeper") \ - M(MetadataFromKeeperUpdateCacheOneLevel, "Number of times a cache update for one level of directory tree was done") \ - M(MetadataFromKeeperTransactionCommit, "Number of times metadata transaction commit was attempted") \ - M(MetadataFromKeeperTransactionCommitRetry, "Number of times metadata transaction commit was retried") \ - M(MetadataFromKeeperCleanupTransactionCommit, "Number of times metadata transaction commit for deleted objects cleanup was attempted") \ - M(MetadataFromKeeperCleanupTransactionCommitRetry, "Number of times metadata transaction commit for deleted objects cleanup was retried") \ - M(MetadataFromKeeperOperations, "Number of times a request was made to Keeper") \ - M(MetadataFromKeeperIndividualOperations, "Number of paths read or written by single or multi requests to Keeper") \ - M(MetadataFromKeeperReconnects, "Number of times a reconnect to Keeper was done") \ - M(MetadataFromKeeperBackgroundCleanupObjects, "Number of times a old deleted object clean up was performed by background task") \ - M(MetadataFromKeeperBackgroundCleanupTransactions, "Number of times old transaction idempotency token was cleaned up by background task") \ - M(MetadataFromKeeperBackgroundCleanupErrors, "Number of times an error was encountered in background cleanup task") \ - \ - M(KafkaRebalanceRevocations, "Number of partition revocations (the first stage of consumer group rebalance)") \ - M(KafkaRebalanceAssignments, "Number of partition assignments (the final stage of consumer group rebalance)") \ - M(KafkaRebalanceErrors, "Number of failed consumer group rebalances") \ - M(KafkaMessagesPolled, "Number of Kafka messages polled from librdkafka to ClickHouse") \ - M(KafkaMessagesRead, "Number of Kafka messages already processed by ClickHouse") \ - M(KafkaMessagesFailed, "Number of Kafka messages ClickHouse failed to parse") \ - M(KafkaRowsRead, "Number of rows parsed from Kafka messages") \ - M(KafkaRowsRejected, "Number of parsed rows which were later rejected (due to rebalances / errors or similar reasons). Those rows will be consumed again after the rebalance.") \ - M(KafkaDirectReads, "Number of direct selects from Kafka tables since server start") \ - M(KafkaBackgroundReads, "Number of background reads populating materialized views from Kafka since server start") \ - M(KafkaCommits, "Number of successful commits of consumed offsets to Kafka (normally should be the same as KafkaBackgroundReads)") \ - M(KafkaCommitFailures, "Number of failed commits of consumed offsets to Kafka (usually is a sign of some data duplication)") \ - M(KafkaConsumerErrors, "Number of errors reported by librdkafka during polls") \ - M(KafkaWrites, "Number of writes (inserts) to Kafka tables ") \ - M(KafkaRowsWritten, "Number of rows inserted into Kafka tables") \ - M(KafkaProducerFlushes, "Number of explicit flushes to Kafka producer") \ - M(KafkaMessagesProduced, "Number of messages produced to Kafka") \ - M(KafkaProducerErrors, "Number of errors during producing the messages to Kafka") \ - \ - M(ScalarSubqueriesGlobalCacheHit, "Number of times a read from a scalar subquery was done using the global cache") \ - M(ScalarSubqueriesLocalCacheHit, "Number of times a read from a scalar subquery was done using the local cache") \ - M(ScalarSubqueriesCacheMiss, "Number of times a read from a scalar subquery was not cached and had to be calculated completely") \ - \ - M(SchemaInferenceCacheHits, "Number of times the requested source is found in schema cache") \ - M(SchemaInferenceCacheSchemaHits, "Number of times the schema is found in schema cache during schema inference") \ - M(SchemaInferenceCacheNumRowsHits, "Number of times the number of rows is found in schema cache during count from files") \ - M(SchemaInferenceCacheMisses, "Number of times the requested source is not in schema cache") \ - M(SchemaInferenceCacheSchemaMisses, "Number of times the requested source is in cache but the schema is not in cache during schema inference") \ - M(SchemaInferenceCacheNumRowsMisses, "Number of times the requested source is in cache but the number of rows is not in cache while count from files") \ - M(SchemaInferenceCacheEvictions, "Number of times a schema from cache was evicted due to overflow") \ - M(SchemaInferenceCacheInvalidations, "Number of times a schema in cache became invalid due to changes in data") \ - \ - M(KeeperPacketsSent, "Packets sent by keeper server") \ - M(KeeperPacketsReceived, "Packets received by keeper server") \ - M(KeeperRequestTotal, "Total requests number on keeper server") \ - M(KeeperLatency, "Keeper latency") \ - M(KeeperTotalElapsedMicroseconds, "Keeper total latency for a single request") \ - M(KeeperProcessElapsedMicroseconds, "Keeper commit latency for a single request") \ - M(KeeperPreprocessElapsedMicroseconds, "Keeper preprocessing latency for a single reuquest") \ - M(KeeperStorageLockWaitMicroseconds, "Time spent waiting for acquiring Keeper storage lock") \ - M(KeeperCommitWaitElapsedMicroseconds, "Time spent waiting for certain log to be committed") \ - M(KeeperBatchMaxCount, "Number of times the size of batch was limited by the amount") \ - M(KeeperBatchMaxTotalSize, "Number of times the size of batch was limited by the total bytes size") \ - M(KeeperCommits, "Number of successful commits") \ - M(KeeperCommitsFailed, "Number of failed commits") \ - M(KeeperSnapshotCreations, "Number of snapshots creations")\ - M(KeeperSnapshotCreationsFailed, "Number of failed snapshot creations")\ - M(KeeperSnapshotApplys, "Number of snapshot applying")\ - M(KeeperSnapshotApplysFailed, "Number of failed snapshot applying")\ - M(KeeperReadSnapshot, "Number of snapshot read(serialization)")\ - M(KeeperSaveSnapshot, "Number of snapshot save")\ - M(KeeperCreateRequest, "Number of create requests")\ - M(KeeperRemoveRequest, "Number of remove requests")\ - M(KeeperSetRequest, "Number of set requests")\ - M(KeeperReconfigRequest, "Number of reconfig requests")\ - M(KeeperCheckRequest, "Number of check requests")\ - M(KeeperMultiRequest, "Number of multi requests")\ - M(KeeperMultiReadRequest, "Number of multi read requests")\ - M(KeeperGetRequest, "Number of get requests")\ - M(KeeperListRequest, "Number of list requests")\ - M(KeeperExistsRequest, "Number of exists requests")\ - \ - M(OverflowBreak, "Number of times, data processing was cancelled by query complexity limitation with setting '*_overflow_mode' = 'break' and the result is incomplete.") \ - M(OverflowThrow, "Number of times, data processing was cancelled by query complexity limitation with setting '*_overflow_mode' = 'throw' and exception was thrown.") \ - M(OverflowAny, "Number of times approximate GROUP BY was in effect: when aggregation was performed only on top of first 'max_rows_to_group_by' unique keys and other keys were ignored due to 'group_by_overflow_mode' = 'any'.") \ - \ - M(S3QueueSetFileProcessingMicroseconds, "Time spent to set file as processing")\ - M(S3QueueSetFileProcessedMicroseconds, "Time spent to set file as processed")\ - M(S3QueueSetFileFailedMicroseconds, "Time spent to set file as failed")\ - M(ObjectStorageQueueFailedFiles, "Number of files which failed to be processed")\ - M(ObjectStorageQueueProcessedFiles, "Number of files which were processed")\ - M(ObjectStorageQueueCleanupMaxSetSizeOrTTLMicroseconds, "Time spent to set file as failed")\ - M(ObjectStorageQueuePullMicroseconds, "Time spent to read file data")\ - M(ObjectStorageQueueLockLocalFileStatusesMicroseconds, "Time spent to lock local file statuses")\ - \ - M(ServerStartupMilliseconds, "Time elapsed from starting server to listening to sockets in milliseconds")\ - M(IOUringSQEsSubmitted, "Total number of io_uring SQEs submitted") \ - M(IOUringSQEsResubmitsAsync, "Total number of asynchronous io_uring SQE resubmits performed") \ - M(IOUringSQEsResubmitsSync, "Total number of synchronous io_uring SQE resubmits performed") \ - M(IOUringCQEsCompleted, "Total number of successfully completed io_uring CQEs") \ - M(IOUringCQEsFailed, "Total number of completed io_uring CQEs with failures") \ - \ - M(BackupsOpenedForRead, "Number of backups opened for reading") \ - M(BackupsOpenedForWrite, "Number of backups opened for writing") \ - M(BackupReadMetadataMicroseconds, "Time spent reading backup metadata from .backup file") \ - M(BackupWriteMetadataMicroseconds, "Time spent writing backup metadata to .backup file") \ - M(BackupEntriesCollectorMicroseconds, "Time spent making backup entries") \ - M(BackupEntriesCollectorForTablesDataMicroseconds, "Time spent making backup entries for tables data") \ - M(BackupEntriesCollectorRunPostTasksMicroseconds, "Time spent running post tasks after making backup entries") \ - \ - M(ReadTaskRequestsReceived, "The number of callbacks requested from the remote server back to the initiator server to choose the read task (for s3Cluster table function and similar). Measured on the initiator server side.") \ - M(MergeTreeReadTaskRequestsReceived, "The number of callbacks requested from the remote server back to the initiator server to choose the read task (for MergeTree tables). Measured on the initiator server side.") \ - \ - M(ReadTaskRequestsSent, "The number of callbacks requested from the remote server back to the initiator server to choose the read task (for s3Cluster table function and similar). Measured on the remote server side.") \ - M(MergeTreeReadTaskRequestsSent, "The number of callbacks requested from the remote server back to the initiator server to choose the read task (for MergeTree tables). Measured on the remote server side.") \ - M(MergeTreeAllRangesAnnouncementsSent, "The number of announcements sent from the remote server to the initiator server about the set of data parts (for MergeTree tables). Measured on the remote server side.") \ - M(ReadTaskRequestsSentElapsedMicroseconds, "Time spent in callbacks requested from the remote server back to the initiator server to choose the read task (for s3Cluster table function and similar). Measured on the remote server side.") \ - M(MergeTreeReadTaskRequestsSentElapsedMicroseconds, "Time spent in callbacks requested from the remote server back to the initiator server to choose the read task (for MergeTree tables). Measured on the remote server side.") \ - M(MergeTreeAllRangesAnnouncementsSentElapsedMicroseconds, "Time spent in sending the announcement from the remote server to the initiator server about the set of data parts (for MergeTree tables). Measured on the remote server side.") \ - \ - M(ConnectionPoolIsFullMicroseconds, "Total time spent waiting for a slot in connection pool.") \ - M(AsyncLoaderWaitMicroseconds, "Total time a query was waiting for async loader jobs.") \ - \ - M(DistrCacheServerSwitches, "Number of server switches between distributed cache servers in read/write-through cache") \ - M(DistrCacheReadMicroseconds, "Time spent reading from distributed cache") \ - M(DistrCacheFallbackReadMicroseconds, "Time spend reading from fallback buffer instead of distribted cache") \ - M(DistrCachePrecomputeRangesMicroseconds, "Time spent to precompute read ranges") \ - M(DistrCacheNextImplMicroseconds, "Time spend in ReadBufferFromDistributedCache::nextImpl") \ - M(DistrCacheOpenedConnections, "The number of open connections to distributed cache") \ - M(DistrCacheReusedConnections, "The number of reused connections to distributed cache") \ - M(DistrCacheHoldConnections, "The number of used connections to distributed cache") \ - \ - M(DistrCacheGetResponseMicroseconds, "Time spend to wait for response from distributed cache") \ - M(DistrCacheStartRangeMicroseconds, "Time spent to start a new read range with distributed cache") \ - M(DistrCacheLockRegistryMicroseconds, "Time spent to take DistributedCacheRegistry lock") \ - M(DistrCacheUnusedPackets, "Number of skipped unused packets from distributed cache") \ - M(DistrCachePackets, "Total number of packets received from distributed cache") \ - M(DistrCacheUnusedPacketsBytes, "The number of bytes in Data packets which were ignored") \ - M(DistrCacheRegistryUpdateMicroseconds, "Time spent updating distributed cache registry") \ - M(DistrCacheRegistryUpdates, "Number of distributed cache registry updates") \ - \ - M(DistrCacheConnectMicroseconds, "The time spent to connect to distributed cache") \ - M(DistrCacheConnectAttempts, "The number of connection attempts to distributed cache") \ - M(DistrCacheGetClient, "Number of client access times") \ - \ - M(DistrCacheServerProcessRequestMicroseconds, "Time spent processing request on DistributedCache server side") \ - \ - M(LogTest, "Number of log messages with level Test") \ - M(LogTrace, "Number of log messages with level Trace") \ - M(LogDebug, "Number of log messages with level Debug") \ - M(LogInfo, "Number of log messages with level Info") \ - M(LogWarning, "Number of log messages with level Warning") \ - M(LogError, "Number of log messages with level Error") \ - M(LogFatal, "Number of log messages with level Fatal") \ - \ - M(InterfaceHTTPSendBytes, "Number of bytes sent through HTTP interfaces") \ - M(InterfaceHTTPReceiveBytes, "Number of bytes received through HTTP interfaces") \ - M(InterfaceNativeSendBytes, "Number of bytes sent through native interfaces") \ - M(InterfaceNativeReceiveBytes, "Number of bytes received through native interfaces") \ - M(InterfacePrometheusSendBytes, "Number of bytes sent through Prometheus interfaces") \ - M(InterfacePrometheusReceiveBytes, "Number of bytes received through Prometheus interfaces") \ - M(InterfaceInterserverSendBytes, "Number of bytes sent through interserver interfaces") \ - M(InterfaceInterserverReceiveBytes, "Number of bytes received through interserver interfaces") \ - M(InterfaceMySQLSendBytes, "Number of bytes sent through MySQL interfaces") \ - M(InterfaceMySQLReceiveBytes, "Number of bytes received through MySQL interfaces") \ - M(InterfacePostgreSQLSendBytes, "Number of bytes sent through PostgreSQL interfaces") \ - M(InterfacePostgreSQLReceiveBytes, "Number of bytes received through PostgreSQL interfaces") \ - \ - M(ParallelReplicasUsedCount, "Number of replicas used to execute a query with task-based parallel replicas") \ - \ - M(KeeperLogsEntryReadFromLatestCache, "Number of log entries in Keeper being read from latest logs cache") \ - M(KeeperLogsEntryReadFromCommitCache, "Number of log entries in Keeper being read from commit logs cache") \ - M(KeeperLogsEntryReadFromFile, "Number of log entries in Keeper being read directly from the changelog file") \ - M(KeeperLogsPrefetchedEntries, "Number of log entries in Keeper being prefetched from the changelog file") \ - \ - M(ParallelReplicasAvailableCount, "Number of replicas available to execute a query with task-based parallel replicas") \ - M(ParallelReplicasUnavailableCount, "Number of replicas which was chosen, but found to be unavailable during query execution with task-based parallel replicas") \ - \ - M(StorageConnectionsCreated, "Number of created connections for storages") \ - M(StorageConnectionsReused, "Number of reused connections for storages") \ - M(StorageConnectionsReset, "Number of reset connections for storages") \ - M(StorageConnectionsPreserved, "Number of preserved connections for storages") \ - M(StorageConnectionsExpired, "Number of expired connections for storages") \ - M(StorageConnectionsErrors, "Number of cases when creation of a connection for storage is failed") \ - M(StorageConnectionsElapsedMicroseconds, "Total time spend on creating connections for storages") \ - \ - M(DiskConnectionsCreated, "Number of created connections for disk") \ - M(DiskConnectionsReused, "Number of reused connections for disk") \ - M(DiskConnectionsReset, "Number of reset connections for disk") \ - M(DiskConnectionsPreserved, "Number of preserved connections for disk") \ - M(DiskConnectionsExpired, "Number of expired connections for disk") \ - M(DiskConnectionsErrors, "Number of cases when creation of a connection for disk is failed") \ - M(DiskConnectionsElapsedMicroseconds, "Total time spend on creating connections for disk") \ - \ - M(HTTPConnectionsCreated, "Number of created http connections") \ - M(HTTPConnectionsReused, "Number of reused http connections") \ - M(HTTPConnectionsReset, "Number of reset http connections") \ - M(HTTPConnectionsPreserved, "Number of preserved http connections") \ - M(HTTPConnectionsExpired, "Number of expired http connections") \ - M(HTTPConnectionsErrors, "Number of cases when creation of a http connection failed") \ - M(HTTPConnectionsElapsedMicroseconds, "Total time spend on creating http connections") \ - \ - M(AddressesDiscovered, "Total count of new addresses in dns resolve results for http connections") \ - M(AddressesExpired, "Total count of expired addresses which is no longer presented in dns resolve results for http connections") \ - M(AddressesMarkedAsFailed, "Total count of addresses which has been marked as faulty due to connection errors for http connections") \ - \ - M(ReadWriteBufferFromHTTPRequestsSent, "Number of HTTP requests sent by ReadWriteBufferFromHTTP") \ - M(ReadWriteBufferFromHTTPBytes, "Total size of payload bytes received and sent by ReadWriteBufferFromHTTP. Doesn't include HTTP headers.") \ - \ - M(GWPAsanAllocateSuccess, "Number of successful allocations done by GWPAsan") \ - M(GWPAsanAllocateFailed, "Number of failed allocations done by GWPAsan (i.e. filled pool)") \ - M(GWPAsanFree, "Number of free operations done by GWPAsan") \ - \ - M(MemoryWorkerRun, "Number of runs done by MemoryWorker in background") \ - M(MemoryWorkerRunElapsedMicroseconds, "Total time spent by MemoryWorker for background work") \ +)", ValueType::Number) \ + M(DataAfterMutationDiffersFromReplica, "Number of times data after mutation is not byte-identical to the data on other replicas. In addition to the reasons described in 'DataAfterMergeDiffersFromReplica', it is also possible due to non-deterministic mutation.", ValueType::Number) \ + M(PolygonsAddedToPool, "A polygon has been added to the cache (pool) for the 'pointInPolygon' function.", ValueType::Number) \ + M(PolygonsInPoolAllocatedBytes, "The number of bytes for polygons added to the cache (pool) for the 'pointInPolygon' function.", ValueType::Bytes) \ + \ + M(USearchAddCount, "Number of vectors added to usearch indexes.", ValueType::Number) \ + M(USearchAddVisitedMembers, "Number of nodes visited when adding vectors to usearch indexes.", ValueType::Number) \ + M(USearchAddComputedDistances, "Number of times distance was computed when adding vectors to usearch indexes.", ValueType::Number) \ + M(USearchSearchCount, "Number of search operations performed in usearch indexes.", ValueType::Number) \ + M(USearchSearchVisitedMembers, "Number of nodes visited when searching in usearch indexes.", ValueType::Number) \ + M(USearchSearchComputedDistances, "Number of times distance was computed when searching usearch indexes.", ValueType::Number) \ + \ + M(RWLockAcquiredReadLocks, "Number of times a read lock was acquired (in a heavy RWLock).", ValueType::Number) \ + M(RWLockAcquiredWriteLocks, "Number of times a write lock was acquired (in a heavy RWLock).", ValueType::Number) \ + M(RWLockReadersWaitMilliseconds, "Total time spent waiting for a read lock to be acquired (in a heavy RWLock).", ValueType::Milliseconds) \ + M(RWLockWritersWaitMilliseconds, "Total time spent waiting for a write lock to be acquired (in a heavy RWLock).", ValueType::Milliseconds) \ + M(DNSError, "Total count of errors in DNS resolution", ValueType::Number) \ + M(PartsLockHoldMicroseconds, "Total time spent holding data parts lock in MergeTree tables", ValueType::Microseconds) \ + M(PartsLockWaitMicroseconds, "Total time spent waiting for data parts lock in MergeTree tables", ValueType::Microseconds) \ + \ + M(RealTimeMicroseconds, "Total (wall clock) time spent in processing (queries and other tasks) threads (note that this is a sum).", ValueType::Microseconds) \ + M(UserTimeMicroseconds, "Total time spent in processing (queries and other tasks) threads executing CPU instructions in user mode. This includes time CPU pipeline was stalled due to main memory access, cache misses, branch mispredictions, hyper-threading, etc.", ValueType::Microseconds) \ + M(SystemTimeMicroseconds, "Total time spent in processing (queries and other tasks) threads executing CPU instructions in OS kernel mode. This is time spent in syscalls, excluding waiting time during blocking syscalls.", ValueType::Microseconds) \ + M(MemoryOvercommitWaitTimeMicroseconds, "Total time spent in waiting for memory to be freed in OvercommitTracker.", ValueType::Microseconds) \ + M(MemoryAllocatorPurge, "Total number of times memory allocator purge was requested", ValueType::Number) \ + M(MemoryAllocatorPurgeTimeMicroseconds, "Total number of times memory allocator purge was requested", ValueType::Microseconds) \ + M(SoftPageFaults, "The number of soft page faults in query execution threads. Soft page fault usually means a miss in the memory allocator cache, which requires a new memory mapping from the OS and subsequent allocation of a page of physical memory.", ValueType::Number) \ + M(HardPageFaults, "The number of hard page faults in query execution threads. High values indicate either that you forgot to turn off swap on your server, or eviction of memory pages of the ClickHouse binary during very high memory pressure, or successful usage of the 'mmap' read method for the tables data.", ValueType::Number) \ + \ + M(OSIOWaitMicroseconds, "Total time a thread spent waiting for a result of IO operation, from the OS point of view. This is real IO that doesn't include page cache.", ValueType::Microseconds) \ + M(OSCPUWaitMicroseconds, "Total time a thread was ready for execution but waiting to be scheduled by OS, from the OS point of view.", ValueType::Microseconds) \ + M(OSCPUVirtualTimeMicroseconds, "CPU time spent seen by OS. Does not include involuntary waits due to virtualization.", ValueType::Microseconds) \ + M(OSReadBytes, "Number of bytes read from disks or block devices. Doesn't include bytes read from page cache. May include excessive data due to block size, readahead, etc.", ValueType::Bytes) \ + M(OSWriteBytes, "Number of bytes written to disks or block devices. Doesn't include bytes that are in page cache dirty pages. May not include data that was written by OS asynchronously.", ValueType::Bytes) \ + M(OSReadChars, "Number of bytes read from filesystem, including page cache.", ValueType::Bytes) \ + M(OSWriteChars, "Number of bytes written to filesystem, including page cache.", ValueType::Bytes) \ + \ + M(ParallelReplicasHandleRequestMicroseconds, "Time spent processing requests for marks from replicas", ValueType::Microseconds) \ + M(ParallelReplicasHandleAnnouncementMicroseconds, "Time spent processing replicas announcements", ValueType::Microseconds) \ + M(ParallelReplicasAnnouncementMicroseconds, "Time spent to send an announcement", ValueType::Microseconds) \ + M(ParallelReplicasReadRequestMicroseconds, "Time spent for read requests", ValueType::Microseconds) \ + \ + M(ParallelReplicasReadAssignedMarks, "Sum across all replicas of how many of scheduled marks were assigned by consistent hash", ValueType::Number) \ + M(ParallelReplicasReadUnassignedMarks, "Sum across all replicas of how many unassigned marks were scheduled", ValueType::Number) \ + M(ParallelReplicasReadAssignedForStealingMarks, "Sum across all replicas of how many of scheduled marks were assigned for stealing by consistent hash", ValueType::Number) \ + M(ParallelReplicasReadMarks, "How many marks were read by the given replica", ValueType::Number) \ + \ + M(ParallelReplicasStealingByHashMicroseconds, "Time spent collecting segments meant for stealing by hash", ValueType::Microseconds) \ + M(ParallelReplicasProcessingPartsMicroseconds, "Time spent processing data parts", ValueType::Microseconds) \ + M(ParallelReplicasStealingLeftoversMicroseconds, "Time spent collecting orphaned segments", ValueType::Microseconds) \ + M(ParallelReplicasCollectingOwnedSegmentsMicroseconds, "Time spent collecting segments meant by hash", ValueType::Microseconds) \ + M(ParallelReplicasNumRequests, "Number of requests to the initiator.", ValueType::Number) \ + M(ParallelReplicasDeniedRequests, "Number of completely denied requests to the initiator", ValueType::Number) \ + M(CacheWarmerBytesDownloaded, "Amount of data fetched into filesystem cache by dedicated background threads.", ValueType::Bytes) \ + M(CacheWarmerDataPartsDownloaded, "Number of data parts that were fully fetched by CacheWarmer.", ValueType::Number) \ + M(IgnoredColdParts, "See setting ignore_cold_parts_seconds. Number of times read queries ignored very new parts that weren't pulled into cache by CacheWarmer yet.", ValueType::Number) \ + M(PreferredWarmedUnmergedParts, "See setting prefer_warmed_unmerged_parts_seconds. Number of times read queries used outdated pre-merge parts that are in cache instead of merged part that wasn't pulled into cache by CacheWarmer yet.", ValueType::Number) \ + \ + M(PerfCPUCycles, "Total cycles. Be wary of what happens during CPU frequency scaling.", ValueType::Number) \ + M(PerfInstructions, "Retired instructions. Be careful, these can be affected by various issues, most notably hardware interrupt counts.", ValueType::Number) \ + M(PerfCacheReferences, "Cache accesses. Usually, this indicates Last Level Cache accesses, but this may vary depending on your CPU. This may include prefetches and coherency messages; again this depends on the design of your CPU.", ValueType::Number) \ + M(PerfCacheMisses, "Cache misses. Usually this indicates Last Level Cache misses; this is intended to be used in conjunction with the PERFCOUNTHWCACHEREFERENCES event to calculate cache miss rates.", ValueType::Number) \ + M(PerfBranchInstructions, "Retired branch instructions. Prior to Linux 2.6.35, this used the wrong event on AMD processors.", ValueType::Number) \ + M(PerfBranchMisses, "Mispredicted branch instructions.", ValueType::Number) \ + M(PerfBusCycles, "Bus cycles, which can be different from total cycles.", ValueType::Number) \ + M(PerfStalledCyclesFrontend, "Stalled cycles during issue.", ValueType::Number) \ + M(PerfStalledCyclesBackend, "Stalled cycles during retirement.", ValueType::Number) \ + M(PerfRefCPUCycles, "Total cycles; not affected by CPU frequency scaling.", ValueType::Number) \ + \ + M(PerfCPUClock, "The CPU clock, a high-resolution per-CPU timer", ValueType::Number) \ + M(PerfTaskClock, "A clock count specific to the task that is running", ValueType::Number) \ + M(PerfContextSwitches, "Number of context switches", ValueType::Number) \ + M(PerfCPUMigrations, "Number of times the process has migrated to a new CPU", ValueType::Number) \ + M(PerfAlignmentFaults, "Number of alignment faults. These happen when unaligned memory accesses happen; the kernel can handle these but it reduces performance. This happens only on some architectures (never on x86).", ValueType::Number) \ + M(PerfEmulationFaults, "Number of emulation faults. The kernel sometimes traps on unimplemented instructions and emulates them for user space. This can negatively impact performance.", ValueType::Number) \ + M(PerfMinEnabledTime, "For all events, minimum time that an event was enabled. Used to track event multiplexing influence", ValueType::Number) \ + M(PerfMinEnabledRunningTime, "Running time for event with minimum enabled time. Used to track the amount of event multiplexing", ValueType::Number) \ + M(PerfDataTLBReferences, "Data TLB references", ValueType::Number) \ + M(PerfDataTLBMisses, "Data TLB misses", ValueType::Number) \ + M(PerfInstructionTLBReferences, "Instruction TLB references", ValueType::Number) \ + M(PerfInstructionTLBMisses, "Instruction TLB misses", ValueType::Number) \ + M(PerfLocalMemoryReferences, "Local NUMA node memory reads", ValueType::Number) \ + M(PerfLocalMemoryMisses, "Local NUMA node memory read misses", ValueType::Number) \ + \ + M(CannotWriteToWriteBufferDiscard, "Number of stack traces dropped by query profiler or signal handler because pipe is full or cannot write to pipe.", ValueType::Number) \ + M(QueryProfilerSignalOverruns, "Number of times we drop processing of a query profiler signal due to overrun plus the number of signals that OS has not delivered due to overrun.", ValueType::Number) \ + M(QueryProfilerConcurrencyOverruns, "Number of times we drop processing of a query profiler signal due to too many concurrent query profilers in other threads, which may indicate overload.", ValueType::Number) \ + M(QueryProfilerRuns, "Number of times QueryProfiler had been run.", ValueType::Number) \ + M(QueryProfilerErrors, "Invalid memory accesses during asynchronous stack unwinding.", ValueType::Number) \ + \ + M(CreatedLogEntryForMerge, "Successfully created log entry to merge parts in ReplicatedMergeTree.", ValueType::Number) \ + M(NotCreatedLogEntryForMerge, "Log entry to merge parts in ReplicatedMergeTree is not created due to concurrent log update by another replica.", ValueType::Number) \ + M(CreatedLogEntryForMutation, "Successfully created log entry to mutate parts in ReplicatedMergeTree.", ValueType::Number) \ + M(NotCreatedLogEntryForMutation, "Log entry to mutate parts in ReplicatedMergeTree is not created due to concurrent log update by another replica.", ValueType::Number) \ + \ + M(S3ReadMicroseconds, "Time of GET and HEAD requests to S3 storage.", ValueType::Microseconds) \ + M(S3ReadRequestsCount, "Number of GET and HEAD requests to S3 storage.", ValueType::Number) \ + M(S3ReadRequestsErrors, "Number of non-throttling errors in GET and HEAD requests to S3 storage.", ValueType::Number) \ + M(S3ReadRequestsThrottling, "Number of 429 and 503 errors in GET and HEAD requests to S3 storage.", ValueType::Number) \ + M(S3ReadRequestsRedirects, "Number of redirects in GET and HEAD requests to S3 storage.", ValueType::Number) \ + \ + M(S3WriteMicroseconds, "Time of POST, DELETE, PUT and PATCH requests to S3 storage.", ValueType::Microseconds) \ + M(S3WriteRequestsCount, "Number of POST, DELETE, PUT and PATCH requests to S3 storage.", ValueType::Number) \ + M(S3WriteRequestsErrors, "Number of non-throttling errors in POST, DELETE, PUT and PATCH requests to S3 storage.", ValueType::Number) \ + M(S3WriteRequestsThrottling, "Number of 429 and 503 errors in POST, DELETE, PUT and PATCH requests to S3 storage.", ValueType::Number) \ + M(S3WriteRequestsRedirects, "Number of redirects in POST, DELETE, PUT and PATCH requests to S3 storage.", ValueType::Number) \ + \ + M(DiskS3ReadMicroseconds, "Time of GET and HEAD requests to DiskS3 storage.", ValueType::Microseconds) \ + M(DiskS3ReadRequestsCount, "Number of GET and HEAD requests to DiskS3 storage.", ValueType::Number) \ + M(DiskS3ReadRequestsErrors, "Number of non-throttling errors in GET and HEAD requests to DiskS3 storage.", ValueType::Number) \ + M(DiskS3ReadRequestsThrottling, "Number of 429 and 503 errors in GET and HEAD requests to DiskS3 storage.", ValueType::Number) \ + M(DiskS3ReadRequestsRedirects, "Number of redirects in GET and HEAD requests to DiskS3 storage.", ValueType::Number) \ + \ + M(DiskS3WriteMicroseconds, "Time of POST, DELETE, PUT and PATCH requests to DiskS3 storage.", ValueType::Microseconds) \ + M(DiskS3WriteRequestsCount, "Number of POST, DELETE, PUT and PATCH requests to DiskS3 storage.", ValueType::Number) \ + M(DiskS3WriteRequestsErrors, "Number of non-throttling errors in POST, DELETE, PUT and PATCH requests to DiskS3 storage.", ValueType::Number) \ + M(DiskS3WriteRequestsThrottling, "Number of 429 and 503 errors in POST, DELETE, PUT and PATCH requests to DiskS3 storage.", ValueType::Number) \ + M(DiskS3WriteRequestsRedirects, "Number of redirects in POST, DELETE, PUT and PATCH requests to DiskS3 storage.", ValueType::Number) \ + \ + M(S3DeleteObjects, "Number of S3 API DeleteObject(s) calls.", ValueType::Number) \ + M(S3CopyObject, "Number of S3 API CopyObject calls.", ValueType::Number) \ + M(S3ListObjects, "Number of S3 API ListObjects calls.", ValueType::Number) \ + M(S3HeadObject, "Number of S3 API HeadObject calls.", ValueType::Number) \ + M(S3GetObjectAttributes, "Number of S3 API GetObjectAttributes calls.", ValueType::Number) \ + M(S3CreateMultipartUpload, "Number of S3 API CreateMultipartUpload calls.", ValueType::Number) \ + M(S3UploadPartCopy, "Number of S3 API UploadPartCopy calls.", ValueType::Number) \ + M(S3UploadPart, "Number of S3 API UploadPart calls.", ValueType::Number) \ + M(S3AbortMultipartUpload, "Number of S3 API AbortMultipartUpload calls.", ValueType::Number) \ + M(S3CompleteMultipartUpload, "Number of S3 API CompleteMultipartUpload calls.", ValueType::Number) \ + M(S3PutObject, "Number of S3 API PutObject calls.", ValueType::Number) \ + M(S3GetObject, "Number of S3 API GetObject calls.", ValueType::Number) \ + \ + M(DiskS3DeleteObjects, "Number of DiskS3 API DeleteObject(s) calls.", ValueType::Number) \ + M(DiskS3CopyObject, "Number of DiskS3 API CopyObject calls.", ValueType::Number) \ + M(DiskS3ListObjects, "Number of DiskS3 API ListObjects calls.", ValueType::Number) \ + M(DiskS3HeadObject, "Number of DiskS3 API HeadObject calls.", ValueType::Number) \ + M(DiskS3GetObjectAttributes, "Number of DiskS3 API GetObjectAttributes calls.", ValueType::Number) \ + M(DiskS3CreateMultipartUpload, "Number of DiskS3 API CreateMultipartUpload calls.", ValueType::Number) \ + M(DiskS3UploadPartCopy, "Number of DiskS3 API UploadPartCopy calls.", ValueType::Number) \ + M(DiskS3UploadPart, "Number of DiskS3 API UploadPart calls.", ValueType::Number) \ + M(DiskS3AbortMultipartUpload, "Number of DiskS3 API AbortMultipartUpload calls.", ValueType::Number) \ + M(DiskS3CompleteMultipartUpload, "Number of DiskS3 API CompleteMultipartUpload calls.", ValueType::Number) \ + M(DiskS3PutObject, "Number of DiskS3 API PutObject calls.", ValueType::Number) \ + M(DiskS3GetObject, "Number of DiskS3 API GetObject calls.", ValueType::Number) \ + \ + M(DiskPlainRewritableAzureDirectoryCreated, "Number of directories created by the 'plain_rewritable' metadata storage for AzureObjectStorage.", ValueType::Number) \ + M(DiskPlainRewritableAzureDirectoryRemoved, "Number of directories removed by the 'plain_rewritable' metadata storage for AzureObjectStorage.", ValueType::Number) \ + M(DiskPlainRewritableLocalDirectoryCreated, "Number of directories created by the 'plain_rewritable' metadata storage for LocalObjectStorage.", ValueType::Number) \ + M(DiskPlainRewritableLocalDirectoryRemoved, "Number of directories removed by the 'plain_rewritable' metadata storage for LocalObjectStorage.", ValueType::Number) \ + M(DiskPlainRewritableS3DirectoryCreated, "Number of directories created by the 'plain_rewritable' metadata storage for S3ObjectStorage.", ValueType::Number) \ + M(DiskPlainRewritableS3DirectoryRemoved, "Number of directories removed by the 'plain_rewritable' metadata storage for S3ObjectStorage.", ValueType::Number) \ + \ + M(S3Clients, "Number of created S3 clients.", ValueType::Number) \ + M(TinyS3Clients, "Number of S3 clients copies which reuse an existing auth provider from another client.", ValueType::Number) \ + \ + M(EngineFileLikeReadFiles, "Number of files read in table engines working with files (like File/S3/URL/HDFS).", ValueType::Number) \ + \ + M(ReadBufferFromS3Microseconds, "Time spent on reading from S3.", ValueType::Microseconds) \ + M(ReadBufferFromS3InitMicroseconds, "Time spent initializing connection to S3.", ValueType::Microseconds) \ + M(ReadBufferFromS3Bytes, "Bytes read from S3.", ValueType::Bytes) \ + M(ReadBufferFromS3RequestsErrors, "Number of exceptions while reading from S3.", ValueType::Number) \ + \ + M(WriteBufferFromS3Microseconds, "Time spent on writing to S3.", ValueType::Microseconds) \ + M(WriteBufferFromS3Bytes, "Bytes written to S3.", ValueType::Bytes) \ + M(WriteBufferFromS3RequestsErrors, "Number of exceptions while writing to S3.", ValueType::Number) \ + M(WriteBufferFromS3WaitInflightLimitMicroseconds, "Time spent on waiting while some of the current requests are done when its number reached the limit defined by s3_max_inflight_parts_for_one_file.", ValueType::Microseconds) \ + M(QueryMemoryLimitExceeded, "Number of times when memory limit exceeded for query.", ValueType::Number) \ + \ + M(AzureGetObject, "Number of Azure API GetObject calls.", ValueType::Number) \ + M(AzureUpload, "Number of Azure blob storage API Upload calls", ValueType::Number) \ + M(AzureStageBlock, "Number of Azure blob storage API StageBlock calls", ValueType::Number) \ + M(AzureCommitBlockList, "Number of Azure blob storage API CommitBlockList calls", ValueType::Number) \ + M(AzureCopyObject, "Number of Azure blob storage API CopyObject calls", ValueType::Number) \ + M(AzureDeleteObjects, "Number of Azure blob storage API DeleteObject(s) calls.", ValueType::Number) \ + M(AzureListObjects, "Number of Azure blob storage API ListObjects calls.", ValueType::Number) \ + M(AzureGetProperties, "Number of Azure blob storage API GetProperties calls.", ValueType::Number) \ + M(AzureCreateContainer, "Number of Azure blob storage API CreateContainer calls.", ValueType::Number) \ + \ + M(DiskAzureGetObject, "Number of Disk Azure API GetObject calls.", ValueType::Number) \ + M(DiskAzureUpload, "Number of Disk Azure blob storage API Upload calls", ValueType::Number) \ + M(DiskAzureStageBlock, "Number of Disk Azure blob storage API StageBlock calls", ValueType::Number) \ + M(DiskAzureCommitBlockList, "Number of Disk Azure blob storage API CommitBlockList calls", ValueType::Number) \ + M(DiskAzureCopyObject, "Number of Disk Azure blob storage API CopyObject calls", ValueType::Number) \ + M(DiskAzureListObjects, "Number of Disk Azure blob storage API ListObjects calls.", ValueType::Number) \ + M(DiskAzureDeleteObjects, "Number of Azure blob storage API DeleteObject(s) calls.", ValueType::Number) \ + M(DiskAzureGetProperties, "Number of Disk Azure blob storage API GetProperties calls.", ValueType::Number) \ + M(DiskAzureCreateContainer, "Number of Disk Azure blob storage API CreateContainer calls.", ValueType::Number) \ + \ + M(ReadBufferFromAzureMicroseconds, "Time spent on reading from Azure.", ValueType::Microseconds) \ + M(ReadBufferFromAzureInitMicroseconds, "Time spent initializing connection to Azure.", ValueType::Microseconds) \ + M(ReadBufferFromAzureBytes, "Bytes read from Azure.", ValueType::Bytes) \ + M(ReadBufferFromAzureRequestsErrors, "Number of exceptions while reading from Azure", ValueType::Number) \ + \ + M(CachedReadBufferReadFromCacheHits, "Number of times the read from filesystem cache hit the cache.", ValueType::Number) \ + M(CachedReadBufferReadFromCacheMisses, "Number of times the read from filesystem cache miss the cache.", ValueType::Number) \ + M(CachedReadBufferReadFromSourceMicroseconds, "Time reading from filesystem cache source (from remote filesystem, etc)", ValueType::Microseconds) \ + M(CachedReadBufferReadFromCacheMicroseconds, "Time reading from filesystem cache", ValueType::Microseconds) \ + M(CachedReadBufferReadFromSourceBytes, "Bytes read from filesystem cache source (from remote fs, etc)", ValueType::Bytes) \ + M(CachedReadBufferReadFromCacheBytes, "Bytes read from filesystem cache", ValueType::Bytes) \ + M(CachedReadBufferPredownloadedBytes, "Bytes read from filesystem cache source. Cache segments are read from left to right as a whole, it might be that we need to predownload some part of the segment irrelevant for the current task just to get to the needed data", ValueType::Bytes) \ + M(CachedReadBufferCacheWriteBytes, "Bytes written from source (remote fs, etc) to filesystem cache", ValueType::Bytes) \ + M(CachedReadBufferCacheWriteMicroseconds, "Time spent writing data into filesystem cache", ValueType::Microseconds) \ + M(CachedReadBufferCreateBufferMicroseconds, "Prepare buffer time", ValueType::Microseconds) \ + M(CachedWriteBufferCacheWriteBytes, "Bytes written from source (remote fs, etc) to filesystem cache", ValueType::Bytes) \ + M(CachedWriteBufferCacheWriteMicroseconds, "Time spent writing data into filesystem cache", ValueType::Microseconds) \ + \ + M(FilesystemCacheLoadMetadataMicroseconds, "Time spent loading filesystem cache metadata", ValueType::Microseconds) \ + M(FilesystemCacheEvictedBytes, "Number of bytes evicted from filesystem cache", ValueType::Bytes) \ + M(FilesystemCacheEvictedFileSegments, "Number of file segments evicted from filesystem cache", ValueType::Number) \ + M(FilesystemCacheEvictionSkippedFileSegments, "Number of file segments skipped for eviction because of being in unreleasable state", ValueType::Number) \ + M(FilesystemCacheEvictionSkippedEvictingFileSegments, "Number of file segments skipped for eviction because of being in evicting state", ValueType::Number) \ + M(FilesystemCacheEvictionTries, "Number of filesystem cache eviction attempts", ValueType::Number) \ + M(FilesystemCacheLockKeyMicroseconds, "Lock cache key time", ValueType::Microseconds) \ + M(FilesystemCacheLockMetadataMicroseconds, "Lock filesystem cache metadata time", ValueType::Microseconds) \ + M(FilesystemCacheLockCacheMicroseconds, "Lock filesystem cache time", ValueType::Microseconds) \ + M(FilesystemCacheReserveMicroseconds, "Filesystem cache space reservation time", ValueType::Microseconds) \ + M(FilesystemCacheEvictMicroseconds, "Filesystem cache eviction time", ValueType::Microseconds) \ + M(FilesystemCacheGetOrSetMicroseconds, "Filesystem cache getOrSet() time", ValueType::Microseconds) \ + M(FilesystemCacheGetMicroseconds, "Filesystem cache get() time", ValueType::Microseconds) \ + M(FileSegmentWaitMicroseconds, "Wait on DOWNLOADING state", ValueType::Microseconds) \ + M(FileSegmentCompleteMicroseconds, "Duration of FileSegment::complete() in filesystem cache", ValueType::Microseconds) \ + M(FileSegmentLockMicroseconds, "Lock file segment time", ValueType::Microseconds) \ + M(FileSegmentWriteMicroseconds, "File segment write() time", ValueType::Microseconds) \ + M(FileSegmentUseMicroseconds, "File segment use() time", ValueType::Microseconds) \ + M(FileSegmentRemoveMicroseconds, "File segment remove() time", ValueType::Microseconds) \ + M(FileSegmentHolderCompleteMicroseconds, "File segments holder complete() time", ValueType::Microseconds) \ + M(FileSegmentFailToIncreasePriority, "Number of times the priority was not increased due to a high contention on the cache lock", ValueType::Number) \ + M(FilesystemCacheFailToReserveSpaceBecauseOfLockContention, "Number of times space reservation was skipped due to a high contention on the cache lock", ValueType::Number) \ + M(FilesystemCacheFailToReserveSpaceBecauseOfCacheResize, "Number of times space reservation was skipped due to the cache is being resized", ValueType::Number) \ + M(FilesystemCacheHoldFileSegments, "Filesystem cache file segments count, which were hold", ValueType::Number) \ + M(FilesystemCacheUnusedHoldFileSegments, "Filesystem cache file segments count, which were hold, but not used (because of seek or LIMIT n, etc)", ValueType::Number) \ + M(FilesystemCacheFreeSpaceKeepingThreadRun, "Number of times background thread executed free space keeping job", ValueType::Number) \ + M(FilesystemCacheFreeSpaceKeepingThreadWorkMilliseconds, "Time for which background thread executed free space keeping job", ValueType::Milliseconds) \ + \ + M(RemoteFSSeeks, "Total number of seeks for async buffer", ValueType::Number) \ + M(RemoteFSPrefetches, "Number of prefetches made with asynchronous reading from remote filesystem", ValueType::Number) \ + M(RemoteFSCancelledPrefetches, "Number of cancelled prefecthes (because of seek)", ValueType::Number) \ + M(RemoteFSUnusedPrefetches, "Number of prefetches pending at buffer destruction", ValueType::Number) \ + M(RemoteFSPrefetchedReads, "Number of reads from prefecthed buffer", ValueType::Number) \ + M(RemoteFSPrefetchedBytes, "Number of bytes from prefecthed buffer", ValueType::Bytes) \ + M(RemoteFSUnprefetchedReads, "Number of reads from unprefetched buffer", ValueType::Number) \ + M(RemoteFSUnprefetchedBytes, "Number of bytes from unprefetched buffer", ValueType::Bytes) \ + M(RemoteFSLazySeeks, "Number of lazy seeks", ValueType::Number) \ + M(RemoteFSSeeksWithReset, "Number of seeks which lead to a new connection", ValueType::Number) \ + M(RemoteFSBuffers, "Number of buffers created for asynchronous reading from remote filesystem", ValueType::Number) \ + M(MergeTreePrefetchedReadPoolInit, "Time spent preparing tasks in MergeTreePrefetchedReadPool", ValueType::Microseconds) \ + M(WaitPrefetchTaskMicroseconds, "Time spend waiting for prefetched reader", ValueType::Microseconds) \ + \ + M(ThreadpoolReaderTaskMicroseconds, "Time spent getting the data in asynchronous reading", ValueType::Microseconds) \ + M(ThreadpoolReaderPrepareMicroseconds, "Time spent on preparation (e.g. call to reader seek() method)", ValueType::Microseconds) \ + M(ThreadpoolReaderReadBytes, "Bytes read from a threadpool task in asynchronous reading", ValueType::Bytes) \ + M(ThreadpoolReaderSubmit, "Bytes read from a threadpool task in asynchronous reading", ValueType::Bytes) \ + M(ThreadpoolReaderSubmitReadSynchronously, "How many times we haven't scheduled a task on the thread pool and read synchronously instead", ValueType::Number) \ + M(ThreadpoolReaderSubmitReadSynchronouslyBytes, "How many bytes were read synchronously", ValueType::Bytes) \ + M(ThreadpoolReaderSubmitReadSynchronouslyMicroseconds, "How much time we spent reading synchronously", ValueType::Microseconds) \ + M(ThreadpoolReaderSubmitLookupInCacheMicroseconds, "How much time we spent checking if content is cached", ValueType::Microseconds) \ + M(AsynchronousReaderIgnoredBytes, "Number of bytes ignored during asynchronous reading", ValueType::Bytes) \ + \ + M(FileSegmentWaitReadBufferMicroseconds, "Metric per file segment. Time spend waiting for internal read buffer (includes cache waiting)", ValueType::Microseconds) \ + M(FileSegmentReadMicroseconds, "Metric per file segment. Time spend reading from file", ValueType::Microseconds) \ + M(FileSegmentCacheWriteMicroseconds, "Metric per file segment. Time spend writing data to cache", ValueType::Microseconds) \ + M(FileSegmentPredownloadMicroseconds, "Metric per file segment. Time spent pre-downloading data to cache (pre-downloading - finishing file segment download (after someone who failed to do that) up to the point current thread was requested to do)", ValueType::Microseconds) \ + M(FileSegmentUsedBytes, "Metric per file segment. How many bytes were actually used from current file segment", ValueType::Bytes) \ + \ + M(ReadBufferSeekCancelConnection, "Number of seeks which lead to new connection (s3, http)", ValueType::Number) \ + \ + M(SleepFunctionCalls, "Number of times a sleep function (sleep, sleepEachRow) has been called.", ValueType::Number) \ + M(SleepFunctionMicroseconds, "Time set to sleep in a sleep function (sleep, sleepEachRow).", ValueType::Microseconds) \ + M(SleepFunctionElapsedMicroseconds, "Time spent sleeping in a sleep function (sleep, sleepEachRow).", ValueType::Microseconds) \ + \ + M(ThreadPoolReaderPageCacheHit, "Number of times the read inside ThreadPoolReader was done from the page cache.", ValueType::Number) \ + M(ThreadPoolReaderPageCacheHitBytes, "Number of bytes read inside ThreadPoolReader when it was done from the page cache.", ValueType::Bytes) \ + M(ThreadPoolReaderPageCacheHitElapsedMicroseconds, "Time spent reading data from page cache in ThreadPoolReader.", ValueType::Microseconds) \ + M(ThreadPoolReaderPageCacheMiss, "Number of times the read inside ThreadPoolReader was not done from page cache and was hand off to thread pool.", ValueType::Number) \ + M(ThreadPoolReaderPageCacheMissBytes, "Number of bytes read inside ThreadPoolReader when read was not done from page cache and was hand off to thread pool.", ValueType::Bytes) \ + M(ThreadPoolReaderPageCacheMissElapsedMicroseconds, "Time spent reading data inside the asynchronous job in ThreadPoolReader - when read was not done from the page cache.", ValueType::Microseconds) \ + \ + M(AsynchronousReadWaitMicroseconds, "Time spent in waiting for asynchronous reads in asynchronous local read.", ValueType::Microseconds) \ + M(SynchronousReadWaitMicroseconds, "Time spent in waiting for synchronous reads in asynchronous local read.", ValueType::Microseconds) \ + M(AsynchronousRemoteReadWaitMicroseconds, "Time spent in waiting for asynchronous remote reads.", ValueType::Microseconds) \ + M(SynchronousRemoteReadWaitMicroseconds, "Time spent in waiting for synchronous remote reads.", ValueType::Microseconds) \ + \ + M(ExternalDataSourceLocalCacheReadBytes, "Bytes read from local cache buffer in RemoteReadBufferCache", ValueType::Bytes) \ + \ + M(MainConfigLoads, "Number of times the main configuration was reloaded.", ValueType::Number) \ + \ + M(AggregationPreallocatedElementsInHashTables, "How many elements were preallocated in hash tables for aggregation.", ValueType::Number) \ + M(AggregationHashTablesInitializedAsTwoLevel, "How many hash tables were inited as two-level for aggregation.", ValueType::Number) \ + M(AggregationOptimizedEqualRangesOfKeys, "For how many blocks optimization of equal ranges of keys was applied", ValueType::Number) \ + M(HashJoinPreallocatedElementsInHashTables, "How many elements were preallocated in hash tables for hash join.", ValueType::Number) \ + \ + M(MetadataFromKeeperCacheHit, "Number of times an object storage metadata request was answered from cache without making request to Keeper", ValueType::Number) \ + M(MetadataFromKeeperCacheMiss, "Number of times an object storage metadata request had to be answered from Keeper", ValueType::Number) \ + M(MetadataFromKeeperCacheUpdateMicroseconds, "Total time spent in updating the cache including waiting for responses from Keeper", ValueType::Microseconds) \ + M(MetadataFromKeeperUpdateCacheOneLevel, "Number of times a cache update for one level of directory tree was done", ValueType::Number) \ + M(MetadataFromKeeperTransactionCommit, "Number of times metadata transaction commit was attempted", ValueType::Number) \ + M(MetadataFromKeeperTransactionCommitRetry, "Number of times metadata transaction commit was retried", ValueType::Number) \ + M(MetadataFromKeeperCleanupTransactionCommit, "Number of times metadata transaction commit for deleted objects cleanup was attempted", ValueType::Number) \ + M(MetadataFromKeeperCleanupTransactionCommitRetry, "Number of times metadata transaction commit for deleted objects cleanup was retried", ValueType::Number) \ + M(MetadataFromKeeperOperations, "Number of times a request was made to Keeper", ValueType::Number) \ + M(MetadataFromKeeperIndividualOperations, "Number of paths read or written by single or multi requests to Keeper", ValueType::Number) \ + M(MetadataFromKeeperReconnects, "Number of times a reconnect to Keeper was done", ValueType::Number) \ + M(MetadataFromKeeperBackgroundCleanupObjects, "Number of times a old deleted object clean up was performed by background task", ValueType::Number) \ + M(MetadataFromKeeperBackgroundCleanupTransactions, "Number of times old transaction idempotency token was cleaned up by background task", ValueType::Number) \ + M(MetadataFromKeeperBackgroundCleanupErrors, "Number of times an error was encountered in background cleanup task", ValueType::Number) \ + \ + M(KafkaRebalanceRevocations, "Number of partition revocations (the first stage of consumer group rebalance)", ValueType::Number) \ + M(KafkaRebalanceAssignments, "Number of partition assignments (the final stage of consumer group rebalance)", ValueType::Number) \ + M(KafkaRebalanceErrors, "Number of failed consumer group rebalances", ValueType::Number) \ + M(KafkaMessagesPolled, "Number of Kafka messages polled from librdkafka to ClickHouse", ValueType::Number) \ + M(KafkaMessagesRead, "Number of Kafka messages already processed by ClickHouse", ValueType::Number) \ + M(KafkaMessagesFailed, "Number of Kafka messages ClickHouse failed to parse", ValueType::Number) \ + M(KafkaRowsRead, "Number of rows parsed from Kafka messages", ValueType::Number) \ + M(KafkaRowsRejected, "Number of parsed rows which were later rejected (due to rebalances / errors or similar reasons). Those rows will be consumed again after the rebalance.", ValueType::Number) \ + M(KafkaDirectReads, "Number of direct selects from Kafka tables since server start", ValueType::Number) \ + M(KafkaBackgroundReads, "Number of background reads populating materialized views from Kafka since server start", ValueType::Number) \ + M(KafkaCommits, "Number of successful commits of consumed offsets to Kafka (normally should be the same as KafkaBackgroundReads)", ValueType::Number) \ + M(KafkaCommitFailures, "Number of failed commits of consumed offsets to Kafka (usually is a sign of some data duplication)", ValueType::Number) \ + M(KafkaConsumerErrors, "Number of errors reported by librdkafka during polls", ValueType::Number) \ + M(KafkaWrites, "Number of writes (inserts) to Kafka tables ", ValueType::Number) \ + M(KafkaRowsWritten, "Number of rows inserted into Kafka tables", ValueType::Number) \ + M(KafkaProducerFlushes, "Number of explicit flushes to Kafka producer", ValueType::Number) \ + M(KafkaMessagesProduced, "Number of messages produced to Kafka", ValueType::Number) \ + M(KafkaProducerErrors, "Number of errors during producing the messages to Kafka", ValueType::Number) \ + \ + M(ScalarSubqueriesGlobalCacheHit, "Number of times a read from a scalar subquery was done using the global cache", ValueType::Number) \ + M(ScalarSubqueriesLocalCacheHit, "Number of times a read from a scalar subquery was done using the local cache", ValueType::Number) \ + M(ScalarSubqueriesCacheMiss, "Number of times a read from a scalar subquery was not cached and had to be calculated completely", ValueType::Number) \ + \ + M(SchemaInferenceCacheHits, "Number of times the requested source is found in schema cache", ValueType::Number) \ + M(SchemaInferenceCacheSchemaHits, "Number of times the schema is found in schema cache during schema inference", ValueType::Number) \ + M(SchemaInferenceCacheNumRowsHits, "Number of times the number of rows is found in schema cache during count from files", ValueType::Number) \ + M(SchemaInferenceCacheMisses, "Number of times the requested source is not in schema cache", ValueType::Number) \ + M(SchemaInferenceCacheSchemaMisses, "Number of times the requested source is in cache but the schema is not in cache during schema inference", ValueType::Number) \ + M(SchemaInferenceCacheNumRowsMisses, "Number of times the requested source is in cache but the number of rows is not in cache while count from files", ValueType::Number) \ + M(SchemaInferenceCacheEvictions, "Number of times a schema from cache was evicted due to overflow", ValueType::Number) \ + M(SchemaInferenceCacheInvalidations, "Number of times a schema in cache became invalid due to changes in data", ValueType::Number) \ + \ + M(KeeperPacketsSent, "Packets sent by keeper server", ValueType::Number) \ + M(KeeperPacketsReceived, "Packets received by keeper server", ValueType::Number) \ + M(KeeperRequestTotal, "Total requests number on keeper server", ValueType::Number) \ + M(KeeperLatency, "Keeper latency", ValueType::Milliseconds) \ + M(KeeperTotalElapsedMicroseconds, "Keeper total latency for a single request", ValueType::Microseconds) \ + M(KeeperProcessElapsedMicroseconds, "Keeper commit latency for a single request", ValueType::Microseconds) \ + M(KeeperPreprocessElapsedMicroseconds, "Keeper preprocessing latency for a single reuquest", ValueType::Microseconds)\ + M(KeeperStorageLockWaitMicroseconds, "Time spent waiting for acquiring Keeper storage lock", ValueType::Microseconds) \ + M(KeeperCommitWaitElapsedMicroseconds, "Time spent waiting for certain log to be committed", ValueType::Microseconds) \ + M(KeeperBatchMaxCount, "Number of times the size of batch was limited by the amount", ValueType::Number) \ + M(KeeperBatchMaxTotalSize, "Number of times the size of batch was limited by the total bytes size", ValueType::Number) \ + M(KeeperCommits, "Number of successful commits", ValueType::Number) \ + M(KeeperCommitsFailed, "Number of failed commits", ValueType::Number) \ + M(KeeperSnapshotCreations, "Number of snapshots creations", ValueType::Number) \ + M(KeeperSnapshotCreationsFailed, "Number of failed snapshot creations", ValueType::Number) \ + M(KeeperSnapshotApplys, "Number of snapshot applying", ValueType::Number) \ + M(KeeperSnapshotApplysFailed, "Number of failed snapshot applying", ValueType::Number) \ + M(KeeperReadSnapshot, "Number of snapshot read(serialization)", ValueType::Number) \ + M(KeeperSaveSnapshot, "Number of snapshot save", ValueType::Number) \ + M(KeeperCreateRequest, "Number of create requests", ValueType::Number) \ + M(KeeperRemoveRequest, "Number of remove requests", ValueType::Number) \ + M(KeeperSetRequest, "Number of set requests", ValueType::Number) \ + M(KeeperReconfigRequest, "Number of reconfig requests", ValueType::Number) \ + M(KeeperCheckRequest, "Number of check requests", ValueType::Number) \ + M(KeeperMultiRequest, "Number of multi requests", ValueType::Number) \ + M(KeeperMultiReadRequest, "Number of multi read requests", ValueType::Number) \ + M(KeeperGetRequest, "Number of get requests", ValueType::Number) \ + M(KeeperListRequest, "Number of list requests", ValueType::Number) \ + M(KeeperExistsRequest, "Number of exists requests", ValueType::Number) \ + \ + M(OverflowBreak, "Number of times, data processing was cancelled by query complexity limitation with setting '*_overflow_mode' = 'break' and the result is incomplete.", ValueType::Number) \ + M(OverflowThrow, "Number of times, data processing was cancelled by query complexity limitation with setting '*_overflow_mode' = 'throw' and exception was thrown.", ValueType::Number) \ + M(OverflowAny, "Number of times approximate GROUP BY was in effect: when aggregation was performed only on top of first 'max_rows_to_group_by' unique keys and other keys were ignored due to 'group_by_overflow_mode' = 'any'.", ValueType::Number) \ + \ + M(S3QueueSetFileProcessingMicroseconds, "Time spent to set file as processing", ValueType::Microseconds) \ + M(S3QueueSetFileProcessedMicroseconds, "Time spent to set file as processed", ValueType::Microseconds) \ + M(S3QueueSetFileFailedMicroseconds, "Time spent to set file as failed", ValueType::Microseconds) \ + M(ObjectStorageQueueFailedFiles, "Number of files which failed to be processed", ValueType::Number)\ + M(ObjectStorageQueueProcessedFiles, "Number of files which were processed", ValueType::Number)\ + M(ObjectStorageQueueCleanupMaxSetSizeOrTTLMicroseconds, "Time spent to set file as failed", ValueType::Microseconds) \ + M(ObjectStorageQueuePullMicroseconds, "Time spent to read file data", ValueType::Microseconds) \ + M(ObjectStorageQueueLockLocalFileStatusesMicroseconds, "Time spent to lock local file statuses", ValueType::Microseconds) \ + \ + M(ServerStartupMilliseconds, "Time elapsed from starting server to listening to sockets in milliseconds", ValueType::Milliseconds) \ + M(IOUringSQEsSubmitted, "Total number of io_uring SQEs submitted", ValueType::Number) \ + M(IOUringSQEsResubmitsAsync, "Total number of asynchronous io_uring SQE resubmits performed", ValueType::Number) \ + M(IOUringSQEsResubmitsSync, "Total number of synchronous io_uring SQE resubmits performed", ValueType::Number) \ + M(IOUringCQEsCompleted, "Total number of successfully completed io_uring CQEs", ValueType::Number) \ + M(IOUringCQEsFailed, "Total number of completed io_uring CQEs with failures", ValueType::Number) \ + \ + M(BackupsOpenedForRead, "Number of backups opened for reading", ValueType::Number) \ + M(BackupsOpenedForWrite, "Number of backups opened for writing", ValueType::Number) \ + M(BackupReadMetadataMicroseconds, "Time spent reading backup metadata from .backup file", ValueType::Microseconds) \ + M(BackupWriteMetadataMicroseconds, "Time spent writing backup metadata to .backup file", ValueType::Microseconds) \ + M(BackupEntriesCollectorMicroseconds, "Time spent making backup entries", ValueType::Microseconds) \ + M(BackupEntriesCollectorForTablesDataMicroseconds, "Time spent making backup entries for tables data", ValueType::Microseconds) \ + M(BackupEntriesCollectorRunPostTasksMicroseconds, "Time spent running post tasks after making backup entries", ValueType::Microseconds) \ + \ + M(ReadTaskRequestsReceived, "The number of callbacks requested from the remote server back to the initiator server to choose the read task (for s3Cluster table function and similar). Measured on the initiator server side.", ValueType::Number) \ + M(MergeTreeReadTaskRequestsReceived, "The number of callbacks requested from the remote server back to the initiator server to choose the read task (for MergeTree tables). Measured on the initiator server side.", ValueType::Number) \ + \ + M(ReadTaskRequestsSent, "The number of callbacks requested from the remote server back to the initiator server to choose the read task (for s3Cluster table function and similar). Measured on the remote server side.", ValueType::Number) \ + M(MergeTreeReadTaskRequestsSent, "The number of callbacks requested from the remote server back to the initiator server to choose the read task (for MergeTree tables). Measured on the remote server side.", ValueType::Number) \ + M(MergeTreeAllRangesAnnouncementsSent, "The number of announcements sent from the remote server to the initiator server about the set of data parts (for MergeTree tables). Measured on the remote server side.", ValueType::Number) \ + M(ReadTaskRequestsSentElapsedMicroseconds, "Time spent in callbacks requested from the remote server back to the initiator server to choose the read task (for s3Cluster table function and similar). Measured on the remote server side.", ValueType::Microseconds) \ + M(MergeTreeReadTaskRequestsSentElapsedMicroseconds, "Time spent in callbacks requested from the remote server back to the initiator server to choose the read task (for MergeTree tables). Measured on the remote server side.", ValueType::Microseconds) \ + M(MergeTreeAllRangesAnnouncementsSentElapsedMicroseconds, "Time spent in sending the announcement from the remote server to the initiator server about the set of data parts (for MergeTree tables). Measured on the remote server side.", ValueType::Microseconds) \ + \ + M(ConnectionPoolIsFullMicroseconds, "Total time spent waiting for a slot in connection pool.", ValueType::Microseconds) \ + M(AsyncLoaderWaitMicroseconds, "Total time a query was waiting for async loader jobs.", ValueType::Microseconds) \ + \ + M(DistrCacheServerSwitches, "Number of server switches between distributed cache servers in read/write-through cache", ValueType::Number) \ + M(DistrCacheReadMicroseconds, "Time spent reading from distributed cache", ValueType::Microseconds) \ + M(DistrCacheFallbackReadMicroseconds, "Time spend reading from fallback buffer instead of distribted cache", ValueType::Microseconds) \ + M(DistrCachePrecomputeRangesMicroseconds, "Time spent to precompute read ranges", ValueType::Microseconds) \ + M(DistrCacheNextImplMicroseconds, "Time spend in ReadBufferFromDistributedCache::nextImpl", ValueType::Microseconds) \ + M(DistrCacheOpenedConnections, "The number of open connections to distributed cache", ValueType::Number) \ + M(DistrCacheReusedConnections, "The number of reused connections to distributed cache", ValueType::Number) \ + M(DistrCacheHoldConnections, "The number of used connections to distributed cache", ValueType::Number) \ + \ + M(DistrCacheGetResponseMicroseconds, "Time spend to wait for response from distributed cache", ValueType::Microseconds) \ + M(DistrCacheStartRangeMicroseconds, "Time spent to start a new read range with distributed cache", ValueType::Microseconds) \ + M(DistrCacheLockRegistryMicroseconds, "Time spent to take DistributedCacheRegistry lock", ValueType::Microseconds) \ + M(DistrCacheUnusedPackets, "Number of skipped unused packets from distributed cache", ValueType::Number) \ + M(DistrCachePackets, "Total number of packets received from distributed cache", ValueType::Number) \ + M(DistrCacheUnusedPacketsBytes, "The number of bytes in Data packets which were ignored", ValueType::Bytes) \ + M(DistrCacheRegistryUpdateMicroseconds, "Time spent updating distributed cache registry", ValueType::Microseconds) \ + M(DistrCacheRegistryUpdates, "Number of distributed cache registry updates", ValueType::Number) \ + \ + M(DistrCacheConnectMicroseconds, "The time spent to connect to distributed cache", ValueType::Microseconds) \ + M(DistrCacheConnectAttempts, "The number of connection attempts to distributed cache", ValueType::Number) \ + M(DistrCacheGetClient, "Number of client access times", ValueType::Number) \ + \ + M(DistrCacheServerProcessRequestMicroseconds, "Time spent processing request on DistributedCache server side", ValueType::Microseconds) \ + \ + M(LogTest, "Number of log messages with level Test", ValueType::Number) \ + M(LogTrace, "Number of log messages with level Trace", ValueType::Number) \ + M(LogDebug, "Number of log messages with level Debug", ValueType::Number) \ + M(LogInfo, "Number of log messages with level Info", ValueType::Number) \ + M(LogWarning, "Number of log messages with level Warning", ValueType::Number) \ + M(LogError, "Number of log messages with level Error", ValueType::Number) \ + M(LogFatal, "Number of log messages with level Fatal", ValueType::Number) \ + \ + M(InterfaceHTTPSendBytes, "Number of bytes sent through HTTP interfaces", ValueType::Bytes) \ + M(InterfaceHTTPReceiveBytes, "Number of bytes received through HTTP interfaces", ValueType::Bytes) \ + M(InterfaceNativeSendBytes, "Number of bytes sent through native interfaces", ValueType::Bytes) \ + M(InterfaceNativeReceiveBytes, "Number of bytes received through native interfaces", ValueType::Bytes) \ + M(InterfacePrometheusSendBytes, "Number of bytes sent through Prometheus interfaces", ValueType::Bytes) \ + M(InterfacePrometheusReceiveBytes, "Number of bytes received through Prometheus interfaces", ValueType::Bytes) \ + M(InterfaceInterserverSendBytes, "Number of bytes sent through interserver interfaces", ValueType::Bytes) \ + M(InterfaceInterserverReceiveBytes, "Number of bytes received through interserver interfaces", ValueType::Bytes) \ + M(InterfaceMySQLSendBytes, "Number of bytes sent through MySQL interfaces", ValueType::Bytes) \ + M(InterfaceMySQLReceiveBytes, "Number of bytes received through MySQL interfaces", ValueType::Bytes) \ + M(InterfacePostgreSQLSendBytes, "Number of bytes sent through PostgreSQL interfaces", ValueType::Bytes) \ + M(InterfacePostgreSQLReceiveBytes, "Number of bytes received through PostgreSQL interfaces", ValueType::Bytes) \ + \ + M(ParallelReplicasUsedCount, "Number of replicas used to execute a query with task-based parallel replicas", ValueType::Number) \ + \ + M(KeeperLogsEntryReadFromLatestCache, "Number of log entries in Keeper being read from latest logs cache", ValueType::Number) \ + M(KeeperLogsEntryReadFromCommitCache, "Number of log entries in Keeper being read from commit logs cache", ValueType::Number) \ + M(KeeperLogsEntryReadFromFile, "Number of log entries in Keeper being read directly from the changelog file", ValueType::Number) \ + M(KeeperLogsPrefetchedEntries, "Number of log entries in Keeper being prefetched from the changelog file", ValueType::Number) \ + \ + M(ParallelReplicasAvailableCount, "Number of replicas available to execute a query with task-based parallel replicas", ValueType::Number) \ + M(ParallelReplicasUnavailableCount, "Number of replicas which was chosen, but found to be unavailable during query execution with task-based parallel replicas", ValueType::Number) \ + \ + M(StorageConnectionsCreated, "Number of created connections for storages", ValueType::Number) \ + M(StorageConnectionsReused, "Number of reused connections for storages", ValueType::Number) \ + M(StorageConnectionsReset, "Number of reset connections for storages", ValueType::Number) \ + M(StorageConnectionsPreserved, "Number of preserved connections for storages", ValueType::Number) \ + M(StorageConnectionsExpired, "Number of expired connections for storages", ValueType::Number) \ + M(StorageConnectionsErrors, "Number of cases when creation of a connection for storage is failed", ValueType::Number) \ + M(StorageConnectionsElapsedMicroseconds, "Total time spend on creating connections for storages", ValueType::Microseconds) \ + \ + M(DiskConnectionsCreated, "Number of created connections for disk", ValueType::Number) \ + M(DiskConnectionsReused, "Number of reused connections for disk", ValueType::Number) \ + M(DiskConnectionsReset, "Number of reset connections for disk", ValueType::Number) \ + M(DiskConnectionsPreserved, "Number of preserved connections for disk", ValueType::Number) \ + M(DiskConnectionsExpired, "Number of expired connections for disk", ValueType::Number) \ + M(DiskConnectionsErrors, "Number of cases when creation of a connection for disk is failed", ValueType::Number) \ + M(DiskConnectionsElapsedMicroseconds, "Total time spend on creating connections for disk", ValueType::Microseconds) \ + \ + M(HTTPConnectionsCreated, "Number of created http connections", ValueType::Number) \ + M(HTTPConnectionsReused, "Number of reused http connections", ValueType::Number) \ + M(HTTPConnectionsReset, "Number of reset http connections", ValueType::Number) \ + M(HTTPConnectionsPreserved, "Number of preserved http connections", ValueType::Number) \ + M(HTTPConnectionsExpired, "Number of expired http connections", ValueType::Number) \ + M(HTTPConnectionsErrors, "Number of cases when creation of a http connection failed", ValueType::Number) \ + M(HTTPConnectionsElapsedMicroseconds, "Total time spend on creating http connections", ValueType::Microseconds) \ + \ + M(AddressesDiscovered, "Total count of new addresses in dns resolve results for http connections", ValueType::Number) \ + M(AddressesExpired, "Total count of expired addresses which is no longer presented in dns resolve results for http connections", ValueType::Number) \ + M(AddressesMarkedAsFailed, "Total count of addresses which has been marked as faulty due to connection errors for http connections", ValueType::Number) \ + \ + M(ReadWriteBufferFromHTTPRequestsSent, "Number of HTTP requests sent by ReadWriteBufferFromHTTP", ValueType::Number) \ + M(ReadWriteBufferFromHTTPBytes, "Total size of payload bytes received and sent by ReadWriteBufferFromHTTP. Doesn't include HTTP headers.", ValueType::Bytes) \ + \ + M(GWPAsanAllocateSuccess, "Number of successful allocations done by GWPAsan", ValueType::Number) \ + M(GWPAsanAllocateFailed, "Number of failed allocations done by GWPAsan (i.e. filled pool)", ValueType::Number) \ + M(GWPAsanFree, "Number of free operations done by GWPAsan", ValueType::Number) \ + \ + M(MemoryWorkerRun, "Number of runs done by MemoryWorker in background", ValueType::Number) \ + M(MemoryWorkerRunElapsedMicroseconds, "Total time spent by MemoryWorker for background work", ValueType::Microseconds) \ #ifdef APPLY_FOR_EXTERNAL_EVENTS @@ -845,7 +845,7 @@ The server successfully detected this situation and will download merged part fr namespace ProfileEvents { -#define M(NAME, DOCUMENTATION) extern const Event NAME = Event(__COUNTER__); +#define M(NAME, DOCUMENTATION, VALUE_TYPE) extern const Event NAME = Event(__COUNTER__); APPLY_FOR_EVENTS(M) #undef M constexpr Event END = Event(__COUNTER__); @@ -919,7 +919,7 @@ const char * getName(Event event) { static const char * strings[] = { - #define M(NAME, DOCUMENTATION) #NAME, + #define M(NAME, DOCUMENTATION, VALUE_TYPE) #NAME, APPLY_FOR_EVENTS(M) #undef M }; @@ -931,7 +931,19 @@ const char * getDocumentation(Event event) { static const char * strings[] = { - #define M(NAME, DOCUMENTATION) DOCUMENTATION, + #define M(NAME, DOCUMENTATION, VALUE_TYPE) DOCUMENTATION, + APPLY_FOR_EVENTS(M) + #undef M + }; + + return strings[event]; +} + +ValueType getValueType(Event event) +{ + static ValueType strings[] = + { + #define M(NAME, DOCUMENTATION, VALUE_TYPE) VALUE_TYPE, APPLY_FOR_EVENTS(M) #undef M }; diff --git a/src/Common/ProfileEvents.h b/src/Common/ProfileEvents.h index f196ed5a04c..50d6a5e1a18 100644 --- a/src/Common/ProfileEvents.h +++ b/src/Common/ProfileEvents.h @@ -149,6 +149,15 @@ namespace ProfileEvents static const Event num_counters; }; + enum class ValueType : uint8_t + { + Number, + Bytes, + Milliseconds, + Microseconds, + Nanoseconds, + }; + /// Increment a counter for event. Thread-safe. void increment(Event event, Count amount = 1); @@ -165,6 +174,9 @@ namespace ProfileEvents /// Get description of event by identifier. Returns statically allocated string. const char * getDocumentation(Event event); + /// Get value type of event by identifier. Returns enum value. + ValueType getValueType(Event event); + /// Get index just after last event identifier. Event end(); diff --git a/src/Common/QueryFuzzer.cpp b/src/Common/QueryFuzzer.cpp index 0b2f6c09b45..2122f7355f4 100644 --- a/src/Common/QueryFuzzer.cpp +++ b/src/Common/QueryFuzzer.cpp @@ -1005,7 +1005,7 @@ void QueryFuzzer::fuzzExpressionList(ASTExpressionList & expr_list) { for (auto & child : expr_list.children) { - if (auto * literal = typeid_cast(child.get())) + if (auto * /*literal*/ _ = typeid_cast(child.get())) { if (fuzz_rand() % 13 == 0) child = fuzzLiteralUnderExpressionList(child); diff --git a/src/Common/QueryFuzzer.h b/src/Common/QueryFuzzer.h index 35d088809f2..29dca5ef2d4 100644 --- a/src/Common/QueryFuzzer.h +++ b/src/Common/QueryFuzzer.h @@ -9,10 +9,9 @@ #include #include -#include +#include #include #include -#include "Parsers/IAST_fwd.h" namespace DB diff --git a/src/Common/Scheduler/Nodes/FairPolicy.h b/src/Common/Scheduler/Nodes/FairPolicy.h index fba637e979e..1a4c7b94b28 100644 --- a/src/Common/Scheduler/Nodes/FairPolicy.h +++ b/src/Common/Scheduler/Nodes/FairPolicy.h @@ -52,7 +52,7 @@ public: { if (!ISchedulerNode::equals(other)) return false; - if (auto * o = dynamic_cast(other)) + if (auto * _ = dynamic_cast(other)) return true; return false; } diff --git a/src/Common/Scheduler/Nodes/FifoQueue.h b/src/Common/Scheduler/Nodes/FifoQueue.h index 9fbc6d1ae65..90f8fffe665 100644 --- a/src/Common/Scheduler/Nodes/FifoQueue.h +++ b/src/Common/Scheduler/Nodes/FifoQueue.h @@ -34,7 +34,7 @@ public: { if (!ISchedulerNode::equals(other)) return false; - if (auto * o = dynamic_cast(other)) + if (auto * _ = dynamic_cast(other)) return true; return false; } diff --git a/src/Common/Scheduler/Nodes/PriorityPolicy.h b/src/Common/Scheduler/Nodes/PriorityPolicy.h index 91dc95600d5..88fa8c240ca 100644 --- a/src/Common/Scheduler/Nodes/PriorityPolicy.h +++ b/src/Common/Scheduler/Nodes/PriorityPolicy.h @@ -43,7 +43,7 @@ public: { if (!ISchedulerNode::equals(other)) return false; - if (auto * o = dynamic_cast(other)) + if (auto * _ = dynamic_cast(other)) return true; return false; } diff --git a/src/Common/Scheduler/SchedulerRoot.h b/src/Common/Scheduler/SchedulerRoot.h index 5307aadc3cc..4ba6bc65e1c 100644 --- a/src/Common/Scheduler/SchedulerRoot.h +++ b/src/Common/Scheduler/SchedulerRoot.h @@ -99,7 +99,7 @@ public: { if (!ISchedulerNode::equals(other)) return false; - if (auto * o = dynamic_cast(other)) + if (auto * _ = dynamic_cast(other)) return true; return false; } diff --git a/src/Common/StopToken.cpp b/src/Common/StopToken.cpp new file mode 100644 index 00000000000..65676b0de06 --- /dev/null +++ b/src/Common/StopToken.cpp @@ -0,0 +1,97 @@ +#include + +#include +#include +#include +#include + +struct StopState +{ + /// A pretty inefficient implementation (mutex instead of spinlock, std::list instead of intrusive list, + /// shared_ptr instead of custom refcounting), but this currently doesn't matter. If you want to use this in some + /// performance-sensitive code, feel free to reimplement, probably similar to folly::CancellationToken implementation + /// (but if it's actually performance-sensitive then maybe try to avoid using this at all: all this pointer chasing, + /// reference conting, and callbacks can't be very fast.) + + std::mutex mutex; + std::atomic stopped {false}; + std::list callbacks; +}; + +bool StopToken::stop_requested() const +{ + return state && state->stopped.load(); +} + +StopSource::StopSource() : state(std::make_shared()) {} + +bool StopSource::request_stop() +{ + std::list callbacks; + { + std::lock_guard lock(state->mutex); + if (state->stopped.exchange(true)) + { + chassert(state->callbacks.empty()); + return false; + } + callbacks = std::move(state->callbacks); + } + std::exception_ptr exception; + for (StopCallback * cb : callbacks) + { + /// If one StopCallback's destroys another StopCallback, this may deadlock because the second + /// StopCallback's destructor will wait for both callbacks to return (if it's later in the `callbacks` list). + /// This can be prevented by allowing ~StopCallback() to set some cancellation flag that we'd check here, + /// but this doesn't seem worth the trouble. Just don't have such complicated callbacks. + + try + { + cb->callback(); + } + catch (...) + { + if (!exception) + exception = std::current_exception(); + } + cb->returned.store(true); + } + if (exception) + std::rethrow_exception(exception); + return true; +} + +StopCallback::StopCallback(const StopToken & token, Callback cb) : state(token.state), callback(std::move(cb)) +{ + if (state == nullptr) + return; + std::unique_lock lock(state->mutex); + if (state->stopped.load()) + { + lock.unlock(); + state = nullptr; + callback(); + } + else + { + state->callbacks.push_back(this); + it = std::prev(state->callbacks.end()); + } +} + +StopCallback::~StopCallback() +{ + if (state == nullptr) + return; + std::unique_lock lock(state->mutex); + if (state->stopped.load()) + { + lock.unlock(); + while (!returned.load()) + std::this_thread::yield(); + } + else + { + state->callbacks.erase(it); + } +} diff --git a/src/Common/StopToken.h b/src/Common/StopToken.h new file mode 100644 index 00000000000..6e93f34fa5f --- /dev/null +++ b/src/Common/StopToken.h @@ -0,0 +1,71 @@ +#pragma once +#include +#include +#include +#include + +/// Just like std::stop_token, which isn't available yet. A.k.a. folly::CancellationToken. +/// When we switch to C++20, delete this and use std::stop_token instead. + +struct StopState; +using StopStatePtr = std::shared_ptr; + +class StopToken +{ +public: + StopToken() = default; + + StopToken(const StopToken &) = default; + StopToken(StopToken &&) = default; + StopToken & operator=(const StopToken &) = default; + StopToken & operator=(StopToken &&) = default; + + bool stop_requested() const; + bool stop_possible() const { return state != nullptr; } + +private: + friend class StopSource; + friend class StopCallback; + + StopStatePtr state; + + explicit StopToken(StopStatePtr s) : state(std::move(s)) {} +}; + +class StopSource +{ +public: + StopSource(); + + StopSource(const StopSource &) = default; + StopSource(StopSource &&) = default; + StopSource & operator=(const StopSource &) = default; + StopSource & operator=(StopSource &&) = default; + + StopToken get_token() const { return StopToken(state); } + bool request_stop(); + +private: + StopStatePtr state; +}; + +class StopCallback +{ +public: + using Callback = std::function; + + StopCallback(const StopToken & token, Callback cb); + /// If the callback is already running, waits for it to return. + ~StopCallback(); + + StopCallback(const StopCallback &) = delete; + StopCallback & operator=(const StopCallback &) = delete; + +private: + friend class StopSource; + + StopStatePtr state; + std::list::iterator it; + Callback callback; + std::atomic_bool returned {false}; +}; diff --git a/src/Common/ThreadPool.cpp b/src/Common/ThreadPool.cpp index 92ad6c3466f..fb70ec4b447 100644 --- a/src/Common/ThreadPool.cpp +++ b/src/Common/ThreadPool.cpp @@ -47,6 +47,47 @@ namespace ProfileEvents } +namespace +{ + struct ScopedDecrement + { + std::optional>> atomic_var; + + // Deleted copy constructor and copy assignment operator + ScopedDecrement(const ScopedDecrement&) = delete; + ScopedDecrement& operator=(const ScopedDecrement&) = delete; + + // Move constructor + ScopedDecrement(ScopedDecrement&& other) noexcept + : atomic_var(std::move(other.atomic_var)) + { + other.atomic_var.reset(); + } + + // Move assignment operator + ScopedDecrement& operator=(ScopedDecrement&& other) noexcept + { + if (this != &other) + { + atomic_var.swap(other.atomic_var); + } + return *this; + } + + explicit ScopedDecrement(std::atomic& var) + : atomic_var(var) + { + atomic_var->get().fetch_sub(1, std::memory_order_relaxed); + } + + ~ScopedDecrement() + { + if (atomic_var) + atomic_var->get().fetch_add(1, std::memory_order_relaxed); + } + }; +} + class JobWithPriority { public: @@ -55,6 +96,8 @@ public: Job job; Priority priority; CurrentMetrics::Increment metric_increment; + ScopedDecrement available_threads_decrement; + DB::OpenTelemetry::TracingContextOnThread thread_trace_context; /// Call stacks of all jobs' schedulings leading to this one @@ -62,11 +105,20 @@ public: bool enable_job_stack_trace = false; Stopwatch job_create_time; + // Deleted copy constructor and copy assignment operator + JobWithPriority(const JobWithPriority&) = delete; + JobWithPriority& operator=(const JobWithPriority&) = delete; + + // Move constructor and move assignment operator + JobWithPriority(JobWithPriority&&) noexcept = default; + JobWithPriority& operator=(JobWithPriority&&) noexcept = default; + JobWithPriority( Job job_, Priority priority_, CurrentMetrics::Metric metric, const DB::OpenTelemetry::TracingContextOnThread & thread_trace_context_, - bool capture_frame_pointers) + bool capture_frame_pointers, ScopedDecrement available_threads_decrement_) : job(job_), priority(priority_), metric_increment(metric), + available_threads_decrement(std::move(available_threads_decrement_)), thread_trace_context(thread_trace_context_), enable_job_stack_trace(capture_frame_pointers) { if (!capture_frame_pointers) @@ -85,8 +137,6 @@ public: { return job_create_time.elapsedMicroseconds(); } - - }; static constexpr auto DEFAULT_THREAD_NAME = "ThreadPool"; @@ -125,12 +175,19 @@ ThreadPoolImpl::ThreadPoolImpl( , queue_size(queue_size_ ? std::max(queue_size_, max_threads) : 0 /* zero means the queue is unlimited */) , shutdown_on_exception(shutdown_on_exception_) { + max_threads = std::min(max_threads, static_cast(MAX_THEORETICAL_THREAD_COUNT)); + max_free_threads = std::min(max_free_threads, static_cast(MAX_THEORETICAL_THREAD_COUNT)); + remaining_pool_capacity.store(max_threads, std::memory_order_relaxed); + available_threads.store(0, std::memory_order_relaxed); } template void ThreadPoolImpl::setMaxThreads(size_t value) { + value = std::min(value, static_cast(MAX_THEORETICAL_THREAD_COUNT)); std::lock_guard lock(mutex); + remaining_pool_capacity.fetch_add(value - max_threads, std::memory_order_relaxed); + bool need_start_threads = (value > max_threads); bool need_finish_free_threads = (value < max_free_threads); @@ -163,6 +220,7 @@ size_t ThreadPoolImpl::getMaxThreads() const template void ThreadPoolImpl::setMaxFreeThreads(size_t value) { + value = std::min(value, static_cast(MAX_THEORETICAL_THREAD_COUNT)); std::lock_guard lock(mutex); bool need_finish_free_threads = (value < max_free_threads); @@ -184,7 +242,6 @@ void ThreadPoolImpl::setQueueSize(size_t value) jobs.reserve(queue_size); } - template template ReturnType ThreadPoolImpl::scheduleImpl(Job job, Priority priority, std::optional wait_microseconds, bool propagate_opentelemetry_tracing_context) @@ -207,6 +264,38 @@ ReturnType ThreadPoolImpl::scheduleImpl(Job job, Priority priority, std: return false; }; + // Decrement available_threads, scoped to the job lifecycle. + // This ensures that available_threads decreases when a new job starts + // and automatically increments when the job completes or goes out of scope. + ScopedDecrement available_threads_decrement(available_threads); + + std::unique_ptr new_thread; + + // Load the current capacity + int64_t capacity = remaining_pool_capacity.load(std::memory_order_relaxed); + int64_t currently_available_threads = available_threads.load(std::memory_order_relaxed); + + while (currently_available_threads <= 0 && capacity > 0) + { + if (remaining_pool_capacity.compare_exchange_weak(capacity, capacity - 1, std::memory_order_relaxed)) + { + try + { + new_thread = std::make_unique(*this); + break; // Exit the loop once a thread is successfully created. + } + catch (...) + { + // Failed to create the thread, restore capacity + remaining_pool_capacity.fetch_add(1, std::memory_order_relaxed); + std::lock_guard lock(mutex); // needed to change first_exception. + return on_error("failed to start the thread"); + } + } + // capacity gets reloaded by (unsuccessful) compare_exchange_weak + currently_available_threads = available_threads.load(std::memory_order_relaxed); + } + { Stopwatch watch; std::unique_lock lock(mutex); @@ -219,6 +308,7 @@ ReturnType ThreadPoolImpl::scheduleImpl(Job job, Priority priority, std: auto pred = [this] { return !queue_size || scheduled_jobs < queue_size || shutdown; }; + /// Wait for available threads or timeout if (wait_microseconds) /// Check for optional. Condition is true if the optional is set. Even if the value is zero. { if (!job_finished.wait_for(lock, std::chrono::microseconds(*wait_microseconds), pred)) @@ -230,48 +320,90 @@ ReturnType ThreadPoolImpl::scheduleImpl(Job job, Priority priority, std: if (shutdown) return on_error("shutdown"); - /// We must not to allocate any memory after we emplaced a job in a queue. - /// Because if an exception would be thrown, we won't notify a thread about job occurrence. + /// We must not allocate memory or perform operations that could throw exceptions after adding a job to the queue, + /// because if an exception occurs, it may leave the job in the queue without notifying any threads. + typename ThreadFromThreadPool::ThreadList::iterator thread_slot; - /// Check if there are enough threads to process job. - if (threads.size() < std::min(max_threads, scheduled_jobs + 1)) + /// The decision to start a new thread is made outside the locked section. + /// However, thread load and demand can change dynamically, and decisions based on + /// atomic variables outside the critical section might become outdated by the time we acquire the lock. + /// This can lead to two possible scenarios: + /// + /// 1) Relatively common: A new thread was started outside the lock, but by the time we acquire the lock, + /// demand for threads has decreased (e.g., other threads have finished their jobs and are now idle). + /// In this case, even though there are now enough threads, we still attempt to add the new thread + /// to the pool, provided it does not exceed the `max_threads` or `max_free_threads` limits. Keeping + /// an extra thread in the pool may help accommodate a sudden increase in demand without the need + /// to wait for thread creation. + /// + /// 2) Very unlikely (but possible): Outside the lock, it appeared there were enough threads + /// to handle the workload. However, after acquiring the lock, it turns out the new thread + /// is needed (possibly because one of the existing threads was removed or became unavailable). + /// In this case, we create the thread inside the critical section, even though this may introduce + /// a small delay. + + /// Check if we can add the thread created outside the critical section to the pool. + bool adding_new_thread = new_thread && threads.size() < std::min(max_threads, 1 /* current job */ + scheduled_jobs + max_free_threads); + + // If we didn't create a new thread initially but realize we actually need one (unlikely scenario). + if (unlikely(!adding_new_thread && threads.size() < std::min(max_threads, scheduled_jobs + 1))) { try { - threads.emplace_front(); + remaining_pool_capacity.fetch_sub(1, std::memory_order_relaxed); + new_thread = std::make_unique(*this); + } + catch (...) + { + // If thread creation fails, restore the pool capacity and return an error. + remaining_pool_capacity.fetch_add(1, std::memory_order_relaxed); + return on_error("failed to start the thread"); + } + adding_new_thread = true; + } + + if (adding_new_thread) + { + try + { + threads.emplace_front(std::move(new_thread)); + thread_slot = threads.begin(); } catch (...) { /// Most likely this is a std::bad_alloc exception - return on_error("cannot allocate thread slot"); - } - - try - { - Stopwatch watch2; - threads.front() = Thread([this, it = threads.begin()] { worker(it); }); - ProfileEvents::increment( - std::is_same_v ? ProfileEvents::GlobalThreadPoolThreadCreationMicroseconds : ProfileEvents::LocalThreadPoolThreadCreationMicroseconds, - watch2.elapsedMicroseconds()); - ProfileEvents::increment( - std::is_same_v ? ProfileEvents::GlobalThreadPoolExpansions : ProfileEvents::LocalThreadPoolExpansions); - } - catch (...) - { - threads.pop_front(); - return on_error("cannot allocate thread"); + return on_error("cannot emplace the thread in the pool"); } } + else // we have a thread but there is no space for that in the pool. + { + new_thread.reset(); + } - jobs.emplace(std::move(job), - priority, - metric_scheduled_jobs, - /// Tracing context on this thread is used as parent context for the sub-thread that runs the job - propagate_opentelemetry_tracing_context ? DB::OpenTelemetry::CurrentContext() : DB::OpenTelemetry::TracingContextOnThread(), - /// capture_frame_pointers - DB::Exception::enable_job_stack_trace); + try + { + jobs.emplace(std::move(job), + priority, + metric_scheduled_jobs, + /// Tracing context on this thread is used as parent context for the sub-thread that runs the job + propagate_opentelemetry_tracing_context ? DB::OpenTelemetry::CurrentContext() : DB::OpenTelemetry::TracingContextOnThread(), + /// capture_frame_pointers + DB::Exception::enable_job_stack_trace, + std::move(available_threads_decrement)); - ++scheduled_jobs; + ++scheduled_jobs; + + if (adding_new_thread) + (*thread_slot)->start(thread_slot); + + } + catch (...) + { + if (adding_new_thread) + threads.pop_front(); + + return on_error("cannot start the job or thread"); + } } /// Wake up a free thread to run the new job. @@ -291,30 +423,51 @@ void ThreadPoolImpl::startNewThreadsNoLock() /// Start new threads while there are more scheduled jobs in the queue and the limit `max_threads` is not reached. while (threads.size() < std::min(scheduled_jobs, max_threads)) { + std::unique_ptr new_thread; + + int64_t capacity = remaining_pool_capacity.load(std::memory_order_relaxed); + + while (capacity > 0) + { + if (remaining_pool_capacity.compare_exchange_weak(capacity, capacity - 1, std::memory_order_relaxed)) + { + try + { + // Successfully decremented, attempt to create a new thread + new_thread = std::make_unique(*this); + } + catch (...) + { + // Failed to create the thread, restore capacity + remaining_pool_capacity.fetch_add(1, std::memory_order_relaxed); + } + break; // Exit loop whether thread creation succeeded or not + } + } + + if (!new_thread) + break; /// failed to start more threads + + typename ThreadFromThreadPool::ThreadList::iterator thread_slot; + try { - threads.emplace_front(); + threads.emplace_front(std::move(new_thread)); + thread_slot = threads.begin(); } catch (...) { - break; /// failed to start more threads + break; } try { - Stopwatch watch; - threads.front() = Thread([this, it = threads.begin()] { worker(it); }); - ProfileEvents::increment( - std::is_same_v ? ProfileEvents::GlobalThreadPoolThreadCreationMicroseconds : ProfileEvents::LocalThreadPoolThreadCreationMicroseconds, - watch.elapsedMicroseconds()); - ProfileEvents::increment( - std::is_same_v ? ProfileEvents::GlobalThreadPoolExpansions : ProfileEvents::LocalThreadPoolExpansions); - + (*thread_slot)->start(thread_slot); } catch (...) { threads.pop_front(); - break; /// failed to start more threads + break; } } } @@ -376,21 +529,29 @@ void ThreadPoolImpl::finalize() { std::lock_guard lock(mutex); shutdown = true; - /// We don't want threads to remove themselves from `threads` anymore, otherwise `thread.join()` will go wrong below in this function. + + /// scheduleImpl doesn't check for shutdown outside the critical section, + /// so we set remaining_pool_capacity to a large negative value + /// (e.g., -MAX_THEORETICAL_THREAD_COUNT) to signal that no new threads are needed. + /// This effectively prevents any new threads from being started during shutdown. + remaining_pool_capacity.store(-MAX_THEORETICAL_THREAD_COUNT, std::memory_order_relaxed); + + /// Disable thread self-removal from `threads`. Otherwise, if threads remove themselves, + /// the thread.join() operation will fail later in this function. threads_remove_themselves = false; } - /// Wake up threads so they can finish themselves. + /// Notify all threads to wake them up, so they can complete their work and exit gracefully. new_job_or_shutdown.notify_all(); - /// Wait for all currently running jobs to finish (we don't wait for all scheduled jobs here like the function wait() does). - for (auto & thread : threads) + /// Join all threads before clearing the list + for (auto& thread_ptr : threads) { - thread.join(); - ProfileEvents::increment( - std::is_same_v ? ProfileEvents::GlobalThreadPoolShrinks : ProfileEvents::LocalThreadPoolShrinks); + if (thread_ptr) + thread_ptr->join(); } + // now it's safe to clear the threads threads.clear(); } @@ -426,11 +587,88 @@ bool ThreadPoolImpl::finished() const return shutdown; } + template -void ThreadPoolImpl::worker(typename std::list::iterator thread_it) +ThreadPoolImpl::ThreadFromThreadPool::ThreadFromThreadPool(ThreadPoolImpl& parent_pool_) + : parent_pool(parent_pool_) + , thread_state(ThreadState::Preparing) // Initial state is Preparing +{ + Stopwatch watch2; + + thread = Thread(&ThreadFromThreadPool::worker, this); + + ProfileEvents::increment( + std::is_same_v ? ProfileEvents::GlobalThreadPoolThreadCreationMicroseconds : ProfileEvents::LocalThreadPoolThreadCreationMicroseconds, + watch2.elapsedMicroseconds()); + ProfileEvents::increment( + std::is_same_v ? ProfileEvents::GlobalThreadPoolExpansions : ProfileEvents::LocalThreadPoolExpansions); + + parent_pool.available_threads.fetch_add(1, std::memory_order_relaxed); +} + + +template +void ThreadPoolImpl::ThreadFromThreadPool::start(typename ThreadList::iterator & it) +{ + /// the thread which created ThreadFromThreadPool should start it after adding it to the pool, or destroy it. + /// no parallelism is expected here. So the only valid transition for the start method is Preparing to Running. + chassert(thread_state.load(std::memory_order_relaxed) == ThreadState::Preparing); + thread_it = it; + thread_state.store(ThreadState::Running, std::memory_order_relaxed); /// now worker can start executing the main loop +} + +template +void ThreadPoolImpl::ThreadFromThreadPool::join() +{ + // Ensure the thread is joined before destruction if still joinable + if (thread.joinable()) + thread.join(); +} + +template +void ThreadPoolImpl::ThreadFromThreadPool::removeSelfFromPoolNoPoolLock() +{ + if (thread.joinable()) + thread.detach(); + + parent_pool.threads.erase(thread_it); +} + +template +ThreadPoolImpl::ThreadFromThreadPool::~ThreadFromThreadPool() +{ + parent_pool.available_threads.fetch_sub(1, std::memory_order_relaxed); + + // The thread is being destructed, so the remaining pool capacity increases + parent_pool.remaining_pool_capacity.fetch_add(1, std::memory_order_relaxed); + + // If the worker was still waiting in the loop for thread initialization, + // signal it to terminate and be destroyed now. + thread_state.store(ThreadState::Destructing, std::memory_order_relaxed); + + join(); + + ProfileEvents::increment( + std::is_same_v ? ProfileEvents::GlobalThreadPoolShrinks : ProfileEvents::LocalThreadPoolShrinks); +} + + +template +void ThreadPoolImpl::ThreadFromThreadPool::worker() { DENY_ALLOCATIONS_IN_SCOPE; - CurrentMetrics::Increment metric_pool_threads(metric_threads); + + // wait until the thread will be started + while (thread_state.load(std::memory_order_relaxed) == ThreadState::Preparing) + { + std::this_thread::yield(); // let's try to yield to avoid consuming too much CPU in the busy-loop + } + + // If the thread transitions to Destructing, exit + if (thread_state.load(std::memory_order_relaxed) == ThreadState::Destructing) + return; + + CurrentMetrics::Increment metric_pool_threads(parent_pool.metric_threads); bool job_is_done = false; std::exception_ptr exception_from_job; @@ -447,7 +685,7 @@ void ThreadPoolImpl::worker(typename std::list::iterator thread_ { Stopwatch watch; - std::unique_lock lock(mutex); + std::unique_lock lock(parent_pool.mutex); ProfileEvents::increment( std::is_same_v ? ProfileEvents::GlobalThreadPoolLockWaitMicroseconds : ProfileEvents::LocalThreadPoolLockWaitMicroseconds, watch.elapsedMicroseconds()); @@ -458,48 +696,55 @@ void ThreadPoolImpl::worker(typename std::list::iterator thread_ job_is_done = false; if (exception_from_job) { - if (!first_exception) - first_exception = exception_from_job; - if (shutdown_on_exception) - shutdown = true; + if (!parent_pool.first_exception) + parent_pool.first_exception = exception_from_job; + if (parent_pool.shutdown_on_exception) + { + parent_pool.shutdown = true; + + // Prevent new thread creation, as explained in finalize. + parent_pool.remaining_pool_capacity.store(-MAX_THEORETICAL_THREAD_COUNT, std::memory_order_relaxed); + } exception_from_job = {}; } - --scheduled_jobs; + --parent_pool.scheduled_jobs; - job_finished.notify_all(); - if (shutdown) - new_job_or_shutdown.notify_all(); /// `shutdown` was set, wake up other threads so they can finish themselves. + parent_pool.job_finished.notify_all(); + if (parent_pool.shutdown) + parent_pool.new_job_or_shutdown.notify_all(); /// `shutdown` was set, wake up other threads so they can finish themselves. } - new_job_or_shutdown.wait(lock, [&] { return !jobs.empty() || shutdown || threads.size() > std::min(max_threads, scheduled_jobs + max_free_threads); }); + parent_pool.new_job_or_shutdown.wait(lock, [this] { + return !parent_pool.jobs.empty() + || parent_pool.shutdown + || parent_pool.threads.size() > std::min(parent_pool.max_threads, parent_pool.scheduled_jobs + parent_pool.max_free_threads); + }); - if (jobs.empty() || threads.size() > std::min(max_threads, scheduled_jobs + max_free_threads)) + + if (parent_pool.jobs.empty() || parent_pool.threads.size() > std::min(parent_pool.max_threads, parent_pool.scheduled_jobs + parent_pool.max_free_threads)) { // We enter here if: // - either this thread is not needed anymore due to max_free_threads excess; // - or shutdown happened AND all jobs are already handled. - if (threads_remove_themselves) - { - thread_it->detach(); - threads.erase(thread_it); - ProfileEvents::increment( - std::is_same_v ? ProfileEvents::GlobalThreadPoolShrinks : ProfileEvents::LocalThreadPoolShrinks); - } + + if (parent_pool.threads_remove_themselves) + removeSelfFromPoolNoPoolLock(); // Detach and remove itself from the pool + return; } /// boost::priority_queue does not provide interface for getting non-const reference to an element /// to prevent us from modifying its priority. We have to use const_cast to force move semantics on JobWithPriority. - job_data = std::move(const_cast(jobs.top())); - jobs.pop(); + job_data = std::move(const_cast(parent_pool.jobs.top())); + parent_pool.jobs.pop(); ProfileEvents::increment( std::is_same_v ? ProfileEvents::GlobalThreadPoolJobWaitTimeMicroseconds : ProfileEvents::LocalThreadPoolJobWaitTimeMicroseconds, job_data->elapsedMicroseconds()); /// We don't run jobs after `shutdown` is set, but we have to properly dequeue all jobs and finish them. - if (shutdown) + if (parent_pool.shutdown) { { ALLOW_ALLOCATIONS_IN_SCOPE; @@ -522,7 +767,7 @@ void ThreadPoolImpl::worker(typename std::list::iterator thread_ if (DB::Exception::enable_job_stack_trace) DB::Exception::setThreadFramePointers(std::move(job_data->frame_pointers)); - CurrentMetrics::Increment metric_active_pool_threads(metric_active_threads); + CurrentMetrics::Increment metric_active_pool_threads(parent_pool.metric_active_threads); if constexpr (!std::is_same_v) { @@ -575,7 +820,6 @@ void ThreadPoolImpl::worker(typename std::list::iterator thread_ } } - template class ThreadPoolImpl; template class ThreadPoolImpl>; template class ThreadPoolImpl>; diff --git a/src/Common/ThreadPool.h b/src/Common/ThreadPool.h index fd9149bda04..7e497245acc 100644 --- a/src/Common/ThreadPool.h +++ b/src/Common/ThreadPool.h @@ -32,7 +32,7 @@ class JobWithPriority; * * This thread pool can be used as a task queue. * For example, you can create a thread pool with 10 threads (and queue of size 10) and schedule 1000 tasks - * - in this case you will be blocked to keep 10 tasks in fly. + * - in this case you will be blocked to keep 10 tasks in flight. * * Thread: std::thread or something with identical interface. */ @@ -40,9 +40,57 @@ template class ThreadPoolImpl { public: + // used as 'unlimited' thread pool size + // on linux you can not have more threads even if the RAM is unlimited + // see https://docs.kernel.org/admin-guide/sysctl/kernel.html#threads-max + static constexpr int MAX_THEORETICAL_THREAD_COUNT = 0x3fffffff; // ~1 billion + using Job = std::function; using Metric = CurrentMetrics::Metric; + // Subclass that encapsulates the thread and has the ability to remove itself from the pool. + class ThreadFromThreadPool + { + public: + using ThreadList = std::list>; + + /// Constructor to initialize and start the thread (but not associate it with the pool) + explicit ThreadFromThreadPool(ThreadPoolImpl& parent_pool); + + // Shift the thread state from Preparing to Running to allow the worker to start. + void start(ThreadList::iterator& it); + + void join(); + + // Destructor to join the thread if needed (shift the state to Destructing if it was not running) + ~ThreadFromThreadPool(); + + private: + ThreadPoolImpl& parent_pool; + Thread thread; + + enum class ThreadState + { + Preparing, + Running, + Destructing + }; + + // Atomic state to track the thread's state + std::atomic thread_state; + + // Stores the position of the thread in the parent thread pool list + typename std::list>::iterator thread_it; + + // Remove itself from the parent pool + void removeSelfFromPoolNoPoolLock(); + + // Worker does a busy loop (with yield) while the state is Preparing. + // After that, immediately returns if the state changed to Destructing, + // or starts the main working loop if the state is Running. + void worker(); + }; + /// Maximum number of threads is based on the number of physical cores. ThreadPoolImpl(Metric metric_threads_, Metric metric_active_threads_, Metric metric_scheduled_jobs_); @@ -63,14 +111,14 @@ public: size_t queue_size_, bool shutdown_on_exception_ = true); - /// Add new job. Locks until number of scheduled jobs is less than maximum or exception in one of threads was thrown. - /// If any thread was throw an exception, first exception will be rethrown from this method, - /// and exception will be cleared. + /// Add new job. Locks until the number of scheduled jobs is less than the maximum or an exception in one of the threads was thrown. + /// If any thread has thrown an exception, the first exception will be rethrown from this method, + /// and the exception will be cleared. /// Also throws an exception if cannot create thread. /// Priority: lower is higher. - /// NOTE: Probably you should call wait() if exception was thrown. If some previously scheduled jobs are using some objects, - /// located on stack of current thread, the stack must not be unwinded until all jobs finished. However, - /// if ThreadPool is a local object, it will wait for all scheduled jobs in own destructor. + /// NOTE: Probably you should call wait() if an exception was thrown. If some previously scheduled jobs are using some objects, + /// located on the stack of the current thread, the stack must not be unwound until all jobs are finished. However, + /// if ThreadPool is a local object, it will wait for all scheduled jobs in its own destructor. void scheduleOrThrowOnError(Job job, Priority priority = {}); /// Similar to scheduleOrThrowOnError(...). Wait for specified amount of time and schedule a job or return false. @@ -81,12 +129,12 @@ public: /// Wait for all currently active jobs to be done. /// You may call schedule and wait many times in arbitrary order. - /// If any thread was throw an exception, first exception will be rethrown from this method, - /// and exception will be cleared. + /// If any thread has thrown an exception, the first exception will be rethrown from this method, + /// and the exception will be cleared. void wait(); /// Waits for all threads. Doesn't rethrow exceptions (use 'wait' method to rethrow exceptions). - /// You should not destroy object while calling schedule or wait methods from another threads. + /// You should not destroy the object while calling schedule or wait methods from other threads. ~ThreadPoolImpl(); /// Returns number of running and scheduled jobs. @@ -127,28 +175,40 @@ private: size_t queue_size; size_t scheduled_jobs = 0; + + // Originally equals to max_threads, but changes dynamically. + // Decrements with every new thread started, increments when it finishes. + // If positive, then more threads can be started. + // When it comes to zero, it means that max_threads threads have already been started. + // it can be below zero when the threadpool is shutting down + std::atomic remaining_pool_capacity; + + // Increments every time a new thread joins the thread pool or a job finishes. + // Decrements every time a task is scheduled. + // If positive, it means that there are more threads than jobs (and some are idle). + // If zero, it means that every thread has a job. + // If negative, it means that we have more jobs than threads. + std::atomic available_threads; + bool shutdown = false; bool threads_remove_themselves = true; const bool shutdown_on_exception = true; boost::heap::priority_queue> jobs; - std::list threads; + ThreadFromThreadPool::ThreadList threads; std::exception_ptr first_exception; std::stack on_destroy_callbacks; template ReturnType scheduleImpl(Job job, Priority priority, std::optional wait_microseconds, bool propagate_opentelemetry_tracing_context = true); - void worker(typename std::list::iterator thread_it); - - /// Tries to start new threads if there are scheduled jobs and the limit `max_threads` is not reached. Must be called with `mutex` locked. + /// Tries to start new threads if there are scheduled jobs and the limit `max_threads` is not reached. Must be called with the mutex locked. void startNewThreadsNoLock(); void finalize(); void onDestroy(); }; - /// ThreadPool with std::thread for threads. using FreeThreadPool = ThreadPoolImpl; diff --git a/src/Common/formatReadable.cpp b/src/Common/formatReadable.cpp index 7e126a72c33..0ea2cb4adca 100644 --- a/src/Common/formatReadable.cpp +++ b/src/Common/formatReadable.cpp @@ -82,3 +82,16 @@ std::string formatReadableQuantity(double value, int precision) formatReadableQuantity(value, out, precision); return out.str(); } + +void formatReadableTime(double ns, DB::WriteBuffer & out, int precision) +{ + const char * units[] = {" ns", " us", " ms", " s"}; + formatReadable(ns, out, precision, units, sizeof(units) / sizeof(units[0]), 1000); +} + +std::string formatReadableTime(double ns, int precision) +{ + DB::WriteBufferFromOwnString out; + formatReadableTime(ns, out, precision); + return out.str(); +} diff --git a/src/Common/formatReadable.h b/src/Common/formatReadable.h index 0d7a437219a..2c320989ca9 100644 --- a/src/Common/formatReadable.h +++ b/src/Common/formatReadable.h @@ -23,6 +23,9 @@ std::string formatReadableSizeWithDecimalSuffix(double value, int precision = 2) void formatReadableQuantity(double value, DB::WriteBuffer & out, int precision = 2); std::string formatReadableQuantity(double value, int precision = 2); +/// Prints the passed time in nanoseconds as 123.45 ms. +void formatReadableTime(double ns, DB::WriteBuffer & out, int precision = 2); +std::string formatReadableTime(double ns, int precision = 2); /// Wrapper around value. If used with fmt library (e.g. for log messages), /// value is automatically formatted as size with binary suffix. diff --git a/src/Common/getNumberOfCPUCoresToUse.cpp b/src/Common/getNumberOfCPUCoresToUse.cpp index 28e1e3598ea..1a81e20d3b3 100644 --- a/src/Common/getNumberOfCPUCoresToUse.cpp +++ b/src/Common/getNumberOfCPUCoresToUse.cpp @@ -176,7 +176,7 @@ unsigned getNumberOfCPUCoresToUseImpl() /// /// On really big machines, SMT is detrimental to performance (+ ~5% overhead in ClickBench). On such machines, we limit ourself to the physical cores. /// Few cores indicate it is a small machine, runs in a VM or is a limited cloud instance --> it is reasonable to use all the cores. - if (cores >= 32) + if (cores >= 64) cores = physical_concurrency(); #endif diff --git a/src/Compression/CompressionCodecDeflateQpl.cpp b/src/Compression/CompressionCodecDeflateQpl.cpp index c82ee861a6f..a9654e60006 100644 --- a/src/Compression/CompressionCodecDeflateQpl.cpp +++ b/src/Compression/CompressionCodecDeflateQpl.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Compression/CompressionCodecEncrypted.cpp b/src/Compression/CompressionCodecEncrypted.cpp index 7cda6b95fe5..d9721d90655 100644 --- a/src/Compression/CompressionCodecEncrypted.cpp +++ b/src/Compression/CompressionCodecEncrypted.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Compression/CompressionCodecNone.cpp b/src/Compression/CompressionCodecNone.cpp index 53d62e51920..e52a601700e 100644 --- a/src/Compression/CompressionCodecNone.cpp +++ b/src/Compression/CompressionCodecNone.cpp @@ -1,7 +1,7 @@ #include #include #include - +#include namespace DB { diff --git a/src/Compression/ICompressionCodec.h b/src/Compression/ICompressionCodec.h index 0e4b67798ac..f77b1323d2e 100644 --- a/src/Compression/ICompressionCodec.h +++ b/src/Compression/ICompressionCodec.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include diff --git a/src/Core/BaseSettingsProgramOptions.h b/src/Core/BaseSettingsProgramOptions.h index 81f6c59a5e5..6e1ecce6a34 100644 --- a/src/Core/BaseSettingsProgramOptions.h +++ b/src/Core/BaseSettingsProgramOptions.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include diff --git a/src/Core/DistributedCacheProtocol.h b/src/Core/DistributedCacheProtocol.h new file mode 100644 index 00000000000..c2a8f7552dd --- /dev/null +++ b/src/Core/DistributedCacheProtocol.h @@ -0,0 +1,156 @@ +#pragma once + + +#include +#include + +namespace DistributedCache +{ + +static constexpr auto SERVER_CONFIG_PREFIX = "distributed_cache_server"; +static constexpr auto CLIENT_CONFIG_PREFIX = "distributed_cache_client"; +static constexpr auto REGISTERED_SERVERS_PATH = "registry"; +static constexpr auto OFFSET_ALIGNMENT_PATH = "offset_alignment"; +static constexpr auto DEFAULT_ZOOKEEPER_PATH = "/distributed_cache/"; +static constexpr auto MAX_VIRTUAL_NODES = 100; +static constexpr auto DEFAULT_OFFSET_ALIGNMENT = 16 * 1024 * 1024; +static constexpr auto DEFAULT_MAX_PACKET_SIZE = DB::DBMS_DEFAULT_BUFFER_SIZE; +static constexpr auto MAX_UNACKED_INFLIGHT_PACKETS = 10; +static constexpr auto ACK_DATA_PACKET_WINDOW = 5; +static constexpr auto DEFAULT_CONNECTION_POOL_SIZE = 15000; +static constexpr auto DEFAULT_CONNECTION_TTL_SEC = 200; + +static constexpr auto INITIAL_PROTOCOL_VERSION = 0; +static constexpr auto PROTOCOL_VERSION_WITH_QUERY_ID = 1; +static constexpr auto PROTOCOL_VERSION_WITH_MAX_INFLIGHT_PACKETS = 2; +static constexpr auto PROTOCOL_VERSION_WITH_GCS_TOKEN = 3; +static constexpr UInt32 PROTOCOL_VERSION_WITH_AZURE_AUTH = 4; +static constexpr UInt32 PROTOCOL_VERSION_WITH_TEMPORATY_DATA = 5; + +static constexpr UInt32 CURRENT_PROTOCOL_VERSION = PROTOCOL_VERSION_WITH_TEMPORATY_DATA; + +namespace Protocol +{ + +static constexpr auto MIN_VERSION_WITH_QUERY_ID_IN_REQUEST = 1; + +/** + * Distributed cache protocol. + * + * Read request: + * Step1: (Client) calculate aligned_offset = aligned(file_offset) - alignment to file_offset. + * The alignment is equal to `offset_alignment` + * (stored on zookeeper for shared access from server and client), + * which allows to guarantee if the client needs offset x, + * then it will go to the server which contains a covering + * file segment for this offset. + * Step2: (Client) calculate hash(x, remote_path, aligned_file_offset) -> h, + * Step3: (Client) find distributed cache server: hash_ring(h) -> s + * Step4: (Client) connect to s: + * Client: `Hello` packet (protocol_version, request_type) + * Server: `Hello` packet (mutual_protocol_version) + * Step5: send general info: + * Client: `ReadInfo` packet (object storage connection info, remote paths, start offset, end offset) + * Step6: + * Server: `ReadRange` packet (includes read range), and send the data. + * Client: `Ok` packet + * in case of error (Client): `EndRequest` packet. + * Step7: + * Client: do Step1 from current file offset and get aligned_offset'. + * If aligned_offset' == aligned_offset, do Step6 again. + * else: go to Step2 + * + * Write request: + * Step1: (Client) calculate hash(x, remote_path, file_offset) -> h, + * Step2: (Client) find distributed cache server: hash_ring(h) -> s + * Step3: (Client) connect to s: + * Client: `Hello` packet (protocol_version, request_type) + * Server: `Hello` packet (mutual_protocol_version) + * Step4: send general info: + * Client: `WriteInfo` packet (object storage connection info, remote_path, write range) + * Step5: write one file_segment's range + * Client: `WriteRange` packet (file_segment_start_offset), then process the write. + * Server: `Ok` (after each `Data` packet) + * or `Stop` packet (on error). + * Step6: + * if eof: do Step8 + * else: do Step7 + * Step7: + * do step1: h' = hash(x, remote_path, file_offset'), where file_offset' - start of the next file segment + * do step2: s' = hash_ring(h') + * if s' == s: do Step5 + * else: do Step8 and go to Step3 + * Step8: + * Client: `EndRequest` packet + * Server: `Ok` packet + */ + +enum RequestType +{ + Min = 0, + Read = 1, /// read-through cache + Write = 2, /// write-through cache + Remove = 3, /// drop cache + Show = 4, /// get current cache state + CurrentMetrics = 5, /// get CurrentMetrics + ProfileEvents = 6, /// get ProfileEvents + Max = 8, +}; + +namespace Client +{ + enum Enum + { + Min = 0, + + /// A hello packet for handshake between client and server. + Hello = 1, + /// A packet to start a new request: Read, Write, Remove, Show, etc + StartRequest = 2, + /// A packet to identify that the request is finished. + /// E.g. for read request we no longer need receiving data (even if requested read range is not finished); + /// for write request no data will no longer be sent. + EndRequest = 3, + /// A request to continue already started request but with a new information. + /// E.g. for read request - a new read range is needed; + /// for write request - a new write range will be sent. + ContinueRequest = 4, + /// Acknowledgement of `data_packet_ack_window` processed `DataPacket` packets. + AckRequest = 5, + + Max = 6, + }; +} + +namespace Server +{ + enum Enum + { + Min = 0, + + /// A hello packet for handshake between client and server. + Hello = 1, + /// Identifies that a request was successfully executed. + Ok = 2, + /// Identifies a packet containing an exception message happened on server's size. + Error = 3, + /// Identifies a packet for a Read request. + ReadResult = 4, + /// Identifies a packet for incremental ProfileEvents during Read or Write request. + ProfileCounters = 5, + /// Identifies a packet for a Show request. + ShowResult = 6, + /// Identifies a packet for a ProfileEvents request. + ProfileEvents = 7, + /// Identifies a packet for a Metrics request. + Metrics = 8, + /// Identifies that this server cannot receive any more data for Write request + /// (cache is full or errors during insertion). + Stop = 9, + + Max = 11 + }; +} + +} +} diff --git a/src/Core/Field.h b/src/Core/Field.h index ba8c66580ad..e327aa22cf7 100644 --- a/src/Core/Field.h +++ b/src/Core/Field.h @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Core/FormatFactorySettingsDeclaration.h b/src/Core/FormatFactorySettingsDeclaration.h index 8aa2997a5bd..8190b3c8c31 100644 --- a/src/Core/FormatFactorySettingsDeclaration.h +++ b/src/Core/FormatFactorySettingsDeclaration.h @@ -12,253 +12,1216 @@ #else #define FORMAT_FACTORY_SETTINGS(M, ALIAS) \ - M(Char, format_csv_delimiter, ',', "The character to be considered as a delimiter in CSV data. If setting with a string, a string has to have a length of 1.", 0) \ - M(Bool, format_csv_allow_single_quotes, false, "If it is set to true, allow strings in single quotes.", 0) \ - M(Bool, format_csv_allow_double_quotes, true, "If it is set to true, allow strings in double quotes.", 0) \ - M(Bool, output_format_csv_serialize_tuple_into_separate_columns, true, "If it set to true, then Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost)", 0) \ - M(Bool, input_format_csv_deserialize_separate_columns_into_tuple, true, "If it set to true, then separate columns written in CSV format can be deserialized to Tuple column.", 0) \ - M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \ - M(Bool, input_format_csv_allow_cr_end_of_line, false, "If it is set true, \\r will be allowed at end of line not followed by \\n", 0) \ - M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices", 0) \ - M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \ - M(Bool, input_format_skip_unknown_fields, true, "Skip columns with unknown names from input data (it works for JSONEachRow, -WithNames, -WithNamesAndTypes and TSKV formats).", 0) \ - M(Bool, input_format_with_names_use_header, true, "For -WithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.", 0) \ - M(Bool, input_format_with_types_use_header, true, "For -WithNamesAndTypes input formats this controls whether format parser should check if data types from the input match data types from the header.", 0) \ - M(Bool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).", 0) \ - M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, -WithNames, -WithNamesAndTypes formats).", IMPORTANT) \ - M(Bool, input_format_csv_empty_as_default, true, "Treat empty fields in CSV input as default values.", 0) \ - M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \ - M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices.", 0) \ - M(Bool, input_format_null_as_default, true, "Initialize null fields with default values if the data type of this field is not nullable and it is supported by the input format", 0) \ - M(Bool, input_format_force_null_for_omitted_fields, false, "Force initialize omitted fields with null values", 0) \ - M(Bool, input_format_arrow_case_insensitive_column_matching, false, "Ignore case when matching Arrow columns with CH columns.", 0) \ - M(Int64, input_format_orc_row_batch_size, 100'000, "Batch size when reading ORC stripes.", 0) \ - M(Bool, input_format_orc_case_insensitive_column_matching, false, "Ignore case when matching ORC columns with CH columns.", 0) \ - M(Bool, input_format_parquet_case_insensitive_column_matching, false, "Ignore case when matching Parquet columns with CH columns.", 0) \ - M(Bool, input_format_parquet_preserve_order, false, "Avoid reordering rows when reading from Parquet files. Usually makes it much slower.", 0) \ - M(Bool, input_format_parquet_filter_push_down, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and min/max statistics in the Parquet metadata.", 0) \ - M(Bool, input_format_parquet_use_native_reader, false, "When reading Parquet files, to use native reader instead of arrow reader.", 0) \ - M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \ - M(Bool, input_format_orc_allow_missing_columns, true, "Allow missing columns while reading ORC input formats", 0) \ - M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \ - M(Bool, input_format_orc_filter_push_down, true, "When reading ORC files, skip whole stripes or row groups based on the WHERE/PREWHERE expressions, min/max statistics or bloom filter in the ORC metadata.", 0) \ - M(String, input_format_orc_reader_time_zone_name, "GMT", "The time zone name for ORC row reader, the default ORC row reader's time zone is GMT.", 0) \ - M(Bool, input_format_parquet_allow_missing_columns, true, "Allow missing columns while reading Parquet input formats", 0) \ - M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \ - M(Bool, input_format_arrow_allow_missing_columns, true, "Allow missing columns while reading Arrow input formats", 0) \ - M(Char, input_format_hive_text_fields_delimiter, '\x01', "Delimiter between fields in Hive Text File", 0) \ - M(Char, input_format_hive_text_collection_items_delimiter, '\x02', "Delimiter between collection(array or map) items in Hive Text File", 0) \ - M(Char, input_format_hive_text_map_keys_delimiter, '\x03', "Delimiter between a pair of map key/values in Hive Text File", 0) \ - M(Bool, input_format_hive_text_allow_variable_number_of_columns, true, "Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values", 0) \ - M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \ - M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, "The way how to output UUID in MsgPack format.", 0) \ - M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \ - M(UInt64, input_format_max_bytes_to_read_for_schema_inference, 32 * 1024 * 1024, "The maximum bytes of data to read for automatic schema inference", 0) \ - M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \ - M(Bool, input_format_csv_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference in CSV format", 0) \ - M(Bool, input_format_csv_try_infer_strings_from_quoted_tuples, true, "Interpret quoted tuples in the input data as a value of type String.", 0) \ - M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \ - M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \ - M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \ - M(Bool, input_format_csv_trim_whitespaces, true, "Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings", 0) \ - M(Bool, input_format_csv_use_default_on_bad_values, false, "Allow to set default value to column when CSV field deserialization failed on bad value", 0) \ - M(Bool, input_format_csv_allow_variable_number_of_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \ - M(Bool, input_format_tsv_allow_variable_number_of_columns, false, "Ignore extra columns in TSV input (if file has more columns than expected) and treat missing fields in TSV input as default values", 0) \ - M(Bool, input_format_custom_allow_variable_number_of_columns, false, "Ignore extra columns in CustomSeparated input (if file has more columns than expected) and treat missing fields in CustomSeparated input as default values", 0) \ - M(Bool, input_format_json_compact_allow_variable_number_of_columns, false, "Ignore extra columns in JSONCompact(EachRow) input (if file has more columns than expected) and treat missing fields in JSONCompact(EachRow) input as default values", 0) \ - M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \ - M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \ - M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \ - M(UInt64, input_format_parquet_max_block_size, DEFAULT_BLOCK_SIZE, "Max block size for parquet reader.", 0) \ - M(UInt64, input_format_parquet_prefer_block_bytes, DEFAULT_BLOCK_SIZE * 256, "Average block bytes output by parquet reader", 0) \ - M(Bool, input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format Protobuf", 0) \ - M(Bool, input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format CapnProto", 0) \ - M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format ORC", 0) \ - M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Arrow", 0) \ - M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \ - M(String, schema_inference_hints, "", "The list of column names and types to use in schema inference for formats without column names. The format: 'column_name1 column_type1, column_name2 column_type2, ...'", 0) \ - M(SchemaInferenceMode, schema_inference_mode, "default", "Mode of schema inference. 'default' - assume that all files have the same schema and schema can be inferred from any file, 'union' - files can have different schemas and the resulting schema should be the a union of schemas of all files", 0) \ - M(UInt64Auto, schema_inference_make_columns_nullable, 1, "If set to true, all inferred types will be Nullable in schema inference. When set to false, no columns will be converted to Nullable. When set to 'auto', ClickHouse will use information about nullability from the data.", 0) \ - M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \ - M(Bool, input_format_json_read_bools_as_strings, true, "Allow to parse bools as strings in JSON input formats", 0) \ - M(Bool, input_format_json_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference", 0) \ - M(Bool, input_format_json_validate_types_from_metadata, true, "For JSON/JSONCompact/JSONColumnsWithMetadata input formats this controls whether format parser should check if data types from input metadata match data types of the corresponding columns from the table", 0) \ - M(Bool, input_format_json_read_numbers_as_strings, true, "Allow to parse numbers as strings in JSON input formats", 0) \ - M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \ - M(Bool, input_format_json_read_arrays_as_strings, true, "Allow to parse JSON arrays as strings in JSON input formats", 0) \ - M(Bool, input_format_json_try_infer_named_tuples_from_objects, true, "Try to infer named tuples from JSON objects in JSON input formats", 0) \ - M(Bool, input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects, false, "Use String type instead of an exception in case of ambiguous paths in JSON objects during named tuples inference", 0) \ - M(Bool, input_format_json_infer_incomplete_types_as_strings, true, "Use type String for keys that contains only Nulls or empty objects/arrays during schema inference in JSON input formats", 0) \ - M(Bool, input_format_json_named_tuples_as_objects, true, "Deserialize named tuple columns as JSON objects", 0) \ - M(Bool, input_format_json_ignore_unknown_keys_in_named_tuple, true, "Ignore unknown keys in json object for named tuples", 0) \ - M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \ - M(Bool, input_format_json_throw_on_bad_escape_sequence, true, "Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data", 0) \ - M(Bool, input_format_json_ignore_unnecessary_fields, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields", 0) \ - M(Bool, input_format_try_infer_variants, false, "Try to infer the Variant type in text formats when there is more than one possible type for column/array elements", 0) \ - M(Bool, type_json_skip_duplicated_paths, false, "When enabled, during parsing JSON object into JSON type duplicated paths will be ignored and only the first one will be inserted instead of an exception", 0) \ - M(UInt64, input_format_json_max_depth, 1000, "Maximum depth of a field in JSON. This is not a strict limit, it does not have to be applied precisely.", 0) \ - M(Bool, input_format_json_empty_as_default, false, "Treat empty fields in JSON input as default values.", 0) \ - M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \ - M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \ - M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \ - M(Bool, input_format_try_infer_datetimes_only_datetime64, false, "When input_format_try_infer_datetimes is enabled, infer only DateTime64 but not DateTime types", 0) \ - M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats (except JSON, where exponent numbers are always inferred)", 0) \ - M(Bool, output_format_markdown_escape_special_characters, false, "Escape special characters in Markdown", 0) \ - M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \ - M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \ - M(UInt64, input_format_csv_skip_first_lines, 0, "Skip specified number of lines at the beginning of data in CSV format", 0) \ - M(UInt64, input_format_tsv_skip_first_lines, 0, "Skip specified number of lines at the beginning of data in TSV format", 0) \ - M(Bool, input_format_csv_skip_trailing_empty_lines, false, "Skip trailing empty lines in CSV format", 0) \ - M(Bool, input_format_tsv_skip_trailing_empty_lines, false, "Skip trailing empty lines in TSV format", 0) \ - M(Bool, input_format_custom_skip_trailing_empty_lines, false, "Skip trailing empty lines in CustomSeparated format", 0) \ - M(Bool, input_format_tsv_crlf_end_of_line, false, "If it is set true, file function will read TSV format with \\r\\n instead of \\n.", 0) \ + M(Char, format_csv_delimiter, ',', R"( +The character to be considered as a delimiter in CSV data. If setting with a string, a string has to have a length of 1. +)", 0) \ + M(Bool, format_csv_allow_single_quotes, false, R"( +If it is set to true, allow strings in single quotes. +)", 0) \ + M(Bool, format_csv_allow_double_quotes, true, R"( +If it is set to true, allow strings in double quotes. +)", 0) \ + M(Bool, output_format_csv_serialize_tuple_into_separate_columns, true, R"( +If it set to true, then Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost) +)", 0) \ + M(Bool, input_format_csv_deserialize_separate_columns_into_tuple, true, R"( +If it set to true, then separate columns written in CSV format can be deserialized to Tuple column. +)", 0) \ + M(Bool, output_format_csv_crlf_end_of_line, false, R"( +If it is set true, end of line in CSV format will be \\r\\n instead of \\n. +)", 0) \ + M(Bool, input_format_csv_allow_cr_end_of_line, false, R"( +If it is set true, \\r will be allowed at end of line not followed by \\n +)", 0) \ + M(Bool, input_format_csv_enum_as_number, false, R"( +Treat inserted enum values in CSV formats as enum indices +)", 0) \ + M(Bool, input_format_csv_arrays_as_nested_csv, false, R"( +When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: \"[\"\"Hello\"\", \"\"world\"\", \"\"42\"\"\"\" TV\"\"]\". Braces around array can be omitted. +)", 0) \ + M(Bool, input_format_skip_unknown_fields, true, R"( +Enables or disables skipping insertion of extra data. + +When writing data, ClickHouse throws an exception if input data contain columns that do not exist in the target table. If skipping is enabled, ClickHouse does not insert extra data and does not throw an exception. + +Supported formats: + +- [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) (and other JSON formats) +- [BSONEachRow](../../interfaces/formats.md/#bsoneachrow) (and other JSON formats) +- [TSKV](../../interfaces/formats.md/#tskv) +- All formats with suffixes WithNames/WithNamesAndTypes +- [MySQLDump](../../interfaces/formats.md/#mysqldump) +- [Native](../../interfaces/formats.md/#native) + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. +)", 0) \ + M(Bool, input_format_with_names_use_header, true, R"( +Enables or disables checking the column order when inserting data. + +To improve insert performance, we recommend disabling this check if you are sure that the column order of the input data is the same as in the target table. + +Supported formats: + +- [CSVWithNames](../../interfaces/formats.md/#csvwithnames) +- [CSVWithNamesAndTypes](../../interfaces/formats.md/#csvwithnamesandtypes) +- [TabSeparatedWithNames](../../interfaces/formats.md/#tabseparatedwithnames) +- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md/#tabseparatedwithnamesandtypes) +- [JSONCompactEachRowWithNames](../../interfaces/formats.md/#jsoncompacteachrowwithnames) +- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompacteachrowwithnamesandtypes) +- [JSONCompactStringsEachRowWithNames](../../interfaces/formats.md/#jsoncompactstringseachrowwithnames) +- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompactstringseachrowwithnamesandtypes) +- [RowBinaryWithNames](../../interfaces/formats.md/#rowbinarywithnames) +- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md/#rowbinarywithnamesandtypes) +- [CustomSeparatedWithNames](../../interfaces/formats.md/#customseparatedwithnames) +- [CustomSeparatedWithNamesAndTypes](../../interfaces/formats.md/#customseparatedwithnamesandtypes) + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. +)", 0) \ + M(Bool, input_format_with_types_use_header, true, R"( +Controls whether format parser should check if data types from the input data match data types from the target table. + +Supported formats: + +- [CSVWithNamesAndTypes](../../interfaces/formats.md/#csvwithnamesandtypes) +- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md/#tabseparatedwithnamesandtypes) +- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompacteachrowwithnamesandtypes) +- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md/#jsoncompactstringseachrowwithnamesandtypes) +- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md/#rowbinarywithnamesandtypes-rowbinarywithnamesandtypes) +- [CustomSeparatedWithNamesAndTypes](../../interfaces/formats.md/#customseparatedwithnamesandtypes) + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. +)", 0) \ + M(Bool, input_format_import_nested_json, false, R"( +Enables or disables the insertion of JSON data with nested objects. + +Supported formats: + +- [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +See also: + +- [Usage of Nested Structures](../../interfaces/formats.md/#jsoneachrow-nested) with the `JSONEachRow` format. +)", 0) \ + M(Bool, input_format_defaults_for_omitted_fields, true, R"( +When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option applies to [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) (and other JSON formats), [CSV](../../interfaces/formats.md/#csv), [TabSeparated](../../interfaces/formats.md/#tabseparated), [TSKV](../../interfaces/formats.md/#tskv), [Parquet](../../interfaces/formats.md/#parquet), [Arrow](../../interfaces/formats.md/#arrow), [Avro](../../interfaces/formats.md/#avro), [ORC](../../interfaces/formats.md/#orc), [Native](../../interfaces/formats.md/#native) formats and formats with `WithNames`/`WithNamesAndTypes` suffixes. + +:::note +When this option is enabled, extended table metadata are sent from server to client. It consumes additional computing resources on the server and can reduce performance. +::: + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. +)", IMPORTANT) \ + M(Bool, input_format_csv_empty_as_default, true, R"( +Treat empty fields in CSV input as default values. +)", 0) \ + M(Bool, input_format_tsv_empty_as_default, false, R"( +Treat empty fields in TSV input as default values. +)", 0) \ + M(Bool, input_format_tsv_enum_as_number, false, R"( +Treat inserted enum values in TSV formats as enum indices. +)", 0) \ + M(Bool, input_format_null_as_default, true, R"( +Enables or disables the initialization of [NULL](../../sql-reference/syntax.md/#null-literal) fields with [default values](../../sql-reference/statements/create/table.md/#create-default-values), if data type of these fields is not [nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable). +If column type is not nullable and this setting is disabled, then inserting `NULL` causes an exception. If column type is nullable, then `NULL` values are inserted as is, regardless of this setting. + +This setting is applicable for most input formats. + +For complex default expressions `input_format_defaults_for_omitted_fields` must be enabled too. + +Possible values: + +- 0 — Inserting `NULL` into a not nullable column causes an exception. +- 1 — `NULL` fields are initialized with default column values. +)", 0) \ + M(Bool, input_format_force_null_for_omitted_fields, false, R"( +Force initialize omitted fields with null values +)", 0) \ + M(Bool, input_format_arrow_case_insensitive_column_matching, false, R"( +Ignore case when matching Arrow columns with CH columns. +)", 0) \ + M(Int64, input_format_orc_row_batch_size, 100'000, R"( +Batch size when reading ORC stripes. +)", 0) \ + M(Bool, input_format_orc_case_insensitive_column_matching, false, R"( +Ignore case when matching ORC columns with CH columns. +)", 0) \ + M(Bool, input_format_parquet_case_insensitive_column_matching, false, R"( +Ignore case when matching Parquet columns with CH columns. +)", 0) \ + M(Bool, input_format_parquet_preserve_order, false, R"( +Avoid reordering rows when reading from Parquet files. Usually makes it much slower. +)", 0) \ + M(Bool, input_format_parquet_filter_push_down, true, R"( +When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and min/max statistics in the Parquet metadata. +)", 0) \ + M(Bool, input_format_parquet_use_native_reader, false, R"( +When reading Parquet files, to use native reader instead of arrow reader. +)", 0) \ + M(Bool, input_format_allow_seeks, true, R"( +Allow seeks while reading in ORC/Parquet/Arrow input formats. + +Enabled by default. +)", 0) \ + M(Bool, input_format_orc_allow_missing_columns, true, R"( +Allow missing columns while reading ORC input formats +)", 0) \ + M(Bool, input_format_orc_use_fast_decoder, true, R"( +Use a faster ORC decoder implementation. +)", 0) \ + M(Bool, input_format_orc_filter_push_down, true, R"( +When reading ORC files, skip whole stripes or row groups based on the WHERE/PREWHERE expressions, min/max statistics or bloom filter in the ORC metadata. +)", 0) \ + M(String, input_format_orc_reader_time_zone_name, "GMT", R"( +The time zone name for ORC row reader, the default ORC row reader's time zone is GMT. +)", 0) \ + M(Bool, input_format_parquet_allow_missing_columns, true, R"( +Allow missing columns while reading Parquet input formats +)", 0) \ + M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, R"( +Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format +)", 0) \ + M(Bool, input_format_arrow_allow_missing_columns, true, R"( +Allow missing columns while reading Arrow input formats +)", 0) \ + M(Char, input_format_hive_text_fields_delimiter, '\x01', R"( +Delimiter between fields in Hive Text File +)", 0) \ + M(Char, input_format_hive_text_collection_items_delimiter, '\x02', R"( +Delimiter between collection(array or map) items in Hive Text File +)", 0) \ + M(Char, input_format_hive_text_map_keys_delimiter, '\x03', R"( +Delimiter between a pair of map key/values in Hive Text File +)", 0) \ + M(Bool, input_format_hive_text_allow_variable_number_of_columns, true, R"( +Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values +)", 0) \ + M(UInt64, input_format_msgpack_number_of_columns, 0, R"( +The number of columns in inserted MsgPack data. Used for automatic schema inference from data. +)", 0) \ + M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, R"( +The way how to output UUID in MsgPack format. +)", 0) \ + M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, R"( +The maximum rows of data to read for automatic schema inference. +)", 0) \ + M(UInt64, input_format_max_bytes_to_read_for_schema_inference, 32 * 1024 * 1024, R"( +The maximum amount of data in bytes to read for automatic schema inference. +)", 0) \ + M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, R"( +Use some tweaks and heuristics to infer schema in CSV format +)", 0) \ + M(Bool, input_format_csv_try_infer_numbers_from_strings, false, R"( +If enabled, during schema inference ClickHouse will try to infer numbers from string fields. +It can be useful if CSV data contains quoted UInt64 numbers. + +Disabled by default. +)", 0) \ + M(Bool, input_format_csv_try_infer_strings_from_quoted_tuples, true, R"( +Interpret quoted tuples in the input data as a value of type String. +)", 0) \ + M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, R"( +Use some tweaks and heuristics to infer schema in TSV format +)", 0) \ + M(Bool, input_format_csv_detect_header, true, R"( +Automatically detect header with names and types in CSV format +)", 0) \ + M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, R"( +Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings +)", 0) \ + M(Bool, input_format_csv_trim_whitespaces, true, R"( +Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings +)", 0) \ + M(Bool, input_format_csv_use_default_on_bad_values, false, R"( +Allow to set default value to column when CSV field deserialization failed on bad value +)", 0) \ + M(Bool, input_format_csv_allow_variable_number_of_columns, false, R"( +Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values +)", 0) \ + M(Bool, input_format_tsv_allow_variable_number_of_columns, false, R"( +Ignore extra columns in TSV input (if file has more columns than expected) and treat missing fields in TSV input as default values +)", 0) \ + M(Bool, input_format_custom_allow_variable_number_of_columns, false, R"( +Ignore extra columns in CustomSeparated input (if file has more columns than expected) and treat missing fields in CustomSeparated input as default values +)", 0) \ + M(Bool, input_format_json_compact_allow_variable_number_of_columns, false, R"( +Ignore extra columns in JSONCompact(EachRow) input (if file has more columns than expected) and treat missing fields in JSONCompact(EachRow) input as default values +)", 0) \ + M(Bool, input_format_tsv_detect_header, true, R"( +Automatically detect header with names and types in TSV format +)", 0) \ + M(Bool, input_format_custom_detect_header, true, R"( +Automatically detect header with names and types in CustomSeparated format +)", 0) \ + M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, R"( +Skip columns with unsupported types while schema inference for format Parquet +)", 0) \ + M(UInt64, input_format_parquet_max_block_size, DEFAULT_BLOCK_SIZE, R"( +Max block size for parquet reader. +)", 0) \ + M(UInt64, input_format_parquet_prefer_block_bytes, DEFAULT_BLOCK_SIZE * 256, R"( +Average block bytes output by parquet reader +)", 0) \ + M(Bool, input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference, false, R"( +Skip fields with unsupported types while schema inference for format Protobuf +)", 0) \ + M(Bool, input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference, false, R"( +Skip columns with unsupported types while schema inference for format CapnProto +)", 0) \ + M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, R"( +Skip columns with unsupported types while schema inference for format ORC +)", 0) \ + M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, R"( +Skip columns with unsupported types while schema inference for format Arrow +)", 0) \ + M(String, column_names_for_schema_inference, "", R"( +The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...' +)", 0) \ + M(String, schema_inference_hints, "", R"( +The list of column names and types to use as hints in schema inference for formats without schema. + +Example: + +Query: +```sql +desc format(JSONEachRow, '{"x" : 1, "y" : "String", "z" : "0.0.0.0" }') settings schema_inference_hints='x UInt8, z IPv4'; +``` + +Result: +```sql +x UInt8 +y Nullable(String) +z IPv4 +``` + +:::note +If the `schema_inference_hints` is not formatted properly, or if there is a typo or a wrong datatype, etc... the whole schema_inference_hints will be ignored. +::: +)", 0) \ + M(SchemaInferenceMode, schema_inference_mode, "default", R"( +Mode of schema inference. 'default' - assume that all files have the same schema and schema can be inferred from any file, 'union' - files can have different schemas and the resulting schema should be the a union of schemas of all files +)", 0) \ + M(UInt64Auto, schema_inference_make_columns_nullable, 1, R"( +Controls making inferred types `Nullable` in schema inference. +If the setting is enabled, all inferred type will be `Nullable`, if disabled, the inferred type will never be `Nullable`, if set to `auto`, the inferred type will be `Nullable` only if the column contains `NULL` in a sample that is parsed during schema inference or file metadata contains information about column nullability. +)", 0) \ + M(Bool, input_format_json_read_bools_as_numbers, true, R"( +Allow parsing bools as numbers in JSON input formats. + +Enabled by default. +)", 0) \ + M(Bool, input_format_json_read_bools_as_strings, true, R"( +Allow parsing bools as strings in JSON input formats. + +Enabled by default. +)", 0) \ + M(Bool, input_format_json_try_infer_numbers_from_strings, false, R"( +If enabled, during schema inference ClickHouse will try to infer numbers from string fields. +It can be useful if JSON data contains quoted UInt64 numbers. + +Disabled by default. +)", 0) \ + M(Bool, input_format_json_validate_types_from_metadata, true, R"( +For JSON/JSONCompact/JSONColumnsWithMetadata input formats, if this setting is set to 1, +the types from metadata in input data will be compared with the types of the corresponding columns from the table. + +Enabled by default. +)", 0) \ + M(Bool, input_format_json_read_numbers_as_strings, true, R"( +Allow parsing numbers as strings in JSON input formats. + +Enabled by default. +)", 0) \ + M(Bool, input_format_json_read_objects_as_strings, true, R"( +Allow parsing JSON objects as strings in JSON input formats. + +Example: + +```sql +SET input_format_json_read_objects_as_strings = 1; +CREATE TABLE test (id UInt64, obj String, date Date) ENGINE=Memory(); +INSERT INTO test FORMAT JSONEachRow {"id" : 1, "obj" : {"a" : 1, "b" : "Hello"}, "date" : "2020-01-01"}; +SELECT * FROM test; +``` + +Result: + +``` +┌─id─┬─obj──────────────────────┬───────date─┐ +│ 1 │ {"a" : 1, "b" : "Hello"} │ 2020-01-01 │ +└────┴──────────────────────────┴────────────┘ +``` + +Enabled by default. +)", 0) \ + M(Bool, input_format_json_read_arrays_as_strings, true, R"( +Allow parsing JSON arrays as strings in JSON input formats. + +Example: + +```sql +SET input_format_json_read_arrays_as_strings = 1; +SELECT arr, toTypeName(arr), JSONExtractArrayRaw(arr)[3] from format(JSONEachRow, 'arr String', '{"arr" : [1, "Hello", [1,2,3]]}'); +``` + +Result: +``` +┌─arr───────────────────┬─toTypeName(arr)─┬─arrayElement(JSONExtractArrayRaw(arr), 3)─┐ +│ [1, "Hello", [1,2,3]] │ String │ [1,2,3] │ +└───────────────────────┴─────────────────┴───────────────────────────────────────────┘ +``` + +Enabled by default. +)", 0) \ + M(Bool, input_format_json_try_infer_named_tuples_from_objects, true, R"( +If enabled, during schema inference ClickHouse will try to infer named Tuple from JSON objects. +The resulting named Tuple will contain all elements from all corresponding JSON objects from sample data. + +Example: + +```sql +SET input_format_json_try_infer_named_tuples_from_objects = 1; +DESC format(JSONEachRow, '{"obj" : {"a" : 42, "b" : "Hello"}}, {"obj" : {"a" : 43, "c" : [1, 2, 3]}}, {"obj" : {"d" : {"e" : 42}}}') +``` + +Result: + +``` +┌─name─┬─type───────────────────────────────────────────────────────────────────────────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐ +│ obj │ Tuple(a Nullable(Int64), b Nullable(String), c Array(Nullable(Int64)), d Tuple(e Nullable(Int64))) │ │ │ │ │ │ +└──────┴────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ +``` + +Enabled by default. +)", 0) \ + M(Bool, input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects, false, R"( +Use String type instead of an exception in case of ambiguous paths in JSON objects during named tuples inference +)", 0) \ + M(Bool, input_format_json_infer_incomplete_types_as_strings, true, R"( +Allow to use String type for JSON keys that contain only `Null`/`{}`/`[]` in data sample during schema inference. +In JSON formats any value can be read as String, and we can avoid errors like `Cannot determine type for column 'column_name' by first 25000 rows of data, most likely this column contains only Nulls or empty Arrays/Maps` during schema inference +by using String type for keys with unknown types. + +Example: + +```sql +SET input_format_json_infer_incomplete_types_as_strings = 1, input_format_json_try_infer_named_tuples_from_objects = 1; +DESCRIBE format(JSONEachRow, '{"obj" : {"a" : [1,2,3], "b" : "hello", "c" : null, "d" : {}, "e" : []}}'); +SELECT * FROM format(JSONEachRow, '{"obj" : {"a" : [1,2,3], "b" : "hello", "c" : null, "d" : {}, "e" : []}}'); +``` + +Result: +``` +┌─name─┬─type───────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐ +│ obj │ Tuple(a Array(Nullable(Int64)), b Nullable(String), c Nullable(String), d Nullable(String), e Array(Nullable(String))) │ │ │ │ │ │ +└──────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ + +┌─obj────────────────────────────┐ +│ ([1,2,3],'hello',NULL,'{}',[]) │ +└────────────────────────────────┘ +``` + +Enabled by default. +)", 0) \ + M(Bool, input_format_json_named_tuples_as_objects, true, R"( +Parse named tuple columns as JSON objects. + +Enabled by default. +)", 0) \ + M(Bool, input_format_json_ignore_unknown_keys_in_named_tuple, true, R"( +Ignore unknown keys in json object for named tuples. + +Enabled by default. +)", 0) \ + M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, R"( +Insert default values for missing elements in JSON object while parsing named tuple. +This setting works only when setting `input_format_json_named_tuples_as_objects` is enabled. + +Enabled by default. +)", 0) \ + M(Bool, input_format_json_throw_on_bad_escape_sequence, true, R"( +Throw an exception if JSON string contains bad escape sequence in JSON input formats. If disabled, bad escape sequences will remain as is in the data. + +Enabled by default. +)", 0) \ + M(Bool, input_format_json_ignore_unnecessary_fields, true, R"( +Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields +)", 0) \ + M(Bool, input_format_try_infer_variants, false, R"( +If enabled, ClickHouse will try to infer type [`Variant`](../../sql-reference/data-types/variant.md) in schema inference for text formats when there is more than one possible type for column/array elements. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. +)", 0) \ + M(Bool, type_json_skip_duplicated_paths, false, R"( +When enabled, during parsing JSON object into JSON type duplicated paths will be ignored and only the first one will be inserted instead of an exception +)", 0) \ + M(UInt64, input_format_json_max_depth, 1000, R"( +Maximum depth of a field in JSON. This is not a strict limit, it does not have to be applied precisely. +)", 0) \ + M(Bool, input_format_json_empty_as_default, false, R"( +Treat empty fields in JSON input as default values. +)", 0) \ + M(Bool, input_format_try_infer_integers, true, R"( +If enabled, ClickHouse will try to infer integers instead of floats in schema inference for text formats. If all numbers in the column from input data are integers, the result type will be `Int64`, if at least one number is float, the result type will be `Float64`. + +Enabled by default. +)", 0) \ + M(Bool, input_format_try_infer_dates, true, R"( +If enabled, ClickHouse will try to infer type `Date` from string fields in schema inference for text formats. If all fields from a column in input data were successfully parsed as dates, the result type will be `Date`, if at least one field was not parsed as date, the result type will be `String`. + +Enabled by default. +)", 0) \ + M(Bool, input_format_try_infer_datetimes, true, R"( +If enabled, ClickHouse will try to infer type `DateTime64` from string fields in schema inference for text formats. If all fields from a column in input data were successfully parsed as datetimes, the result type will be `DateTime64`, if at least one field was not parsed as datetime, the result type will be `String`. + +Enabled by default. +)", 0) \ + M(Bool, input_format_try_infer_datetimes_only_datetime64, false, R"( +When input_format_try_infer_datetimes is enabled, infer only DateTime64 but not DateTime types +)", 0) \ + M(Bool, input_format_try_infer_exponent_floats, false, R"( +Try to infer floats in exponential notation while schema inference in text formats (except JSON, where exponent numbers are always inferred) +)", 0) \ + M(Bool, output_format_markdown_escape_special_characters, false, R"( +Escape special characters in Markdown +)", 0) \ + M(Bool, input_format_protobuf_flatten_google_wrappers, false, R"( +Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls +)", 0) \ + M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, R"( +When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized +)", 0) \ + M(UInt64, input_format_csv_skip_first_lines, 0, R"( +Skip specified number of lines at the beginning of data in CSV format +)", 0) \ + M(UInt64, input_format_tsv_skip_first_lines, 0, R"( +Skip specified number of lines at the beginning of data in TSV format +)", 0) \ + M(Bool, input_format_csv_skip_trailing_empty_lines, false, R"( +Skip trailing empty lines in CSV format +)", 0) \ + M(Bool, input_format_tsv_skip_trailing_empty_lines, false, R"( +Skip trailing empty lines in TSV format +)", 0) \ + M(Bool, input_format_custom_skip_trailing_empty_lines, false, R"( +Skip trailing empty lines in CustomSeparated format +)", 0) \ + M(Bool, input_format_tsv_crlf_end_of_line, false, R"( +If it is set true, file function will read TSV format with \\r\\n instead of \\n. +)", 0) \ \ - M(Bool, input_format_native_allow_types_conversion, true, "Allow data types conversion in Native input format", 0) \ - M(Bool, input_format_native_decode_types_in_binary_format, false, "Read data types in binary format instead of type names in Native input format", 0) \ - M(Bool, output_format_native_encode_types_in_binary_format, false, "Write data types in binary format instead of type names in Native output format", 0) \ + M(Bool, input_format_native_allow_types_conversion, true, R"( +Allow data types conversion in Native input format +)", 0) \ + M(Bool, input_format_native_decode_types_in_binary_format, false, R"( +Read data types in binary format instead of type names in Native input format +)", 0) \ + M(Bool, output_format_native_encode_types_in_binary_format, false, R"( +Write data types in binary format instead of type names in Native output format +)", 0) \ \ - M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic', 'best_effort' and 'best_effort_us'.", 0) \ - M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \ - M(IntervalOutputFormat, interval_output_format, FormatSettings::IntervalOutputFormat::Numeric, "Textual representation of Interval. Possible values: 'kusto', 'numeric'.", 0) \ + M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, R"( +Allows choosing a parser of the text representation of date and time. + +The setting does not apply to [date and time functions](../../sql-reference/functions/date-time-functions.md). + +Possible values: + +- `'best_effort'` — Enables extended parsing. + + ClickHouse can parse the basic `YYYY-MM-DD HH:MM:SS` format and all [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) date and time formats. For example, `'2018-06-08T01:02:03.000Z'`. + +- `'basic'` — Use basic parser. + + ClickHouse can parse only the basic `YYYY-MM-DD HH:MM:SS` or `YYYY-MM-DD` format. For example, `2019-08-20 10:18:56` or `2019-08-20`. + +Cloud default value: `'best_effort'`. + +See also: + +- [DateTime data type.](../../sql-reference/data-types/datetime.md) +- [Functions for working with dates and times.](../../sql-reference/functions/date-time-functions.md) +)", 0) \ + M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, R"( +Allows choosing different output formats of the text representation of date and time. + +Possible values: + +- `simple` - Simple output format. + + ClickHouse output date and time `YYYY-MM-DD hh:mm:ss` format. For example, `2019-08-20 10:18:56`. The calculation is performed according to the data type's time zone (if present) or server time zone. + +- `iso` - ISO output format. + + ClickHouse output date and time in [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `YYYY-MM-DDThh:mm:ssZ` format. For example, `2019-08-20T10:18:56Z`. Note that output is in UTC (`Z` means UTC). + +- `unix_timestamp` - Unix timestamp output format. + + ClickHouse output date and time in [Unix timestamp](https://en.wikipedia.org/wiki/Unix_time) format. For example `1566285536`. + +See also: + +- [DateTime data type.](../../sql-reference/data-types/datetime.md) +- [Functions for working with dates and times.](../../sql-reference/functions/date-time-functions.md) +)", 0) \ + M(IntervalOutputFormat, interval_output_format, FormatSettings::IntervalOutputFormat::Numeric, R"( +Allows choosing different output formats of the text representation of interval types. + +Possible values: + +- `kusto` - KQL-style output format. + + ClickHouse outputs intervals in [KQL format](https://learn.microsoft.com/en-us/dotnet/standard/base-types/standard-timespan-format-strings#the-constant-c-format-specifier). For example, `toIntervalDay(2)` would be formatted as `2.00:00:00`. Please note that for interval types of varying length (ie. `IntervalMonth` and `IntervalYear`) the average number of seconds per interval is taken into account. + +- `numeric` - Numeric output format. + + ClickHouse outputs intervals as their underlying numeric representation. For example, `toIntervalDay(2)` would be formatted as `2`. + +See also: + +- [Interval](../../sql-reference/data-types/special-data-types/interval.md) +)", 0) \ \ - M(Bool, input_format_ipv4_default_on_conversion_error, false, "Deserialization of IPv4 will use default values instead of throwing exception on conversion error.", 0) \ - M(Bool, input_format_ipv6_default_on_conversion_error, false, "Deserialization of IPV6 will use default values instead of throwing exception on conversion error.", 0) \ - M(String, bool_true_representation, "true", "Text to represent bool value in TSV/CSV formats.", 0) \ - M(String, bool_false_representation, "false", "Text to represent bool value in TSV/CSV formats.", 0) \ + M(Bool, input_format_ipv4_default_on_conversion_error, false, R"( +Deserialization of IPv4 will use default values instead of throwing exception on conversion error. + +Disabled by default. +)", 0) \ + M(Bool, input_format_ipv6_default_on_conversion_error, false, R"( +Deserialization of IPV6 will use default values instead of throwing exception on conversion error. + +Disabled by default. +)", 0) \ + M(String, bool_true_representation, "true", R"( +Text to represent true bool value in TSV/CSV/Vertical/Pretty formats. +)", 0) \ + M(String, bool_false_representation, "false", R"( +Text to represent false bool value in TSV/CSV/Vertical/Pretty formats. +)", 0) \ \ - M(Bool, input_format_values_interpret_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression.", 0) \ - M(Bool, input_format_values_deduce_templates_of_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser, deduce template of the SQL expression, try to parse all rows using template and then interpret expression for all rows.", 0) \ - M(Bool, input_format_values_accurate_types_of_literals, true, "For Values format: when parsing and interpreting expressions using template, check actual type of literal to avoid possible overflow and precision issues.", 0) \ - M(Bool, input_format_avro_allow_missing_fields, false, "For Avro/AvroConfluent format: when field is not found in schema use default value instead of error", 0) \ + M(Bool, input_format_values_interpret_expressions, true, R"( +For Values format: if the field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression. +)", 0) \ + M(Bool, input_format_values_deduce_templates_of_expressions, true, R"( +For Values format: if the field could not be parsed by streaming parser, run SQL parser, deduce template of the SQL expression, try to parse all rows using template and then interpret expression for all rows. +)", 0) \ + M(Bool, input_format_values_accurate_types_of_literals, true, R"( +For Values format: when parsing and interpreting expressions using template, check actual type of literal to avoid possible overflow and precision issues. +)", 0) \ + M(Bool, input_format_avro_allow_missing_fields, false, R"( +For Avro/AvroConfluent format: when field is not found in schema use default value instead of error +)", 0) \ /** This setting is obsolete and do nothing, left for compatibility reasons. */ \ - M(Bool, input_format_avro_null_as_default, false, "For Avro/AvroConfluent format: insert default in case of null and non Nullable column", 0) \ - M(UInt64, format_binary_max_string_size, 1_GiB, "The maximum allowed size for String in RowBinary format. It prevents allocating large amount of memory in case of corrupted data. 0 means there is no limit", 0) \ - M(UInt64, format_binary_max_array_size, 1_GiB, "The maximum allowed size for Array in RowBinary format. It prevents allocating large amount of memory in case of corrupted data. 0 means there is no limit", 0) \ - M(Bool, input_format_binary_decode_types_in_binary_format, false, "Read data types in binary format instead of type names in RowBinaryWithNamesAndTypes input format", 0) \ - M(Bool, output_format_binary_encode_types_in_binary_format, false, "Write data types in binary format instead of type names in RowBinaryWithNamesAndTypes output format ", 0) \ - M(URI, format_avro_schema_registry_url, "", "For AvroConfluent format: Confluent Schema Registry URL.", 0) \ + M(Bool, input_format_avro_null_as_default, false, R"( +For Avro/AvroConfluent format: insert default in case of null and non Nullable column +)", 0) \ + M(UInt64, format_binary_max_string_size, 1_GiB, R"( +The maximum allowed size for String in RowBinary format. It prevents allocating large amount of memory in case of corrupted data. 0 means there is no limit +)", 0) \ + M(UInt64, format_binary_max_array_size, 1_GiB, R"( +The maximum allowed size for Array in RowBinary format. It prevents allocating large amount of memory in case of corrupted data. 0 means there is no limit +)", 0) \ + M(Bool, input_format_binary_decode_types_in_binary_format, false, R"( +Read data types in binary format instead of type names in RowBinaryWithNamesAndTypes input format +)", 0) \ + M(Bool, output_format_binary_encode_types_in_binary_format, false, R"( +Write data types in binary format instead of type names in RowBinaryWithNamesAndTypes output format +)", 0) \ + M(URI, format_avro_schema_registry_url, "", R"( +For AvroConfluent format: Confluent Schema Registry URL. +)", 0) \ \ - M(Bool, output_format_json_quote_64bit_integers, true, "Controls quoting of 64-bit integers in JSON output format.", 0) \ - M(Bool, output_format_json_quote_denormals, false, "Enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format.", 0) \ - M(Bool, output_format_json_quote_decimals, false, "Controls quoting of decimals in JSON output format.", 0) \ - M(Bool, output_format_json_quote_64bit_floats, false, "Controls quoting of 64-bit float numbers in JSON output format.", 0) \ + M(Bool, output_format_json_quote_64bit_integers, true, R"( +Controls quoting of 64-bit or bigger [integers](../../sql-reference/data-types/int-uint.md) (like `UInt64` or `Int128`) when they are output in a [JSON](../../interfaces/formats.md/#json) format. +Such integers are enclosed in quotes by default. This behavior is compatible with most JavaScript implementations. + +Possible values: + +- 0 — Integers are output without quotes. +- 1 — Integers are enclosed in quotes. +)", 0) \ + M(Bool, output_format_json_quote_denormals, false, R"str( +Enables `+nan`, `-nan`, `+inf`, `-inf` outputs in [JSON](../../interfaces/formats.md/#json) output format. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +**Example** + +Consider the following table `account_orders`: + +```text +┌─id─┬─name───┬─duration─┬─period─┬─area─┐ +│ 1 │ Andrew │ 20 │ 0 │ 400 │ +│ 2 │ John │ 40 │ 0 │ 0 │ +│ 3 │ Bob │ 15 │ 0 │ -100 │ +└────┴────────┴──────────┴────────┴──────┘ +``` + +When `output_format_json_quote_denormals = 0`, the query returns `null` values in output: + +```sql +SELECT area/period FROM account_orders FORMAT JSON; +``` + +```json +{ + "meta": + [ + { + "name": "divide(area, period)", + "type": "Float64" + } + ], + + "data": + [ + { + "divide(area, period)": null + }, + { + "divide(area, period)": null + }, + { + "divide(area, period)": null + } + ], + + "rows": 3, + + "statistics": + { + "elapsed": 0.003648093, + "rows_read": 3, + "bytes_read": 24 + } +} +``` + +When `output_format_json_quote_denormals = 1`, the query returns: + +```json +{ + "meta": + [ + { + "name": "divide(area, period)", + "type": "Float64" + } + ], + + "data": + [ + { + "divide(area, period)": "inf" + }, + { + "divide(area, period)": "-nan" + }, + { + "divide(area, period)": "-inf" + } + ], + + "rows": 3, + + "statistics": + { + "elapsed": 0.000070241, + "rows_read": 3, + "bytes_read": 24 + } +} +``` +)str", 0) \ + M(Bool, output_format_json_quote_decimals, false, R"( +Controls quoting of decimals in JSON output formats. + +Disabled by default. +)", 0) \ + M(Bool, output_format_json_quote_64bit_floats, false, R"( +Controls quoting of 64-bit [floats](../../sql-reference/data-types/float.md) when they are output in JSON* formats. + +Disabled by default. +)", 0) \ \ - M(Bool, output_format_json_escape_forward_slashes, true, "Controls escaping forward slashes for string outputs in JSON output format. This is intended for compatibility with JavaScript. Don't confuse with backslashes that are always escaped.", 0) \ - M(Bool, output_format_json_named_tuples_as_objects, true, "Serialize named tuple columns as JSON objects.", 0) \ - M(Bool, output_format_json_skip_null_value_in_named_tuples, false, "Skip key value pairs with null value when serialize named tuple columns as JSON objects. It is only valid when output_format_json_named_tuples_as_objects is true.", 0) \ - M(Bool, output_format_json_array_of_rows, false, "Output a JSON array of all rows in JSONEachRow(Compact) format.", 0) \ - M(Bool, output_format_json_validate_utf8, false, "Validate UTF-8 sequences in JSON output formats, doesn't impact formats JSON/JSONCompact/JSONColumnsWithMetadata, they always validate utf8", 0) \ + M(Bool, output_format_json_escape_forward_slashes, true, R"( +Controls escaping forward slashes for string outputs in JSON output format. This is intended for compatibility with JavaScript. Don't confuse with backslashes that are always escaped. + +Enabled by default. +)", 0) \ + M(Bool, output_format_json_named_tuples_as_objects, true, R"( +Serialize named tuple columns as JSON objects. + +Enabled by default. +)", 0) \ + M(Bool, output_format_json_skip_null_value_in_named_tuples, false, R"( +Skip key value pairs with null value when serialize named tuple columns as JSON objects. It is only valid when output_format_json_named_tuples_as_objects is true. +)", 0) \ + M(Bool, output_format_json_array_of_rows, false, R"( +Enables the ability to output all rows as a JSON array in the [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) format. + +Possible values: + +- 1 — ClickHouse outputs all rows as an array, each row in the `JSONEachRow` format. +- 0 — ClickHouse outputs each row separately in the `JSONEachRow` format. + +**Example of a query with the enabled setting** + +Query: + +```sql +SET output_format_json_array_of_rows = 1; +SELECT number FROM numbers(3) FORMAT JSONEachRow; +``` + +Result: + +```text +[ +{"number":"0"}, +{"number":"1"}, +{"number":"2"} +] +``` + +**Example of a query with the disabled setting** + +Query: + +```sql +SET output_format_json_array_of_rows = 0; +SELECT number FROM numbers(3) FORMAT JSONEachRow; +``` + +Result: + +```text +{"number":"0"} +{"number":"1"} +{"number":"2"} +``` +)", 0) \ + M(Bool, output_format_json_validate_utf8, false, R"( +Controls validation of UTF-8 sequences in JSON output formats, doesn't impact formats JSON/JSONCompact/JSONColumnsWithMetadata, they always validate UTF-8. + +Disabled by default. +)", 0) \ \ - M(String, format_json_object_each_row_column_for_object_name, "", "The name of column that will be used as object names in JSONObjectEachRow format. Column type should be String", 0) \ + M(String, format_json_object_each_row_column_for_object_name, "", R"( +The name of column that will be used for storing/writing object names in [JSONObjectEachRow](../../interfaces/formats.md/#jsonobjecteachrow) format. +Column type should be String. If value is empty, default names `row_{i}`will be used for object names. + +### input_format_json_compact_allow_variable_number_of_columns {#input_format_json_compact_allow_variable_number_of_columns} + +Allow variable number of columns in rows in JSONCompact/JSONCompactEachRow input formats. +Ignore extra columns in rows with more columns than expected and treat missing columns as default values. + +Disabled by default. + +### output_format_markdown_escape_special_characters {#output_format_markdown_escape_special_characters} + +When enabled, escape special characters in Markdown. + +[Common Mark](https://spec.commonmark.org/0.30/#example-12) defines the following special characters that can be escaped by \: + +``` +! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ +``` + +Possible values: + ++ 0 — Disable. ++ 1 — Enable. + +### input_format_json_empty_as_default {#input_format_json_empty_as_default} + +When enabled, replace empty input fields in JSON with default values. For complex default expressions `input_format_defaults_for_omitted_fields` must be enabled too. + +Possible values: + ++ 0 — Disable. ++ 1 — Enable. +)", 0) \ \ - M(UInt64, output_format_pretty_max_rows, 10000, "Rows limit for Pretty formats.", 0) \ - M(UInt64, output_format_pretty_max_column_pad_width, 250, "Maximum width to pad all values in a column in Pretty formats.", 0) \ - M(UInt64, output_format_pretty_max_value_width, 10000, "Maximum width of value to display in Pretty formats. If greater - it will be cut.", 0) \ - M(UInt64, output_format_pretty_max_value_width_apply_for_single_value, false, "Only cut values (see the `output_format_pretty_max_value_width` setting) when it is not a single value in a block. Otherwise output it entirely, which is useful for the `SHOW CREATE TABLE` query.", 0) \ - M(UInt64Auto, output_format_pretty_color, "auto", "Use ANSI escape sequences in Pretty formats. 0 - disabled, 1 - enabled, 'auto' - enabled if a terminal.", 0) \ - M(String, output_format_pretty_grid_charset, "UTF-8", "Charset for printing grid borders. Available charsets: ASCII, UTF-8 (default one).", 0) \ - M(UInt64, output_format_pretty_display_footer_column_names, true, "Display column names in the footer if there are 999 or more rows.", 0) \ - M(UInt64, output_format_pretty_display_footer_column_names_min_rows, 50, "Sets the minimum threshold value of rows for which to enable displaying column names in the footer. 50 (default)", 0) \ - M(UInt64, output_format_parquet_row_group_size, 1000000, "Target row group size in rows.", 0) \ - M(UInt64, output_format_parquet_row_group_size_bytes, 512 * 1024 * 1024, "Target row group size in bytes, before compression.", 0) \ - M(Bool, output_format_parquet_string_as_string, true, "Use Parquet String type instead of Binary for String columns.", 0) \ - M(Bool, output_format_parquet_fixed_string_as_fixed_byte_array, true, "Use Parquet FIXED_LENGTH_BYTE_ARRAY type instead of Binary for FixedString columns.", 0) \ - M(ParquetVersion, output_format_parquet_version, "2.latest", "Parquet format version for output format. Supported versions: 1.0, 2.4, 2.6 and 2.latest (default)", 0) \ - M(ParquetCompression, output_format_parquet_compression_method, "zstd", "Compression method for Parquet output format. Supported codecs: snappy, lz4, brotli, zstd, gzip, none (uncompressed)", 0) \ - M(Bool, output_format_parquet_compliant_nested_types, true, "In parquet file schema, use name 'element' instead of 'item' for list elements. This is a historical artifact of Arrow library implementation. Generally increases compatibility, except perhaps with some old versions of Arrow.", 0) \ - M(Bool, output_format_parquet_use_custom_encoder, true, "Use a faster Parquet encoder implementation.", 0) \ - M(Bool, output_format_parquet_parallel_encoding, true, "Do Parquet encoding in multiple threads. Requires output_format_parquet_use_custom_encoder.", 0) \ - M(UInt64, output_format_parquet_data_page_size, 1024 * 1024, "Target page size in bytes, before compression.", 0) \ - M(UInt64, output_format_parquet_batch_size, 1024, "Check page size every this many rows. Consider decreasing if you have columns with average values size above a few KBs.", 0) \ - M(Bool, output_format_parquet_write_page_index, true, "Add a possibility to write page index into parquet files.", 0) \ - M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy', 'zstd'.", 0) \ - M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \ - M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \ - M(UInt64, output_format_avro_rows_in_file, 1, "Max rows in a file (if permitted by storage)", 0) \ - M(Bool, output_format_tsv_crlf_end_of_line, false, "If it is set true, end of line in TSV format will be \\r\\n instead of \\n.", 0) \ - M(String, format_csv_null_representation, "\\N", "Custom NULL representation in CSV format", 0) \ - M(String, format_tsv_null_representation, "\\N", "Custom NULL representation in TSV format", 0) \ - M(Bool, output_format_decimal_trailing_zeros, false, "Output trailing zeros when printing Decimal values. E.g. 1.230000 instead of 1.23.", 0) \ + M(UInt64, output_format_pretty_max_rows, 10000, R"( +Rows limit for Pretty formats. +)", 0) \ + M(UInt64, output_format_pretty_max_column_pad_width, 250, R"( +Maximum width to pad all values in a column in Pretty formats. +)", 0) \ + M(UInt64, output_format_pretty_max_value_width, 10000, R"( +Maximum width of value to display in Pretty formats. If greater - it will be cut. +)", 0) \ + M(UInt64, output_format_pretty_max_value_width_apply_for_single_value, false, R"( +Only cut values (see the `output_format_pretty_max_value_width` setting) when it is not a single value in a block. Otherwise output it entirely, which is useful for the `SHOW CREATE TABLE` query. +)", 0) \ + M(UInt64Auto, output_format_pretty_color, "auto", R"( +Use ANSI escape sequences in Pretty formats. 0 - disabled, 1 - enabled, 'auto' - enabled if a terminal. +)", 0) \ + M(String, output_format_pretty_grid_charset, "UTF-8", R"( +Charset for printing grid borders. Available charsets: ASCII, UTF-8 (default one). +)", 0) \ + M(UInt64, output_format_pretty_display_footer_column_names, true, R"( +Display column names in the footer if there are many table rows. + +Possible values: + +- 0 — No column names are displayed in the footer. +- 1 — Column names are displayed in the footer if row count is greater than or equal to the threshold value set by [output_format_pretty_display_footer_column_names_min_rows](#output_format_pretty_display_footer_column_names_min_rows) (50 by default). + +**Example** + +Query: + +```sql +SELECT *, toTypeName(*) FROM (SELECT * FROM system.numbers LIMIT 1000); +``` + +Result: + +```response + ┌─number─┬─toTypeName(number)─┐ + 1. │ 0 │ UInt64 │ + 2. │ 1 │ UInt64 │ + 3. │ 2 │ UInt64 │ + ... + 999. │ 998 │ UInt64 │ +1000. │ 999 │ UInt64 │ + └─number─┴─toTypeName(number)─┘ +``` +)", 0) \ + M(UInt64, output_format_pretty_display_footer_column_names_min_rows, 50, R"( +Sets the minimum number of rows for which a footer with column names will be displayed if setting [output_format_pretty_display_footer_column_names](#output_format_pretty_display_footer_column_names) is enabled. +)", 0) \ + M(UInt64, output_format_parquet_row_group_size, 1000000, R"( +Target row group size in rows. +)", 0) \ + M(UInt64, output_format_parquet_row_group_size_bytes, 512 * 1024 * 1024, R"( +Target row group size in bytes, before compression. +)", 0) \ + M(Bool, output_format_parquet_string_as_string, true, R"( +Use Parquet String type instead of Binary for String columns. +)", 0) \ + M(Bool, output_format_parquet_fixed_string_as_fixed_byte_array, true, R"( +Use Parquet FIXED_LENGTH_BYTE_ARRAY type instead of Binary for FixedString columns. +)", 0) \ + M(ParquetVersion, output_format_parquet_version, "2.latest", R"( +Parquet format version for output format. Supported versions: 1.0, 2.4, 2.6 and 2.latest (default) +)", 0) \ + M(ParquetCompression, output_format_parquet_compression_method, "zstd", R"( +Compression method for Parquet output format. Supported codecs: snappy, lz4, brotli, zstd, gzip, none (uncompressed) +)", 0) \ + M(Bool, output_format_parquet_compliant_nested_types, true, R"( +In parquet file schema, use name 'element' instead of 'item' for list elements. This is a historical artifact of Arrow library implementation. Generally increases compatibility, except perhaps with some old versions of Arrow. +)", 0) \ + M(Bool, output_format_parquet_use_custom_encoder, true, R"( +Use a faster Parquet encoder implementation. +)", 0) \ + M(Bool, output_format_parquet_parallel_encoding, true, R"( +Do Parquet encoding in multiple threads. Requires output_format_parquet_use_custom_encoder. +)", 0) \ + M(UInt64, output_format_parquet_data_page_size, 1024 * 1024, R"( +Target page size in bytes, before compression. +)", 0) \ + M(UInt64, output_format_parquet_batch_size, 1024, R"( +Check page size every this many rows. Consider decreasing if you have columns with average values size above a few KBs. +)", 0) \ + M(Bool, output_format_parquet_write_page_index, true, R"( +Add a possibility to write page index into parquet files. +)", 0) \ + M(String, output_format_avro_codec, "", R"( +Compression codec used for output. Possible values: 'null', 'deflate', 'snappy', 'zstd'. +)", 0) \ + M(UInt64, output_format_avro_sync_interval, 16 * 1024, R"( +Sync interval in bytes. +)", 0) \ + M(String, output_format_avro_string_column_pattern, "", R"( +For Avro format: regexp of String columns to select as AVRO string. +)", 0) \ + M(UInt64, output_format_avro_rows_in_file, 1, R"( +Max rows in a file (if permitted by storage) +)", 0) \ + M(Bool, output_format_tsv_crlf_end_of_line, false, R"( +If it is set true, end of line in TSV format will be \\r\\n instead of \\n. +)", 0) \ + M(String, format_csv_null_representation, "\\N", R"( +Custom NULL representation in CSV format +)", 0) \ + M(String, format_tsv_null_representation, "\\N", R"( +Custom NULL representation in TSV format +)", 0) \ + M(Bool, output_format_decimal_trailing_zeros, false, R"( +Output trailing zeros when printing Decimal values. E.g. 1.230000 instead of 1.23. + +Disabled by default. +)", 0) \ \ - M(UInt64, input_format_allow_errors_num, 0, "Maximum absolute amount of errors while reading text formats (like CSV, TSV). In case of error, if at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue.", 0) \ - M(Float, input_format_allow_errors_ratio, 0, "Maximum relative amount of errors while reading text formats (like CSV, TSV). In case of error, if at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue.", 0) \ - M(String, input_format_record_errors_file_path, "", "Path of the file used to record errors while reading text formats (CSV, TSV).", 0) \ - M(String, errors_output_format, "CSV", "Method to write Errors to text output.", 0) \ + M(UInt64, input_format_allow_errors_num, 0, R"( +Sets the maximum number of acceptable errors when reading from text formats (CSV, TSV, etc.). + +The default value is 0. + +Always pair it with `input_format_allow_errors_ratio`. + +If an error occurred while reading rows but the error counter is still less than `input_format_allow_errors_num`, ClickHouse ignores the row and moves on to the next one. + +If both `input_format_allow_errors_num` and `input_format_allow_errors_ratio` are exceeded, ClickHouse throws an exception. +)", 0) \ + M(Float, input_format_allow_errors_ratio, 0, R"( +Sets the maximum percentage of errors allowed when reading from text formats (CSV, TSV, etc.). +The percentage of errors is set as a floating-point number between 0 and 1. + +The default value is 0. + +Always pair it with `input_format_allow_errors_num`. + +If an error occurred while reading rows but the error counter is still less than `input_format_allow_errors_ratio`, ClickHouse ignores the row and moves on to the next one. + +If both `input_format_allow_errors_num` and `input_format_allow_errors_ratio` are exceeded, ClickHouse throws an exception. +)", 0) \ + M(String, input_format_record_errors_file_path, "", R"( +Path of the file used to record errors while reading text formats (CSV, TSV). +)", 0) \ + M(String, errors_output_format, "CSV", R"( +Method to write Errors to text output. +)", 0) \ \ - M(String, format_schema, "", "Schema identifier (used by schema-based formats)", 0) \ - M(String, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)", 0) \ - M(String, format_template_row, "", "Path to file which contains format string for rows (for Template format)", 0) \ - M(String, format_template_row_format, "", "Format string for rows (for Template format)", 0) \ - M(String, format_template_resultset_format, "", "Format string for result set (for Template format)", 0) \ - M(String, format_template_rows_between_delimiter, "\n", "Delimiter between rows (for Template format)", 0) \ + M(String, format_schema, "", R"( +This parameter is useful when you are using formats that require a schema definition, such as [Cap’n Proto](https://capnproto.org/) or [Protobuf](https://developers.google.com/protocol-buffers/). The value depends on the format. +)", 0) \ + M(String, format_template_resultset, "", R"( +Path to file which contains format string for result set (for Template format) +)", 0) \ + M(String, format_template_row, "", R"( +Path to file which contains format string for rows (for Template format) +)", 0) \ + M(String, format_template_row_format, "", R"( +Format string for rows (for Template format) +)", 0) \ + M(String, format_template_resultset_format, "", R"( +Format string for result set (for Template format) +)", 0) \ + M(String, format_template_rows_between_delimiter, "\n", R"( +Delimiter between rows (for Template format) +)", 0) \ \ - M(EscapingRule, format_custom_escaping_rule, "Escaped", "Field escaping rule (for CustomSeparated format)", 0) \ - M(String, format_custom_field_delimiter, "\t", "Delimiter between fields (for CustomSeparated format)", 0) \ - M(String, format_custom_row_before_delimiter, "", "Delimiter before field of the first column (for CustomSeparated format)", 0) \ - M(String, format_custom_row_after_delimiter, "\n", "Delimiter after field of the last column (for CustomSeparated format)", 0) \ - M(String, format_custom_row_between_delimiter, "", "Delimiter between rows (for CustomSeparated format)", 0) \ - M(String, format_custom_result_before_delimiter, "", "Prefix before result set (for CustomSeparated format)", 0) \ - M(String, format_custom_result_after_delimiter, "", "Suffix after result set (for CustomSeparated format)", 0) \ + M(EscapingRule, format_custom_escaping_rule, "Escaped", R"( +Field escaping rule (for CustomSeparated format) +)", 0) \ + M(String, format_custom_field_delimiter, "\t", R"( +Delimiter between fields (for CustomSeparated format) +)", 0) \ + M(String, format_custom_row_before_delimiter, "", R"( +Delimiter before field of the first column (for CustomSeparated format) +)", 0) \ + M(String, format_custom_row_after_delimiter, "\n", R"( +Delimiter after field of the last column (for CustomSeparated format) +)", 0) \ + M(String, format_custom_row_between_delimiter, "", R"( +Delimiter between rows (for CustomSeparated format) +)", 0) \ + M(String, format_custom_result_before_delimiter, "", R"( +Prefix before result set (for CustomSeparated format) +)", 0) \ + M(String, format_custom_result_after_delimiter, "", R"( +Suffix after result set (for CustomSeparated format) +)", 0) \ \ - M(String, format_regexp, "", "Regular expression (for Regexp format)", 0) \ - M(EscapingRule, format_regexp_escaping_rule, "Raw", "Field escaping rule (for Regexp format)", 0) \ - M(Bool, format_regexp_skip_unmatched, false, "Skip lines unmatched by regular expression (for Regexp format)", 0) \ + M(String, format_regexp, "", R"( +Regular expression (for Regexp format) +)", 0) \ + M(EscapingRule, format_regexp_escaping_rule, "Raw", R"( +Field escaping rule (for Regexp format) +)", 0) \ + M(Bool, format_regexp_skip_unmatched, false, R"( +Skip lines unmatched by regular expression (for Regexp format) +)", 0) \ \ - M(Bool, output_format_enable_streaming, false, "Enable streaming in output formats that support it.", 0) \ - M(Bool, output_format_write_statistics, true, "Write statistics about read rows, bytes, time elapsed in suitable output formats.", 0) \ - M(Bool, output_format_pretty_row_numbers, true, "Add row numbers before each row for pretty output format", 0) \ - M(Bool, output_format_pretty_highlight_digit_groups, true, "If enabled and if output is a terminal, highlight every digit corresponding to the number of thousands, millions, etc. with underline.", 0) \ - M(UInt64, output_format_pretty_single_large_number_tip_threshold, 1'000'000, "Print a readable number tip on the right side of the table if the block consists of a single number which exceeds this value (except 0)", 0) \ - M(Bool, insert_distributed_one_random_shard, false, "If setting is enabled, inserting into distributed table will choose a random shard to write when there is no sharding key", 0) \ + M(Bool, output_format_enable_streaming, false, R"( +Enable streaming in output formats that support it. + +Disabled by default. +)", 0) \ + M(Bool, output_format_write_statistics, true, R"( +Write statistics about read rows, bytes, time elapsed in suitable output formats. + +Enabled by default +)", 0) \ + M(Bool, output_format_pretty_row_numbers, true, R"( +Add row numbers before each row for pretty output format +)", 0) \ + M(Bool, output_format_pretty_highlight_digit_groups, true, R"( +If enabled and if output is a terminal, highlight every digit corresponding to the number of thousands, millions, etc. with underline. +)", 0) \ + M(UInt64, output_format_pretty_single_large_number_tip_threshold, 1'000'000, R"( +Print a readable number tip on the right side of the table if the block consists of a single number which exceeds this value (except 0) +)", 0) \ + M(Bool, insert_distributed_one_random_shard, false, R"( +Enables or disables random shard insertion into a [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table when there is no distributed key. + +By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will reject any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards. + +Possible values: + +- 0 — Insertion is rejected if there are multiple shards and no distributed key is given. +- 1 — Insertion is done randomly among all available shards when no distributed key is given. +)", 0) \ \ - M(Bool, exact_rows_before_limit, false, "When enabled, ClickHouse will provide exact value for rows_before_limit_at_least statistic, but with the cost that the data before limit will have to be read completely", 0) \ - M(Bool, rows_before_aggregation, false, "When enabled, ClickHouse will provide exact value for rows_before_aggregation statistic, represents the number of rows read before aggregation", 0) \ - M(UInt64, cross_to_inner_join_rewrite, 1, "Use inner join instead of comma/cross join if there are joining expressions in the WHERE section. Values: 0 - no rewrite, 1 - apply if possible for comma/cross, 2 - force rewrite all comma joins, cross - if possible", 0) \ + M(Bool, exact_rows_before_limit, false, R"( +When enabled, ClickHouse will provide exact value for rows_before_limit_at_least statistic, but with the cost that the data before limit will have to be read completely +)", 0) \ + M(Bool, rows_before_aggregation, false, R"( +When enabled, ClickHouse will provide exact value for rows_before_aggregation statistic, represents the number of rows read before aggregation +)", 0) \ + M(UInt64, cross_to_inner_join_rewrite, 1, R"( +Use inner join instead of comma/cross join if there are joining expressions in the WHERE section. Values: 0 - no rewrite, 1 - apply if possible for comma/cross, 2 - force rewrite all comma joins, cross - if possible +)", 0) \ \ - M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \ - M(Bool, output_format_arrow_use_signed_indexes_for_dictionary, true, "Use signed integers for dictionary indexes in Arrow format", 0) \ - M(Bool, output_format_arrow_use_64_bit_indexes_for_dictionary, false, "Always use 64 bit integers for dictionary indexes in Arrow format", 0) \ - M(Bool, output_format_arrow_string_as_string, true, "Use Arrow String type instead of Binary for String columns", 0) \ - M(Bool, output_format_arrow_fixed_string_as_fixed_byte_array, true, "Use Arrow FIXED_SIZE_BINARY type instead of Binary for FixedString columns.", 0) \ - M(ArrowCompression, output_format_arrow_compression_method, "lz4_frame", "Compression method for Arrow output format. Supported codecs: lz4_frame, zstd, none (uncompressed)", 0) \ + M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, R"( +Enable output LowCardinality type as Dictionary Arrow type +)", 0) \ + M(Bool, output_format_arrow_use_signed_indexes_for_dictionary, true, R"( +Use signed integers for dictionary indexes in Arrow format +)", 0) \ + M(Bool, output_format_arrow_use_64_bit_indexes_for_dictionary, false, R"( +Always use 64 bit integers for dictionary indexes in Arrow format +)", 0) \ + M(Bool, output_format_arrow_string_as_string, true, R"( +Use Arrow String type instead of Binary for String columns +)", 0) \ + M(Bool, output_format_arrow_fixed_string_as_fixed_byte_array, true, R"( +Use Arrow FIXED_SIZE_BINARY type instead of Binary for FixedString columns. +)", 0) \ + M(ArrowCompression, output_format_arrow_compression_method, "lz4_frame", R"( +Compression method for Arrow output format. Supported codecs: lz4_frame, zstd, none (uncompressed) +)", 0) \ \ - M(Bool, output_format_orc_string_as_string, true, "Use ORC String type instead of Binary for String columns", 0) \ - M(ORCCompression, output_format_orc_compression_method, "zstd", "Compression method for ORC output format. Supported codecs: lz4, snappy, zlib, zstd, none (uncompressed)", 0) \ - M(UInt64, output_format_orc_row_index_stride, 10'000, "Target row index stride in ORC output format", 0) \ - M(Double, output_format_orc_dictionary_key_size_threshold, 0.0, "For a string column in ORC output format, if the number of distinct values is greater than this fraction of the total number of non-null rows, turn off dictionary encoding. Otherwise dictionary encoding is enabled", 0) \ + M(Bool, output_format_orc_string_as_string, true, R"( +Use ORC String type instead of Binary for String columns +)", 0) \ + M(ORCCompression, output_format_orc_compression_method, "zstd", R"( +Compression method for ORC output format. Supported codecs: lz4, snappy, zlib, zstd, none (uncompressed) +)", 0) \ + M(UInt64, output_format_orc_row_index_stride, 10'000, R"( +Target row index stride in ORC output format +)", 0) \ + M(Double, output_format_orc_dictionary_key_size_threshold, 0.0, R"( +For a string column in ORC output format, if the number of distinct values is greater than this fraction of the total number of non-null rows, turn off dictionary encoding. Otherwise dictionary encoding is enabled +)", 0) \ \ - M(CapnProtoEnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::CapnProtoEnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \ + M(CapnProtoEnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::CapnProtoEnumComparingMode::BY_VALUES, R"( +How to map ClickHouse Enum and CapnProto Enum +)", 0) \ \ - M(Bool, format_capn_proto_use_autogenerated_schema, true, "Use autogenerated CapnProto schema when format_schema is not set", 0) \ - M(Bool, format_protobuf_use_autogenerated_schema, true, "Use autogenerated Protobuf when format_schema is not set", 0) \ - M(String, output_format_schema, "", "The path to the file where the automatically generated schema will be saved", 0) \ + M(Bool, format_capn_proto_use_autogenerated_schema, true, R"( +Use autogenerated CapnProto schema when format_schema is not set +)", 0) \ + M(Bool, format_protobuf_use_autogenerated_schema, true, R"( +Use autogenerated Protobuf when format_schema is not set +)", 0) \ + M(String, output_format_schema, "", R"( +The path to the file where the automatically generated schema will be saved in [Cap’n Proto](../../interfaces/formats.md#capnproto-capnproto) or [Protobuf](../../interfaces/formats.md#protobuf-protobuf) formats. +)", 0) \ \ - M(String, input_format_mysql_dump_table_name, "", "Name of the table in MySQL dump from which to read data", 0) \ - M(Bool, input_format_mysql_dump_map_column_names, true, "Match columns from table in MySQL dump and columns from ClickHouse table by names", 0) \ + M(String, input_format_mysql_dump_table_name, "", R"( +Name of the table in MySQL dump from which to read data +)", 0) \ + M(Bool, input_format_mysql_dump_map_column_names, true, R"( +Match columns from table in MySQL dump and columns from ClickHouse table by names +)", 0) \ \ - M(UInt64, output_format_sql_insert_max_batch_size, DEFAULT_BLOCK_SIZE, "The maximum number of rows in one INSERT statement.", 0) \ - M(String, output_format_sql_insert_table_name, "table", "The name of table in the output INSERT query", 0) \ - M(Bool, output_format_sql_insert_include_column_names, true, "Include column names in INSERT query", 0) \ - M(Bool, output_format_sql_insert_use_replace, false, "Use REPLACE statement instead of INSERT", 0) \ - M(Bool, output_format_sql_insert_quote_names, true, "Quote column names with '`' characters", 0) \ + M(UInt64, output_format_sql_insert_max_batch_size, DEFAULT_BLOCK_SIZE, R"( +The maximum number of rows in one INSERT statement. +)", 0) \ + M(String, output_format_sql_insert_table_name, "table", R"( +The name of table in the output INSERT query +)", 0) \ + M(Bool, output_format_sql_insert_include_column_names, true, R"( +Include column names in INSERT query +)", 0) \ + M(Bool, output_format_sql_insert_use_replace, false, R"( +Use REPLACE statement instead of INSERT +)", 0) \ + M(Bool, output_format_sql_insert_quote_names, true, R"( +Quote column names with '`' characters +)", 0) \ \ - M(Bool, output_format_values_escape_quote_with_quote, false, "If true escape ' with '', otherwise quoted with \\'", 0) \ + M(Bool, output_format_values_escape_quote_with_quote, false, R"( +If true escape ' with '', otherwise quoted with \\' +)", 0) \ \ - M(Bool, output_format_bson_string_as_string, false, "Use BSON String type instead of Binary for String columns.", 0) \ - M(Bool, input_format_bson_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format BSON.", 0) \ + M(Bool, output_format_bson_string_as_string, false, R"( +Use BSON String type instead of Binary for String columns. +)", 0) \ + M(Bool, input_format_bson_skip_fields_with_unsupported_types_in_schema_inference, false, R"( +Skip fields with unsupported types while schema inference for format BSON. +)", 0) \ \ - M(Bool, format_display_secrets_in_show_and_select, false, "Do not hide secrets in SHOW and SELECT queries.", IMPORTANT) \ - M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \ - M(Bool, regexp_dict_flag_case_insensitive, false, "Use case-insensitive matching for a regexp_tree dictionary. Can be overridden in individual expressions with (?i) and (?-i).", 0) \ - M(Bool, regexp_dict_flag_dotall, false, "Allow '.' to match newline characters for a regexp_tree dictionary.", 0) \ + M(Bool, format_display_secrets_in_show_and_select, false, R"( +Enables or disables showing secrets in `SHOW` and `SELECT` queries for tables, databases, +table functions, and dictionaries. + +User wishing to see secrets must also have +[`display_secrets_in_show_and_select` server setting](../server-configuration-parameters/settings#display_secrets_in_show_and_select) +turned on and a +[`displaySecretsInShowAndSelect`](../../sql-reference/statements/grant#display-secrets) privilege. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. +)", IMPORTANT) \ + M(Bool, regexp_dict_allow_hyperscan, true, R"( +Allow regexp_tree dictionary using Hyperscan library. +)", 0) \ + M(Bool, regexp_dict_flag_case_insensitive, false, R"( +Use case-insensitive matching for a regexp_tree dictionary. Can be overridden in individual expressions with (?i) and (?-i). +)", 0) \ + M(Bool, regexp_dict_flag_dotall, false, R"( +Allow '.' to match newline characters for a regexp_tree dictionary. +)", 0) \ \ - M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading dictionary source in several threads. It's supported only by dictionaries with local CLICKHOUSE source.", 0) \ - M(Bool, precise_float_parsing, false, "Prefer more precise (but slower) float parsing algorithm", 0) \ - M(DateTimeOverflowBehavior, date_time_overflow_behavior, "ignore", "Overflow mode for Date, Date32, DateTime, DateTime64 types. Possible values: 'ignore', 'throw', 'saturate'.", 0) \ - M(Bool, validate_experimental_and_suspicious_types_inside_nested_types, true, "Validate usage of experimental and suspicious types inside nested types like Array/Map/Tuple", 0) \ + M(Bool, dictionary_use_async_executor, false, R"( +Execute a pipeline for reading dictionary source in several threads. It's supported only by dictionaries with local CLICKHOUSE source. +)", 0) \ + M(Bool, precise_float_parsing, false, R"( +Prefer more precise (but slower) float parsing algorithm +)", 0) \ + M(DateTimeOverflowBehavior, date_time_overflow_behavior, "ignore", R"( +Overflow mode for Date, Date32, DateTime, DateTime64 types. Possible values: 'ignore', 'throw', 'saturate'. +)", 0) \ + M(Bool, validate_experimental_and_suspicious_types_inside_nested_types, true, R"( +Validate usage of experimental and suspicious types inside nested types like Array/Map/Tuple +)", 0) \ \ - M(IdentifierQuotingRule, show_create_query_identifier_quoting_rule, IdentifierQuotingRule::WhenNecessary, "Set the quoting rule for identifiers in SHOW CREATE query", 0) \ - M(IdentifierQuotingStyle, show_create_query_identifier_quoting_style, IdentifierQuotingStyle::Backticks, "Set the quoting style for identifiers in SHOW CREATE query", 0) \ + M(IdentifierQuotingRule, show_create_query_identifier_quoting_rule, IdentifierQuotingRule::WhenNecessary, R"( +Set the quoting rule for identifiers in SHOW CREATE query +)", 0) \ + M(IdentifierQuotingStyle, show_create_query_identifier_quoting_style, IdentifierQuotingStyle::Backticks, R"( +Set the quoting style for identifiers in SHOW CREATE query +)", 0) \ // End of FORMAT_FACTORY_SETTINGS diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 14442bd1f2f..cad34521886 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -53,361 +52,2225 @@ namespace ErrorCodes #define OBSOLETE_SETTINGS(M, ALIAS) #else #define COMMON_SETTINGS(M, ALIAS) \ - M(Dialect, dialect, Dialect::clickhouse, "Which dialect will be used to parse query", 0)\ - M(UInt64, min_compress_block_size, 65536, "The actual size of the block to compress, if the uncompressed data is less than max_compress_block_size, is no less than this value and no less than the volume of data for one mark.", 0) \ - M(UInt64, max_compress_block_size, 1048576, "The maximum size of blocks of uncompressed data before compressing for writing to a table.", 0) \ - M(UInt64, max_block_size, DEFAULT_BLOCK_SIZE, "Maximum block size in rows for reading", 0) \ - M(UInt64, max_insert_block_size, DEFAULT_INSERT_BLOCK_SIZE, "The maximum block size for insertion, if we control the creation of blocks for insertion.", 0) \ - M(UInt64, min_insert_block_size_rows, DEFAULT_INSERT_BLOCK_SIZE, "Squash blocks passed to INSERT query to specified size in rows, if blocks are not big enough.", 0) \ - M(UInt64, min_insert_block_size_bytes, (DEFAULT_INSERT_BLOCK_SIZE * 256), "Squash blocks passed to INSERT query to a specified size in bytes if blocks are not big enough.", 0) \ - M(UInt64, min_insert_block_size_rows_for_materialized_views, 0, "Like min_insert_block_size_rows, but applied only during pushing to MATERIALIZED VIEW (default: min_insert_block_size_rows)", 0) \ - M(UInt64, min_insert_block_size_bytes_for_materialized_views, 0, "Like min_insert_block_size_bytes, but applied only during pushing to MATERIALIZED VIEW (default: min_insert_block_size_bytes)", 0) \ - M(UInt64, min_external_table_block_size_rows, DEFAULT_INSERT_BLOCK_SIZE, "Squash blocks passed to external table to specified size in rows, if blocks are not big enough.", 0) \ - M(UInt64, min_external_table_block_size_bytes, (DEFAULT_INSERT_BLOCK_SIZE * 256), "Squash blocks passed to the external table to a specified size in bytes, if blocks are not big enough.", 0) \ - M(UInt64, max_joined_block_size_rows, DEFAULT_BLOCK_SIZE, "Maximum block size for JOIN result (if join algorithm supports it). 0 means unlimited.", 0) \ - M(UInt64, max_insert_threads, 0, "The maximum number of threads to execute the INSERT SELECT query. Values 0 or 1 mean that INSERT SELECT is not run in parallel. Higher values will lead to higher memory usage. Parallel INSERT SELECT has effect only if the SELECT part is run on parallel, see 'max_threads' setting.", 0) \ - M(UInt64, max_insert_delayed_streams_for_parallel_write, 0, "The maximum number of streams (columns) to delay final part flush. Default - auto (1000 in case of underlying storage supports parallel write, for example S3 and disabled otherwise)", 0) \ - M(MaxThreads, max_final_threads, 0, "The maximum number of threads to read from table with FINAL.", 0) \ - M(UInt64, max_threads_for_indexes, 0, "The maximum number of threads process indices.", 0) \ - M(MaxThreads, max_threads, 0, "The maximum number of threads to execute the request. By default, it is determined automatically.", 0) \ - M(Bool, use_concurrency_control, true, "Respect the server's concurrency control (see the `concurrent_threads_soft_limit_num` and `concurrent_threads_soft_limit_ratio_to_cores` global server settings). If disabled, it allows using a larger number of threads even if the server is overloaded (not recommended for normal usage, and needed mostly for tests).", 0) \ - M(MaxThreads, max_download_threads, 4, "The maximum number of threads to download data (e.g. for URL engine).", 0) \ - M(MaxThreads, max_parsing_threads, 0, "The maximum number of threads to parse data in input formats that support parallel parsing. By default, it is determined automatically", 0) \ - M(UInt64, max_download_buffer_size, 10*1024*1024, "The maximal size of buffer for parallel downloading (e.g. for URL engine) per each thread.", 0) \ - M(UInt64, max_read_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, "The maximum size of the buffer to read from the filesystem.", 0) \ - M(UInt64, max_read_buffer_size_local_fs, 128*1024, "The maximum size of the buffer to read from local filesystem. If set to 0 then max_read_buffer_size will be used.", 0) \ - M(UInt64, max_read_buffer_size_remote_fs, 0, "The maximum size of the buffer to read from remote filesystem. If set to 0 then max_read_buffer_size will be used.", 0) \ - M(UInt64, max_distributed_connections, 1024, "The maximum number of connections for distributed processing of one query (should be greater than max_threads).", 0) \ - M(UInt64, max_query_size, DBMS_DEFAULT_MAX_QUERY_SIZE, "The maximum number of bytes of a query string parsed by the SQL parser. Data in the VALUES clause of INSERT queries is processed by a separate stream parser (that consumes O(1) RAM) and not affected by this restriction.", 0) \ - M(UInt64, interactive_delay, 100000, "The interval in microseconds to check if the request is cancelled, and to send progress info.", 0) \ - M(Seconds, connect_timeout, DBMS_DEFAULT_CONNECT_TIMEOUT_SEC, "Connection timeout if there are no replicas.", 0) \ - M(Milliseconds, handshake_timeout_ms, 10000, "Timeout for receiving HELLO packet from replicas.", 0) \ - M(Milliseconds, connect_timeout_with_failover_ms, 1000, "Connection timeout for selecting first healthy replica.", 0) \ - M(Milliseconds, connect_timeout_with_failover_secure_ms, 1000, "Connection timeout for selecting first healthy replica (for secure connections).", 0) \ - M(Seconds, receive_timeout, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "Timeout for receiving data from the network, in seconds. If no bytes were received in this interval, the exception is thrown. If you set this setting on the client, the 'send_timeout' for the socket will also be set on the corresponding connection end on the server.", 0) \ - M(Seconds, send_timeout, DBMS_DEFAULT_SEND_TIMEOUT_SEC, "Timeout for sending data to the network, in seconds. If a client needs to send some data but is not able to send any bytes in this interval, the exception is thrown. If you set this setting on the client, the 'receive_timeout' for the socket will also be set on the corresponding connection end on the server.", 0) \ - M(Seconds, tcp_keep_alive_timeout, DEFAULT_TCP_KEEP_ALIVE_TIMEOUT /* less than DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC */, "The time in seconds the connection needs to remain idle before TCP starts sending keepalive probes", 0) \ - M(Milliseconds, hedged_connection_timeout_ms, 50, "Connection timeout for establishing connection with replica for Hedged requests", 0) \ - M(Milliseconds, receive_data_timeout_ms, 2000, "Connection timeout for receiving first packet of data or packet with positive progress from replica", 0) \ - M(Bool, use_hedged_requests, true, "Use hedged requests for distributed queries", 0) \ - M(Bool, allow_changing_replica_until_first_data_packet, false, "Allow HedgedConnections to change replica until receiving first data packet", 0) \ - M(Milliseconds, queue_max_wait_ms, 0, "The wait time in the request queue, if the number of concurrent requests exceeds the maximum.", 0) \ - M(Milliseconds, connection_pool_max_wait_ms, 0, "The wait time when the connection pool is full.", 0) \ - M(Milliseconds, replace_running_query_max_wait_ms, 5000, "The wait time for running query with the same query_id to finish when setting 'replace_running_query' is active.", 0) \ - M(Milliseconds, kafka_max_wait_ms, 5000, "The wait time for reading from Kafka before retry.", 0) \ - M(Milliseconds, rabbitmq_max_wait_ms, 5000, "The wait time for reading from RabbitMQ before retry.", 0) \ - M(UInt64, poll_interval, DBMS_DEFAULT_POLL_INTERVAL, "Block at the query wait loop on the server for the specified number of seconds.", 0) \ - M(UInt64, idle_connection_timeout, 3600, "Close idle TCP connections after specified number of seconds.", 0) \ - M(UInt64, distributed_connections_pool_size, 1024, "Maximum number of connections with one remote server in the pool.", 0) \ - M(UInt64, connections_with_failover_max_tries, 3, "The maximum number of attempts to connect to replicas.", 0) \ - M(UInt64, s3_strict_upload_part_size, S3::DEFAULT_STRICT_UPLOAD_PART_SIZE, "The exact size of part to upload during multipart upload to S3 (some implementations does not supports variable size parts).", 0) \ - M(UInt64, azure_strict_upload_part_size, 0, "The exact size of part to upload during multipart upload to Azure blob storage.", 0) \ - M(UInt64, azure_max_blocks_in_multipart_upload, 50000, "Maximum number of blocks in multipart upload for Azure.", 0) \ - M(UInt64, s3_min_upload_part_size, S3::DEFAULT_MIN_UPLOAD_PART_SIZE, "The minimum size of part to upload during multipart upload to S3.", 0) \ - M(UInt64, s3_max_upload_part_size, S3::DEFAULT_MAX_UPLOAD_PART_SIZE, "The maximum size of part to upload during multipart upload to S3.", 0) \ - M(UInt64, azure_min_upload_part_size, 16*1024*1024, "The minimum size of part to upload during multipart upload to Azure blob storage.", 0) \ - M(UInt64, azure_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to Azure blob storage.", 0) \ - M(UInt64, s3_upload_part_size_multiply_factor, S3::DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_FACTOR, "Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3.", 0) \ - M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, S3::DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_PARTS_COUNT_THRESHOLD, "Each time this number of parts was uploaded to S3, s3_min_upload_part_size is multiplied by s3_upload_part_size_multiply_factor.", 0) \ - M(UInt64, s3_max_part_number, S3::DEFAULT_MAX_PART_NUMBER, "Maximum part number number for s3 upload part.", 0) \ - M(UInt64, s3_max_single_operation_copy_size, S3::DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE, "Maximum size for a single copy operation in s3", 0) \ - M(UInt64, azure_upload_part_size_multiply_factor, 2, "Multiply azure_min_upload_part_size by this factor each time azure_multiply_parts_count_threshold parts were uploaded from a single write to Azure blob storage.", 0) \ - M(UInt64, azure_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to Azure blob storage, azure_min_upload_part_size is multiplied by azure_upload_part_size_multiply_factor.", 0) \ - M(UInt64, s3_max_inflight_parts_for_one_file, S3::DEFAULT_MAX_INFLIGHT_PARTS_FOR_ONE_FILE, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited.", 0) \ - M(UInt64, azure_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited.", 0) \ - M(UInt64, s3_max_single_part_upload_size, S3::DEFAULT_MAX_SINGLE_PART_UPLOAD_SIZE, "The maximum size of object to upload using singlepart upload to S3.", 0) \ - M(UInt64, azure_max_single_part_upload_size, 100*1024*1024, "The maximum size of object to upload using singlepart upload to Azure blob storage.", 0) \ - M(UInt64, azure_max_single_part_copy_size, 256*1024*1024, "The maximum size of object to copy using single part copy to Azure blob storage.", 0) \ - M(UInt64, s3_max_single_read_retries, S3::DEFAULT_MAX_SINGLE_READ_TRIES, "The maximum number of retries during single S3 read.", 0) \ - M(UInt64, azure_max_single_read_retries, 4, "The maximum number of retries during single Azure blob storage read.", 0) \ - M(UInt64, azure_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during Azure blob storage write", 0) \ - M(UInt64, s3_max_unexpected_write_error_retries, S3::DEFAULT_MAX_UNEXPECTED_WRITE_ERROR_RETRIES, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ - M(UInt64, s3_max_redirects, S3::DEFAULT_MAX_REDIRECTS, "Max number of S3 redirects hops allowed.", 0) \ - M(UInt64, s3_max_connections, S3::DEFAULT_MAX_CONNECTIONS, "The maximum number of connections per server.", 0) \ - M(UInt64, s3_max_get_rps, 0, "Limit on S3 GET request per second rate before throttling. Zero means unlimited.", 0) \ - M(UInt64, s3_max_get_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_get_rps`", 0) \ - M(UInt64, s3_max_put_rps, 0, "Limit on S3 PUT request per second rate before throttling. Zero means unlimited.", 0) \ - M(UInt64, s3_max_put_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_put_rps`", 0) \ - M(UInt64, s3_list_object_keys_size, S3::DEFAULT_LIST_OBJECT_KEYS_SIZE, "Maximum number of files that could be returned in batch by ListObject request", 0) \ - M(Bool, s3_use_adaptive_timeouts, S3::DEFAULT_USE_ADAPTIVE_TIMEOUTS, "When adaptive timeouts are enabled first two attempts are made with low receive and send timeout", 0) \ - M(UInt64, azure_list_object_keys_size, 1000, "Maximum number of files that could be returned in batch by ListObject request", 0) \ - M(Bool, s3_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables.", 0) \ - M(Bool, azure_truncate_on_insert, false, "Enables or disables truncate before insert in azure engine tables.", 0) \ - M(Bool, s3_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in s3 engine tables", 0) \ - M(Bool, s3_skip_empty_files, false, "Allow to skip empty files in s3 table engine", 0) \ - M(Bool, azure_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in azure engine tables", 0) \ - M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \ - M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \ - M(Bool, azure_allow_parallel_part_upload, true, "Use multiple threads for azure multipart upload.", 0) \ - M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ - M(Bool, hdfs_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ - M(Bool, azure_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ - M(Bool, s3_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in S3 table engine", 0) \ - M(Bool, hdfs_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in HDFS table engine", 0) \ - M(Bool, azure_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in AzureBlobStorage table engine", 0) \ - M(UInt64, azure_sdk_max_retries, 10, "Maximum number of retries in azure sdk", 0) \ - M(UInt64, azure_sdk_retry_initial_backoff_ms, 10, "Minimal backoff between retries in azure sdk", 0) \ - M(UInt64, azure_sdk_retry_max_backoff_ms, 1000, "Maximal backoff between retries in azure sdk", 0) \ - M(Bool, s3_validate_request_settings, true, "Validate S3 request settings", 0) \ - M(Bool, s3_disable_checksum, S3::DEFAULT_DISABLE_CHECKSUM, "Do not calculate a checksum when sending a file to S3. This speeds up writes by avoiding excessive processing passes on a file. It is mostly safe as the data of MergeTree tables is checksummed by ClickHouse anyway, and when S3 is accessed with HTTPS, the TLS layer already provides integrity while transferring through the network. While additional checksums on S3 give defense in depth.", 0) \ - M(UInt64, s3_retry_attempts, S3::DEFAULT_RETRY_ATTEMPTS, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \ - M(UInt64, s3_request_timeout_ms, S3::DEFAULT_REQUEST_TIMEOUT_MS, "Idleness timeout for sending and receiving data to/from S3. Fail if a single TCP read or write call blocks for this long.", 0) \ - M(UInt64, s3_connect_timeout_ms, S3::DEFAULT_CONNECT_TIMEOUT_MS, "Connection timeout for host from s3 disks.", 0) \ - M(Bool, enable_s3_requests_logging, false, "Enable very explicit logging of S3 requests. Makes sense for debug only.", 0) \ - M(String, s3queue_default_zookeeper_path, "/clickhouse/s3queue/", "Default zookeeper path prefix for S3Queue engine", 0) \ - M(Bool, s3queue_enable_logging_to_s3queue_log, false, "Enable writing to system.s3queue_log. The value can be overwritten per table with table settings", 0) \ - M(UInt64, hdfs_replication, 0, "The actual number of replications can be specified when the hdfs file is created.", 0) \ - M(Bool, hdfs_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables", 0) \ - M(Bool, hdfs_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in hdfs engine tables", 0) \ - M(Bool, hdfs_skip_empty_files, false, "Allow to skip empty files in hdfs table engine", 0) \ - M(Bool, azure_skip_empty_files, false, "Allow to skip empty files in azure table engine", 0) \ - M(UInt64, hsts_max_age, 0, "Expired time for HSTS. 0 means disable HSTS.", 0) \ - M(Bool, extremes, false, "Calculate minimums and maximums of the result columns. They can be output in JSON-formats.", IMPORTANT) \ - M(Bool, use_uncompressed_cache, false, "Whether to use the cache of uncompressed blocks.", 0) \ - M(Bool, replace_running_query, false, "Whether the running request should be canceled with the same id as the new one.", 0) \ - M(UInt64, max_remote_read_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for read.", 0) \ - M(UInt64, max_remote_write_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for write.", 0) \ - M(UInt64, max_local_read_bandwidth, 0, "The maximum speed of local reads in bytes per second.", 0) \ - M(UInt64, max_local_write_bandwidth, 0, "The maximum speed of local writes in bytes per second.", 0) \ - M(Bool, stream_like_engine_allow_direct_select, false, "Allow direct SELECT query for Kafka, RabbitMQ, FileLog, Redis Streams, and NATS engines. In case there are attached materialized views, SELECT query is not allowed even if this setting is enabled.", 0) \ - M(String, stream_like_engine_insert_queue, "", "When stream-like engine reads from multiple queues, the user will need to select one queue to insert into when writing. Used by Redis Streams and NATS.", 0) \ - M(Bool, dictionary_validate_primary_key_type, false, "Validate primary key type for dictionaries. By default id type for simple layouts will be implicitly converted to UInt64.", 0) \ - M(Bool, distributed_insert_skip_read_only_replicas, false, "If true, INSERT into Distributed will skip read-only replicas.", 0) \ - M(Bool, distributed_foreground_insert, false, "If the setting is enabled, insert query into distributed waits until data are sent to all nodes in a cluster. \n\nEnables or disables synchronous data insertion into a `Distributed` table.\n\nBy default, when inserting data into a Distributed table, the ClickHouse server sends data to cluster nodes in the background. When `distributed_foreground_insert` = 1, the data is processed synchronously, and the `INSERT` operation succeeds only after all the data is saved on all shards (at least one replica for each shard if `internal_replication` is true).", 0) ALIAS(insert_distributed_sync) \ - M(UInt64, distributed_background_insert_timeout, 0, "Timeout for insert query into distributed. Setting is used only with insert_distributed_sync enabled. Zero value means no timeout.", 0) ALIAS(insert_distributed_timeout) \ - M(Milliseconds, distributed_background_insert_sleep_time_ms, 100, "Sleep time for background INSERTs into Distributed, in case of any errors delay grows exponentially.", 0) ALIAS(distributed_directory_monitor_sleep_time_ms) \ - M(Milliseconds, distributed_background_insert_max_sleep_time_ms, 30000, "Maximum sleep time for background INSERTs into Distributed, it limits exponential growth too.", 0) ALIAS(distributed_directory_monitor_max_sleep_time_ms) \ + M(Dialect, dialect, Dialect::clickhouse, R"( +Which dialect will be used to parse query +)", 0)\ + M(UInt64, min_compress_block_size, 65536, R"( +For [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. In order to reduce latency when processing queries, a block is compressed when writing the next mark if its size is at least `min_compress_block_size`. By default, 65,536. + +The actual size of the block, if the uncompressed data is less than `max_compress_block_size`, is no less than this value and no less than the volume of data for one mark. + +Let’s look at an example. Assume that `index_granularity` was set to 8192 during table creation. + +We are writing a UInt32-type column (4 bytes per value). When writing 8192 rows, the total will be 32 KB of data. Since min_compress_block_size = 65,536, a compressed block will be formed for every two marks. + +We are writing a URL column with the String type (average size of 60 bytes per value). When writing 8192 rows, the average will be slightly less than 500 KB of data. Since this is more than 65,536, a compressed block will be formed for each mark. In this case, when reading data from the disk in the range of a single mark, extra data won’t be decompressed. + +:::note +This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse. +::: +)", 0) \ + M(UInt64, max_compress_block_size, 1048576, R"( +The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, 1,048,576 (1 MiB). Specifying a smaller block size generally leads to slightly reduced compression ratio, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced. + +:::note +This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse. +::: + +Don’t confuse blocks for compression (a chunk of memory consisting of bytes) with blocks for query processing (a set of rows from a table). +)", 0) \ + M(UInt64, max_block_size, DEFAULT_BLOCK_SIZE, R"( +In ClickHouse, data is processed by blocks, which are sets of column parts. The internal processing cycles for a single block are efficient but there are noticeable costs when processing each block. + +The `max_block_size` setting indicates the recommended maximum number of rows to include in a single block when loading data from tables. Blocks the size of `max_block_size` are not always loaded from the table: if ClickHouse determines that less data needs to be retrieved, a smaller block is processed. + +The block size should not be too small to avoid noticeable costs when processing each block. It should also not be too large to ensure that queries with a LIMIT clause execute quickly after processing the first block. When setting `max_block_size`, the goal should be to avoid consuming too much memory when extracting a large number of columns in multiple threads and to preserve at least some cache locality. +)", 0) \ + M(UInt64, max_insert_block_size, DEFAULT_INSERT_BLOCK_SIZE, R"( +The size of blocks (in a count of rows) to form for insertion into a table. +This setting only applies in cases when the server forms the blocks. +For example, for an INSERT via the HTTP interface, the server parses the data format and forms blocks of the specified size. +But when using clickhouse-client, the client parses the data itself, and the ‘max_insert_block_size’ setting on the server does not affect the size of the inserted blocks. +The setting also does not have a purpose when using INSERT SELECT, since data is inserted using the same blocks that are formed after SELECT. + +The default is slightly more than `max_block_size`. The reason for this is that certain table engines (`*MergeTree`) form a data part on the disk for each inserted block, which is a fairly large entity. Similarly, `*MergeTree` tables sort data during insertion, and a large enough block size allow sorting more data in RAM. +)", 0) \ + M(UInt64, min_insert_block_size_rows, DEFAULT_INSERT_BLOCK_SIZE, R"( +Sets the minimum number of rows in the block that can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. + +Possible values: + +- Positive integer. +- 0 — Squashing disabled. +)", 0) \ + M(UInt64, min_insert_block_size_bytes, (DEFAULT_INSERT_BLOCK_SIZE * 256), R"( +Sets the minimum number of bytes in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. + +Possible values: + +- Positive integer. +- 0 — Squashing disabled. +)", 0) \ + M(UInt64, min_insert_block_size_rows_for_materialized_views, 0, R"( +Sets the minimum number of rows in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. This setting is applied only for blocks inserted into [materialized view](../../sql-reference/statements/create/view.md). By adjusting this setting, you control blocks squashing while pushing to materialized view and avoid excessive memory usage. + +Possible values: + +- Any positive integer. +- 0 — Squashing disabled. + +**See Also** + +- [min_insert_block_size_rows](#min-insert-block-size-rows) +)", 0) \ + M(UInt64, min_insert_block_size_bytes_for_materialized_views, 0, R"( +Sets the minimum number of bytes in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. This setting is applied only for blocks inserted into [materialized view](../../sql-reference/statements/create/view.md). By adjusting this setting, you control blocks squashing while pushing to materialized view and avoid excessive memory usage. + +Possible values: + +- Any positive integer. +- 0 — Squashing disabled. + +**See also** + +- [min_insert_block_size_bytes](#min-insert-block-size-bytes) +)", 0) \ + M(UInt64, min_external_table_block_size_rows, DEFAULT_INSERT_BLOCK_SIZE, R"( +Squash blocks passed to external table to specified size in rows, if blocks are not big enough. +)", 0) \ + M(UInt64, min_external_table_block_size_bytes, (DEFAULT_INSERT_BLOCK_SIZE * 256), R"( +Squash blocks passed to the external table to a specified size in bytes, if blocks are not big enough. +)", 0) \ + M(UInt64, max_joined_block_size_rows, DEFAULT_BLOCK_SIZE, R"( +Maximum block size for JOIN result (if join algorithm supports it). 0 means unlimited. +)", 0) \ + M(UInt64, max_insert_threads, 0, R"( +The maximum number of threads to execute the `INSERT SELECT` query. + +Possible values: + +- 0 (or 1) — `INSERT SELECT` no parallel execution. +- Positive integer. Bigger than 1. + +Cloud default value: from `2` to `4`, depending on the service size. + +Parallel `INSERT SELECT` has effect only if the `SELECT` part is executed in parallel, see [max_threads](#max_threads) setting. +Higher values will lead to higher memory usage. +)", 0) \ + M(UInt64, max_insert_delayed_streams_for_parallel_write, 0, R"( +The maximum number of streams (columns) to delay final part flush. Default - auto (1000 in case of underlying storage supports parallel write, for example S3 and disabled otherwise) +)", 0) \ + M(MaxThreads, max_final_threads, 0, R"( +Sets the maximum number of parallel threads for the `SELECT` query data read phase with the [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier. + +Possible values: + +- Positive integer. +- 0 or 1 — Disabled. `SELECT` queries are executed in a single thread. +)", 0) \ + M(UInt64, max_threads_for_indexes, 0, R"( +The maximum number of threads process indices. +)", 0) \ + M(MaxThreads, max_threads, 0, R"( +The maximum number of query processing threads, excluding threads for retrieving data from remote servers (see the ‘max_distributed_connections’ parameter). + +This parameter applies to threads that perform the same stages of the query processing pipeline in parallel. +For example, when reading from a table, if it is possible to evaluate expressions with functions, filter with WHERE and pre-aggregate for GROUP BY in parallel using at least ‘max_threads’ number of threads, then ‘max_threads’ are used. + +For queries that are completed quickly because of a LIMIT, you can set a lower ‘max_threads’. For example, if the necessary number of entries are located in every block and max_threads = 8, then 8 blocks are retrieved, although it would have been enough to read just one. + +The smaller the `max_threads` value, the less memory is consumed. +)", 0) \ + M(Bool, use_concurrency_control, true, R"( +Respect the server's concurrency control (see the `concurrent_threads_soft_limit_num` and `concurrent_threads_soft_limit_ratio_to_cores` global server settings). If disabled, it allows using a larger number of threads even if the server is overloaded (not recommended for normal usage, and needed mostly for tests). +)", 0) \ + M(MaxThreads, max_download_threads, 4, R"( +The maximum number of threads to download data (e.g. for URL engine). +)", 0) \ + M(MaxThreads, max_parsing_threads, 0, R"( +The maximum number of threads to parse data in input formats that support parallel parsing. By default, it is determined automatically +)", 0) \ + M(UInt64, max_download_buffer_size, 10*1024*1024, R"( +The maximal size of buffer for parallel downloading (e.g. for URL engine) per each thread. +)", 0) \ + M(UInt64, max_read_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, R"( +The maximum size of the buffer to read from the filesystem. +)", 0) \ + M(UInt64, max_read_buffer_size_local_fs, 128*1024, R"( +The maximum size of the buffer to read from local filesystem. If set to 0 then max_read_buffer_size will be used. +)", 0) \ + M(UInt64, max_read_buffer_size_remote_fs, 0, R"( +The maximum size of the buffer to read from remote filesystem. If set to 0 then max_read_buffer_size will be used. +)", 0) \ + M(UInt64, max_distributed_connections, 1024, R"( +The maximum number of simultaneous connections with remote servers for distributed processing of a single query to a single Distributed table. We recommend setting a value no less than the number of servers in the cluster. + +The following parameters are only used when creating Distributed tables (and when launching a server), so there is no reason to change them at runtime. +)", 0) \ + M(UInt64, max_query_size, DBMS_DEFAULT_MAX_QUERY_SIZE, R"( +The maximum number of bytes of a query string parsed by the SQL parser. +Data in the VALUES clause of INSERT queries is processed by a separate stream parser (that consumes O(1) RAM) and not affected by this restriction. + +:::note +`max_query_size` cannot be set within an SQL query (e.g., `SELECT now() SETTINGS max_query_size=10000`) because ClickHouse needs to allocate a buffer to parse the query, and this buffer size is determined by the `max_query_size` setting, which must be configured before the query is executed. +::: +)", 0) \ + M(UInt64, interactive_delay, 100000, R"( +The interval in microseconds for checking whether request execution has been canceled and sending the progress. +)", 0) \ + M(Seconds, connect_timeout, DBMS_DEFAULT_CONNECT_TIMEOUT_SEC, R"( +Connection timeout if there are no replicas. +)", 0) \ + M(Milliseconds, handshake_timeout_ms, 10000, R"( +Timeout in milliseconds for receiving Hello packet from replicas during handshake. +)", 0) \ + M(Milliseconds, connect_timeout_with_failover_ms, 1000, R"( +The timeout in milliseconds for connecting to a remote server for a Distributed table engine, if the ‘shard’ and ‘replica’ sections are used in the cluster definition. +If unsuccessful, several attempts are made to connect to various replicas. +)", 0) \ + M(Milliseconds, connect_timeout_with_failover_secure_ms, 1000, R"( +Connection timeout for selecting first healthy replica (for secure connections). +)", 0) \ + M(Seconds, receive_timeout, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, R"( +Timeout for receiving data from the network, in seconds. If no bytes were received in this interval, the exception is thrown. If you set this setting on the client, the 'send_timeout' for the socket will also be set on the corresponding connection end on the server. +)", 0) \ + M(Seconds, send_timeout, DBMS_DEFAULT_SEND_TIMEOUT_SEC, R"( +Timeout for sending data to the network, in seconds. If a client needs to send some data but is not able to send any bytes in this interval, the exception is thrown. If you set this setting on the client, the 'receive_timeout' for the socket will also be set on the corresponding connection end on the server. +)", 0) \ + M(Seconds, tcp_keep_alive_timeout, DEFAULT_TCP_KEEP_ALIVE_TIMEOUT /* less than DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC */, R"( +The time in seconds the connection needs to remain idle before TCP starts sending keepalive probes +)", 0) \ + M(Milliseconds, hedged_connection_timeout_ms, 50, R"( +Connection timeout for establishing connection with replica for Hedged requests +)", 0) \ + M(Milliseconds, receive_data_timeout_ms, 2000, R"( +Connection timeout for receiving first packet of data or packet with positive progress from replica +)", 0) \ + M(Bool, use_hedged_requests, true, R"( +Enables hedged requests logic for remote queries. It allows to establish many connections with different replicas for query. +New connection is enabled in case existent connection(s) with replica(s) were not established within `hedged_connection_timeout` +or no data was received within `receive_data_timeout`. Query uses the first connection which send non empty progress packet (or data packet, if `allow_changing_replica_until_first_data_packet`); +other connections are cancelled. Queries with `max_parallel_replicas > 1` are supported. + +Enabled by default. + +Disabled by default on Cloud. +)", 0) \ + M(Bool, allow_changing_replica_until_first_data_packet, false, R"( +If it's enabled, in hedged requests we can start new connection until receiving first data packet even if we have already made some progress +(but progress haven't updated for `receive_data_timeout` timeout), otherwise we disable changing replica after the first time we made progress. +)", 0) \ + M(Milliseconds, queue_max_wait_ms, 0, R"( +The wait time in the request queue, if the number of concurrent requests exceeds the maximum. +)", 0) \ + M(Milliseconds, connection_pool_max_wait_ms, 0, R"( +The wait time in milliseconds for a connection when the connection pool is full. + +Possible values: + +- Positive integer. +- 0 — Infinite timeout. +)", 0) \ + M(Milliseconds, replace_running_query_max_wait_ms, 5000, R"( +The wait time for running the query with the same `query_id` to finish, when the [replace_running_query](#replace-running-query) setting is active. + +Possible values: + +- Positive integer. +- 0 — Throwing an exception that does not allow to run a new query if the server already executes a query with the same `query_id`. +)", 0) \ + M(Milliseconds, kafka_max_wait_ms, 5000, R"( +The wait time in milliseconds for reading messages from [Kafka](../../engines/table-engines/integrations/kafka.md/#kafka) before retry. + +Possible values: + +- Positive integer. +- 0 — Infinite timeout. + +See also: + +- [Apache Kafka](https://kafka.apache.org/) +)", 0) \ + M(Milliseconds, rabbitmq_max_wait_ms, 5000, R"( +The wait time for reading from RabbitMQ before retry. +)", 0) \ + M(UInt64, poll_interval, DBMS_DEFAULT_POLL_INTERVAL, R"( +Block at the query wait loop on the server for the specified number of seconds. +)", 0) \ + M(UInt64, idle_connection_timeout, 3600, R"( +Timeout to close idle TCP connections after specified number of seconds. + +Possible values: + +- Positive integer (0 - close immediately, after 0 seconds). +)", 0) \ + M(UInt64, distributed_connections_pool_size, 1024, R"( +The maximum number of simultaneous connections with remote servers for distributed processing of all queries to a single Distributed table. We recommend setting a value no less than the number of servers in the cluster. +)", 0) \ + M(UInt64, connections_with_failover_max_tries, 3, R"( +The maximum number of connection attempts with each replica for the Distributed table engine. +)", 0) \ + M(UInt64, s3_strict_upload_part_size, S3::DEFAULT_STRICT_UPLOAD_PART_SIZE, R"( +The exact size of part to upload during multipart upload to S3 (some implementations does not supports variable size parts). +)", 0) \ + M(UInt64, azure_strict_upload_part_size, 0, R"( +The exact size of part to upload during multipart upload to Azure blob storage. +)", 0) \ + M(UInt64, azure_max_blocks_in_multipart_upload, 50000, R"( +Maximum number of blocks in multipart upload for Azure. +)", 0) \ + M(UInt64, s3_min_upload_part_size, S3::DEFAULT_MIN_UPLOAD_PART_SIZE, R"( +The minimum size of part to upload during multipart upload to S3. +)", 0) \ + M(UInt64, s3_max_upload_part_size, S3::DEFAULT_MAX_UPLOAD_PART_SIZE, R"( +The maximum size of part to upload during multipart upload to S3. +)", 0) \ + M(UInt64, azure_min_upload_part_size, 16*1024*1024, R"( +The minimum size of part to upload during multipart upload to Azure blob storage. +)", 0) \ + M(UInt64, azure_max_upload_part_size, 5ull*1024*1024*1024, R"( +The maximum size of part to upload during multipart upload to Azure blob storage. +)", 0) \ + M(UInt64, s3_upload_part_size_multiply_factor, S3::DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_FACTOR, R"( +Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3. +)", 0) \ + M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, S3::DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_PARTS_COUNT_THRESHOLD, R"( +Each time this number of parts was uploaded to S3, s3_min_upload_part_size is multiplied by s3_upload_part_size_multiply_factor. +)", 0) \ + M(UInt64, s3_max_part_number, S3::DEFAULT_MAX_PART_NUMBER, R"( +Maximum part number number for s3 upload part. +)", 0) \ + M(UInt64, s3_max_single_operation_copy_size, S3::DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE, R"( +Maximum size for a single copy operation in s3 +)", 0) \ + M(UInt64, azure_upload_part_size_multiply_factor, 2, R"( +Multiply azure_min_upload_part_size by this factor each time azure_multiply_parts_count_threshold parts were uploaded from a single write to Azure blob storage. +)", 0) \ + M(UInt64, azure_upload_part_size_multiply_parts_count_threshold, 500, R"( +Each time this number of parts was uploaded to Azure blob storage, azure_min_upload_part_size is multiplied by azure_upload_part_size_multiply_factor. +)", 0) \ + M(UInt64, s3_max_inflight_parts_for_one_file, S3::DEFAULT_MAX_INFLIGHT_PARTS_FOR_ONE_FILE, R"( +The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited. +)", 0) \ + M(UInt64, azure_max_inflight_parts_for_one_file, 20, R"( +The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited. +)", 0) \ + M(UInt64, s3_max_single_part_upload_size, S3::DEFAULT_MAX_SINGLE_PART_UPLOAD_SIZE, R"( +The maximum size of object to upload using singlepart upload to S3. +)", 0) \ + M(UInt64, azure_max_single_part_upload_size, 100*1024*1024, R"( +The maximum size of object to upload using singlepart upload to Azure blob storage. +)", 0) \ + M(UInt64, azure_max_single_part_copy_size, 256*1024*1024, R"( +The maximum size of object to copy using single part copy to Azure blob storage. +)", 0) \ + M(UInt64, s3_max_single_read_retries, S3::DEFAULT_MAX_SINGLE_READ_TRIES, R"( +The maximum number of retries during single S3 read. +)", 0) \ + M(UInt64, azure_max_single_read_retries, 4, R"( +The maximum number of retries during single Azure blob storage read. +)", 0) \ + M(UInt64, azure_max_unexpected_write_error_retries, 4, R"( +The maximum number of retries in case of unexpected errors during Azure blob storage write +)", 0) \ + M(UInt64, s3_max_unexpected_write_error_retries, S3::DEFAULT_MAX_UNEXPECTED_WRITE_ERROR_RETRIES, R"( +The maximum number of retries in case of unexpected errors during S3 write. +)", 0) \ + M(UInt64, s3_max_redirects, S3::DEFAULT_MAX_REDIRECTS, R"( +Max number of S3 redirects hops allowed. +)", 0) \ + M(UInt64, s3_max_connections, S3::DEFAULT_MAX_CONNECTIONS, R"( +The maximum number of connections per server. +)", 0) \ + M(UInt64, s3_max_get_rps, 0, R"( +Limit on S3 GET request per second rate before throttling. Zero means unlimited. +)", 0) \ + M(UInt64, s3_max_get_burst, 0, R"( +Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_get_rps` +)", 0) \ + M(UInt64, s3_max_put_rps, 0, R"( +Limit on S3 PUT request per second rate before throttling. Zero means unlimited. +)", 0) \ + M(UInt64, s3_max_put_burst, 0, R"( +Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_put_rps` +)", 0) \ + M(UInt64, s3_list_object_keys_size, S3::DEFAULT_LIST_OBJECT_KEYS_SIZE, R"( +Maximum number of files that could be returned in batch by ListObject request +)", 0) \ + M(Bool, s3_use_adaptive_timeouts, S3::DEFAULT_USE_ADAPTIVE_TIMEOUTS, R"( +When set to `true` than for all s3 requests first two attempts are made with low send and receive timeouts. +When set to `false` than all attempts are made with identical timeouts. +)", 0) \ + M(UInt64, azure_list_object_keys_size, 1000, R"( +Maximum number of files that could be returned in batch by ListObject request +)", 0) \ + M(Bool, s3_truncate_on_insert, false, R"( +Enables or disables truncate before inserts in s3 engine tables. If disabled, an exception will be thrown on insert attempts if an S3 object already exists. + +Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` query replaces existing content of the file with the new data. +)", 0) \ + M(Bool, azure_truncate_on_insert, false, R"( +Enables or disables truncate before insert in azure engine tables. +)", 0) \ + M(Bool, s3_create_new_file_on_insert, false, R"( +Enables or disables creating a new file on each insert in s3 engine tables. If enabled, on each insert a new S3 object will be created with the key, similar to this pattern: + +initial: `data.Parquet.gz` -> `data.1.Parquet.gz` -> `data.2.Parquet.gz`, etc. + +Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` query creates a new file. +)", 0) \ + M(Bool, s3_skip_empty_files, false, R"( +Enables or disables skipping empty files in [S3](../../engines/table-engines/integrations/s3.md) engine tables. + +Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. +)", 0) \ + M(Bool, azure_create_new_file_on_insert, false, R"( +Enables or disables creating a new file on each insert in azure engine tables +)", 0) \ + M(Bool, s3_check_objects_after_upload, false, R"( +Check each uploaded object to s3 with head request to be sure that upload was successful +)", 0) \ + M(Bool, s3_allow_parallel_part_upload, true, R"( +Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage +)", 0) \ + M(Bool, azure_allow_parallel_part_upload, true, R"( +Use multiple threads for azure multipart upload. +)", 0) \ + M(Bool, s3_throw_on_zero_files_match, false, R"( +Throw an error, when ListObjects request cannot match any files +)", 0) \ + M(Bool, hdfs_throw_on_zero_files_match, false, R"( +Throw an error if matched zero files according to glob expansion rules. + +Possible values: +- 1 — `SELECT` throws an exception. +- 0 — `SELECT` returns empty result. +)", 0) \ + M(Bool, azure_throw_on_zero_files_match, false, R"( +Throw an error if matched zero files according to glob expansion rules. + +Possible values: +- 1 — `SELECT` throws an exception. +- 0 — `SELECT` returns empty result. +)", 0) \ + M(Bool, s3_ignore_file_doesnt_exist, false, R"( +Ignore absence of file if it does not exist when reading certain keys. + +Possible values: +- 1 — `SELECT` returns empty result. +- 0 — `SELECT` throws an exception. +)", 0) \ + M(Bool, hdfs_ignore_file_doesnt_exist, false, R"( +Ignore absence of file if it does not exist when reading certain keys. + +Possible values: +- 1 — `SELECT` returns empty result. +- 0 — `SELECT` throws an exception. +)", 0) \ + M(Bool, azure_ignore_file_doesnt_exist, false, R"( +Ignore absence of file if it does not exist when reading certain keys. + +Possible values: +- 1 — `SELECT` returns empty result. +- 0 — `SELECT` throws an exception. +)", 0) \ + M(UInt64, azure_sdk_max_retries, 10, R"( +Maximum number of retries in azure sdk +)", 0) \ + M(UInt64, azure_sdk_retry_initial_backoff_ms, 10, R"( +Minimal backoff between retries in azure sdk +)", 0) \ + M(UInt64, azure_sdk_retry_max_backoff_ms, 1000, R"( +Maximal backoff between retries in azure sdk +)", 0) \ + M(Bool, s3_validate_request_settings, true, R"( +Enables s3 request settings validation. + +Possible values: +- 1 — validate settings. +- 0 — do not validate settings. +)", 0) \ + M(Bool, s3_disable_checksum, S3::DEFAULT_DISABLE_CHECKSUM, R"( +Do not calculate a checksum when sending a file to S3. This speeds up writes by avoiding excessive processing passes on a file. It is mostly safe as the data of MergeTree tables is checksummed by ClickHouse anyway, and when S3 is accessed with HTTPS, the TLS layer already provides integrity while transferring through the network. While additional checksums on S3 give defense in depth. +)", 0) \ + M(UInt64, s3_retry_attempts, S3::DEFAULT_RETRY_ATTEMPTS, R"( +Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries +)", 0) \ + M(UInt64, s3_request_timeout_ms, S3::DEFAULT_REQUEST_TIMEOUT_MS, R"( +Idleness timeout for sending and receiving data to/from S3. Fail if a single TCP read or write call blocks for this long. +)", 0) \ + M(UInt64, s3_connect_timeout_ms, S3::DEFAULT_CONNECT_TIMEOUT_MS, R"( +Connection timeout for host from s3 disks. +)", 0) \ + M(Bool, enable_s3_requests_logging, false, R"( +Enable very explicit logging of S3 requests. Makes sense for debug only. +)", 0) \ + M(String, s3queue_default_zookeeper_path, "/clickhouse/s3queue/", R"( +Default zookeeper path prefix for S3Queue engine +)", 0) \ + M(Bool, s3queue_enable_logging_to_s3queue_log, false, R"( +Enable writing to system.s3queue_log. The value can be overwritten per table with table settings +)", 0) \ + M(UInt64, hdfs_replication, 0, R"( +The actual number of replications can be specified when the hdfs file is created. +)", 0) \ + M(Bool, hdfs_truncate_on_insert, false, R"( +Enables or disables truncation before an insert in hdfs engine tables. If disabled, an exception will be thrown on an attempt to insert if a file in HDFS already exists. + +Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` query replaces existing content of the file with the new data. +)", 0) \ + M(Bool, hdfs_create_new_file_on_insert, false, R"( +Enables or disables creating a new file on each insert in HDFS engine tables. If enabled, on each insert a new HDFS file will be created with the name, similar to this pattern: + +initial: `data.Parquet.gz` -> `data.1.Parquet.gz` -> `data.2.Parquet.gz`, etc. + +Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` query creates a new file. +)", 0) \ + M(Bool, hdfs_skip_empty_files, false, R"( +Enables or disables skipping empty files in [HDFS](../../engines/table-engines/integrations/hdfs.md) engine tables. + +Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. +)", 0) \ + M(Bool, azure_skip_empty_files, false, R"( +Enables or disables skipping empty files in S3 engine. + +Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. +)", 0) \ + M(UInt64, hsts_max_age, 0, R"( +Expired time for HSTS. 0 means disable HSTS. +)", 0) \ + M(Bool, extremes, false, R"( +Whether to count extreme values (the minimums and maximums in columns of a query result). Accepts 0 or 1. By default, 0 (disabled). +For more information, see the section “Extreme values”. +)", IMPORTANT) \ + M(Bool, use_uncompressed_cache, false, R"( +Whether to use a cache of uncompressed blocks. Accepts 0 or 1. By default, 0 (disabled). +Using the uncompressed cache (only for tables in the MergeTree family) can significantly reduce latency and increase throughput when working with a large number of short queries. Enable this setting for users who send frequent short requests. Also pay attention to the [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md/#server-settings-uncompressed_cache_size) configuration parameter (only set in the config file) – the size of uncompressed cache blocks. By default, it is 8 GiB. The uncompressed cache is filled in as needed and the least-used data is automatically deleted. + +For queries that read at least a somewhat large volume of data (one million rows or more), the uncompressed cache is disabled automatically to save space for truly small queries. This means that you can keep the ‘use_uncompressed_cache’ setting always set to 1. +)", 0) \ + M(Bool, replace_running_query, false, R"( +When using the HTTP interface, the ‘query_id’ parameter can be passed. This is any string that serves as the query identifier. +If a query from the same user with the same ‘query_id’ already exists at this time, the behaviour depends on the ‘replace_running_query’ parameter. + +`0` (default) – Throw an exception (do not allow the query to run if a query with the same ‘query_id’ is already running). + +`1` – Cancel the old query and start running the new one. + +Set this parameter to 1 for implementing suggestions for segmentation conditions. After entering the next character, if the old query hasn’t finished yet, it should be cancelled. +)", 0) \ + M(UInt64, max_remote_read_network_bandwidth, 0, R"( +The maximum speed of data exchange over the network in bytes per second for read. +)", 0) \ + M(UInt64, max_remote_write_network_bandwidth, 0, R"( +The maximum speed of data exchange over the network in bytes per second for write. +)", 0) \ + M(UInt64, max_local_read_bandwidth, 0, R"( +The maximum speed of local reads in bytes per second. +)", 0) \ + M(UInt64, max_local_write_bandwidth, 0, R"( +The maximum speed of local writes in bytes per second. +)", 0) \ + M(Bool, stream_like_engine_allow_direct_select, false, R"( +Allow direct SELECT query for Kafka, RabbitMQ, FileLog, Redis Streams, and NATS engines. In case there are attached materialized views, SELECT query is not allowed even if this setting is enabled. +)", 0) \ + M(String, stream_like_engine_insert_queue, "", R"( +When stream-like engine reads from multiple queues, the user will need to select one queue to insert into when writing. Used by Redis Streams and NATS. +)", 0) \ + M(Bool, dictionary_validate_primary_key_type, false, R"( +Validate primary key type for dictionaries. By default id type for simple layouts will be implicitly converted to UInt64. +)", 0) \ + M(Bool, distributed_insert_skip_read_only_replicas, false, R"( +Enables skipping read-only replicas for INSERT queries into Distributed. + +Possible values: + +- 0 — INSERT was as usual, if it will go to read-only replica it will fail +- 1 — Initiator will skip read-only replicas before sending data to shards. +)", 0) \ + M(Bool, distributed_foreground_insert, false, R"( +Enables or disables synchronous data insertion into a [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table. + +By default, when inserting data into a `Distributed` table, the ClickHouse server sends data to cluster nodes in background mode. When `distributed_foreground_insert=1`, the data is processed synchronously, and the `INSERT` operation succeeds only after all the data is saved on all shards (at least one replica for each shard if `internal_replication` is true). + +Possible values: + +- 0 — Data is inserted in background mode. +- 1 — Data is inserted in synchronous mode. + +Cloud default value: `1`. + +**See Also** + +- [Distributed Table Engine](../../engines/table-engines/special/distributed.md/#distributed) +- [Managing Distributed Tables](../../sql-reference/statements/system.md/#query-language-system-distributed) +)", 0) ALIAS(insert_distributed_sync) \ + M(UInt64, distributed_background_insert_timeout, 0, R"( +Timeout for insert query into distributed. Setting is used only with insert_distributed_sync enabled. Zero value means no timeout. +)", 0) ALIAS(insert_distributed_timeout) \ + M(Milliseconds, distributed_background_insert_sleep_time_ms, 100, R"( +Base interval for the [Distributed](../../engines/table-engines/special/distributed.md) table engine to send data. The actual interval grows exponentially in the event of errors. + +Possible values: + +- A positive integer number of milliseconds. +)", 0) ALIAS(distributed_directory_monitor_sleep_time_ms) \ + M(Milliseconds, distributed_background_insert_max_sleep_time_ms, 30000, R"( +Maximum interval for the [Distributed](../../engines/table-engines/special/distributed.md) table engine to send data. Limits exponential growth of the interval set in the [distributed_background_insert_sleep_time_ms](#distributed_background_insert_sleep_time_ms) setting. + +Possible values: + +- A positive integer number of milliseconds. +)", 0) ALIAS(distributed_directory_monitor_max_sleep_time_ms) \ \ - M(Bool, distributed_background_insert_batch, false, "Should background INSERTs into Distributed be batched into bigger blocks.", 0) ALIAS(distributed_directory_monitor_batch_inserts) \ - M(Bool, distributed_background_insert_split_batch_on_failure, false, "Should batches of the background INSERT into Distributed be split into smaller batches in case of failures.", 0) ALIAS(distributed_directory_monitor_split_batch_on_failure) \ + M(Bool, distributed_background_insert_batch, false, R"( +Enables/disables inserted data sending in batches. + +When batch sending is enabled, the [Distributed](../../engines/table-engines/special/distributed.md) table engine tries to send multiple files of inserted data in one operation instead of sending them separately. Batch sending improves cluster performance by better-utilizing server and network resources. + +Possible values: + +- 1 — Enabled. +- 0 — Disabled. +)", 0) ALIAS(distributed_directory_monitor_batch_inserts) \ + M(Bool, distributed_background_insert_split_batch_on_failure, false, R"( +Enables/disables splitting batches on failures. + +Sometimes sending particular batch to the remote shard may fail, because of some complex pipeline after (i.e. `MATERIALIZED VIEW` with `GROUP BY`) due to `Memory limit exceeded` or similar errors. In this case, retrying will not help (and this will stuck distributed sends for the table) but sending files from that batch one by one may succeed INSERT. + +So installing this setting to `1` will disable batching for such batches (i.e. temporary disables `distributed_background_insert_batch` for failed batches). + +Possible values: + +- 1 — Enabled. +- 0 — Disabled. + +:::note +This setting also affects broken batches (that may appears because of abnormal server (machine) termination and no `fsync_after_insert`/`fsync_directories` for [Distributed](../../engines/table-engines/special/distributed.md) table engine). +::: + +:::note +You should not rely on automatic batch splitting, since this may hurt performance. +::: +)", 0) ALIAS(distributed_directory_monitor_split_batch_on_failure) \ \ - M(Bool, optimize_move_to_prewhere, true, "Allows disabling WHERE to PREWHERE optimization in SELECT queries from MergeTree.", 0) \ - M(Bool, optimize_move_to_prewhere_if_final, false, "If the query has `FINAL`, the optimization `move_to_prewhere` is not always correct and it is enabled only if both settings `optimize_move_to_prewhere` and `optimize_move_to_prewhere_if_final` are turned on", 0) \ - M(Bool, move_all_conditions_to_prewhere, true, "Move all viable conditions from WHERE to PREWHERE", 0) \ - M(Bool, enable_multiple_prewhere_read_steps, true, "Move more conditions from WHERE to PREWHERE and do reads from disk and filtering in multiple steps if there are multiple conditions combined with AND", 0) \ - M(Bool, move_primary_key_columns_to_end_of_prewhere, true, "Move PREWHERE conditions containing primary key columns to the end of AND chain. It is likely that these conditions are taken into account during primary key analysis and thus will not contribute a lot to PREWHERE filtering.", 0) \ + M(Bool, optimize_move_to_prewhere, true, R"( +Enables or disables automatic [PREWHERE](../../sql-reference/statements/select/prewhere.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries. + +Works only for [*MergeTree](../../engines/table-engines/mergetree-family/index.md) tables. + +Possible values: + +- 0 — Automatic `PREWHERE` optimization is disabled. +- 1 — Automatic `PREWHERE` optimization is enabled. +)", 0) \ + M(Bool, optimize_move_to_prewhere_if_final, false, R"( +Enables or disables automatic [PREWHERE](../../sql-reference/statements/select/prewhere.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries with [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier. + +Works only for [*MergeTree](../../engines/table-engines/mergetree-family/index.md) tables. + +Possible values: + +- 0 — Automatic `PREWHERE` optimization in `SELECT` queries with `FINAL` modifier is disabled. +- 1 — Automatic `PREWHERE` optimization in `SELECT` queries with `FINAL` modifier is enabled. + +**See Also** + +- [optimize_move_to_prewhere](#optimize_move_to_prewhere) setting +)", 0) \ + M(Bool, move_all_conditions_to_prewhere, true, R"( +Move all viable conditions from WHERE to PREWHERE +)", 0) \ + M(Bool, enable_multiple_prewhere_read_steps, true, R"( +Move more conditions from WHERE to PREWHERE and do reads from disk and filtering in multiple steps if there are multiple conditions combined with AND +)", 0) \ + M(Bool, move_primary_key_columns_to_end_of_prewhere, true, R"( +Move PREWHERE conditions containing primary key columns to the end of AND chain. It is likely that these conditions are taken into account during primary key analysis and thus will not contribute a lot to PREWHERE filtering. +)", 0) \ \ - M(UInt64, alter_sync, 1, "Wait for actions to manipulate the partitions. 0 - do not wait, 1 - wait for execution only of itself, 2 - wait for everyone.", 0) ALIAS(replication_alter_partitions_sync) \ - M(Int64, replication_wait_for_inactive_replica_timeout, 120, "Wait for inactive replica to execute ALTER/OPTIMIZE. Time in seconds, 0 - do not wait, negative - wait for unlimited time.", 0) \ - M(Bool, alter_move_to_space_execute_async, false, "Execute ALTER TABLE MOVE ... TO [DISK|VOLUME] asynchronously", 0) \ + M(UInt64, alter_sync, 1, R"( +Allows to set up waiting for actions to be executed on replicas by [ALTER](../../sql-reference/statements/alter/index.md), [OPTIMIZE](../../sql-reference/statements/optimize.md) or [TRUNCATE](../../sql-reference/statements/truncate.md) queries. + +Possible values: + +- 0 — Do not wait. +- 1 — Wait for own execution. +- 2 — Wait for everyone. + +Cloud default value: `0`. + +:::note +`alter_sync` is applicable to `Replicated` tables only, it does nothing to alters of not `Replicated` tables. +::: +)", 0) ALIAS(replication_alter_partitions_sync) \ + M(Int64, replication_wait_for_inactive_replica_timeout, 120, R"( +Specifies how long (in seconds) to wait for inactive replicas to execute [ALTER](../../sql-reference/statements/alter/index.md), [OPTIMIZE](../../sql-reference/statements/optimize.md) or [TRUNCATE](../../sql-reference/statements/truncate.md) queries. + +Possible values: + +- 0 — Do not wait. +- Negative integer — Wait for unlimited time. +- Positive integer — The number of seconds to wait. +)", 0) \ + M(Bool, alter_move_to_space_execute_async, false, R"( +Execute ALTER TABLE MOVE ... TO [DISK|VOLUME] asynchronously +)", 0) \ \ - M(LoadBalancing, load_balancing, LoadBalancing::RANDOM, "Which replicas (among healthy replicas) to preferably send a query to (on the first attempt) for distributed processing.", 0) \ - M(UInt64, load_balancing_first_offset, 0, "Which replica to preferably send a query when FIRST_OR_RANDOM load balancing strategy is used.", 0) \ + M(LoadBalancing, load_balancing, LoadBalancing::RANDOM, R"( +Specifies the algorithm of replicas selection that is used for distributed query processing. + +ClickHouse supports the following algorithms of choosing replicas: + +- [Random](#load_balancing-random) (by default) +- [Nearest hostname](#load_balancing-nearest_hostname) +- [Hostname levenshtein distance](#load_balancing-hostname_levenshtein_distance) +- [In order](#load_balancing-in_order) +- [First or random](#load_balancing-first_or_random) +- [Round robin](#load_balancing-round_robin) + +See also: + +- [distributed_replica_max_ignored_errors](#distributed_replica_max_ignored_errors) + +### Random (by Default) {#load_balancing-random} + +``` sql +load_balancing = random +``` + +The number of errors is counted for each replica. The query is sent to the replica with the fewest errors, and if there are several of these, to anyone of them. +Disadvantages: Server proximity is not accounted for; if the replicas have different data, you will also get different data. + +### Nearest Hostname {#load_balancing-nearest_hostname} + +``` sql +load_balancing = nearest_hostname +``` + +The number of errors is counted for each replica. Every 5 minutes, the number of errors is integrally divided by 2. Thus, the number of errors is calculated for a recent time with exponential smoothing. If there is one replica with a minimal number of errors (i.e. errors occurred recently on the other replicas), the query is sent to it. If there are multiple replicas with the same minimal number of errors, the query is sent to the replica with a hostname that is most similar to the server’s hostname in the config file (for the number of different characters in identical positions, up to the minimum length of both hostnames). + +For instance, example01-01-1 and example01-01-2 are different in one position, while example01-01-1 and example01-02-2 differ in two places. +This method might seem primitive, but it does not require external data about network topology, and it does not compare IP addresses, which would be complicated for our IPv6 addresses. + +Thus, if there are equivalent replicas, the closest one by name is preferred. +We can also assume that when sending a query to the same server, in the absence of failures, a distributed query will also go to the same servers. So even if different data is placed on the replicas, the query will return mostly the same results. + +### Hostname levenshtein distance {#load_balancing-hostname_levenshtein_distance} + +``` sql +load_balancing = hostname_levenshtein_distance +``` + +Just like `nearest_hostname`, but it compares hostname in a [levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) manner. For example: + +``` text +example-clickhouse-0-0 ample-clickhouse-0-0 +1 + +example-clickhouse-0-0 example-clickhouse-1-10 +2 + +example-clickhouse-0-0 example-clickhouse-12-0 +3 +``` + +### In Order {#load_balancing-in_order} + +``` sql +load_balancing = in_order +``` + +Replicas with the same number of errors are accessed in the same order as they are specified in the configuration. +This method is appropriate when you know exactly which replica is preferable. + +### First or Random {#load_balancing-first_or_random} + +``` sql +load_balancing = first_or_random +``` + +This algorithm chooses the first replica in the set or a random replica if the first is unavailable. It’s effective in cross-replication topology setups, but useless in other configurations. + +The `first_or_random` algorithm solves the problem of the `in_order` algorithm. With `in_order`, if one replica goes down, the next one gets a double load while the remaining replicas handle the usual amount of traffic. When using the `first_or_random` algorithm, the load is evenly distributed among replicas that are still available. + +It's possible to explicitly define what the first replica is by using the setting `load_balancing_first_offset`. This gives more control to rebalance query workloads among replicas. + +### Round Robin {#load_balancing-round_robin} + +``` sql +load_balancing = round_robin +``` + +This algorithm uses a round-robin policy across replicas with the same number of errors (only the queries with `round_robin` policy is accounted). +)", 0) \ + M(UInt64, load_balancing_first_offset, 0, R"( +Which replica to preferably send a query when FIRST_OR_RANDOM load balancing strategy is used. +)", 0) \ \ - M(TotalsMode, totals_mode, TotalsMode::AFTER_HAVING_EXCLUSIVE, "How to calculate TOTALS when HAVING is present, as well as when max_rows_to_group_by and group_by_overflow_mode = ‘any’ are present.", IMPORTANT) \ - M(Float, totals_auto_threshold, 0.5, "The threshold for totals_mode = 'auto'.", 0) \ + M(TotalsMode, totals_mode, TotalsMode::AFTER_HAVING_EXCLUSIVE, R"( +How to calculate TOTALS when HAVING is present, as well as when max_rows_to_group_by and group_by_overflow_mode = ‘any’ are present. +See the section “WITH TOTALS modifier”. +)", IMPORTANT) \ + M(Float, totals_auto_threshold, 0.5, R"( +The threshold for `totals_mode = 'auto'`. +See the section “WITH TOTALS modifier”. +)", 0) \ \ - M(Bool, allow_suspicious_low_cardinality_types, false, "In CREATE TABLE statement allows specifying LowCardinality modifier for types of small fixed size (8 or less). Enabling this may increase merge times and memory consumption.", 0) \ - M(Bool, allow_suspicious_fixed_string_types, false, "In CREATE TABLE statement allows creating columns of type FixedString(n) with n > 256. FixedString with length >= 256 is suspicious and most likely indicates a misuse", 0) \ - M(Bool, allow_suspicious_indices, false, "Reject primary/secondary indexes and sorting keys with identical expressions", 0) \ - M(Bool, allow_suspicious_ttl_expressions, false, "Reject TTL expressions that don't depend on any of table's columns. It indicates a user error most of the time.", 0) \ - M(Bool, allow_suspicious_variant_types, false, "In CREATE TABLE statement allows specifying Variant type with similar variant types (for example, with different numeric or date types). Enabling this setting may introduce some ambiguity when working with values with similar types.", 0) \ - M(Bool, allow_suspicious_primary_key, false, "Forbid suspicious PRIMARY KEY/ORDER BY for MergeTree (i.e. SimpleAggregateFunction)", 0) \ - M(Bool, allow_suspicious_types_in_group_by, false, "Allow suspicious types like Variant/Dynamic in GROUP BY clause", 0) \ - M(Bool, allow_suspicious_types_in_order_by, false, "Allow suspicious types like Variant/Dynamic in ORDER BY clause", 0) \ - M(Bool, compile_expressions, false, "Compile some scalar functions and operators to native code. Due to a bug in the LLVM compiler infrastructure, on AArch64 machines, it is known to lead to a nullptr dereference and, consequently, server crash. Do not enable this setting.", 0) \ - M(UInt64, min_count_to_compile_expression, 3, "The number of identical expressions before they are JIT-compiled", 0) \ - M(Bool, compile_aggregate_expressions, true, "Compile aggregate functions to native code.", 0) \ - M(UInt64, min_count_to_compile_aggregate_expression, 3, "The number of identical aggregate expressions before they are JIT-compiled", 0) \ - M(Bool, compile_sort_description, true, "Compile sort description to native code.", 0) \ - M(UInt64, min_count_to_compile_sort_description, 3, "The number of identical sort descriptions before they are JIT-compiled", 0) \ - M(UInt64, group_by_two_level_threshold, 100000, "From what number of keys, a two-level aggregation starts. 0 - the threshold is not set.", 0) \ - M(UInt64, group_by_two_level_threshold_bytes, 50000000, "From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. Two-level aggregation is used when at least one of the thresholds is triggered.", 0) \ - M(Bool, distributed_aggregation_memory_efficient, true, "Is the memory-saving mode of distributed aggregation enabled.", 0) \ - M(UInt64, aggregation_memory_efficient_merge_threads, 0, "Number of threads to use for merge intermediate aggregation results in memory efficient mode. When bigger, then more memory is consumed. 0 means - same as 'max_threads'.", 0) \ - M(Bool, enable_memory_bound_merging_of_aggregation_results, true, "Enable memory bound merging strategy for aggregation.", 0) \ - M(Bool, enable_positional_arguments, true, "Enable positional arguments in ORDER BY, GROUP BY and LIMIT BY", 0) \ - M(Bool, enable_extended_results_for_datetime_functions, false, "Enable date functions like toLastDayOfMonth return Date32 results (instead of Date results) for Date32/DateTime64 arguments.", 0) \ - M(Bool, allow_nonconst_timezone_arguments, false, "Allow non-const timezone arguments in certain time-related functions like toTimeZone(), fromUnixTimestamp*(), snowflakeToDateTime*()", 0) \ - M(Bool, function_locate_has_mysql_compatible_argument_order, true, "Function locate() has arguments (needle, haystack[, start_pos]) like in MySQL instead of (haystack, needle[, start_pos]) like function position()", 0) \ + M(Bool, allow_suspicious_low_cardinality_types, false, R"( +Allows or restricts using [LowCardinality](../../sql-reference/data-types/lowcardinality.md) with data types with fixed size of 8 bytes or less: numeric data types and `FixedString(8_bytes_or_less)`. + +For small fixed values using of `LowCardinality` is usually inefficient, because ClickHouse stores a numeric index for each row. As a result: + +- Disk space usage can rise. +- RAM consumption can be higher, depending on a dictionary size. +- Some functions can work slower due to extra coding/encoding operations. + +Merge times in [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md)-engine tables can grow due to all the reasons described above. + +Possible values: + +- 1 — Usage of `LowCardinality` is not restricted. +- 0 — Usage of `LowCardinality` is restricted. +)", 0) \ + M(Bool, allow_suspicious_fixed_string_types, false, R"( +In CREATE TABLE statement allows creating columns of type FixedString(n) with n > 256. FixedString with length >= 256 is suspicious and most likely indicates a misuse +)", 0) \ + M(Bool, allow_suspicious_indices, false, R"( +Reject primary/secondary indexes and sorting keys with identical expressions +)", 0) \ + M(Bool, allow_suspicious_ttl_expressions, false, R"( +Reject TTL expressions that don't depend on any of table's columns. It indicates a user error most of the time. +)", 0) \ + M(Bool, allow_suspicious_variant_types, false, R"( +In CREATE TABLE statement allows specifying Variant type with similar variant types (for example, with different numeric or date types). Enabling this setting may introduce some ambiguity when working with values with similar types. +)", 0) \ + M(Bool, allow_suspicious_primary_key, false, R"( +Allow suspicious `PRIMARY KEY`/`ORDER BY` for MergeTree (i.e. SimpleAggregateFunction). +)", 0) \ + M(Bool, allow_suspicious_types_in_group_by, false, R"( +Allows or restricts using [Variant](../../sql-reference/data-types/variant.md) and [Dynamic](../../sql-reference/data-types/dynamic.md) types in GROUP BY keys. +)", 0) \ + M(Bool, allow_suspicious_types_in_order_by, false, R"( +Allows or restricts using [Variant](../../sql-reference/data-types/variant.md) and [Dynamic](../../sql-reference/data-types/dynamic.md) types in ORDER BY keys. +)", 0) \ + M(Bool, compile_expressions, false, R"( +Compile some scalar functions and operators to native code. Due to a bug in the LLVM compiler infrastructure, on AArch64 machines, it is known to lead to a nullptr dereference and, consequently, server crash. Do not enable this setting. +)", 0) \ + M(UInt64, min_count_to_compile_expression, 3, R"( +Minimum count of executing same expression before it is get compiled. +)", 0) \ + M(Bool, compile_aggregate_expressions, true, R"( +Enables or disables JIT-compilation of aggregate functions to native code. Enabling this setting can improve the performance. + +Possible values: + +- 0 — Aggregation is done without JIT compilation. +- 1 — Aggregation is done using JIT compilation. + +**See Also** + +- [min_count_to_compile_aggregate_expression](#min_count_to_compile_aggregate_expression) +)", 0) \ + M(UInt64, min_count_to_compile_aggregate_expression, 3, R"( +The minimum number of identical aggregate expressions to start JIT-compilation. Works only if the [compile_aggregate_expressions](#compile_aggregate_expressions) setting is enabled. + +Possible values: + +- Positive integer. +- 0 — Identical aggregate expressions are always JIT-compiled. +)", 0) \ + M(Bool, compile_sort_description, true, R"( +Compile sort description to native code. +)", 0) \ + M(UInt64, min_count_to_compile_sort_description, 3, R"( +The number of identical sort descriptions before they are JIT-compiled +)", 0) \ + M(UInt64, group_by_two_level_threshold, 100000, R"( +From what number of keys, a two-level aggregation starts. 0 - the threshold is not set. +)", 0) \ + M(UInt64, group_by_two_level_threshold_bytes, 50000000, R"( +From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. Two-level aggregation is used when at least one of the thresholds is triggered. +)", 0) \ + M(Bool, distributed_aggregation_memory_efficient, true, R"( +Is the memory-saving mode of distributed aggregation enabled. +)", 0) \ + M(UInt64, aggregation_memory_efficient_merge_threads, 0, R"( +Number of threads to use for merge intermediate aggregation results in memory efficient mode. When bigger, then more memory is consumed. 0 means - same as 'max_threads'. +)", 0) \ + M(Bool, enable_memory_bound_merging_of_aggregation_results, true, R"( +Enable memory bound merging strategy for aggregation. +)", 0) \ + M(Bool, enable_positional_arguments, true, R"( +Enables or disables supporting positional arguments for [GROUP BY](../../sql-reference/statements/select/group-by.md), [LIMIT BY](../../sql-reference/statements/select/limit-by.md), [ORDER BY](../../sql-reference/statements/select/order-by.md) statements. + +Possible values: + +- 0 — Positional arguments aren't supported. +- 1 — Positional arguments are supported: column numbers can use instead of column names. + +**Example** + +Query: + +```sql +CREATE TABLE positional_arguments(one Int, two Int, three Int) ENGINE=Memory(); + +INSERT INTO positional_arguments VALUES (10, 20, 30), (20, 20, 10), (30, 10, 20); + +SELECT * FROM positional_arguments ORDER BY 2,3; +``` + +Result: + +```text +┌─one─┬─two─┬─three─┐ +│ 30 │ 10 │ 20 │ +│ 20 │ 20 │ 10 │ +│ 10 │ 20 │ 30 │ +└─────┴─────┴───────┘ +``` +)", 0) \ + M(Bool, enable_extended_results_for_datetime_functions, false, R"( +Enables or disables returning results of type: +- `Date32` with extended range (compared to type `Date`) for functions [toStartOfYear](../../sql-reference/functions/date-time-functions.md#tostartofyear), [toStartOfISOYear](../../sql-reference/functions/date-time-functions.md#tostartofisoyear), [toStartOfQuarter](../../sql-reference/functions/date-time-functions.md#tostartofquarter), [toStartOfMonth](../../sql-reference/functions/date-time-functions.md#tostartofmonth), [toLastDayOfMonth](../../sql-reference/functions/date-time-functions.md#tolastdayofmonth), [toStartOfWeek](../../sql-reference/functions/date-time-functions.md#tostartofweek), [toLastDayOfWeek](../../sql-reference/functions/date-time-functions.md#tolastdayofweek) and [toMonday](../../sql-reference/functions/date-time-functions.md#tomonday). +- `DateTime64` with extended range (compared to type `DateTime`) for functions [toStartOfDay](../../sql-reference/functions/date-time-functions.md#tostartofday), [toStartOfHour](../../sql-reference/functions/date-time-functions.md#tostartofhour), [toStartOfMinute](../../sql-reference/functions/date-time-functions.md#tostartofminute), [toStartOfFiveMinutes](../../sql-reference/functions/date-time-functions.md#tostartoffiveminutes), [toStartOfTenMinutes](../../sql-reference/functions/date-time-functions.md#tostartoftenminutes), [toStartOfFifteenMinutes](../../sql-reference/functions/date-time-functions.md#tostartoffifteenminutes) and [timeSlot](../../sql-reference/functions/date-time-functions.md#timeslot). + +Possible values: + +- 0 — Functions return `Date` or `DateTime` for all types of arguments. +- 1 — Functions return `Date32` or `DateTime64` for `Date32` or `DateTime64` arguments and `Date` or `DateTime` otherwise. +)", 0) \ + M(Bool, allow_nonconst_timezone_arguments, false, R"( +Allow non-const timezone arguments in certain time-related functions like toTimeZone(), fromUnixTimestamp*(), snowflakeToDateTime*() +)", 0) \ + M(Bool, function_locate_has_mysql_compatible_argument_order, true, R"( +Controls the order of arguments in function [locate](../../sql-reference/functions/string-search-functions.md#locate). + +Possible values: + +- 0 — Function `locate` accepts arguments `(haystack, needle[, start_pos])`. +- 1 — Function `locate` accepts arguments `(needle, haystack, [, start_pos])` (MySQL-compatible behavior) +)", 0) \ \ - M(Bool, group_by_use_nulls, false, "Treat columns mentioned in ROLLUP, CUBE or GROUPING SETS as Nullable", 0) \ + M(Bool, group_by_use_nulls, false, R"( +Changes the way the [GROUP BY clause](/docs/en/sql-reference/statements/select/group-by.md) treats the types of aggregation keys. +When the `ROLLUP`, `CUBE`, or `GROUPING SETS` specifiers are used, some aggregation keys may not be used to produce some result rows. +Columns for these keys are filled with either default value or `NULL` in corresponding rows depending on this setting. + +Possible values: + +- 0 — The default value for the aggregation key type is used to produce missing values. +- 1 — ClickHouse executes `GROUP BY` the same way as the SQL standard says. The types of aggregation keys are converted to [Nullable](/docs/en/sql-reference/data-types/nullable.md/#data_type-nullable). Columns for corresponding aggregation keys are filled with [NULL](/docs/en/sql-reference/syntax.md) for rows that didn't use it. + +See also: + +- [GROUP BY clause](/docs/en/sql-reference/statements/select/group-by.md) +)", 0) \ \ - M(Bool, skip_unavailable_shards, false, "If true, ClickHouse silently skips unavailable shards. Shard is marked as unavailable when: 1) The shard cannot be reached due to a connection failure. 2) Shard is unresolvable through DNS. 3) Table does not exist on the shard.", 0) \ + M(Bool, skip_unavailable_shards, false, R"( +Enables or disables silently skipping of unavailable shards. + +Shard is considered unavailable if all its replicas are unavailable. A replica is unavailable in the following cases: + +- ClickHouse can’t connect to replica for any reason. + + When connecting to a replica, ClickHouse performs several attempts. If all these attempts fail, the replica is considered unavailable. + +- Replica can’t be resolved through DNS. + + If replica’s hostname can’t be resolved through DNS, it can indicate the following situations: + + - Replica’s host has no DNS record. It can occur in systems with dynamic DNS, for example, [Kubernetes](https://kubernetes.io), where nodes can be unresolvable during downtime, and this is not an error. + + - Configuration error. ClickHouse configuration file contains a wrong hostname. + +Possible values: + +- 1 — skipping enabled. + + If a shard is unavailable, ClickHouse returns a result based on partial data and does not report node availability issues. + +- 0 — skipping disabled. + + If a shard is unavailable, ClickHouse throws an exception. +)", 0) \ \ - M(UInt64, parallel_distributed_insert_select, 0, "Process distributed INSERT SELECT query in the same cluster on local tables on every shard; if set to 1 - SELECT is executed on each shard; if set to 2 - SELECT and INSERT are executed on each shard", 0) \ - M(UInt64, distributed_group_by_no_merge, 0, "If 1, Do not merge aggregation states from different servers for distributed queries (shards will process query up to the Complete stage, initiator just proxies the data from the shards). If 2 the initiator will apply ORDER BY and LIMIT stages (it is not in case when shard process query up to the Complete stage)", 0) \ - M(UInt64, distributed_push_down_limit, 1, "If 1, LIMIT will be applied on each shard separately. Usually you don't need to use it, since this will be done automatically if it is possible, i.e. for simple query SELECT FROM LIMIT.", 0) \ - M(Bool, optimize_distributed_group_by_sharding_key, true, "Optimize GROUP BY sharding_key queries (by avoiding costly aggregation on the initiator server).", 0) \ - M(UInt64, optimize_skip_unused_shards_limit, 1000, "Limit for number of sharding key values, turns off optimize_skip_unused_shards if the limit is reached", 0) \ - M(Bool, optimize_skip_unused_shards, false, "Assumes that data is distributed by sharding_key. Optimization to skip unused shards if SELECT query filters by sharding_key.", 0) \ - M(Bool, optimize_skip_unused_shards_rewrite_in, true, "Rewrite IN in query for remote shards to exclude values that does not belong to the shard (requires optimize_skip_unused_shards)", 0) \ - M(Bool, allow_nondeterministic_optimize_skip_unused_shards, false, "Allow non-deterministic functions (includes dictGet) in sharding_key for optimize_skip_unused_shards", 0) \ - M(UInt64, force_optimize_skip_unused_shards, 0, "Throw an exception if unused shards cannot be skipped (1 - throw only if the table has the sharding key, 2 - always throw.", 0) \ - M(UInt64, optimize_skip_unused_shards_nesting, 0, "Same as optimize_skip_unused_shards, but accept nesting level until which it will work.", 0) \ - M(UInt64, force_optimize_skip_unused_shards_nesting, 0, "Same as force_optimize_skip_unused_shards, but accept nesting level until which it will work.", 0) \ + M(UInt64, parallel_distributed_insert_select, 0, R"( +Enables parallel distributed `INSERT ... SELECT` query. + +If we execute `INSERT INTO distributed_table_a SELECT ... FROM distributed_table_b` queries and both tables use the same cluster, and both tables are either [replicated](../../engines/table-engines/mergetree-family/replication.md) or non-replicated, then this query is processed locally on every shard. + +Possible values: + +- 0 — Disabled. +- 1 — `SELECT` will be executed on each shard from the underlying table of the distributed engine. +- 2 — `SELECT` and `INSERT` will be executed on each shard from/to the underlying table of the distributed engine. +)", 0) \ + M(UInt64, distributed_group_by_no_merge, 0, R"( +Do not merge aggregation states from different servers for distributed query processing, you can use this in case it is for certain that there are different keys on different shards + +Possible values: + +- `0` — Disabled (final query processing is done on the initiator node). +- `1` - Do not merge aggregation states from different servers for distributed query processing (query completely processed on the shard, initiator only proxy the data), can be used in case it is for certain that there are different keys on different shards. +- `2` - Same as `1` but applies `ORDER BY` and `LIMIT` (it is not possible when the query processed completely on the remote node, like for `distributed_group_by_no_merge=1`) on the initiator (can be used for queries with `ORDER BY` and/or `LIMIT`). + +**Example** + +```sql +SELECT * +FROM remote('127.0.0.{2,3}', system.one) +GROUP BY dummy +LIMIT 1 +SETTINGS distributed_group_by_no_merge = 1 +FORMAT PrettyCompactMonoBlock + +┌─dummy─┐ +│ 0 │ +│ 0 │ +└───────┘ +``` + +```sql +SELECT * +FROM remote('127.0.0.{2,3}', system.one) +GROUP BY dummy +LIMIT 1 +SETTINGS distributed_group_by_no_merge = 2 +FORMAT PrettyCompactMonoBlock + +┌─dummy─┐ +│ 0 │ +└───────┘ +``` +)", 0) \ + M(UInt64, distributed_push_down_limit, 1, R"( +Enables or disables [LIMIT](#limit) applying on each shard separately. + +This will allow to avoid: +- Sending extra rows over network; +- Processing rows behind the limit on the initiator. + +Starting from 21.9 version you cannot get inaccurate results anymore, since `distributed_push_down_limit` changes query execution only if at least one of the conditions met: +- [distributed_group_by_no_merge](#distributed-group-by-no-merge) > 0. +- Query **does not have** `GROUP BY`/`DISTINCT`/`LIMIT BY`, but it has `ORDER BY`/`LIMIT`. +- Query **has** `GROUP BY`/`DISTINCT`/`LIMIT BY` with `ORDER BY`/`LIMIT` and: + - [optimize_skip_unused_shards](#optimize-skip-unused-shards) is enabled. + - [optimize_distributed_group_by_sharding_key](#optimize-distributed-group-by-sharding-key) is enabled. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +See also: + +- [distributed_group_by_no_merge](#distributed-group-by-no-merge) +- [optimize_skip_unused_shards](#optimize-skip-unused-shards) +- [optimize_distributed_group_by_sharding_key](#optimize-distributed-group-by-sharding-key) +)", 0) \ + M(Bool, optimize_distributed_group_by_sharding_key, true, R"( +Optimize `GROUP BY sharding_key` queries, by avoiding costly aggregation on the initiator server (which will reduce memory usage for the query on the initiator server). + +The following types of queries are supported (and all combinations of them): + +- `SELECT DISTINCT [..., ]sharding_key[, ...] FROM dist` +- `SELECT ... FROM dist GROUP BY sharding_key[, ...]` +- `SELECT ... FROM dist GROUP BY sharding_key[, ...] ORDER BY x` +- `SELECT ... FROM dist GROUP BY sharding_key[, ...] LIMIT 1` +- `SELECT ... FROM dist GROUP BY sharding_key[, ...] LIMIT 1 BY x` + +The following types of queries are not supported (support for some of them may be added later): + +- `SELECT ... GROUP BY sharding_key[, ...] WITH TOTALS` +- `SELECT ... GROUP BY sharding_key[, ...] WITH ROLLUP` +- `SELECT ... GROUP BY sharding_key[, ...] WITH CUBE` +- `SELECT ... GROUP BY sharding_key[, ...] SETTINGS extremes=1` + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +See also: + +- [distributed_group_by_no_merge](#distributed-group-by-no-merge) +- [distributed_push_down_limit](#distributed-push-down-limit) +- [optimize_skip_unused_shards](#optimize-skip-unused-shards) + +:::note +Right now it requires `optimize_skip_unused_shards` (the reason behind this is that one day it may be enabled by default, and it will work correctly only if data was inserted via Distributed table, i.e. data is distributed according to sharding_key). +::: +)", 0) \ + M(UInt64, optimize_skip_unused_shards_limit, 1000, R"( +Limit for number of sharding key values, turns off `optimize_skip_unused_shards` if the limit is reached. + +Too many values may require significant amount for processing, while the benefit is doubtful, since if you have huge number of values in `IN (...)`, then most likely the query will be sent to all shards anyway. +)", 0) \ + M(Bool, optimize_skip_unused_shards, false, R"( +Enables or disables skipping of unused shards for [SELECT](../../sql-reference/statements/select/index.md) queries that have sharding key condition in `WHERE/PREWHERE` (assuming that the data is distributed by sharding key, otherwise a query yields incorrect result). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. +)", 0) \ + M(Bool, optimize_skip_unused_shards_rewrite_in, true, R"( +Rewrite IN in query for remote shards to exclude values that does not belong to the shard (requires optimize_skip_unused_shards). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. +)", 0) \ + M(Bool, allow_nondeterministic_optimize_skip_unused_shards, false, R"( +Allow nondeterministic (like `rand` or `dictGet`, since later has some caveats with updates) functions in sharding key. + +Possible values: + +- 0 — Disallowed. +- 1 — Allowed. +)", 0) \ + M(UInt64, force_optimize_skip_unused_shards, 0, R"( +Enables or disables query execution if [optimize_skip_unused_shards](#optimize-skip-unused-shards) is enabled and skipping of unused shards is not possible. If the skipping is not possible and the setting is enabled, an exception will be thrown. + +Possible values: + +- 0 — Disabled. ClickHouse does not throw an exception. +- 1 — Enabled. Query execution is disabled only if the table has a sharding key. +- 2 — Enabled. Query execution is disabled regardless of whether a sharding key is defined for the table. +)", 0) \ + M(UInt64, optimize_skip_unused_shards_nesting, 0, R"( +Controls [`optimize_skip_unused_shards`](#optimize-skip-unused-shards) (hence still requires [`optimize_skip_unused_shards`](#optimize-skip-unused-shards)) depends on the nesting level of the distributed query (case when you have `Distributed` table that look into another `Distributed` table). + +Possible values: + +- 0 — Disabled, `optimize_skip_unused_shards` works always. +- 1 — Enables `optimize_skip_unused_shards` only for the first level. +- 2 — Enables `optimize_skip_unused_shards` up to the second level. +)", 0) \ + M(UInt64, force_optimize_skip_unused_shards_nesting, 0, R"( +Controls [`force_optimize_skip_unused_shards`](#force-optimize-skip-unused-shards) (hence still requires [`force_optimize_skip_unused_shards`](#force-optimize-skip-unused-shards)) depends on the nesting level of the distributed query (case when you have `Distributed` table that look into another `Distributed` table). + +Possible values: + +- 0 - Disabled, `force_optimize_skip_unused_shards` works always. +- 1 — Enables `force_optimize_skip_unused_shards` only for the first level. +- 2 — Enables `force_optimize_skip_unused_shards` up to the second level. +)", 0) \ \ - M(Bool, input_format_parallel_parsing, true, "Enable parallel parsing for some data formats.", 0) \ - M(UInt64, min_chunk_bytes_for_parallel_parsing, (10 * 1024 * 1024), "The minimum chunk size in bytes, which each thread will parse in parallel.", 0) \ - M(Bool, output_format_parallel_formatting, true, "Enable parallel formatting for some data formats.", 0) \ - M(UInt64, output_format_compression_level, 3, "Default compression level if query output is compressed. The setting is applied when `SELECT` query has `INTO OUTFILE` or when inserting to table function `file`, `url`, `hdfs`, `s3`, and `azureBlobStorage`.", 0) \ - M(UInt64, output_format_compression_zstd_window_log, 0, "Can be used when the output compression method is `zstd`. If greater than `0`, this setting explicitly sets compression window size (power of `2`) and enables a long-range mode for zstd compression.", 0) \ - M(Bool, enable_parsing_to_custom_serialization, true, "If true then data can be parsed directly to columns with custom serialization (e.g. Sparse) according to hints for serialization got from the table.", 0) \ + M(Bool, input_format_parallel_parsing, true, R"( +Enables or disables order-preserving parallel parsing of data formats. Supported only for [TSV](../../interfaces/formats.md/#tabseparated), [TSKV](../../interfaces/formats.md/#tskv), [CSV](../../interfaces/formats.md/#csv) and [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) formats. + +Possible values: + +- 1 — Enabled. +- 0 — Disabled. +)", 0) \ + M(UInt64, min_chunk_bytes_for_parallel_parsing, (10 * 1024 * 1024), R"( +- Type: unsigned int +- Default value: 1 MiB + +The minimum chunk size in bytes, which each thread will parse in parallel. +)", 0) \ + M(Bool, output_format_parallel_formatting, true, R"( +Enables or disables parallel formatting of data formats. Supported only for [TSV](../../interfaces/formats.md/#tabseparated), [TSKV](../../interfaces/formats.md/#tskv), [CSV](../../interfaces/formats.md/#csv) and [JSONEachRow](../../interfaces/formats.md/#jsoneachrow) formats. + +Possible values: + +- 1 — Enabled. +- 0 — Disabled. +)", 0) \ + M(UInt64, output_format_compression_level, 3, R"( +Default compression level if query output is compressed. The setting is applied when `SELECT` query has `INTO OUTFILE` or when writing to table functions `file`, `url`, `hdfs`, `s3`, or `azureBlobStorage`. + +Possible values: from `1` to `22` +)", 0) \ + M(UInt64, output_format_compression_zstd_window_log, 0, R"( +Can be used when the output compression method is `zstd`. If greater than `0`, this setting explicitly sets compression window size (power of `2`) and enables a long-range mode for zstd compression. This can help to achieve a better compression ratio. + +Possible values: non-negative numbers. Note that if the value is too small or too big, `zstdlib` will throw an exception. Typical values are from `20` (window size = `1MB`) to `30` (window size = `1GB`). +)", 0) \ + M(Bool, enable_parsing_to_custom_serialization, true, R"( +If true then data can be parsed directly to columns with custom serialization (e.g. Sparse) according to hints for serialization got from the table. +)", 0) \ \ - M(UInt64, merge_tree_min_rows_for_concurrent_read, (20 * 8192), "If at least as many lines are read from one file, the reading can be parallelized.", 0) \ - M(UInt64, merge_tree_min_bytes_for_concurrent_read, (24 * 10 * 1024 * 1024), "If at least as many bytes are read from one file, the reading can be parallelized.", 0) \ - M(UInt64, merge_tree_min_rows_for_seek, 0, "You can skip reading more than that number of rows at the price of one seek per file.", 0) \ - M(UInt64, merge_tree_min_bytes_for_seek, 0, "You can skip reading more than that number of bytes at the price of one seek per file.", 0) \ - M(UInt64, merge_tree_coarse_index_granularity, 8, "If the index segment can contain the required keys, divide it into as many parts and recursively check them.", 0) \ - M(UInt64, merge_tree_max_rows_to_use_cache, (128 * 8192), "The maximum number of rows per request, to use the cache of uncompressed data. If the request is large, the cache is not used. (For large queries not to flush out the cache.)", 0) \ - M(UInt64, merge_tree_max_bytes_to_use_cache, (192 * 10 * 1024 * 1024), "The maximum number of bytes per request, to use the cache of uncompressed data. If the request is large, the cache is not used. (For large queries not to flush out the cache.)", 0) \ - M(Bool, do_not_merge_across_partitions_select_final, false, "Merge parts only in one partition in select final", 0) \ - M(Bool, split_parts_ranges_into_intersecting_and_non_intersecting_final, true, "Split parts ranges into intersecting and non intersecting during FINAL optimization", 0) \ - M(Bool, split_intersecting_parts_ranges_into_layers_final, true, "Split intersecting parts ranges into layers during FINAL optimization", 0) \ + M(UInt64, merge_tree_min_rows_for_concurrent_read, (20 * 8192), R"( +If the number of rows to be read from a file of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table exceeds `merge_tree_min_rows_for_concurrent_read` then ClickHouse tries to perform a concurrent reading from this file on several threads. + +Possible values: + +- Positive integer. +)", 0) \ + M(UInt64, merge_tree_min_bytes_for_concurrent_read, (24 * 10 * 1024 * 1024), R"( +If the number of bytes to read from one file of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md)-engine table exceeds `merge_tree_min_bytes_for_concurrent_read`, then ClickHouse tries to concurrently read from this file in several threads. + +Possible value: + +- Positive integer. +)", 0) \ + M(UInt64, merge_tree_min_rows_for_seek, 0, R"( +If the distance between two data blocks to be read in one file is less than `merge_tree_min_rows_for_seek` rows, then ClickHouse does not seek through the file but reads the data sequentially. + +Possible values: + +- Any positive integer. +)", 0) \ + M(UInt64, merge_tree_min_bytes_for_seek, 0, R"( +If the distance between two data blocks to be read in one file is less than `merge_tree_min_bytes_for_seek` bytes, then ClickHouse sequentially reads a range of file that contains both blocks, thus avoiding extra seek. + +Possible values: + +- Any positive integer. +)", 0) \ + M(UInt64, merge_tree_coarse_index_granularity, 8, R"( +When searching for data, ClickHouse checks the data marks in the index file. If ClickHouse finds that required keys are in some range, it divides this range into `merge_tree_coarse_index_granularity` subranges and searches the required keys there recursively. + +Possible values: + +- Any positive even integer. +)", 0) \ + M(UInt64, merge_tree_max_rows_to_use_cache, (128 * 8192), R"( +If ClickHouse should read more than `merge_tree_max_rows_to_use_cache` rows in one query, it does not use the cache of uncompressed blocks. + +The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md/#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. + +Possible values: + +- Any positive integer. +)", 0) \ + M(UInt64, merge_tree_max_bytes_to_use_cache, (192 * 10 * 1024 * 1024), R"( +If ClickHouse should read more than `merge_tree_max_bytes_to_use_cache` bytes in one query, it does not use the cache of uncompressed blocks. + +The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md/#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. + +Possible values: + +- Any positive integer. +)", 0) \ + M(Bool, do_not_merge_across_partitions_select_final, false, R"( +Merge parts only in one partition in select final +)", 0) \ + M(Bool, split_parts_ranges_into_intersecting_and_non_intersecting_final, true, R"( +Split parts ranges into intersecting and non intersecting during FINAL optimization +)", 0) \ + M(Bool, split_intersecting_parts_ranges_into_layers_final, true, R"( +Split intersecting parts ranges into layers during FINAL optimization +)", 0) \ \ - M(UInt64, mysql_max_rows_to_insert, 65536, "The maximum number of rows in MySQL batch insertion of the MySQL storage engine", 0) \ - M(Bool, mysql_map_string_to_text_in_show_columns, true, "If enabled, String type will be mapped to TEXT in SHOW [FULL] COLUMNS, BLOB otherwise. Has an effect only when the connection is made through the MySQL wire protocol.", 0) \ - M(Bool, mysql_map_fixed_string_to_text_in_show_columns, true, "If enabled, FixedString type will be mapped to TEXT in SHOW [FULL] COLUMNS, BLOB otherwise. Has an effect only when the connection is made through the MySQL wire protocol.", 0) \ + M(UInt64, mysql_max_rows_to_insert, 65536, R"( +The maximum number of rows in MySQL batch insertion of the MySQL storage engine +)", 0) \ + M(Bool, mysql_map_string_to_text_in_show_columns, true, R"( +When enabled, [String](../../sql-reference/data-types/string.md) ClickHouse data type will be displayed as `TEXT` in [SHOW COLUMNS](../../sql-reference/statements/show.md#show_columns). + +Has an effect only when the connection is made through the MySQL wire protocol. + +- 0 - Use `BLOB`. +- 1 - Use `TEXT`. +)", 0) \ + M(Bool, mysql_map_fixed_string_to_text_in_show_columns, true, R"( +When enabled, [FixedString](../../sql-reference/data-types/fixedstring.md) ClickHouse data type will be displayed as `TEXT` in [SHOW COLUMNS](../../sql-reference/statements/show.md#show_columns). + +Has an effect only when the connection is made through the MySQL wire protocol. + +- 0 - Use `BLOB`. +- 1 - Use `TEXT`. +)", 0) \ \ - M(UInt64, optimize_min_equality_disjunction_chain_length, 3, "The minimum length of the expression `expr = x1 OR ... expr = xN` for optimization ", 0) \ - M(UInt64, optimize_min_inequality_conjunction_chain_length, 3, "The minimum length of the expression `expr <> x1 AND ... expr <> xN` for optimization ", 0) \ + M(UInt64, optimize_min_equality_disjunction_chain_length, 3, R"( +The minimum length of the expression `expr = x1 OR ... expr = xN` for optimization +)", 0) \ + M(UInt64, optimize_min_inequality_conjunction_chain_length, 3, R"( +The minimum length of the expression `expr <> x1 AND ... expr <> xN` for optimization +)", 0) \ \ - M(UInt64, min_bytes_to_use_direct_io, 0, "The minimum number of bytes for reading the data with O_DIRECT option during SELECT queries execution. 0 - disabled.", 0) \ - M(UInt64, min_bytes_to_use_mmap_io, 0, "The minimum number of bytes for reading the data with mmap option during SELECT queries execution. 0 - disabled.", 0) \ - M(Bool, checksum_on_read, true, "Validate checksums on reading. It is enabled by default and should be always enabled in production. Please do not expect any benefits in disabling this setting. It may only be used for experiments and benchmarks. The setting is only applicable for tables of MergeTree family. Checksums are always validated for other table engines and when receiving data over the network.", 0) \ + M(UInt64, min_bytes_to_use_direct_io, 0, R"( +The minimum data volume required for using direct I/O access to the storage disk. + +ClickHouse uses this setting when reading data from tables. If the total storage volume of all the data to be read exceeds `min_bytes_to_use_direct_io` bytes, then ClickHouse reads the data from the storage disk with the `O_DIRECT` option. + +Possible values: + +- 0 — Direct I/O is disabled. +- Positive integer. +)", 0) \ + M(UInt64, min_bytes_to_use_mmap_io, 0, R"( +This is an experimental setting. Sets the minimum amount of memory for reading large files without copying data from the kernel to userspace. Recommended threshold is about 64 MB, because [mmap/munmap](https://en.wikipedia.org/wiki/Mmap) is slow. It makes sense only for large files and helps only if data reside in the page cache. + +Possible values: + +- Positive integer. +- 0 — Big files read with only copying data from kernel to userspace. +)", 0) \ + M(Bool, checksum_on_read, true, R"( +Validate checksums on reading. It is enabled by default and should be always enabled in production. Please do not expect any benefits in disabling this setting. It may only be used for experiments and benchmarks. The setting is only applicable for tables of MergeTree family. Checksums are always validated for other table engines and when receiving data over the network. +)", 0) \ \ - M(Bool, force_index_by_date, false, "Throw an exception if there is a partition key in a table, and it is not used.", 0) \ - M(Bool, force_primary_key, false, "Throw an exception if there is primary key in a table, and it is not used.", 0) \ - M(Bool, use_skip_indexes, true, "Use data skipping indexes during query execution.", 0) \ - M(Bool, use_skip_indexes_if_final, false, "If query has FINAL, then skipping data based on indexes may produce incorrect result, hence disabled by default.", 0) \ - M(Bool, materialize_skip_indexes_on_insert, true, "If true skip indexes are calculated on inserts, otherwise skip indexes will be calculated only during merges", 0) \ - M(Bool, materialize_statistics_on_insert, true, "If true statistics are calculated on inserts, otherwise statistics will be calculated only during merges", 0) \ - M(String, ignore_data_skipping_indices, "", "Comma separated list of strings or literals with the name of the data skipping indices that should be excluded during query execution.", 0) \ + M(Bool, force_index_by_date, false, R"( +Disables query execution if the index can’t be used by date. + +Works with tables in the MergeTree family. + +If `force_index_by_date=1`, ClickHouse checks whether the query has a date key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition reduces the amount of data to read. For example, the condition `Date != ' 2000-01-01 '` is acceptable even when it matches all the data in the table (i.e., running the query requires a full scan). For more information about ranges of data in MergeTree tables, see [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). +)", 0) \ + M(Bool, force_primary_key, false, R"( +Disables query execution if indexing by the primary key is not possible. + +Works with tables in the MergeTree family. + +If `force_primary_key=1`, ClickHouse checks to see if the query has a primary key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition reduces the amount of data to read. For more information about data ranges in MergeTree tables, see [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). +)", 0) \ + M(Bool, use_skip_indexes, true, R"( +Use data skipping indexes during query execution. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. +)", 0) \ + M(Bool, use_skip_indexes_if_final, false, R"( +Controls whether skipping indexes are used when executing a query with the FINAL modifier. + +By default, this setting is disabled because skip indexes may exclude rows (granules) containing the latest data, which could lead to incorrect results. When enabled, skipping indexes are applied even with the FINAL modifier, potentially improving performance but with the risk of missing recent updates. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. +)", 0) \ + M(Bool, materialize_skip_indexes_on_insert, true, R"( +If true skip indexes are calculated on inserts, otherwise skip indexes will be calculated only during merges +)", 0) \ + M(Bool, materialize_statistics_on_insert, true, R"( +If true statistics are calculated on inserts, otherwise statistics will be calculated only during merges +)", 0) \ + M(String, ignore_data_skipping_indices, "", R"( +Ignores the skipping indexes specified if used by the query. + +Consider the following example: + +```sql +CREATE TABLE data +( + key Int, + x Int, + y Int, + INDEX x_idx x TYPE minmax GRANULARITY 1, + INDEX y_idx y TYPE minmax GRANULARITY 1, + INDEX xy_idx (x,y) TYPE minmax GRANULARITY 1 +) +Engine=MergeTree() +ORDER BY key; + +INSERT INTO data VALUES (1, 2, 3); + +SELECT * FROM data; +SELECT * FROM data SETTINGS ignore_data_skipping_indices=''; -- query will produce CANNOT_PARSE_TEXT error. +SELECT * FROM data SETTINGS ignore_data_skipping_indices='x_idx'; -- Ok. +SELECT * FROM data SETTINGS ignore_data_skipping_indices='na_idx'; -- Ok. + +SELECT * FROM data WHERE x = 1 AND y = 1 SETTINGS ignore_data_skipping_indices='xy_idx',force_data_skipping_indices='xy_idx' ; -- query will produce INDEX_NOT_USED error, since xy_idx is explicitly ignored. +SELECT * FROM data WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx'; +``` + +The query without ignoring any indexes: +```sql +EXPLAIN indexes = 1 SELECT * FROM data WHERE x = 1 AND y = 2; + +Expression ((Projection + Before ORDER BY)) + Filter (WHERE) + ReadFromMergeTree (default.data) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 + Skip + Name: x_idx + Description: minmax GRANULARITY 1 + Parts: 0/1 + Granules: 0/1 + Skip + Name: y_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 + Skip + Name: xy_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 +``` + +Ignoring the `xy_idx` index: +```sql +EXPLAIN indexes = 1 SELECT * FROM data WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx'; + +Expression ((Projection + Before ORDER BY)) + Filter (WHERE) + ReadFromMergeTree (default.data) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 + Skip + Name: x_idx + Description: minmax GRANULARITY 1 + Parts: 0/1 + Granules: 0/1 + Skip + Name: y_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 +``` + +Works with tables in the MergeTree family. +)", 0) \ \ - M(String, force_data_skipping_indices, "", "Comma separated list of strings or literals with the name of the data skipping indices that should be used during query execution, otherwise an exception will be thrown.", 0) \ + M(String, force_data_skipping_indices, "", R"( +Disables query execution if passed data skipping indices wasn't used. + +Consider the following example: + +```sql +CREATE TABLE data +( + key Int, + d1 Int, + d1_null Nullable(Int), + INDEX d1_idx d1 TYPE minmax GRANULARITY 1, + INDEX d1_null_idx assumeNotNull(d1_null) TYPE minmax GRANULARITY 1 +) +Engine=MergeTree() +ORDER BY key; + +SELECT * FROM data_01515; +SELECT * FROM data_01515 SETTINGS force_data_skipping_indices=''; -- query will produce CANNOT_PARSE_TEXT error. +SELECT * FROM data_01515 SETTINGS force_data_skipping_indices='d1_idx'; -- query will produce INDEX_NOT_USED error. +SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_idx'; -- Ok. +SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='`d1_idx`'; -- Ok (example of full featured parser). +SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='`d1_idx`, d1_null_idx'; -- query will produce INDEX_NOT_USED error, since d1_null_idx is not used. +SELECT * FROM data_01515 WHERE d1 = 0 AND assumeNotNull(d1_null) = 0 SETTINGS force_data_skipping_indices='`d1_idx`, d1_null_idx'; -- Ok. +``` +)", 0) \ \ - M(Float, max_streams_to_max_threads_ratio, 1, "Allows you to use more sources than the number of threads - to more evenly distribute work across threads. It is assumed that this is a temporary solution since it will be possible in the future to make the number of sources equal to the number of threads, but for each source to dynamically select available work for itself.", 0) \ - M(Float, max_streams_multiplier_for_merge_tables, 5, "Ask more streams when reading from Merge table. Streams will be spread across tables that Merge table will use. This allows more even distribution of work across threads and is especially helpful when merged tables differ in size.", 0) \ + M(Float, max_streams_to_max_threads_ratio, 1, R"( +Allows you to use more sources than the number of threads - to more evenly distribute work across threads. It is assumed that this is a temporary solution since it will be possible in the future to make the number of sources equal to the number of threads, but for each source to dynamically select available work for itself. +)", 0) \ + M(Float, max_streams_multiplier_for_merge_tables, 5, R"( +Ask more streams when reading from Merge table. Streams will be spread across tables that Merge table will use. This allows more even distribution of work across threads and is especially helpful when merged tables differ in size. +)", 0) \ \ - M(String, network_compression_method, "LZ4", "Allows you to select the method of data compression when writing.", 0) \ + M(String, network_compression_method, "LZ4", R"( +Sets the method of data compression that is used for communication between servers and between server and [clickhouse-client](../../interfaces/cli.md). + +Possible values: + +- `LZ4` — sets LZ4 compression method. +- `ZSTD` — sets ZSTD compression method. + +**See Also** + +- [network_zstd_compression_level](#network_zstd_compression_level) +)", 0) \ \ - M(Int64, network_zstd_compression_level, 1, "Allows you to select the level of ZSTD compression.", 0) \ + M(Int64, network_zstd_compression_level, 1, R"( +Adjusts the level of ZSTD compression. Used only when [network_compression_method](#network_compression_method) is set to `ZSTD`. + +Possible values: + +- Positive integer from 1 to 15. +)", 0) \ \ - M(Int64, zstd_window_log_max, 0, "Allows you to select the max window log of ZSTD (it will not be used for MergeTree family)", 0) \ + M(Int64, zstd_window_log_max, 0, R"( +Allows you to select the max window log of ZSTD (it will not be used for MergeTree family) +)", 0) \ \ - M(UInt64, priority, 0, "Priority of the query. 1 - the highest, higher value - lower priority; 0 - do not use priorities.", 0) \ - M(Int64, os_thread_priority, 0, "If non zero - set corresponding 'nice' value for query processing threads. Can be used to adjust query priority for OS scheduler.", 0) \ + M(UInt64, priority, 0, R"( +Priority of the query. 1 - the highest, higher value - lower priority; 0 - do not use priorities. +)", 0) \ + M(Int64, os_thread_priority, 0, R"( +Sets the priority ([nice](https://en.wikipedia.org/wiki/Nice_(Unix))) for threads that execute queries. The OS scheduler considers this priority when choosing the next thread to run on each available CPU core. + +:::note +To use this setting, you need to set the `CAP_SYS_NICE` capability. The `clickhouse-server` package sets it up during installation. Some virtual environments do not allow you to set the `CAP_SYS_NICE` capability. In this case, `clickhouse-server` shows a message about it at the start. +::: + +Possible values: + +- You can set values in the range `[-20, 19]`. + +Lower values mean higher priority. Threads with low `nice` priority values are executed more frequently than threads with high values. High values are preferable for long-running non-interactive queries because it allows them to quickly give up resources in favour of short interactive queries when they arrive. +)", 0) \ \ - M(Bool, log_queries, true, "Log requests and write the log to the system table.", 0) \ - M(Bool, log_formatted_queries, false, "Log formatted queries and write the log to the system table.", 0) \ - M(LogQueriesType, log_queries_min_type, QueryLogElementType::QUERY_START, "Minimal type in query_log to log, possible values (from low to high): QUERY_START, QUERY_FINISH, EXCEPTION_BEFORE_START, EXCEPTION_WHILE_PROCESSING.", 0) \ - M(Milliseconds, log_queries_min_query_duration_ms, 0, "Minimal time for the query to run, to get to the query_log/query_thread_log/query_views_log.", 0) \ - M(UInt64, log_queries_cut_to_length, 100000, "If query length is greater than a specified threshold (in bytes), then cut query when writing to query log. Also limit the length of printed query in ordinary text log.", 0) \ - M(Float, log_queries_probability, 1., "Log queries with the specified probability.", 0) \ + M(Bool, log_queries, true, R"( +Setting up query logging. + +Queries sent to ClickHouse with this setup are logged according to the rules in the [query_log](../../operations/server-configuration-parameters/settings.md/#query-log) server configuration parameter. + +Example: + +``` text +log_queries=1 +``` +)", 0) \ + M(Bool, log_formatted_queries, false, R"( +Allows to log formatted queries to the [system.query_log](../../operations/system-tables/query_log.md) system table (populates `formatted_query` column in the [system.query_log](../../operations/system-tables/query_log.md)). + +Possible values: + +- 0 — Formatted queries are not logged in the system table. +- 1 — Formatted queries are logged in the system table. +)", 0) \ + M(LogQueriesType, log_queries_min_type, QueryLogElementType::QUERY_START, R"( +`query_log` minimal type to log. + +Possible values: +- `QUERY_START` (`=1`) +- `QUERY_FINISH` (`=2`) +- `EXCEPTION_BEFORE_START` (`=3`) +- `EXCEPTION_WHILE_PROCESSING` (`=4`) + +Can be used to limit which entities will go to `query_log`, say you are interested only in errors, then you can use `EXCEPTION_WHILE_PROCESSING`: + +``` text +log_queries_min_type='EXCEPTION_WHILE_PROCESSING' +``` +)", 0) \ + M(Milliseconds, log_queries_min_query_duration_ms, 0, R"( +If enabled (non-zero), queries faster than the value of this setting will not be logged (you can think about this as a `long_query_time` for [MySQL Slow Query Log](https://dev.mysql.com/doc/refman/5.7/en/slow-query-log.html)), and this basically means that you will not find them in the following tables: + +- `system.query_log` +- `system.query_thread_log` + +Only the queries with the following type will get to the log: + +- `QUERY_FINISH` +- `EXCEPTION_WHILE_PROCESSING` + +- Type: milliseconds +- Default value: 0 (any query) +)", 0) \ + M(UInt64, log_queries_cut_to_length, 100000, R"( +If query length is greater than a specified threshold (in bytes), then cut query when writing to query log. Also limit the length of printed query in ordinary text log. +)", 0) \ + M(Float, log_queries_probability, 1., R"( +Allows a user to write to [query_log](../../operations/system-tables/query_log.md), [query_thread_log](../../operations/system-tables/query_thread_log.md), and [query_views_log](../../operations/system-tables/query_views_log.md) system tables only a sample of queries selected randomly with the specified probability. It helps to reduce the load with a large volume of queries in a second. + +Possible values: + +- 0 — Queries are not logged in the system tables. +- Positive floating-point number in the range [0..1]. For example, if the setting value is `0.5`, about half of the queries are logged in the system tables. +- 1 — All queries are logged in the system tables. +)", 0) \ \ - M(Bool, log_processors_profiles, true, "Log Processors profile events.", 0) \ - M(DistributedProductMode, distributed_product_mode, DistributedProductMode::DENY, "How are distributed subqueries performed inside IN or JOIN sections?", IMPORTANT) \ + M(Bool, log_processors_profiles, true, R"( +Write time that processor spent during execution/waiting for data to `system.processors_profile_log` table. + +See also: + +- [`system.processors_profile_log`](../../operations/system-tables/processors_profile_log.md) +- [`EXPLAIN PIPELINE`](../../sql-reference/statements/explain.md#explain-pipeline) +)", 0) \ + M(DistributedProductMode, distributed_product_mode, DistributedProductMode::DENY, R"( +Changes the behaviour of [distributed subqueries](../../sql-reference/operators/in.md). + +ClickHouse applies this setting when the query contains the product of distributed tables, i.e. when the query for a distributed table contains a non-GLOBAL subquery for the distributed table. + +Restrictions: + +- Only applied for IN and JOIN subqueries. +- Only if the FROM section uses a distributed table containing more than one shard. +- If the subquery concerns a distributed table containing more than one shard. +- Not used for a table-valued [remote](../../sql-reference/table-functions/remote.md) function. + +Possible values: + +- `deny` — Default value. Prohibits using these types of subqueries (returns the “Double-distributed in/JOIN subqueries is denied” exception). +- `local` — Replaces the database and table in the subquery with local ones for the destination server (shard), leaving the normal `IN`/`JOIN.` +- `global` — Replaces the `IN`/`JOIN` query with `GLOBAL IN`/`GLOBAL JOIN.` +- `allow` — Allows the use of these types of subqueries. +)", IMPORTANT) \ \ - M(UInt64, max_concurrent_queries_for_all_users, 0, "The maximum number of concurrent requests for all users.", 0) \ - M(UInt64, max_concurrent_queries_for_user, 0, "The maximum number of concurrent requests per user.", 0) \ + M(UInt64, max_concurrent_queries_for_all_users, 0, R"( +Throw exception if the value of this setting is less or equal than the current number of simultaneously processed queries. + +Example: `max_concurrent_queries_for_all_users` can be set to 99 for all users and database administrator can set it to 100 for itself to run queries for investigation even when the server is overloaded. + +Modifying the setting for one query or user does not affect other queries. + +Possible values: + +- Positive integer. +- 0 — No limit. + +**Example** + +``` xml +99 +``` + +**See Also** + +- [max_concurrent_queries](/docs/en/operations/server-configuration-parameters/settings.md/#max_concurrent_queries) +)", 0) \ + M(UInt64, max_concurrent_queries_for_user, 0, R"( +The maximum number of simultaneously processed queries per user. + +Possible values: + +- Positive integer. +- 0 — No limit. + +**Example** + +``` xml +5 +``` +)", 0) \ \ - M(Bool, insert_deduplicate, true, "For INSERT queries in the replicated table, specifies that deduplication of inserting blocks should be performed", 0) \ - M(Bool, async_insert_deduplicate, false, "For async INSERT queries in the replicated table, specifies that deduplication of inserting blocks should be performed", 0) \ + M(Bool, insert_deduplicate, true, R"( +Enables or disables block deduplication of `INSERT` (for Replicated\* tables). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)). +For the replicated tables by default the only 100 of the most recent blocks for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md/#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)). +For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window). +)", 0) \ + M(Bool, async_insert_deduplicate, false, R"( +For async INSERT queries in the replicated table, specifies that deduplication of inserting blocks should be performed +)", 0) \ \ - M(UInt64Auto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled, 'auto' - use majority", 0) \ - M(Milliseconds, insert_quorum_timeout, 600000, "If the quorum of replicas did not meet in a specified time (in milliseconds), an exception will be thrown and insertion is aborted.", 0) \ - M(Bool, insert_quorum_parallel, true, "For quorum INSERT queries - enable to make parallel inserts without linearizability", 0) \ - M(UInt64, select_sequential_consistency, 0, "For SELECT queries from the replicated table, throw an exception if the replica does not have a chunk written with the quorum; do not read the parts that have not yet been written with the quorum.", 0) \ - M(UInt64, table_function_remote_max_addresses, 1000, "The maximum number of different shards and the maximum number of replicas of one shard in the `remote` function.", 0) \ - M(Milliseconds, read_backoff_min_latency_ms, 1000, "Setting to reduce the number of threads in case of slow reads. Pay attention only to reads that took at least that much time.", 0) \ - M(UInt64, read_backoff_max_throughput, 1048576, "Settings to reduce the number of threads in case of slow reads. Count events when the read bandwidth is less than that many bytes per second.", 0) \ - M(Milliseconds, read_backoff_min_interval_between_events_ms, 1000, "Settings to reduce the number of threads in case of slow reads. Do not pay attention to the event, if the previous one has passed less than a certain amount of time.", 0) \ - M(UInt64, read_backoff_min_events, 2, "Settings to reduce the number of threads in case of slow reads. The number of events after which the number of threads will be reduced.", 0) \ + M(UInt64Auto, insert_quorum, 0, R"( +:::note +This setting is not applicable to SharedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information. +::: + +Enables the quorum writes. + +- If `insert_quorum < 2`, the quorum writes are disabled. +- If `insert_quorum >= 2`, the quorum writes are enabled. +- If `insert_quorum = 'auto'`, use majority number (`number_of_replicas / 2 + 1`) as quorum number. + +Quorum writes + +`INSERT` succeeds only when ClickHouse manages to correctly write data to the `insert_quorum` of replicas during the `insert_quorum_timeout`. If for any reason the number of replicas with successful writes does not reach the `insert_quorum`, the write is considered failed and ClickHouse will delete the inserted block from all the replicas where data has already been written. + +When `insert_quorum_parallel` is disabled, all replicas in the quorum are consistent, i.e. they contain data from all previous `INSERT` queries (the `INSERT` sequence is linearized). When reading data written using `insert_quorum` and `insert_quorum_parallel` is disabled, you can turn on sequential consistency for `SELECT` queries using [select_sequential_consistency](#select_sequential_consistency). + +ClickHouse generates an exception: + +- If the number of available replicas at the time of the query is less than the `insert_quorum`. +- When `insert_quorum_parallel` is disabled and an attempt to write data is made when the previous block has not yet been inserted in `insert_quorum` of replicas. This situation may occur if the user tries to perform another `INSERT` query to the same table before the previous one with `insert_quorum` is completed. + +See also: + +- [insert_quorum_timeout](#insert_quorum_timeout) +- [insert_quorum_parallel](#insert_quorum_parallel) +- [select_sequential_consistency](#select_sequential_consistency) +)", 0) \ + M(Milliseconds, insert_quorum_timeout, 600000, R"( +Write to a quorum timeout in milliseconds. If the timeout has passed and no write has taken place yet, ClickHouse will generate an exception and the client must repeat the query to write the same block to the same or any other replica. + +See also: + +- [insert_quorum](#insert_quorum) +- [insert_quorum_parallel](#insert_quorum_parallel) +- [select_sequential_consistency](#select_sequential_consistency) +)", 0) \ + M(Bool, insert_quorum_parallel, true, R"( +:::note +This setting is not applicable to SharedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information. +::: + +Enables or disables parallelism for quorum `INSERT` queries. If enabled, additional `INSERT` queries can be sent while previous queries have not yet finished. If disabled, additional writes to the same table will be rejected. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +See also: + +- [insert_quorum](#insert_quorum) +- [insert_quorum_timeout](#insert_quorum_timeout) +- [select_sequential_consistency](#select_sequential_consistency) +)", 0) \ + M(UInt64, select_sequential_consistency, 0, R"( +:::note +This setting differ in behavior between SharedMergeTree and ReplicatedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information about the behavior of `select_sequential_consistency` in SharedMergeTree. +::: + +Enables or disables sequential consistency for `SELECT` queries. Requires `insert_quorum_parallel` to be disabled (enabled by default). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Usage + +When sequential consistency is enabled, ClickHouse allows the client to execute the `SELECT` query only for those replicas that contain data from all previous `INSERT` queries executed with `insert_quorum`. If the client refers to a partial replica, ClickHouse will generate an exception. The SELECT query will not include data that has not yet been written to the quorum of replicas. + +When `insert_quorum_parallel` is enabled (the default), then `select_sequential_consistency` does not work. This is because parallel `INSERT` queries can be written to different sets of quorum replicas so there is no guarantee a single replica will have received all writes. + +See also: + +- [insert_quorum](#insert_quorum) +- [insert_quorum_timeout](#insert_quorum_timeout) +- [insert_quorum_parallel](#insert_quorum_parallel) +)", 0) \ + M(UInt64, table_function_remote_max_addresses, 1000, R"( +Sets the maximum number of addresses generated from patterns for the [remote](../../sql-reference/table-functions/remote.md) function. + +Possible values: + +- Positive integer. +)", 0) \ + M(Milliseconds, read_backoff_min_latency_ms, 1000, R"( +Setting to reduce the number of threads in case of slow reads. Pay attention only to reads that took at least that much time. +)", 0) \ + M(UInt64, read_backoff_max_throughput, 1048576, R"( +Settings to reduce the number of threads in case of slow reads. Count events when the read bandwidth is less than that many bytes per second. +)", 0) \ + M(Milliseconds, read_backoff_min_interval_between_events_ms, 1000, R"( +Settings to reduce the number of threads in case of slow reads. Do not pay attention to the event, if the previous one has passed less than a certain amount of time. +)", 0) \ + M(UInt64, read_backoff_min_events, 2, R"( +Settings to reduce the number of threads in case of slow reads. The number of events after which the number of threads will be reduced. +)", 0) \ \ - M(UInt64, read_backoff_min_concurrency, 1, "Settings to try keeping the minimal number of threads in case of slow reads.", 0) \ + M(UInt64, read_backoff_min_concurrency, 1, R"( +Settings to try keeping the minimal number of threads in case of slow reads. +)", 0) \ \ - M(Float, memory_tracker_fault_probability, 0., "For testing of `exception safety` - throw an exception every time you allocate memory with the specified probability.", 0) \ - M(Float, merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability, 0.0, "For testing of `PartsSplitter` - split read ranges into intersecting and non intersecting every time you read from MergeTree with the specified probability.", 0) \ + M(Float, memory_tracker_fault_probability, 0., R"( +For testing of `exception safety` - throw an exception every time you allocate memory with the specified probability. +)", 0) \ + M(Float, merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability, 0.0, R"( +For testing of `PartsSplitter` - split read ranges into intersecting and non intersecting every time you read from MergeTree with the specified probability. +)", 0) \ \ - M(Bool, enable_http_compression, false, "Compress the result if the client over HTTP said that it understands data compressed by gzip, deflate, zstd, br, lz4, bz2, xz.", 0) \ - M(Int64, http_zlib_compression_level, 3, "Compression level - used if the client on HTTP said that it understands data compressed by gzip or deflate.", 0) \ + M(Bool, enable_http_compression, false, R"( +Enables or disables data compression in the response to an HTTP request. + +For more information, read the [HTTP interface description](../../interfaces/http.md). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. +)", 0) \ + M(Int64, http_zlib_compression_level, 3, R"( +Sets the level of data compression in the response to an HTTP request if [enable_http_compression = 1](#enable_http_compression). + +Possible values: Numbers from 1 to 9. +)", 0) \ \ - M(Bool, http_native_compression_disable_checksumming_on_decompress, false, "If you uncompress the POST data from the client compressed by the native format, do not check the checksum.", 0) \ + M(Bool, http_native_compression_disable_checksumming_on_decompress, false, R"( +Enables or disables checksum verification when decompressing the HTTP POST data from the client. Used only for ClickHouse native compression format (not used with `gzip` or `deflate`). + +For more information, read the [HTTP interface description](../../interfaces/http.md). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. +)", 0) \ \ - M(String, count_distinct_implementation, "uniqExact", "What aggregate function to use for the implementation of count(DISTINCT ...)", 0) \ + M(String, count_distinct_implementation, "uniqExact", R"( +Specifies which of the `uniq*` functions should be used to perform the [COUNT(DISTINCT ...)](../../sql-reference/aggregate-functions/reference/count.md/#agg_function-count) construction. + +Possible values: + +- [uniq](../../sql-reference/aggregate-functions/reference/uniq.md/#agg_function-uniq) +- [uniqCombined](../../sql-reference/aggregate-functions/reference/uniqcombined.md/#agg_function-uniqcombined) +- [uniqCombined64](../../sql-reference/aggregate-functions/reference/uniqcombined64.md/#agg_function-uniqcombined64) +- [uniqHLL12](../../sql-reference/aggregate-functions/reference/uniqhll12.md/#agg_function-uniqhll12) +- [uniqExact](../../sql-reference/aggregate-functions/reference/uniqexact.md/#agg_function-uniqexact) +)", 0) \ \ - M(Bool, add_http_cors_header, false, "Write add http CORS header.", 0) \ + M(Bool, add_http_cors_header, false, R"( +Write add http CORS header. +)", 0) \ \ - M(UInt64, max_http_get_redirects, 0, "Max number of HTTP GET redirects hops allowed. Ensures additional security measures are in place to prevent a malicious server from redirecting your requests to unexpected services.\n\nIt is the case when an external server redirects to another address, but that address appears to be internal to the company's infrastructure, and by sending an HTTP request to an internal server, you could request an internal API from the internal network, bypassing the auth, or even query other services, such as Redis or Memcached. When you don't have an internal infrastructure (including something running on your localhost), or you trust the server, it is safe to allow redirects. Although keep in mind, that if the URL uses HTTP instead of HTTPS, and you will have to trust not only the remote server but also your ISP and every network in the middle.", 0) \ + M(UInt64, max_http_get_redirects, 0, R"( +Max number of HTTP GET redirects hops allowed. Ensures additional security measures are in place to prevent a malicious server from redirecting your requests to unexpected services.\n\nIt is the case when an external server redirects to another address, but that address appears to be internal to the company's infrastructure, and by sending an HTTP request to an internal server, you could request an internal API from the internal network, bypassing the auth, or even query other services, such as Redis or Memcached. When you don't have an internal infrastructure (including something running on your localhost), or you trust the server, it is safe to allow redirects. Although keep in mind, that if the URL uses HTTP instead of HTTPS, and you will have to trust not only the remote server but also your ISP and every network in the middle. +)", 0) \ \ - M(Bool, use_client_time_zone, false, "Use client timezone for interpreting DateTime string values, instead of adopting server timezone.", 0) \ + M(Bool, use_client_time_zone, false, R"( +Use client timezone for interpreting DateTime string values, instead of adopting server timezone. +)", 0) \ \ - M(Bool, send_progress_in_http_headers, false, "Send progress notifications using X-ClickHouse-Progress headers. Some clients do not support high amount of HTTP headers (Python requests in particular), so it is disabled by default.", 0) \ + M(Bool, send_progress_in_http_headers, false, R"( +Enables or disables `X-ClickHouse-Progress` HTTP response headers in `clickhouse-server` responses. + +For more information, read the [HTTP interface description](../../interfaces/http.md). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. +)", 0) \ \ - M(UInt64, http_headers_progress_interval_ms, 100, "Do not send HTTP headers X-ClickHouse-Progress more frequently than at each specified interval.", 0) \ - M(Bool, http_wait_end_of_query, false, "Enable HTTP response buffering on the server-side.", 0) \ - M(Bool, http_write_exception_in_output_format, true, "Write exception in output format to produce valid output. Works with JSON and XML formats.", 0) \ - M(UInt64, http_response_buffer_size, 0, "The number of bytes to buffer in the server memory before sending a HTTP response to the client or flushing to disk (when http_wait_end_of_query is enabled).", 0) \ + M(UInt64, http_headers_progress_interval_ms, 100, R"( +Do not send HTTP headers X-ClickHouse-Progress more frequently than at each specified interval. +)", 0) \ + M(Bool, http_wait_end_of_query, false, R"( +Enable HTTP response buffering on the server-side. +)", 0) \ + M(Bool, http_write_exception_in_output_format, true, R"( +Write exception in output format to produce valid output. Works with JSON and XML formats. +)", 0) \ + M(UInt64, http_response_buffer_size, 0, R"( +The number of bytes to buffer in the server memory before sending a HTTP response to the client or flushing to disk (when http_wait_end_of_query is enabled). +)", 0) \ \ - M(Bool, fsync_metadata, true, "Do fsync after changing the metadata for tables and databases (.sql files). Could be disabled in case of poor latency on a server with high load of DDL queries and high load of the disk subsystem.", 0) \ + M(Bool, fsync_metadata, true, R"( +Enables or disables [fsync](http://pubs.opengroup.org/onlinepubs/9699919799/functions/fsync.html) when writing `.sql` files. Enabled by default. + +It makes sense to disable it if the server has millions of tiny tables that are constantly being created and destroyed. +)", 0) \ \ - M(Bool, join_use_nulls, false, "Use NULLs for non-joined rows of outer JOINs for types that can be inside Nullable. If false, use default value of corresponding columns data type.", IMPORTANT) \ + M(Bool, join_use_nulls, false, R"( +Sets the type of [JOIN](../../sql-reference/statements/select/join.md) behaviour. When merging tables, empty cells may appear. ClickHouse fills them differently based on this setting. + +Possible values: + +- 0 — The empty cells are filled with the default value of the corresponding field type. +- 1 — `JOIN` behaves the same way as in standard SQL. The type of the corresponding field is converted to [Nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable), and empty cells are filled with [NULL](../../sql-reference/syntax.md). +)", IMPORTANT) \ \ - M(UInt64, join_output_by_rowlist_perkey_rows_threshold, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join.", 0) \ - M(JoinStrictness, join_default_strictness, JoinStrictness::All, "Set default strictness in JOIN query. Possible values: empty string, 'ANY', 'ALL'. If empty, query without strictness will throw an exception.", 0) \ - M(Bool, any_join_distinct_right_table_keys, false, "Enable old ANY JOIN logic with many-to-one left-to-right table keys mapping for all ANY JOINs. It leads to confusing not equal results for 't1 ANY LEFT JOIN t2' and 't2 ANY RIGHT JOIN t1'. ANY RIGHT JOIN needs one-to-many keys mapping to be consistent with LEFT one.", IMPORTANT) \ - M(Bool, single_join_prefer_left_table, true, "For single JOIN in case of identifier ambiguity prefer left table", IMPORTANT) \ + M(UInt64, join_output_by_rowlist_perkey_rows_threshold, 5, R"( +The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join. +)", 0) \ + M(JoinStrictness, join_default_strictness, JoinStrictness::All, R"( +Sets default strictness for [JOIN clauses](../../sql-reference/statements/select/join.md/#select-join). + +Possible values: + +- `ALL` — If the right table has several matching rows, ClickHouse creates a [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) from matching rows. This is the normal `JOIN` behaviour from standard SQL. +- `ANY` — If the right table has several matching rows, only the first one found is joined. If the right table has only one matching row, the results of `ANY` and `ALL` are the same. +- `ASOF` — For joining sequences with an uncertain match. +- `Empty string` — If `ALL` or `ANY` is not specified in the query, ClickHouse throws an exception. +)", 0) \ + M(Bool, any_join_distinct_right_table_keys, false, R"( +Enables legacy ClickHouse server behaviour in `ANY INNER|LEFT JOIN` operations. + +:::note +Use this setting only for backward compatibility if your use cases depend on legacy `JOIN` behaviour. +::: + +When the legacy behaviour is enabled: + +- Results of `t1 ANY LEFT JOIN t2` and `t2 ANY RIGHT JOIN t1` operations are not equal because ClickHouse uses the logic with many-to-one left-to-right table keys mapping. +- Results of `ANY INNER JOIN` operations contain all rows from the left table like the `SEMI LEFT JOIN` operations do. + +When the legacy behaviour is disabled: + +- Results of `t1 ANY LEFT JOIN t2` and `t2 ANY RIGHT JOIN t1` operations are equal because ClickHouse uses the logic which provides one-to-many keys mapping in `ANY RIGHT JOIN` operations. +- Results of `ANY INNER JOIN` operations contain one row per key from both the left and right tables. + +Possible values: + +- 0 — Legacy behaviour is disabled. +- 1 — Legacy behaviour is enabled. + +See also: + +- [JOIN strictness](../../sql-reference/statements/select/join.md/#join-settings) +)", IMPORTANT) \ + M(Bool, single_join_prefer_left_table, true, R"( +For single JOIN in case of identifier ambiguity prefer left table +)", IMPORTANT) \ \ - M(UInt64, preferred_block_size_bytes, 1000000, "This setting adjusts the data block size for query processing and represents additional fine-tuning to the more rough 'max_block_size' setting. If the columns are large and with 'max_block_size' rows the block size is likely to be larger than the specified amount of bytes, its size will be lowered for better CPU cache locality.", 0) \ + M(UInt64, preferred_block_size_bytes, 1000000, R"( +This setting adjusts the data block size for query processing and represents additional fine-tuning to the more rough 'max_block_size' setting. If the columns are large and with 'max_block_size' rows the block size is likely to be larger than the specified amount of bytes, its size will be lowered for better CPU cache locality. +)", 0) \ \ - M(UInt64, max_replica_delay_for_distributed_queries, 300, "If set, distributed queries of Replicated tables will choose servers with replication delay in seconds less than the specified value (not inclusive). Zero means do not take delay into account.", 0) \ - M(Bool, fallback_to_stale_replicas_for_distributed_queries, true, "Suppose max_replica_delay_for_distributed_queries is set and all replicas for the queried table are stale. If this setting is enabled, the query will be performed anyway, otherwise the error will be reported.", 0) \ - M(UInt64, preferred_max_column_in_block_size_bytes, 0, "Limit on max column size in block while reading. Helps to decrease cache misses count. Should be close to L2 cache size.", 0) \ + M(UInt64, max_replica_delay_for_distributed_queries, 300, R"( +Disables lagging replicas for distributed queries. See [Replication](../../engines/table-engines/mergetree-family/replication.md). + +Sets the time in seconds. If a replica's lag is greater than or equal to the set value, this replica is not used. + +Possible values: + +- Positive integer. +- 0 — Replica lags are not checked. + +To prevent the use of any replica with a non-zero lag, set this parameter to 1. + +Used when performing `SELECT` from a distributed table that points to replicated tables. +)", 0) \ + M(Bool, fallback_to_stale_replicas_for_distributed_queries, true, R"( +Forces a query to an out-of-date replica if updated data is not available. See [Replication](../../engines/table-engines/mergetree-family/replication.md). + +ClickHouse selects the most relevant from the outdated replicas of the table. + +Used when performing `SELECT` from a distributed table that points to replicated tables. + +By default, 1 (enabled). +)", 0) \ + M(UInt64, preferred_max_column_in_block_size_bytes, 0, R"( +Limit on max column size in block while reading. Helps to decrease cache misses count. Should be close to L2 cache size. +)", 0) \ \ - M(UInt64, parts_to_delay_insert, 0, "If the destination table contains at least that many active parts in a single partition, artificially slow down insert into table.", 0) \ - M(UInt64, parts_to_throw_insert, 0, "If more than this number active parts in a single partition of the destination table, throw 'Too many parts ...' exception.", 0) \ - M(UInt64, number_of_mutations_to_delay, 0, "If the mutated table contains at least that many unfinished mutations, artificially slow down mutations of table. 0 - disabled", 0) \ - M(UInt64, number_of_mutations_to_throw, 0, "If the mutated table contains at least that many unfinished mutations, throw 'Too many mutations ...' exception. 0 - disabled", 0) \ - M(Int64, distributed_ddl_task_timeout, 180, "Timeout for DDL query responses from all hosts in the cluster. If a ddl request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. A negative value means infinite. Zero means async mode.", 0) \ - M(Milliseconds, stream_flush_interval_ms, 7500, "Timeout for flushing data from streaming storages.", 0) \ - M(Milliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.", 0) \ - M(UInt64, min_free_disk_bytes_to_perform_insert, 0, "Minimum free disk space bytes to perform an insert.", 0) \ - M(Double, min_free_disk_ratio_to_perform_insert, 0.0, "Minimum free disk space ratio to perform an insert.", 0) \ + M(UInt64, parts_to_delay_insert, 0, R"( +If the destination table contains at least that many active parts in a single partition, artificially slow down insert into table. +)", 0) \ + M(UInt64, parts_to_throw_insert, 0, R"( +If more than this number active parts in a single partition of the destination table, throw 'Too many parts ...' exception. +)", 0) \ + M(UInt64, number_of_mutations_to_delay, 0, R"( +If the mutated table contains at least that many unfinished mutations, artificially slow down mutations of table. 0 - disabled +)", 0) \ + M(UInt64, number_of_mutations_to_throw, 0, R"( +If the mutated table contains at least that many unfinished mutations, throw 'Too many mutations ...' exception. 0 - disabled +)", 0) \ + M(Int64, distributed_ddl_task_timeout, 180, R"( +Sets timeout for DDL query responses from all hosts in cluster. If a DDL request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. Negative value means infinite. + +Possible values: + +- Positive integer. +- 0 — Async mode. +- Negative integer — infinite timeout. +)", 0) \ + M(Milliseconds, stream_flush_interval_ms, 7500, R"( +Works for tables with streaming in the case of a timeout, or when a thread generates [max_insert_block_size](#max_insert_block_size) rows. + +The default value is 7500. + +The smaller the value, the more often data is flushed into the table. Setting the value too low leads to poor performance. +)", 0) \ + M(Milliseconds, stream_poll_timeout_ms, 500, R"( +Timeout for polling data from/to streaming storages. +)", 0) \ + M(UInt64, min_free_disk_bytes_to_perform_insert, 0, R"( +Minimum free disk space bytes to perform an insert. +)", 0) \ + M(Float, min_free_disk_ratio_to_perform_insert, 0.0, R"( +Minimum free disk space ratio to perform an insert. +)", 0) \ \ - M(Bool, final, false, "Query with the FINAL modifier by default. If the engine does not support the FINAL, it does not have any effect. On queries with multiple tables, FINAL is applied only to those that support it. It also works on distributed tables", 0) \ + M(Bool, final, false, R"( +Automatically applies [FINAL](../../sql-reference/statements/select/from.md#final-modifier) modifier to all tables in a query, to tables where [FINAL](../../sql-reference/statements/select/from.md#final-modifier) is applicable, including joined tables and tables in sub-queries, and +distributed tables. + +Possible values: + +- 0 - disabled +- 1 - enabled + +Example: + +```sql +CREATE TABLE test +( + key Int64, + some String +) +ENGINE = ReplacingMergeTree +ORDER BY key; + +INSERT INTO test FORMAT Values (1, 'first'); +INSERT INTO test FORMAT Values (1, 'second'); + +SELECT * FROM test; +┌─key─┬─some───┐ +│ 1 │ second │ +└─────┴────────┘ +┌─key─┬─some──┐ +│ 1 │ first │ +└─────┴───────┘ + +SELECT * FROM test SETTINGS final = 1; +┌─key─┬─some───┐ +│ 1 │ second │ +└─────┴────────┘ + +SET final = 1; +SELECT * FROM test; +┌─key─┬─some───┐ +│ 1 │ second │ +└─────┴────────┘ +``` +)", 0) \ \ - M(Bool, partial_result_on_first_cancel, false, "Allows query to return a partial result after cancel.", 0) \ + M(Bool, partial_result_on_first_cancel, false, R"( +Allows query to return a partial result after cancel. +)", 0) \ \ - M(Bool, ignore_on_cluster_for_replicated_udf_queries, false, "Ignore ON CLUSTER clause for replicated UDF management queries.", 0) \ - M(Bool, ignore_on_cluster_for_replicated_access_entities_queries, false, "Ignore ON CLUSTER clause for replicated access entities management queries.", 0) \ - M(Bool, ignore_on_cluster_for_replicated_named_collections_queries, false, "Ignore ON CLUSTER clause for replicated named collections management queries.", 0) \ + M(Bool, ignore_on_cluster_for_replicated_udf_queries, false, R"( +Ignore ON CLUSTER clause for replicated UDF management queries. +)", 0) \ + M(Bool, ignore_on_cluster_for_replicated_access_entities_queries, false, R"( +Ignore ON CLUSTER clause for replicated access entities management queries. +)", 0) \ + M(Bool, ignore_on_cluster_for_replicated_named_collections_queries, false, R"( +Ignore ON CLUSTER clause for replicated named collections management queries. +)", 0) \ /** Settings for testing hedged requests */ \ - M(Milliseconds, sleep_in_send_tables_status_ms, 0, "Time to sleep in sending tables status response in TCPHandler", 0) \ - M(Milliseconds, sleep_in_send_data_ms, 0, "Time to sleep in sending data in TCPHandler", 0) \ - M(Milliseconds, sleep_after_receiving_query_ms, 0, "Time to sleep after receiving query in TCPHandler", 0) \ - M(UInt64, unknown_packet_in_send_data, 0, "Send unknown packet instead of data Nth data packet", 0) \ + M(Milliseconds, sleep_in_send_tables_status_ms, 0, R"( +Time to sleep in sending tables status response in TCPHandler +)", 0) \ + M(Milliseconds, sleep_in_send_data_ms, 0, R"( +Time to sleep in sending data in TCPHandler +)", 0) \ + M(Milliseconds, sleep_after_receiving_query_ms, 0, R"( +Time to sleep after receiving query in TCPHandler +)", 0) \ + M(UInt64, unknown_packet_in_send_data, 0, R"( +Send unknown packet instead of data Nth data packet +)", 0) \ \ - M(Bool, insert_allow_materialized_columns, false, "If setting is enabled, Allow materialized columns in INSERT.", 0) \ - M(Seconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.", 0) \ - M(Seconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP send timeout", 0) \ - M(Seconds, http_receive_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP receive timeout", 0) \ - M(UInt64, http_max_uri_size, 1048576, "Maximum URI length of HTTP request", 0) \ - M(UInt64, http_max_fields, 1000000, "Maximum number of fields in HTTP header", 0) \ - M(UInt64, http_max_field_name_size, 128 * 1024, "Maximum length of field name in HTTP header", 0) \ - M(UInt64, http_max_field_value_size, 128 * 1024, "Maximum length of field value in HTTP header", 0) \ - M(Bool, http_skip_not_found_url_for_globs, true, "Skip URLs for globs with HTTP_NOT_FOUND error", 0) \ - M(Bool, http_make_head_request, true, "Allows the execution of a `HEAD` request while reading data from HTTP to retrieve information about the file to be read, such as its size", 0) \ - M(Bool, optimize_throw_if_noop, false, "If setting is enabled and OPTIMIZE query didn't actually assign a merge then an explanatory exception is thrown", 0) \ - M(Bool, use_index_for_in_with_subqueries, true, "Try using an index if there is a subquery or a table expression on the right side of the IN operator.", 0) \ - M(UInt64, use_index_for_in_with_subqueries_max_values, 0, "The maximum size of the set in the right-hand side of the IN operator to use table index for filtering. It allows to avoid performance degradation and higher memory usage due to the preparation of additional data structures for large queries. Zero means no limit.", 0) \ - M(Bool, analyze_index_with_space_filling_curves, true, "If a table has a space-filling curve in its index, e.g. `ORDER BY mortonEncode(x, y)`, and the query has conditions on its arguments, e.g. `x >= 10 AND x <= 20 AND y >= 20 AND y <= 30`, use the space-filling curve for index analysis.", 0) \ - M(Bool, joined_subquery_requires_alias, true, "Force joined subqueries and table functions to have aliases for correct name qualification.", 0) \ - M(Bool, empty_result_for_aggregation_by_empty_set, false, "Return empty result when aggregating without keys on empty set.", 0) \ - M(Bool, empty_result_for_aggregation_by_constant_keys_on_empty_set, true, "Return empty result when aggregating by constant keys on empty set.", 0) \ - M(Bool, allow_distributed_ddl, true, "If it is set to true, then a user is allowed to executed distributed DDL queries.", 0) \ - M(Bool, allow_suspicious_codecs, false, "If it is set to true, allow to specify meaningless compression codecs.", 0) \ - M(Bool, enable_deflate_qpl_codec, false, "Enable/disable the DEFLATE_QPL codec.", 0) \ - M(Bool, enable_zstd_qat_codec, false, "Enable/disable the ZSTD_QAT codec.", 0) \ - M(UInt64, query_profiler_real_time_period_ns, QUERY_PROFILER_DEFAULT_SAMPLE_RATE_NS, "Period for real clock timer of query profiler (in nanoseconds). Set 0 value to turn off the real clock query profiler. The recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ - M(UInt64, query_profiler_cpu_time_period_ns, QUERY_PROFILER_DEFAULT_SAMPLE_RATE_NS, "Period for CPU clock timer of query profiler (in nanoseconds). Set 0 value to turn off the CPU clock query profiler. The recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ - M(Bool, metrics_perf_events_enabled, false, "If enabled, some of the perf events will be measured throughout queries' execution.", 0) \ - M(String, metrics_perf_events_list, "", "Comma separated list of perf metrics that will be measured throughout queries' execution. Empty means all events. See PerfEventInfo in sources for the available events.", 0) \ - M(Float, opentelemetry_start_trace_probability, 0., "Probability to start an OpenTelemetry trace for an incoming query.", 0) \ - M(Bool, opentelemetry_trace_processors, false, "Collect OpenTelemetry spans for processors.", 0) \ - M(Bool, prefer_column_name_to_alias, false, "Prefer using column names instead of aliases if possible.", 0) \ + M(Bool, insert_allow_materialized_columns, false, R"( +If setting is enabled, Allow materialized columns in INSERT. +)", 0) \ + M(Seconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, R"( +HTTP connection timeout (in seconds). + +Possible values: + +- Any positive integer. +- 0 - Disabled (infinite timeout). +)", 0) \ + M(Seconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, R"( +HTTP send timeout (in seconds). + +Possible values: + +- Any positive integer. +- 0 - Disabled (infinite timeout). + +:::note +It's applicable only to the default profile. A server reboot is required for the changes to take effect. +::: +)", 0) \ + M(Seconds, http_receive_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, R"( +HTTP receive timeout (in seconds). + +Possible values: + +- Any positive integer. +- 0 - Disabled (infinite timeout). +)", 0) \ + M(UInt64, http_max_uri_size, 1048576, R"( +Sets the maximum URI length of an HTTP request. + +Possible values: + +- Positive integer. +)", 0) \ + M(UInt64, http_max_fields, 1000000, R"( +Maximum number of fields in HTTP header +)", 0) \ + M(UInt64, http_max_field_name_size, 128 * 1024, R"( +Maximum length of field name in HTTP header +)", 0) \ + M(UInt64, http_max_field_value_size, 128 * 1024, R"( +Maximum length of field value in HTTP header +)", 0) \ + M(Bool, http_skip_not_found_url_for_globs, true, R"( +Skip URLs for globs with HTTP_NOT_FOUND error +)", 0) \ + M(Bool, http_make_head_request, true, R"( +The `http_make_head_request` setting allows the execution of a `HEAD` request while reading data from HTTP to retrieve information about the file to be read, such as its size. Since it's enabled by default, it may be desirable to disable this setting in cases where the server does not support `HEAD` requests. +)", 0) \ + M(Bool, optimize_throw_if_noop, false, R"( +Enables or disables throwing an exception if an [OPTIMIZE](../../sql-reference/statements/optimize.md) query didn’t perform a merge. + +By default, `OPTIMIZE` returns successfully even if it didn’t do anything. This setting lets you differentiate these situations and get the reason in an exception message. + +Possible values: + +- 1 — Throwing an exception is enabled. +- 0 — Throwing an exception is disabled. +)", 0) \ + M(Bool, use_index_for_in_with_subqueries, true, R"( +Try using an index if there is a subquery or a table expression on the right side of the IN operator. +)", 0) \ + M(UInt64, use_index_for_in_with_subqueries_max_values, 0, R"( +The maximum size of the set in the right-hand side of the IN operator to use table index for filtering. It allows to avoid performance degradation and higher memory usage due to the preparation of additional data structures for large queries. Zero means no limit. +)", 0) \ + M(Bool, analyze_index_with_space_filling_curves, true, R"( +If a table has a space-filling curve in its index, e.g. `ORDER BY mortonEncode(x, y)` or `ORDER BY hilbertEncode(x, y)`, and the query has conditions on its arguments, e.g. `x >= 10 AND x <= 20 AND y >= 20 AND y <= 30`, use the space-filling curve for index analysis. +)", 0) \ + M(Bool, joined_subquery_requires_alias, true, R"( +Force joined subqueries and table functions to have aliases for correct name qualification. +)", 0) \ + M(Bool, empty_result_for_aggregation_by_empty_set, false, R"( +Return empty result when aggregating without keys on empty set. +)", 0) \ + M(Bool, empty_result_for_aggregation_by_constant_keys_on_empty_set, true, R"( +Return empty result when aggregating by constant keys on empty set. +)", 0) \ + M(Bool, allow_distributed_ddl, true, R"( +If it is set to true, then a user is allowed to executed distributed DDL queries. +)", 0) \ + M(Bool, allow_suspicious_codecs, false, R"( +If it is set to true, allow to specify meaningless compression codecs. +)", 0) \ + M(Bool, enable_deflate_qpl_codec, false, R"( +If turned on, the DEFLATE_QPL codec may be used to compress columns. + +Possible values: + +- 0 - Disabled +- 1 - Enabled +)", 0) \ + M(Bool, enable_zstd_qat_codec, false, R"( +If turned on, the ZSTD_QAT codec may be used to compress columns. + +Possible values: + +- 0 - Disabled +- 1 - Enabled +)", 0) \ + M(UInt64, query_profiler_real_time_period_ns, QUERY_PROFILER_DEFAULT_SAMPLE_RATE_NS, R"( +Sets the period for a real clock timer of the [query profiler](../../operations/optimizing-performance/sampling-query-profiler.md). Real clock timer counts wall-clock time. + +Possible values: + +- Positive integer number, in nanoseconds. + + Recommended values: + + - 10000000 (100 times a second) nanoseconds and less for single queries. + - 1000000000 (once a second) for cluster-wide profiling. + +- 0 for turning off the timer. + +**Temporarily disabled in ClickHouse Cloud.** + +See also: + +- System table [trace_log](../../operations/system-tables/trace_log.md/#system_tables-trace_log) +)", 0) \ + M(UInt64, query_profiler_cpu_time_period_ns, QUERY_PROFILER_DEFAULT_SAMPLE_RATE_NS, R"( +Sets the period for a CPU clock timer of the [query profiler](../../operations/optimizing-performance/sampling-query-profiler.md). This timer counts only CPU time. + +Possible values: + +- A positive integer number of nanoseconds. + + Recommended values: + + - 10000000 (100 times a second) nanoseconds and more for single queries. + - 1000000000 (once a second) for cluster-wide profiling. + +- 0 for turning off the timer. + +**Temporarily disabled in ClickHouse Cloud.** + +See also: + +- System table [trace_log](../../operations/system-tables/trace_log.md/#system_tables-trace_log) +)", 0) \ + M(Bool, metrics_perf_events_enabled, false, R"( +If enabled, some of the perf events will be measured throughout queries' execution. +)", 0) \ + M(String, metrics_perf_events_list, "", R"( +Comma separated list of perf metrics that will be measured throughout queries' execution. Empty means all events. See PerfEventInfo in sources for the available events. +)", 0) \ + M(Float, opentelemetry_start_trace_probability, 0., R"( +Sets the probability that the ClickHouse can start a trace for executed queries (if no parent [trace context](https://www.w3.org/TR/trace-context/) is supplied). + +Possible values: + +- 0 — The trace for all executed queries is disabled (if no parent trace context is supplied). +- Positive floating-point number in the range [0..1]. For example, if the setting value is `0,5`, ClickHouse can start a trace on average for half of the queries. +- 1 — The trace for all executed queries is enabled. +)", 0) \ + M(Bool, opentelemetry_trace_processors, false, R"( +Collect OpenTelemetry spans for processors. +)", 0) \ + M(Bool, prefer_column_name_to_alias, false, R"( +Enables or disables using the original column names instead of aliases in query expressions and clauses. It especially matters when alias is the same as the column name, see [Expression Aliases](../../sql-reference/syntax.md/#notes-on-usage). Enable this setting to make aliases syntax rules in ClickHouse more compatible with most other database engines. + +Possible values: + +- 0 — The column name is substituted with the alias. +- 1 — The column name is not substituted with the alias. + +**Example** + +The difference between enabled and disabled: + +Query: + +```sql +SET prefer_column_name_to_alias = 0; +SELECT avg(number) AS number, max(number) FROM numbers(10); +``` + +Result: + +```text +Received exception from server (version 21.5.1): +Code: 184. DB::Exception: Received from localhost:9000. DB::Exception: Aggregate function avg(number) is found inside another aggregate function in query: While processing avg(number) AS number. +``` + +Query: + +```sql +SET prefer_column_name_to_alias = 1; +SELECT avg(number) AS number, max(number) FROM numbers(10); +``` + +Result: + +```text +┌─number─┬─max(number)─┐ +│ 4.5 │ 9 │ +└────────┴─────────────┘ +``` +)", 0) \ \ - M(Bool, prefer_global_in_and_join, false, "If enabled, all IN/JOIN operators will be rewritten as GLOBAL IN/JOIN. It's useful when the to-be-joined tables are only available on the initiator and we need to always scatter their data on-the-fly during distributed processing with the GLOBAL keyword. It's also useful to reduce the need to access the external sources joining external tables.", 0) \ - M(Bool, enable_vertical_final, true, "If enable, remove duplicated rows during FINAL by marking rows as deleted and filtering them later instead of merging rows", 0) \ + M(Bool, prefer_global_in_and_join, false, R"( +Enables the replacement of `IN`/`JOIN` operators with `GLOBAL IN`/`GLOBAL JOIN`. + +Possible values: + +- 0 — Disabled. `IN`/`JOIN` operators are not replaced with `GLOBAL IN`/`GLOBAL JOIN`. +- 1 — Enabled. `IN`/`JOIN` operators are replaced with `GLOBAL IN`/`GLOBAL JOIN`. + +**Usage** + +Although `SET distributed_product_mode=global` can change the queries behavior for the distributed tables, it's not suitable for local tables or tables from external resources. Here is when the `prefer_global_in_and_join` setting comes into play. + +For example, we have query serving nodes that contain local tables, which are not suitable for distribution. We need to scatter their data on the fly during distributed processing with the `GLOBAL` keyword — `GLOBAL IN`/`GLOBAL JOIN`. + +Another use case of `prefer_global_in_and_join` is accessing tables created by external engines. This setting helps to reduce the number of calls to external sources while joining such tables: only one call per query. + +**See also:** + +- [Distributed subqueries](../../sql-reference/operators/in.md/#select-distributed-subqueries) for more information on how to use `GLOBAL IN`/`GLOBAL JOIN` +)", 0) \ + M(Bool, enable_vertical_final, true, R"( +If enable, remove duplicated rows during FINAL by marking rows as deleted and filtering them later instead of merging rows +)", 0) \ \ \ /** Limits during query execution are part of the settings. \ @@ -417,587 +2280,3534 @@ namespace ErrorCodes * Almost all limits apply to each stream individually. \ */ \ \ - M(UInt64, max_rows_to_read, 0, "Limit on read rows from the most 'deep' sources. That is, only in the deepest subquery. When reading from a remote server, it is only checked on a remote server.", 0) \ - M(UInt64, max_bytes_to_read, 0, "Limit on read bytes (after decompression) from the most 'deep' sources. That is, only in the deepest subquery. When reading from a remote server, it is only checked on a remote server.", 0) \ - M(OverflowMode, read_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \ + M(UInt64, max_rows_to_read, 0, R"( +Limit on read rows from the most 'deep' sources. That is, only in the deepest subquery. When reading from a remote server, it is only checked on a remote server. +)", 0) \ + M(UInt64, max_bytes_to_read, 0, R"( +Limit on read bytes (after decompression) from the most 'deep' sources. That is, only in the deepest subquery. When reading from a remote server, it is only checked on a remote server. +)", 0) \ + M(OverflowMode, read_overflow_mode, OverflowMode::THROW, R"( +What to do when the limit is exceeded. +)", 0) \ \ - M(UInt64, max_rows_to_read_leaf, 0, "Limit on read rows on the leaf nodes for distributed queries. Limit is applied for local reads only, excluding the final merge stage on the root node. Note, the setting is unstable with prefer_localhost_replica=1.", 0) \ - M(UInt64, max_bytes_to_read_leaf, 0, "Limit on read bytes (after decompression) on the leaf nodes for distributed queries. Limit is applied for local reads only, excluding the final merge stage on the root node. Note, the setting is unstable with prefer_localhost_replica=1.", 0) \ - M(OverflowMode, read_overflow_mode_leaf, OverflowMode::THROW, "What to do when the leaf limit is exceeded.", 0) \ + M(UInt64, max_rows_to_read_leaf, 0, R"( +Limit on read rows on the leaf nodes for distributed queries. Limit is applied for local reads only, excluding the final merge stage on the root node. Note, the setting is unstable with prefer_localhost_replica=1. +)", 0) \ + M(UInt64, max_bytes_to_read_leaf, 0, R"( +Limit on read bytes (after decompression) on the leaf nodes for distributed queries. Limit is applied for local reads only, excluding the final merge stage on the root node. Note, the setting is unstable with prefer_localhost_replica=1. +)", 0) \ + M(OverflowMode, read_overflow_mode_leaf, OverflowMode::THROW, R"( +What to do when the leaf limit is exceeded. +)", 0) \ \ - M(UInt64, max_rows_to_group_by, 0, "If aggregation during GROUP BY is generating more than the specified number of rows (unique GROUP BY keys), the behavior will be determined by the 'group_by_overflow_mode' which by default is - throw an exception, but can be also switched to an approximate GROUP BY mode.", 0) \ - M(OverflowModeGroupBy, group_by_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \ - M(UInt64, max_bytes_before_external_group_by, 0, "If memory usage during GROUP BY operation is exceeding this threshold in bytes, activate the 'external aggregation' mode (spill data to disk). Recommended value is half of the available system memory.", 0) \ + M(UInt64, max_rows_to_group_by, 0, R"( +If aggregation during GROUP BY is generating more than the specified number of rows (unique GROUP BY keys), the behavior will be determined by the 'group_by_overflow_mode' which by default is - throw an exception, but can be also switched to an approximate GROUP BY mode. +)", 0) \ + M(OverflowModeGroupBy, group_by_overflow_mode, OverflowMode::THROW, R"( +What to do when the limit is exceeded. +)", 0) \ + M(UInt64, max_bytes_before_external_group_by, 0, R"( +If memory usage during GROUP BY operation is exceeding this threshold in bytes, activate the 'external aggregation' mode (spill data to disk). Recommended value is half of the available system memory. +)", 0) \ \ - M(UInt64, max_rows_to_sort, 0, "If more than the specified amount of records have to be processed for ORDER BY operation, the behavior will be determined by the 'sort_overflow_mode' which by default is - throw an exception", 0) \ - M(UInt64, max_bytes_to_sort, 0, "If more than the specified amount of (uncompressed) bytes have to be processed for ORDER BY operation, the behavior will be determined by the 'sort_overflow_mode' which by default is - throw an exception", 0) \ - M(OverflowMode, sort_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \ - M(UInt64, prefer_external_sort_block_bytes, DEFAULT_BLOCK_SIZE * 256, "Prefer maximum block bytes for external sort, reduce the memory usage during merging.", 0) \ - M(UInt64, max_bytes_before_external_sort, 0, "If memory usage during ORDER BY operation is exceeding this threshold in bytes, activate the 'external sorting' mode (spill data to disk). Recommended value is half of the available system memory.", 0) \ - M(UInt64, max_bytes_before_remerge_sort, 1000000000, "In case of ORDER BY with LIMIT, when memory usage is higher than specified threshold, perform additional steps of merging blocks before final merge to keep just top LIMIT rows.", 0) \ - M(Float, remerge_sort_lowered_memory_bytes_ratio, 2., "If memory usage after remerge does not reduced by this ratio, remerge will be disabled.", 0) \ + M(UInt64, max_rows_to_sort, 0, R"( +If more than the specified amount of records have to be processed for ORDER BY operation, the behavior will be determined by the 'sort_overflow_mode' which by default is - throw an exception +)", 0) \ + M(UInt64, max_bytes_to_sort, 0, R"( +If more than the specified amount of (uncompressed) bytes have to be processed for ORDER BY operation, the behavior will be determined by the 'sort_overflow_mode' which by default is - throw an exception +)", 0) \ + M(OverflowMode, sort_overflow_mode, OverflowMode::THROW, R"( +What to do when the limit is exceeded. +)", 0) \ + M(UInt64, prefer_external_sort_block_bytes, DEFAULT_BLOCK_SIZE * 256, R"( +Prefer maximum block bytes for external sort, reduce the memory usage during merging. +)", 0) \ + M(UInt64, max_bytes_before_external_sort, 0, R"( +If memory usage during ORDER BY operation is exceeding this threshold in bytes, activate the 'external sorting' mode (spill data to disk). Recommended value is half of the available system memory. +)", 0) \ + M(UInt64, max_bytes_before_remerge_sort, 1000000000, R"( +In case of ORDER BY with LIMIT, when memory usage is higher than specified threshold, perform additional steps of merging blocks before final merge to keep just top LIMIT rows. +)", 0) \ + M(Float, remerge_sort_lowered_memory_bytes_ratio, 2., R"( +If memory usage after remerge does not reduced by this ratio, remerge will be disabled. +)", 0) \ \ - M(UInt64, max_result_rows, 0, "Limit on result size in rows. The query will stop after processing a block of data if the threshold is met, but it will not cut the last block of the result, therefore the result size can be larger than the threshold.", 0) \ - M(UInt64, max_result_bytes, 0, "Limit on result size in bytes (uncompressed). The query will stop after processing a block of data if the threshold is met, but it will not cut the last block of the result, therefore the result size can be larger than the threshold. Caveats: the result size in memory is taken into account for this threshold. Even if the result size is small, it can reference larger data structures in memory, representing dictionaries of LowCardinality columns, and Arenas of AggregateFunction columns, so the threshold can be exceeded despite the small result size. The setting is fairly low level and should be used with caution.", 0) \ - M(OverflowMode, result_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \ + M(UInt64, max_result_rows, 0, R"( +Limit on result size in rows. The query will stop after processing a block of data if the threshold is met, but it will not cut the last block of the result, therefore the result size can be larger than the threshold. +)", 0) \ + M(UInt64, max_result_bytes, 0, R"( +Limit on result size in bytes (uncompressed). The query will stop after processing a block of data if the threshold is met, but it will not cut the last block of the result, therefore the result size can be larger than the threshold. Caveats: the result size in memory is taken into account for this threshold. Even if the result size is small, it can reference larger data structures in memory, representing dictionaries of LowCardinality columns, and Arenas of AggregateFunction columns, so the threshold can be exceeded despite the small result size. The setting is fairly low level and should be used with caution. +)", 0) \ + M(OverflowMode, result_overflow_mode, OverflowMode::THROW, R"( +What to do when the limit is exceeded. +)", 0) \ \ /* TODO: Check also when merging and finalizing aggregate functions. */ \ - M(Seconds, max_execution_time, 0, "If query runtime exceeds the specified number of seconds, the behavior will be determined by the 'timeout_overflow_mode', which by default is - throw an exception. Note that the timeout is checked and the query can stop only in designated places during data processing. It currently cannot stop during merging of aggregation states or during query analysis, and the actual run time will be higher than the value of this setting.", 0) \ - M(OverflowMode, timeout_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \ - M(Seconds, max_execution_time_leaf, 0, "Similar semantic to max_execution_time but only apply on leaf node for distributed queries, the time out behavior will be determined by 'timeout_overflow_mode_leaf' which by default is - throw an exception", 0) \ - M(OverflowMode, timeout_overflow_mode_leaf, OverflowMode::THROW, "What to do when the leaf limit is exceeded.", 0) \ + M(Seconds, max_execution_time, 0, R"( +If query runtime exceeds the specified number of seconds, the behavior will be determined by the 'timeout_overflow_mode', which by default is - throw an exception. Note that the timeout is checked and the query can stop only in designated places during data processing. It currently cannot stop during merging of aggregation states or during query analysis, and the actual run time will be higher than the value of this setting. +)", 0) \ + M(OverflowMode, timeout_overflow_mode, OverflowMode::THROW, R"( +What to do when the limit is exceeded. +)", 0) \ + M(Seconds, max_execution_time_leaf, 0, R"( +Similar semantic to max_execution_time but only apply on leaf node for distributed queries, the time out behavior will be determined by 'timeout_overflow_mode_leaf' which by default is - throw an exception +)", 0) \ + M(OverflowMode, timeout_overflow_mode_leaf, OverflowMode::THROW, R"( +What to do when the leaf limit is exceeded. +)", 0) \ \ - M(UInt64, min_execution_speed, 0, "Minimum number of execution rows per second.", 0) \ - M(UInt64, max_execution_speed, 0, "Maximum number of execution rows per second.", 0) \ - M(UInt64, min_execution_speed_bytes, 0, "Minimum number of execution bytes per second.", 0) \ - M(UInt64, max_execution_speed_bytes, 0, "Maximum number of execution bytes per second.", 0) \ - M(Seconds, timeout_before_checking_execution_speed, 10, "Check that the speed is not too low after the specified time has elapsed.", 0) \ - M(Seconds, max_estimated_execution_time, 0, "Maximum query estimate execution time in seconds.", 0) \ + M(UInt64, min_execution_speed, 0, R"( +Minimum number of execution rows per second. +)", 0) \ + M(UInt64, max_execution_speed, 0, R"( +Maximum number of execution rows per second. +)", 0) \ + M(UInt64, min_execution_speed_bytes, 0, R"( +Minimum number of execution bytes per second. +)", 0) \ + M(UInt64, max_execution_speed_bytes, 0, R"( +Maximum number of execution bytes per second. +)", 0) \ + M(Seconds, timeout_before_checking_execution_speed, 10, R"( +Check that the speed is not too low after the specified time has elapsed. +)", 0) \ + M(Seconds, max_estimated_execution_time, 0, R"( +Maximum query estimate execution time in seconds. +)", 0) \ \ - M(UInt64, max_columns_to_read, 0, "If a query requires reading more than specified number of columns, exception is thrown. Zero value means unlimited. This setting is useful to prevent too complex queries.", 0) \ - M(UInt64, max_temporary_columns, 0, "If a query generates more than the specified number of temporary columns in memory as a result of intermediate calculation, the exception is thrown. Zero value means unlimited. This setting is useful to prevent too complex queries.", 0) \ - M(UInt64, max_temporary_non_const_columns, 0, "Similar to the 'max_temporary_columns' setting but applies only to non-constant columns. This makes sense because constant columns are cheap and it is reasonable to allow more of them.", 0) \ + M(UInt64, max_columns_to_read, 0, R"( +If a query requires reading more than specified number of columns, exception is thrown. Zero value means unlimited. This setting is useful to prevent too complex queries. +)", 0) \ + M(UInt64, max_temporary_columns, 0, R"( +If a query generates more than the specified number of temporary columns in memory as a result of intermediate calculation, the exception is thrown. Zero value means unlimited. This setting is useful to prevent too complex queries. +)", 0) \ + M(UInt64, max_temporary_non_const_columns, 0, R"( +Similar to the 'max_temporary_columns' setting but applies only to non-constant columns. This makes sense because constant columns are cheap and it is reasonable to allow more of them. +)", 0) \ \ - M(UInt64, max_sessions_for_user, 0, "Maximum number of simultaneous sessions for a user.", 0) \ + M(UInt64, max_sessions_for_user, 0, R"( +Maximum number of simultaneous sessions for a user. +)", 0) \ \ - M(UInt64, max_subquery_depth, 100, "If a query has more than the specified number of nested subqueries, throw an exception. This allows you to have a sanity check to protect the users of your cluster from going insane with their queries.", 0) \ - M(UInt64, max_analyze_depth, 5000, "Maximum number of analyses performed by interpreter.", 0) \ - M(UInt64, max_ast_depth, 1000, "Maximum depth of query syntax tree. Checked after parsing.", 0) \ - M(UInt64, max_ast_elements, 50000, "Maximum size of query syntax tree in number of nodes. Checked after parsing.", 0) \ - M(UInt64, max_expanded_ast_elements, 500000, "Maximum size of query syntax tree in number of nodes after expansion of aliases and the asterisk.", 0) \ + M(UInt64, max_subquery_depth, 100, R"( +If a query has more than the specified number of nested subqueries, throw an exception. This allows you to have a sanity check to protect the users of your cluster from going insane with their queries. +)", 0) \ + M(UInt64, max_analyze_depth, 5000, R"( +Maximum number of analyses performed by interpreter. +)", 0) \ + M(UInt64, max_ast_depth, 1000, R"( +Maximum depth of query syntax tree. Checked after parsing. +)", 0) \ + M(UInt64, max_ast_elements, 50000, R"( +Maximum size of query syntax tree in number of nodes. Checked after parsing. +)", 0) \ + M(UInt64, max_expanded_ast_elements, 500000, R"( +Maximum size of query syntax tree in number of nodes after expansion of aliases and the asterisk. +)", 0) \ \ - M(UInt64, readonly, 0, "0 - no read-only restrictions. 1 - only read requests, as well as changing explicitly allowed settings. 2 - only read requests, as well as changing settings, except for the 'readonly' setting.", 0) \ + M(UInt64, readonly, 0, R"( +0 - no read-only restrictions. 1 - only read requests, as well as changing explicitly allowed settings. 2 - only read requests, as well as changing settings, except for the 'readonly' setting. +)", 0) \ \ - M(UInt64, max_rows_in_set, 0, "Maximum size of the set (in number of elements) resulting from the execution of the IN section.", 0) \ - M(UInt64, max_bytes_in_set, 0, "Maximum size of the set (in bytes in memory) resulting from the execution of the IN section.", 0) \ - M(OverflowMode, set_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \ + M(UInt64, max_rows_in_set, 0, R"( +Maximum size of the set (in number of elements) resulting from the execution of the IN section. +)", 0) \ + M(UInt64, max_bytes_in_set, 0, R"( +Maximum size of the set (in bytes in memory) resulting from the execution of the IN section. +)", 0) \ + M(OverflowMode, set_overflow_mode, OverflowMode::THROW, R"( +What to do when the limit is exceeded. +)", 0) \ \ - M(UInt64, max_rows_in_join, 0, "Maximum size of the hash table for JOIN (in number of rows).", 0) \ - M(UInt64, max_bytes_in_join, 0, "Maximum size of the hash table for JOIN (in number of bytes in memory).", 0) \ - M(OverflowMode, join_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \ - M(Bool, join_any_take_last_row, false, "When disabled (default) ANY JOIN will take the first found row for a key. When enabled, it will take the last row seen if there are multiple rows for the same key. Can be applied only to hash join and storage join.", IMPORTANT) \ - M(JoinAlgorithm, join_algorithm, JoinAlgorithm::DEFAULT, "Specify join algorithm.", 0) \ - M(UInt64, cross_join_min_rows_to_compress, 10000000, "Minimal count of rows to compress block in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached.", 0) \ - M(UInt64, cross_join_min_bytes_to_compress, 1_GiB, "Minimal size of block to compress in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached.", 0) \ - M(UInt64, default_max_bytes_in_join, 1000000000, "Maximum size of right-side table if limit is required but max_bytes_in_join is not set.", 0) \ - M(UInt64, partial_merge_join_left_table_buffer_bytes, 0, "If not 0 group left table blocks in bigger ones for left-side table in partial merge join. It uses up to 2x of specified memory per joining thread.", 0) \ - M(UInt64, partial_merge_join_rows_in_right_blocks, 65536, "Split right-hand joining data in blocks of specified size. It's a portion of data indexed by min-max values and possibly unloaded on disk.", 0) \ - M(UInt64, join_on_disk_max_files_to_merge, 64, "For MergeJoin on disk set how much files it's allowed to sort simultaneously. Then this value bigger then more memory used and then less disk I/O needed. Minimum is 2.", 0) \ - M(UInt64, max_rows_in_set_to_optimize_join, 0, "Maximal size of the set to filter joined tables by each other row sets before joining. 0 - disable.", 0) \ + M(UInt64, max_rows_in_join, 0, R"( +Maximum size of the hash table for JOIN (in number of rows). +)", 0) \ + M(UInt64, max_bytes_in_join, 0, R"( +Maximum size of the hash table for JOIN (in number of bytes in memory). +)", 0) \ + M(OverflowMode, join_overflow_mode, OverflowMode::THROW, R"( +What to do when the limit is exceeded. +)", 0) \ + M(Bool, join_any_take_last_row, false, R"( +Changes the behaviour of join operations with `ANY` strictness. + +:::note +This setting applies only for `JOIN` operations with [Join](../../engines/table-engines/special/join.md) engine tables. +::: + +Possible values: + +- 0 — If the right table has more than one matching row, only the first one found is joined. +- 1 — If the right table has more than one matching row, only the last one found is joined. + +See also: + +- [JOIN clause](../../sql-reference/statements/select/join.md/#select-join) +- [Join table engine](../../engines/table-engines/special/join.md) +- [join_default_strictness](#join_default_strictness) +)", IMPORTANT) \ + M(JoinAlgorithm, join_algorithm, JoinAlgorithm::DEFAULT, R"( +Specifies which [JOIN](../../sql-reference/statements/select/join.md) algorithm is used. + +Several algorithms can be specified, and an available one would be chosen for a particular query based on kind/strictness and table engine. + +Possible values: + +- default + + This is the equivalent of `hash` or `direct`, if possible (same as `direct,hash`) + +- grace_hash + + [Grace hash join](https://en.wikipedia.org/wiki/Hash_join#Grace_hash_join) is used. Grace hash provides an algorithm option that provides performant complex joins while limiting memory use. + + The first phase of a grace join reads the right table and splits it into N buckets depending on the hash value of key columns (initially, N is `grace_hash_join_initial_buckets`). This is done in a way to ensure that each bucket can be processed independently. Rows from the first bucket are added to an in-memory hash table while the others are saved to disk. If the hash table grows beyond the memory limit (e.g., as set by [`max_bytes_in_join`](/docs/en/operations/settings/query-complexity.md/#max_bytes_in_join)), the number of buckets is increased and the assigned bucket for each row. Any rows which don’t belong to the current bucket are flushed and reassigned. + + Supports `INNER/LEFT/RIGHT/FULL ALL/ANY JOIN`. + +- hash + + [Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used. The most generic implementation that supports all combinations of kind and strictness and multiple join keys that are combined with `OR` in the `JOIN ON` section. + +- parallel_hash + + A variation of `hash` join that splits the data into buckets and builds several hashtables instead of one concurrently to speed up this process. + + When using the `hash` algorithm, the right part of `JOIN` is uploaded into RAM. + +- partial_merge + + A variation of the [sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join), where only the right table is fully sorted. + + The `RIGHT JOIN` and `FULL JOIN` are supported only with `ALL` strictness (`SEMI`, `ANTI`, `ANY`, and `ASOF` are not supported). + + When using the `partial_merge` algorithm, ClickHouse sorts the data and dumps it to the disk. The `partial_merge` algorithm in ClickHouse differs slightly from the classic realization. First, ClickHouse sorts the right table by joining keys in blocks and creates a min-max index for sorted blocks. Then it sorts parts of the left table by the `join key` and joins them over the right table. The min-max index is also used to skip unneeded right table blocks. + +- direct + + This algorithm can be applied when the storage for the right table supports key-value requests. + + The `direct` algorithm performs a lookup in the right table using rows from the left table as keys. It's supported only by special storage such as [Dictionary](../../engines/table-engines/special/dictionary.md/#dictionary) or [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) and only the `LEFT` and `INNER` JOINs. + +- auto + + When set to `auto`, `hash` join is tried first, and the algorithm is switched on the fly to another algorithm if the memory limit is violated. + +- full_sorting_merge + + [Sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join) with full sorting joined tables before joining. + +- prefer_partial_merge + + ClickHouse always tries to use `partial_merge` join if possible, otherwise, it uses `hash`. *Deprecated*, same as `partial_merge,hash`. +)", 0) \ + M(UInt64, cross_join_min_rows_to_compress, 10000000, R"( +Minimal count of rows to compress block in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached. +)", 0) \ + M(UInt64, cross_join_min_bytes_to_compress, 1_GiB, R"( +Minimal size of block to compress in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached. +)", 0) \ + M(UInt64, default_max_bytes_in_join, 1000000000, R"( +Maximum size of right-side table if limit is required but max_bytes_in_join is not set. +)", 0) \ + M(UInt64, partial_merge_join_left_table_buffer_bytes, 0, R"( +If not 0 group left table blocks in bigger ones for left-side table in partial merge join. It uses up to 2x of specified memory per joining thread. +)", 0) \ + M(UInt64, partial_merge_join_rows_in_right_blocks, 65536, R"( +Limits sizes of right-hand join data blocks in partial merge join algorithm for [JOIN](../../sql-reference/statements/select/join.md) queries. + +ClickHouse server: + +1. Splits right-hand join data into blocks with up to the specified number of rows. +2. Indexes each block with its minimum and maximum values. +3. Unloads prepared blocks to disk if it is possible. + +Possible values: + +- Any positive integer. Recommended range of values: \[1000, 100000\]. +)", 0) \ + M(UInt64, join_on_disk_max_files_to_merge, 64, R"( +Limits the number of files allowed for parallel sorting in MergeJoin operations when they are executed on disk. + +The bigger the value of the setting, the more RAM is used and the less disk I/O is needed. + +Possible values: + +- Any positive integer, starting from 2. +)", 0) \ + M(UInt64, max_rows_in_set_to_optimize_join, 0, R"( +Maximal size of the set to filter joined tables by each other's row sets before joining. + +Possible values: + +- 0 — Disable. +- Any positive integer. +)", 0) \ \ - M(Bool, compatibility_ignore_collation_in_create_table, true, "Compatibility ignore collation in create table", 0) \ + M(Bool, compatibility_ignore_collation_in_create_table, true, R"( +Compatibility ignore collation in create table +)", 0) \ \ - M(String, temporary_files_codec, "LZ4", "Set compression codec for temporary files produced by (JOINs, external GROUP BY, external ORDER BY). I.e. LZ4, NONE.", 0) \ + M(String, temporary_files_codec, "LZ4", R"( +Sets compression codec for temporary files used in sorting and joining operations on disk. + +Possible values: + +- LZ4 — [LZ4](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) compression is applied. +- NONE — No compression is applied. +)", 0) \ \ - M(UInt64, max_rows_to_transfer, 0, "Maximum size (in rows) of the transmitted external table obtained when the GLOBAL IN/JOIN section is executed.", 0) \ - M(UInt64, max_bytes_to_transfer, 0, "Maximum size (in uncompressed bytes) of the transmitted external table obtained when the GLOBAL IN/JOIN section is executed.", 0) \ - M(OverflowMode, transfer_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \ + M(UInt64, max_rows_to_transfer, 0, R"( +Maximum size (in rows) of the transmitted external table obtained when the GLOBAL IN/JOIN section is executed. +)", 0) \ + M(UInt64, max_bytes_to_transfer, 0, R"( +Maximum size (in uncompressed bytes) of the transmitted external table obtained when the GLOBAL IN/JOIN section is executed. +)", 0) \ + M(OverflowMode, transfer_overflow_mode, OverflowMode::THROW, R"( +What to do when the limit is exceeded. +)", 0) \ \ - M(UInt64, max_rows_in_distinct, 0, "Maximum number of elements during execution of DISTINCT.", 0) \ - M(UInt64, max_bytes_in_distinct, 0, "Maximum total size of the state (in uncompressed bytes) in memory for the execution of DISTINCT.", 0) \ - M(OverflowMode, distinct_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \ + M(UInt64, max_rows_in_distinct, 0, R"( +Maximum number of elements during execution of DISTINCT. +)", 0) \ + M(UInt64, max_bytes_in_distinct, 0, R"( +Maximum total size of the state (in uncompressed bytes) in memory for the execution of DISTINCT. +)", 0) \ + M(OverflowMode, distinct_overflow_mode, OverflowMode::THROW, R"( +What to do when the limit is exceeded. +)", 0) \ \ - M(UInt64, max_memory_usage, 0, "Maximum memory usage for processing of single query. Zero means unlimited.", 0) \ - M(UInt64, memory_overcommit_ratio_denominator, 1_GiB, "It represents soft memory limit on the user level. This value is used to compute query overcommit ratio.", 0) \ - M(UInt64, max_memory_usage_for_user, 0, "Maximum memory usage for processing all concurrently running queries for the user. Zero means unlimited.", 0) \ - M(UInt64, memory_overcommit_ratio_denominator_for_user, 1_GiB, "It represents soft memory limit on the global level. This value is used to compute query overcommit ratio.", 0) \ - M(UInt64, max_untracked_memory, (4 * 1024 * 1024), "Small allocations and deallocations are grouped in thread local variable and tracked or profiled only when an amount (in absolute value) becomes larger than the specified value. If the value is higher than 'memory_profiler_step' it will be effectively lowered to 'memory_profiler_step'.", 0) \ - M(UInt64, memory_profiler_step, (4 * 1024 * 1024), "Whenever query memory usage becomes larger than every next step in a number of bytes the memory profiler will collect the allocating stack trace. Zero means disabled memory profiler. Values lower than a few megabytes will slow down query processing.", 0) \ - M(Float, memory_profiler_sample_probability, 0., "Collect random allocations and deallocations and write them into system.trace_log with 'MemorySample' trace_type. The probability is for every alloc/free regardless of the size of the allocation (can be changed with `memory_profiler_sample_min_allocation_size` and `memory_profiler_sample_max_allocation_size`). Note that sampling happens only when the amount of untracked memory exceeds 'max_untracked_memory'. You may want to set 'max_untracked_memory' to 0 for extra fine-grained sampling.", 0) \ - M(UInt64, memory_profiler_sample_min_allocation_size, 0, "Collect random allocations of size greater or equal than the specified value with probability equal to `memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold work as expected.", 0) \ - M(UInt64, memory_profiler_sample_max_allocation_size, 0, "Collect random allocations of size less or equal than the specified value with probability equal to `memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold work as expected.", 0) \ - M(Bool, trace_profile_events, false, "Send to system.trace_log profile event and value of increment on each increment with 'ProfileEvent' trace_type", 0) \ + M(UInt64, max_memory_usage, 0, R"( +Maximum memory usage for processing of single query. Zero means unlimited. +)", 0) \ + M(UInt64, memory_overcommit_ratio_denominator, 1_GiB, R"( +It represents the soft memory limit when the hard limit is reached on the global level. +This value is used to compute the overcommit ratio for the query. +Zero means skip the query. +Read more about [memory overcommit](memory-overcommit.md). +)", 0) \ + M(UInt64, max_memory_usage_for_user, 0, R"( +Maximum memory usage for processing all concurrently running queries for the user. Zero means unlimited. +)", 0) \ + M(UInt64, memory_overcommit_ratio_denominator_for_user, 1_GiB, R"( +It represents the soft memory limit when the hard limit is reached on the user level. +This value is used to compute the overcommit ratio for the query. +Zero means skip the query. +Read more about [memory overcommit](memory-overcommit.md). +)", 0) \ + M(UInt64, max_untracked_memory, (4 * 1024 * 1024), R"( +Small allocations and deallocations are grouped in thread local variable and tracked or profiled only when an amount (in absolute value) becomes larger than the specified value. If the value is higher than 'memory_profiler_step' it will be effectively lowered to 'memory_profiler_step'. +)", 0) \ + M(UInt64, memory_profiler_step, (4 * 1024 * 1024), R"( +Sets the step of memory profiler. Whenever query memory usage becomes larger than every next step in number of bytes the memory profiler will collect the allocating stacktrace and will write it into [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log). + +Possible values: + +- A positive integer number of bytes. + +- 0 for turning off the memory profiler. +)", 0) \ + M(Float, memory_profiler_sample_probability, 0., R"( +Collect random allocations and deallocations and write them into system.trace_log with 'MemorySample' trace_type. The probability is for every alloc/free regardless of the size of the allocation (can be changed with `memory_profiler_sample_min_allocation_size` and `memory_profiler_sample_max_allocation_size`). Note that sampling happens only when the amount of untracked memory exceeds 'max_untracked_memory'. You may want to set 'max_untracked_memory' to 0 for extra fine-grained sampling. +)", 0) \ + M(UInt64, memory_profiler_sample_min_allocation_size, 0, R"( +Collect random allocations of size greater or equal than the specified value with probability equal to `memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold work as expected. +)", 0) \ + M(UInt64, memory_profiler_sample_max_allocation_size, 0, R"( +Collect random allocations of size less or equal than the specified value with probability equal to `memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold work as expected. +)", 0) \ + M(Bool, trace_profile_events, false, R"( +Enables or disables collecting stacktraces on each update of profile events along with the name of profile event and the value of increment and sending them into [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log). + +Possible values: + +- 1 — Tracing of profile events enabled. +- 0 — Tracing of profile events disabled. +)", 0) \ \ - M(UInt64, memory_usage_overcommit_max_wait_microseconds, 5'000'000, "Maximum time thread will wait for memory to be freed in the case of memory overcommit. If timeout is reached and memory is not freed, exception is thrown.", 0) \ + M(UInt64, memory_usage_overcommit_max_wait_microseconds, 5'000'000, R"( +Maximum time thread will wait for memory to be freed in the case of memory overcommit on a user level. +If the timeout is reached and memory is not freed, an exception is thrown. +Read more about [memory overcommit](memory-overcommit.md). +)", 0) \ \ - M(UInt64, max_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for a query. Zero means unlimited.", 0) \ - M(UInt64, max_network_bytes, 0, "The maximum number of bytes (compressed) to receive or transmit over the network for execution of the query.", 0) \ - M(UInt64, max_network_bandwidth_for_user, 0, "The maximum speed of data exchange over the network in bytes per second for all concurrently running user queries. Zero means unlimited.", 0)\ - M(UInt64, max_network_bandwidth_for_all_users, 0, "The maximum speed of data exchange over the network in bytes per second for all concurrently running queries. Zero means unlimited.", 0) \ + M(UInt64, max_network_bandwidth, 0, R"( +Limits the speed of the data exchange over the network in bytes per second. This setting applies to every query. + +Possible values: + +- Positive integer. +- 0 — Bandwidth control is disabled. +)", 0) \ + M(UInt64, max_network_bytes, 0, R"( +Limits the data volume (in bytes) that is received or transmitted over the network when executing a query. This setting applies to every individual query. + +Possible values: + +- Positive integer. +- 0 — Data volume control is disabled. +)", 0) \ + M(UInt64, max_network_bandwidth_for_user, 0, R"( +Limits the speed of the data exchange over the network in bytes per second. This setting applies to all concurrently running queries performed by a single user. + +Possible values: + +- Positive integer. +- 0 — Control of the data speed is disabled. +)", 0)\ + M(UInt64, max_network_bandwidth_for_all_users, 0, R"( +Limits the speed that data is exchanged at over the network in bytes per second. This setting applies to all concurrently running queries on the server. + +Possible values: + +- Positive integer. +- 0 — Control of the data speed is disabled. +)", 0) \ \ - M(UInt64, max_temporary_data_on_disk_size_for_user, 0, "The maximum amount of data consumed by temporary files on disk in bytes for all concurrently running user queries. Zero means unlimited.", 0)\ - M(UInt64, max_temporary_data_on_disk_size_for_query, 0, "The maximum amount of data consumed by temporary files on disk in bytes for all concurrently running queries. Zero means unlimited.", 0)\ + M(UInt64, max_temporary_data_on_disk_size_for_user, 0, R"( +The maximum amount of data consumed by temporary files on disk in bytes for all concurrently running user queries. Zero means unlimited. +)", 0)\ + M(UInt64, max_temporary_data_on_disk_size_for_query, 0, R"( +The maximum amount of data consumed by temporary files on disk in bytes for all concurrently running queries. Zero means unlimited. +)", 0)\ \ - M(UInt64, backup_restore_keeper_max_retries, 20, "Max retries for keeper operations during backup or restore", 0) \ - M(UInt64, backup_restore_keeper_retry_initial_backoff_ms, 100, "Initial backoff timeout for [Zoo]Keeper operations during backup or restore", 0) \ - M(UInt64, backup_restore_keeper_retry_max_backoff_ms, 5000, "Max backoff timeout for [Zoo]Keeper operations during backup or restore", 0) \ - M(Float, backup_restore_keeper_fault_injection_probability, 0.0f, "Approximate probability of failure for a keeper request during backup or restore. Valid value is in interval [0.0f, 1.0f]", 0) \ - M(UInt64, backup_restore_keeper_fault_injection_seed, 0, "0 - random seed, otherwise the setting value", 0) \ - M(UInt64, backup_restore_keeper_value_max_size, 1048576, "Maximum size of data of a [Zoo]Keeper's node during backup", 0) \ - M(UInt64, backup_restore_batch_size_for_keeper_multiread, 10000, "Maximum size of batch for multiread request to [Zoo]Keeper during backup or restore", 0) \ - M(UInt64, backup_restore_batch_size_for_keeper_multi, 1000, "Maximum size of batch for multi request to [Zoo]Keeper during backup or restore", 0) \ - M(UInt64, backup_restore_s3_retry_attempts, 1000, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries. It takes place only for backup/restore.", 0) \ - M(UInt64, max_backup_bandwidth, 0, "The maximum read speed in bytes per second for particular backup on server. Zero means unlimited.", 0) \ + M(UInt64, backup_restore_keeper_max_retries, 20, R"( +Max retries for keeper operations during backup or restore +)", 0) \ + M(UInt64, backup_restore_keeper_retry_initial_backoff_ms, 100, R"( +Initial backoff timeout for [Zoo]Keeper operations during backup or restore +)", 0) \ + M(UInt64, backup_restore_keeper_retry_max_backoff_ms, 5000, R"( +Max backoff timeout for [Zoo]Keeper operations during backup or restore +)", 0) \ + M(Float, backup_restore_keeper_fault_injection_probability, 0.0f, R"( +Approximate probability of failure for a keeper request during backup or restore. Valid value is in interval [0.0f, 1.0f] +)", 0) \ + M(UInt64, backup_restore_keeper_fault_injection_seed, 0, R"( +0 - random seed, otherwise the setting value +)", 0) \ + M(UInt64, backup_restore_keeper_value_max_size, 1048576, R"( +Maximum size of data of a [Zoo]Keeper's node during backup +)", 0) \ + M(UInt64, backup_restore_batch_size_for_keeper_multiread, 10000, R"( +Maximum size of batch for multiread request to [Zoo]Keeper during backup or restore +)", 0) \ + M(UInt64, backup_restore_batch_size_for_keeper_multi, 1000, R"( +Maximum size of batch for multi request to [Zoo]Keeper during backup or restore +)", 0) \ + M(UInt64, backup_restore_s3_retry_attempts, 1000, R"( +Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries. It takes place only for backup/restore. +)", 0) \ + M(UInt64, max_backup_bandwidth, 0, R"( +The maximum read speed in bytes per second for particular backup on server. Zero means unlimited. +)", 0) \ \ - M(Bool, log_profile_events, true, "Log query performance statistics into the query_log, query_thread_log and query_views_log.", 0) \ - M(Bool, log_query_settings, true, "Log query settings into the query_log.", 0) \ - M(Bool, log_query_threads, false, "Log query threads into system.query_thread_log table. This setting have effect only when 'log_queries' is true.", 0) \ - M(Bool, log_query_views, true, "Log query dependent views into system.query_views_log table. This setting have effect only when 'log_queries' is true.", 0) \ - M(String, log_comment, "", "Log comment into the system.query_log table and server log. It can be set to arbitrary string no longer than max_query_size.", 0) \ - M(LogsLevel, send_logs_level, LogsLevel::fatal, "Send server text logs with specified minimum level to client. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) \ - M(String, send_logs_source_regexp, "", "Send server text logs with specified regexp to match log source name. Empty means all sources.", 0) \ - M(Bool, enable_optimize_predicate_expression, true, "If it is set to true, optimize predicates to subqueries.", 0) \ - M(Bool, enable_optimize_predicate_expression_to_final_subquery, true, "Allow push predicate to final subquery.", 0) \ - M(Bool, allow_push_predicate_when_subquery_contains_with, true, "Allows push predicate when subquery contains WITH clause", 0) \ + M(Bool, log_profile_events, true, R"( +Log query performance statistics into the query_log, query_thread_log and query_views_log. +)", 0) \ + M(Bool, log_query_settings, true, R"( +Log query settings into the query_log. +)", 0) \ + M(Bool, log_query_threads, false, R"( +Setting up query threads logging. + +Query threads log into the [system.query_thread_log](../../operations/system-tables/query_thread_log.md) table. This setting has effect only when [log_queries](#log-queries) is true. Queries’ threads run by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../../operations/server-configuration-parameters/settings.md/#query_thread_log) server configuration parameter. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +**Example** + +``` text +log_query_threads=1 +``` +)", 0) \ + M(Bool, log_query_views, true, R"( +Setting up query views logging. + +When a query run by ClickHouse with this setting enabled has associated views (materialized or live views), they are logged in the [query_views_log](../../operations/server-configuration-parameters/settings.md/#query_views_log) server configuration parameter. + +Example: + +``` text +log_query_views=1 +``` +)", 0) \ + M(String, log_comment, "", R"( +Specifies the value for the `log_comment` field of the [system.query_log](../system-tables/query_log.md) table and comment text for the server log. + +It can be used to improve the readability of server logs. Additionally, it helps to select queries related to the test from the `system.query_log` after running [clickhouse-test](../../development/tests.md). + +Possible values: + +- Any string no longer than [max_query_size](#max_query_size). If the max_query_size is exceeded, the server throws an exception. + +**Example** + +Query: + +``` sql +SET log_comment = 'log_comment test', log_queries = 1; +SELECT 1; +SYSTEM FLUSH LOGS; +SELECT type, query FROM system.query_log WHERE log_comment = 'log_comment test' AND event_date >= yesterday() ORDER BY event_time DESC LIMIT 2; +``` + +Result: + +``` text +┌─type────────┬─query─────┐ +│ QueryStart │ SELECT 1; │ +│ QueryFinish │ SELECT 1; │ +└─────────────┴───────────┘ +``` +)", 0) \ + M(LogsLevel, send_logs_level, LogsLevel::fatal, R"( +Send server text logs with specified minimum level to client. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none' +)", 0) \ + M(String, send_logs_source_regexp, "", R"( +Send server text logs with specified regexp to match log source name. Empty means all sources. +)", 0) \ + M(Bool, enable_optimize_predicate_expression, true, R"( +Turns on predicate pushdown in `SELECT` queries. + +Predicate pushdown may significantly reduce network traffic for distributed queries. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Usage + +Consider the following queries: + +1. `SELECT count() FROM test_table WHERE date = '2018-10-10'` +2. `SELECT count() FROM (SELECT * FROM test_table) WHERE date = '2018-10-10'` + +If `enable_optimize_predicate_expression = 1`, then the execution time of these queries is equal because ClickHouse applies `WHERE` to the subquery when processing it. + +If `enable_optimize_predicate_expression = 0`, then the execution time of the second query is much longer because the `WHERE` clause applies to all the data after the subquery finishes. +)", 0) \ + M(Bool, enable_optimize_predicate_expression_to_final_subquery, true, R"( +Allow push predicate to final subquery. +)", 0) \ + M(Bool, allow_push_predicate_when_subquery_contains_with, true, R"( +Allows push predicate when subquery contains WITH clause +)", 0) \ \ - M(UInt64, low_cardinality_max_dictionary_size, 8192, "Maximum size (in rows) of shared global dictionary for LowCardinality type.", 0) \ - M(Bool, low_cardinality_use_single_dictionary_for_part, false, "LowCardinality type serialization setting. If is true, than will use additional keys when global dictionary overflows. Otherwise, will create several shared dictionaries.", 0) \ - M(Bool, decimal_check_overflow, true, "Check overflow of decimal arithmetic/comparison operations", 0) \ - M(Bool, allow_custom_error_code_in_throwif, false, "Enable custom error code in function throwIf(). If true, thrown exceptions may have unexpected error codes.", 0) \ + M(UInt64, low_cardinality_max_dictionary_size, 8192, R"( +Sets a maximum size in rows of a shared global dictionary for the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) data type that can be written to a storage file system. This setting prevents issues with RAM in case of unlimited dictionary growth. All the data that can’t be encoded due to maximum dictionary size limitation ClickHouse writes in an ordinary method. + +Possible values: + +- Any positive integer. +)", 0) \ + M(Bool, low_cardinality_use_single_dictionary_for_part, false, R"( +Turns on or turns off using of single dictionary for the data part. + +By default, the ClickHouse server monitors the size of dictionaries and if a dictionary overflows then the server starts to write the next one. To prohibit creating several dictionaries set `low_cardinality_use_single_dictionary_for_part = 1`. + +Possible values: + +- 1 — Creating several dictionaries for the data part is prohibited. +- 0 — Creating several dictionaries for the data part is not prohibited. +)", 0) \ + M(Bool, decimal_check_overflow, true, R"( +Check overflow of decimal arithmetic/comparison operations +)", 0) \ + M(Bool, allow_custom_error_code_in_throwif, false, R"( +Enable custom error code in function throwIf(). If true, thrown exceptions may have unexpected error codes. +)", 0) \ \ - M(Bool, prefer_localhost_replica, true, "If it's true then queries will be always sent to the local replica (if it exists). If it's false then the replica to send a query will be chosen between local and remote ones according to load_balancing", 0) \ - M(UInt64, max_fetch_partition_retries_count, 5, "Amount of retries while fetching partition from another host.", 0) \ - M(UInt64, http_max_multipart_form_data_size, 1024 * 1024 * 1024, "Limit on size of multipart/form-data content. This setting cannot be parsed from URL parameters and should be set in a user profile. Note that content is parsed and external tables are created in memory before the start of query execution. And this is the only limit that has an effect on that stage (limits on max memory usage and max execution time have no effect while reading HTTP form data).", 0) \ - M(Bool, calculate_text_stack_trace, true, "Calculate text stack trace in case of exceptions during query execution. This is the default. It requires symbol lookups that may slow down fuzzing tests when a huge amount of wrong queries are executed. In normal cases, you should not disable this option.", 0) \ - M(Bool, enable_job_stack_trace, false, "Output stack trace of a job creator when job results in exception", 0) \ - M(Bool, allow_ddl, true, "If it is set to true, then a user is allowed to executed DDL queries.", 0) \ - M(Bool, parallel_view_processing, false, "Enables pushing to attached views concurrently instead of sequentially.", 0) \ - M(Bool, enable_unaligned_array_join, false, "Allow ARRAY JOIN with multiple arrays that have different sizes. When this settings is enabled, arrays will be resized to the longest one.", 0) \ - M(Bool, optimize_read_in_order, true, "Enable ORDER BY optimization for reading data in corresponding order in MergeTree tables.", 0) \ - M(Bool, optimize_read_in_window_order, true, "Enable ORDER BY optimization in window clause for reading data in corresponding order in MergeTree tables.", 0) \ - M(Bool, optimize_aggregation_in_order, false, "Enable GROUP BY optimization for aggregating data in corresponding order in MergeTree tables.", 0) \ - M(Bool, read_in_order_use_buffering, true, "Use buffering before merging while reading in order of primary key. It increases the parallelism of query execution", 0) \ - M(UInt64, aggregation_in_order_max_block_bytes, 50000000, "Maximal size of block in bytes accumulated during aggregation in order of primary key. Lower block size allows to parallelize more final merge stage of aggregation.", 0) \ - M(UInt64, read_in_order_two_level_merge_threshold, 100, "Minimal number of parts to read to run preliminary merge step during multithread reading in order of primary key.", 0) \ - M(Bool, low_cardinality_allow_in_native_format, true, "Use LowCardinality type in Native format. Otherwise, convert LowCardinality columns to ordinary for select query, and convert ordinary columns to required LowCardinality for insert query.", 0) \ - M(Bool, cancel_http_readonly_queries_on_client_close, false, "Cancel HTTP readonly queries when a client closes the connection without waiting for response.", 0) \ - M(Bool, external_table_functions_use_nulls, true, "If it is set to true, external table functions will implicitly use Nullable type if needed. Otherwise, NULLs will be substituted with default values. Currently supported only by 'mysql', 'postgresql' and 'odbc' table functions.", 0) \ - M(Bool, external_table_strict_query, false, "If it is set to true, transforming expression to local filter is forbidden for queries to external tables.", 0) \ + M(Bool, prefer_localhost_replica, true, R"( +Enables/disables preferable using the localhost replica when processing distributed queries. + +Possible values: + +- 1 — ClickHouse always sends a query to the localhost replica if it exists. +- 0 — ClickHouse uses the balancing strategy specified by the [load_balancing](#load_balancing) setting. + +:::note +Disable this setting if you use [max_parallel_replicas](#max_parallel_replicas) without [parallel_replicas_custom_key](#parallel_replicas_custom_key). +If [parallel_replicas_custom_key](#parallel_replicas_custom_key) is set, disable this setting only if it's used on a cluster with multiple shards containing multiple replicas. +If it's used on a cluster with a single shard and multiple replicas, disabling this setting will have negative effects. +::: +)", 0) \ + M(UInt64, max_fetch_partition_retries_count, 5, R"( +Amount of retries while fetching partition from another host. +)", 0) \ + M(UInt64, http_max_multipart_form_data_size, 1024 * 1024 * 1024, R"( +Limit on size of multipart/form-data content. This setting cannot be parsed from URL parameters and should be set in a user profile. Note that content is parsed and external tables are created in memory before the start of query execution. And this is the only limit that has an effect on that stage (limits on max memory usage and max execution time have no effect while reading HTTP form data). +)", 0) \ + M(Bool, calculate_text_stack_trace, true, R"( +Calculate text stack trace in case of exceptions during query execution. This is the default. It requires symbol lookups that may slow down fuzzing tests when a huge amount of wrong queries are executed. In normal cases, you should not disable this option. +)", 0) \ + M(Bool, enable_job_stack_trace, false, R"( +Output stack trace of a job creator when job results in exception +)", 0) \ + M(Bool, allow_ddl, true, R"( +If it is set to true, then a user is allowed to executed DDL queries. +)", 0) \ + M(Bool, parallel_view_processing, false, R"( +Enables pushing to attached views concurrently instead of sequentially. +)", 0) \ + M(Bool, enable_unaligned_array_join, false, R"( +Allow ARRAY JOIN with multiple arrays that have different sizes. When this settings is enabled, arrays will be resized to the longest one. +)", 0) \ + M(Bool, optimize_read_in_order, true, R"( +Enables [ORDER BY](../../sql-reference/statements/select/order-by.md/#optimize_read_in_order) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries for reading data from [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. + +Possible values: + +- 0 — `ORDER BY` optimization is disabled. +- 1 — `ORDER BY` optimization is enabled. + +**See Also** + +- [ORDER BY Clause](../../sql-reference/statements/select/order-by.md/#optimize_read_in_order) +)", 0) \ + M(Bool, optimize_read_in_window_order, true, R"( +Enable ORDER BY optimization in window clause for reading data in corresponding order in MergeTree tables. +)", 0) \ + M(Bool, optimize_aggregation_in_order, false, R"( +Enables [GROUP BY](../../sql-reference/statements/select/group-by.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries for aggregating data in corresponding order in [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. + +Possible values: + +- 0 — `GROUP BY` optimization is disabled. +- 1 — `GROUP BY` optimization is enabled. + +**See Also** + +- [GROUP BY optimization](../../sql-reference/statements/select/group-by.md/#aggregation-in-order) +)", 0) \ + M(Bool, read_in_order_use_buffering, true, R"( +Use buffering before merging while reading in order of primary key. It increases the parallelism of query execution +)", 0) \ + M(UInt64, aggregation_in_order_max_block_bytes, 50000000, R"( +Maximal size of block in bytes accumulated during aggregation in order of primary key. Lower block size allows to parallelize more final merge stage of aggregation. +)", 0) \ + M(UInt64, read_in_order_two_level_merge_threshold, 100, R"( +Minimal number of parts to read to run preliminary merge step during multithread reading in order of primary key. +)", 0) \ + M(Bool, low_cardinality_allow_in_native_format, true, R"( +Allows or restricts using the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) data type with the [Native](../../interfaces/formats.md/#native) format. + +If usage of `LowCardinality` is restricted, ClickHouse server converts `LowCardinality`-columns to ordinary ones for `SELECT` queries, and convert ordinary columns to `LowCardinality`-columns for `INSERT` queries. + +This setting is required mainly for third-party clients which do not support `LowCardinality` data type. + +Possible values: + +- 1 — Usage of `LowCardinality` is not restricted. +- 0 — Usage of `LowCardinality` is restricted. +)", 0) \ + M(Bool, cancel_http_readonly_queries_on_client_close, false, R"( +Cancels HTTP read-only queries (e.g. SELECT) when a client closes the connection without waiting for the response. + +Cloud default value: `1`. +)", 0) \ + M(Bool, external_table_functions_use_nulls, true, R"( +Defines how [mysql](../../sql-reference/table-functions/mysql.md), [postgresql](../../sql-reference/table-functions/postgresql.md) and [odbc](../../sql-reference/table-functions/odbc.md) table functions use Nullable columns. + +Possible values: + +- 0 — The table function explicitly uses Nullable columns. +- 1 — The table function implicitly uses Nullable columns. + +**Usage** + +If the setting is set to `0`, the table function does not make Nullable columns and inserts default values instead of NULL. This is also applicable for NULL values inside arrays. +)", 0) \ + M(Bool, external_table_strict_query, false, R"( +If it is set to true, transforming expression to local filter is forbidden for queries to external tables. +)", 0) \ \ - M(Bool, allow_hyperscan, true, "Allow functions that use Hyperscan library. Disable to avoid potentially long compilation times and excessive resource usage.", 0) \ - M(UInt64, max_hyperscan_regexp_length, 0, "Max length of regexp than can be used in hyperscan multi-match functions. Zero means unlimited.", 0) \ - M(UInt64, max_hyperscan_regexp_total_length, 0, "Max total length of all regexps than can be used in hyperscan multi-match functions (per every function). Zero means unlimited.", 0) \ - M(Bool, reject_expensive_hyperscan_regexps, true, "Reject patterns which will likely be expensive to evaluate with hyperscan (due to NFA state explosion)", 0) \ - M(Bool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.", 0) \ - M(Bool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.", 0) \ - M(Bool, splitby_max_substrings_includes_remaining_string, false, "Functions 'splitBy*()' with 'max_substrings' argument > 0 include the remaining string as the last element in the result", 0) \ + M(Bool, allow_hyperscan, true, R"( +Allow functions that use Hyperscan library. Disable to avoid potentially long compilation times and excessive resource usage. +)", 0) \ + M(UInt64, max_hyperscan_regexp_length, 0, R"( +Defines the maximum length for each regular expression in the [hyperscan multi-match functions](../../sql-reference/functions/string-search-functions.md/#multimatchanyhaystack-pattern1-pattern2-patternn). + +Possible values: + +- Positive integer. +- 0 - The length is not limited. + +**Example** + +Query: + +```sql +SELECT multiMatchAny('abcd', ['ab','bcd','c','d']) SETTINGS max_hyperscan_regexp_length = 3; +``` + +Result: + +```text +┌─multiMatchAny('abcd', ['ab', 'bcd', 'c', 'd'])─┐ +│ 1 │ +└────────────────────────────────────────────────┘ +``` + +Query: + +```sql +SELECT multiMatchAny('abcd', ['ab','bcd','c','d']) SETTINGS max_hyperscan_regexp_length = 2; +``` + +Result: + +```text +Exception: Regexp length too large. +``` + +**See Also** + +- [max_hyperscan_regexp_total_length](#max-hyperscan-regexp-total-length) +)", 0) \ + M(UInt64, max_hyperscan_regexp_total_length, 0, R"( +Sets the maximum length total of all regular expressions in each [hyperscan multi-match function](../../sql-reference/functions/string-search-functions.md/#multimatchanyhaystack-pattern1-pattern2-patternn). + +Possible values: + +- Positive integer. +- 0 - The length is not limited. + +**Example** + +Query: + +```sql +SELECT multiMatchAny('abcd', ['a','b','c','d']) SETTINGS max_hyperscan_regexp_total_length = 5; +``` + +Result: + +```text +┌─multiMatchAny('abcd', ['a', 'b', 'c', 'd'])─┐ +│ 1 │ +└─────────────────────────────────────────────┘ +``` + +Query: + +```sql +SELECT multiMatchAny('abcd', ['ab','bc','c','d']) SETTINGS max_hyperscan_regexp_total_length = 5; +``` + +Result: + +```text +Exception: Total regexp lengths too large. +``` + +**See Also** + +- [max_hyperscan_regexp_length](#max-hyperscan-regexp-length) +)", 0) \ + M(Bool, reject_expensive_hyperscan_regexps, true, R"( +Reject patterns which will likely be expensive to evaluate with hyperscan (due to NFA state explosion) +)", 0) \ + M(Bool, allow_simdjson, true, R"( +Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used. +)", 0) \ + M(Bool, allow_introspection_functions, false, R"( +Enables or disables [introspection functions](../../sql-reference/functions/introspection.md) for query profiling. + +Possible values: + +- 1 — Introspection functions enabled. +- 0 — Introspection functions disabled. + +**See Also** + +- [Sampling Query Profiler](../../operations/optimizing-performance/sampling-query-profiler.md) +- System table [trace_log](../../operations/system-tables/trace_log.md/#system_tables-trace_log) +)", 0) \ + M(Bool, splitby_max_substrings_includes_remaining_string, false, R"( +Controls whether function [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with argument `max_substrings` > 0 will include the remaining string in the last element of the result array. + +Possible values: + +- `0` - The remaining string will not be included in the last element of the result array. +- `1` - The remaining string will be included in the last element of the result array. This is the behavior of Spark's [`split()`](https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.sql.functions.split.html) function and Python's ['string.split()'](https://docs.python.org/3/library/stdtypes.html#str.split) method. +)", 0) \ \ - M(Bool, allow_execute_multiif_columnar, true, "Allow execute multiIf function columnar", 0) \ - M(Bool, formatdatetime_f_prints_single_zero, false, "Formatter '%f' in function 'formatDateTime()' prints a single zero instead of six zeros if the formatted value has no fractional seconds.", 0) \ - M(Bool, formatdatetime_parsedatetime_m_is_month_name, true, "Formatter '%M' in functions 'formatDateTime()' and 'parseDateTime()' print/parse the month name instead of minutes.", 0) \ - M(Bool, parsedatetime_parse_without_leading_zeros, true, "Formatters '%c', '%l' and '%k' in function 'parseDateTime()' parse months and hours without leading zeros.", 0) \ - M(Bool, formatdatetime_format_without_leading_zeros, false, "Formatters '%c', '%l' and '%k' in function 'formatDateTime()' print months and hours without leading zeros.", 0) \ + M(Bool, allow_execute_multiif_columnar, true, R"( +Allow execute multiIf function columnar +)", 0) \ + M(Bool, formatdatetime_f_prints_single_zero, false, R"( +Formatter '%f' in function 'formatDateTime()' prints a single zero instead of six zeros if the formatted value has no fractional seconds. +)", 0) \ + M(Bool, formatdatetime_parsedatetime_m_is_month_name, true, R"( +Formatter '%M' in functions 'formatDateTime()' and 'parseDateTime()' print/parse the month name instead of minutes. +)", 0) \ + M(Bool, parsedatetime_parse_without_leading_zeros, true, R"( +Formatters '%c', '%l' and '%k' in function 'parseDateTime()' parse months and hours without leading zeros. +)", 0) \ + M(Bool, formatdatetime_format_without_leading_zeros, false, R"( +Formatters '%c', '%l' and '%k' in function 'formatDateTime()' print months and hours without leading zeros. +)", 0) \ \ - M(UInt64, max_partitions_per_insert_block, 100, "Limit maximum number of partitions in the single INSERTed block. Zero means unlimited. Throw an exception if the block contains too many partitions. This setting is a safety threshold because using a large number of partitions is a common misconception.", 0) \ - M(Bool, throw_on_max_partitions_per_insert_block, true, "Used with max_partitions_per_insert_block. If true (default), an exception will be thrown when max_partitions_per_insert_block is reached. If false, details of the insert query reaching this limit with the number of partitions will be logged. This can be useful if you're trying to understand the impact on users when changing max_partitions_per_insert_block.", 0) \ - M(Int64, max_partitions_to_read, -1, "Limit the max number of partitions that can be accessed in one query. <= 0 means unlimited.", 0) \ - M(Bool, check_query_single_value_result, true, "Return check query result as single 1/0 value", 0) \ - M(Bool, allow_drop_detached, false, "Allow ALTER TABLE ... DROP DETACHED PART[ITION] ... queries", 0) \ - M(UInt64, max_table_size_to_drop, 50000000000lu, "If the size of a table is greater than this value (in bytes) then the table could not be dropped with any DROP query.", 0) \ - M(UInt64, max_partition_size_to_drop, 50000000000lu, "Same as max_table_size_to_drop, but for the partitions.", 0) \ + M(UInt64, max_partitions_per_insert_block, 100, R"( +Limit maximum number of partitions in the single INSERTed block. Zero means unlimited. Throw an exception if the block contains too many partitions. This setting is a safety threshold because using a large number of partitions is a common misconception. +)", 0) \ + M(Bool, throw_on_max_partitions_per_insert_block, true, R"( +Used with max_partitions_per_insert_block. If true (default), an exception will be thrown when max_partitions_per_insert_block is reached. If false, details of the insert query reaching this limit with the number of partitions will be logged. This can be useful if you're trying to understand the impact on users when changing max_partitions_per_insert_block. +)", 0) \ + M(Int64, max_partitions_to_read, -1, R"( +Limit the max number of partitions that can be accessed in one query. <= 0 means unlimited. +)", 0) \ + M(Bool, check_query_single_value_result, true, R"( +Defines the level of detail for the [CHECK TABLE](../../sql-reference/statements/check-table.md/#checking-mergetree-tables) query result for `MergeTree` family engines . + +Possible values: + +- 0 — the query shows a check status for every individual data part of a table. +- 1 — the query shows the general table check status. +)", 0) \ + M(Bool, allow_drop_detached, false, R"( +Allow ALTER TABLE ... DROP DETACHED PART[ITION] ... queries +)", 0) \ \ - M(UInt64, postgresql_connection_pool_size, 16, "Connection pool size for PostgreSQL table engine and database engine.", 0) \ - M(UInt64, postgresql_connection_attempt_timeout, 2, "Connection timeout to PostgreSQL table engine and database engine in seconds.", 0) \ - M(UInt64, postgresql_connection_pool_wait_timeout, 5000, "Connection pool push/pop timeout on empty pool for PostgreSQL table engine and database engine. By default it will block on empty pool.", 0) \ - M(UInt64, postgresql_connection_pool_retries, 2, "Connection pool push/pop retries number for PostgreSQL table engine and database engine.", 0) \ - M(Bool, postgresql_connection_pool_auto_close_connection, false, "Close connection before returning connection to the pool.", 0) \ - M(UInt64, glob_expansion_max_elements, 1000, "Maximum number of allowed addresses (For external storages, table functions, etc).", 0) \ - M(UInt64, odbc_bridge_connection_pool_size, 16, "Connection pool size for each connection settings string in ODBC bridge.", 0) \ - M(Bool, odbc_bridge_use_connection_pooling, true, "Use connection pooling in ODBC bridge. If set to false, a new connection is created every time", 0) \ + M(UInt64, max_table_size_to_drop, 50000000000lu, R"( +Restriction on deleting tables in query time. The value 0 means that you can delete all tables without any restrictions. + +Cloud default value: 1 TB. + +:::note +This query setting overwrites its server setting equivalent, see [max_table_size_to_drop](/docs/en/operations/server-configuration-parameters/settings.md/#max-table-size-to-drop) +::: +)", 0) \ + M(UInt64, max_partition_size_to_drop, 50000000000lu, R"( +Restriction on dropping partitions in query time. The value 0 means that you can drop partitions without any restrictions. + +Cloud default value: 1 TB. + +:::note +This query setting overwrites its server setting equivalent, see [max_partition_size_to_drop](/docs/en/operations/server-configuration-parameters/settings.md/#max-partition-size-to-drop) +::: +)", 0) \ \ - M(Seconds, distributed_replica_error_half_life, DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_DECREASE_ERROR_PERIOD, "Time period reduces replica error counter by 2 times.", 0) \ - M(UInt64, distributed_replica_error_cap, DBMS_CONNECTION_POOL_WITH_FAILOVER_MAX_ERROR_COUNT, "Max number of errors per replica, prevents piling up an incredible amount of errors if replica was offline for some time and allows it to be reconsidered in a shorter amount of time.", 0) \ - M(UInt64, distributed_replica_max_ignored_errors, 0, "Number of errors that will be ignored while choosing replicas", 0) \ + M(UInt64, postgresql_connection_pool_size, 16, R"( +Connection pool size for PostgreSQL table engine and database engine. +)", 0) \ + M(UInt64, postgresql_connection_attempt_timeout, 2, R"( +Connection timeout in seconds of a single attempt to connect PostgreSQL end-point. +The value is passed as a `connect_timeout` parameter of the connection URL. +)", 0) \ + M(UInt64, postgresql_connection_pool_wait_timeout, 5000, R"( +Connection pool push/pop timeout on empty pool for PostgreSQL table engine and database engine. By default it will block on empty pool. +)", 0) \ + M(UInt64, postgresql_connection_pool_retries, 2, R"( +Connection pool push/pop retries number for PostgreSQL table engine and database engine. +)", 0) \ + M(Bool, postgresql_connection_pool_auto_close_connection, false, R"( +Close connection before returning connection to the pool. +)", 0) \ + M(UInt64, glob_expansion_max_elements, 1000, R"( +Maximum number of allowed addresses (For external storages, table functions, etc). +)", 0) \ + M(UInt64, odbc_bridge_connection_pool_size, 16, R"( +Connection pool size for each connection settings string in ODBC bridge. +)", 0) \ + M(Bool, odbc_bridge_use_connection_pooling, true, R"( +Use connection pooling in ODBC bridge. If set to false, a new connection is created every time. +)", 0) \ \ - M(UInt64, min_free_disk_space_for_temporary_data, 0, "The minimum disk space to keep while writing temporary data used in external sorting and aggregation.", 0) \ + M(Seconds, distributed_replica_error_half_life, DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_DECREASE_ERROR_PERIOD, R"( +- Type: seconds +- Default value: 60 seconds + +Controls how fast errors in distributed tables are zeroed. If a replica is unavailable for some time, accumulates 5 errors, and distributed_replica_error_half_life is set to 1 second, then the replica is considered normal 3 seconds after the last error. + +See also: + +- [load_balancing](#load_balancing-round_robin) +- [Table engine Distributed](../../engines/table-engines/special/distributed.md) +- [distributed_replica_error_cap](#distributed_replica_error_cap) +- [distributed_replica_max_ignored_errors](#distributed_replica_max_ignored_errors) +)", 0) \ + M(UInt64, distributed_replica_error_cap, DBMS_CONNECTION_POOL_WITH_FAILOVER_MAX_ERROR_COUNT, R"( +- Type: unsigned int +- Default value: 1000 + +The error count of each replica is capped at this value, preventing a single replica from accumulating too many errors. + +See also: + +- [load_balancing](#load_balancing-round_robin) +- [Table engine Distributed](../../engines/table-engines/special/distributed.md) +- [distributed_replica_error_half_life](#distributed_replica_error_half_life) +- [distributed_replica_max_ignored_errors](#distributed_replica_max_ignored_errors) +)", 0) \ + M(UInt64, distributed_replica_max_ignored_errors, 0, R"( +- Type: unsigned int +- Default value: 0 + +The number of errors that will be ignored while choosing replicas (according to `load_balancing` algorithm). + +See also: + +- [load_balancing](#load_balancing-round_robin) +- [Table engine Distributed](../../engines/table-engines/special/distributed.md) +- [distributed_replica_error_cap](#distributed_replica_error_cap) +- [distributed_replica_error_half_life](#distributed_replica_error_half_life) +)", 0) \ \ - M(DefaultTableEngine, default_temporary_table_engine, DefaultTableEngine::Memory, "Default table engine used when ENGINE is not set in CREATE TEMPORARY statement.",0) \ - M(DefaultTableEngine, default_table_engine, DefaultTableEngine::MergeTree, "Default table engine used when ENGINE is not set in CREATE statement.",0) \ - M(Bool, show_table_uuid_in_table_create_query_if_not_nil, false, "For tables in databases with Engine=Atomic show UUID of the table in its CREATE query.", 0) \ - M(Bool, database_atomic_wait_for_drop_and_detach_synchronously, false, "When executing DROP or DETACH TABLE in Atomic database, wait for table data to be finally dropped or detached.", 0) \ - M(Bool, enable_scalar_subquery_optimization, true, "If it is set to true, prevent scalar subqueries from (de)serializing large scalar values and possibly avoid running the same subquery more than once.", 0) \ - M(Bool, optimize_trivial_count_query, true, "Process trivial 'SELECT count() FROM table' query from metadata.", 0) \ - M(Bool, optimize_trivial_approximate_count_query, false, "Use an approximate value for trivial count optimization of storages that support such estimations.", 0) \ - M(Bool, optimize_count_from_files, true, "Optimize counting rows from files in supported input formats", 0) \ - M(Bool, use_cache_for_count_from_files, true, "Use cache to count the number of rows in files", 0) \ - M(Bool, optimize_respect_aliases, true, "If it is set to true, it will respect aliases in WHERE/GROUP BY/ORDER BY, that will help with partition pruning/secondary indexes/optimize_aggregation_in_order/optimize_read_in_order/optimize_trivial_count", 0) \ - M(UInt64, mutations_sync, 0, "Wait for synchronous execution of ALTER TABLE UPDATE/DELETE queries (mutations). 0 - execute asynchronously. 1 - wait for the current server. 2 - wait for all replicas if they exist.", 0) \ - M(Bool, enable_lightweight_delete, true, "Enable lightweight DELETE mutations for mergetree tables.", 0) ALIAS(allow_experimental_lightweight_delete) \ - M(UInt64, lightweight_deletes_sync, 2, "The same as 'mutations_sync', but controls only execution of lightweight deletes", 0) \ - M(Bool, apply_deleted_mask, true, "Enables filtering out rows deleted with lightweight DELETE. If disabled, a query will be able to read those rows. This is useful for debugging and \"undelete\" scenarios", 0) \ - M(Bool, optimize_normalize_count_variants, true, "Rewrite aggregate functions that semantically equals to count() as count().", 0) \ - M(Bool, optimize_injective_functions_inside_uniq, true, "Delete injective functions of one argument inside uniq*() functions.", 0) \ - M(Bool, rewrite_count_distinct_if_with_count_distinct_implementation, false, "Rewrite countDistinctIf with count_distinct_implementation configuration", 0) \ - M(Bool, convert_query_to_cnf, false, "Convert SELECT query to CNF", 0) \ - M(Bool, optimize_or_like_chain, false, "Optimize multiple OR LIKE into multiMatchAny. This optimization should not be enabled by default, because it defies index analysis in some cases.", 0) \ - M(Bool, optimize_arithmetic_operations_in_aggregate_functions, true, "Move arithmetic operations out of aggregation functions", 0) \ - M(Bool, optimize_redundant_functions_in_order_by, true, "Remove functions from ORDER BY if its argument is also in ORDER BY", 0) \ - M(Bool, optimize_if_chain_to_multiif, false, "Replace if(cond1, then1, if(cond2, ...)) chains to multiIf. Currently it's not beneficial for numeric types.", 0) \ - M(Bool, optimize_multiif_to_if, true, "Replace 'multiIf' with only one condition to 'if'.", 0) \ - M(Bool, optimize_if_transform_strings_to_enum, false, "Replaces string-type arguments in If and Transform to enum. Disabled by default cause it could make inconsistent change in distributed query that would lead to its fail.", 0) \ - M(Bool, optimize_functions_to_subcolumns, true, "Transform functions to subcolumns, if possible, to reduce amount of read data. E.g. 'length(arr)' -> 'arr.size0', 'col IS NULL' -> 'col.null' ", 0) \ - M(Bool, optimize_using_constraints, false, "Use constraints for query optimization", 0) \ - M(Bool, optimize_substitute_columns, false, "Use constraints for column substitution", 0) \ - M(Bool, optimize_append_index, false, "Use constraints in order to append index condition (indexHint)", 0) \ - M(Bool, optimize_time_filter_with_preimage, true, "Optimize Date and DateTime predicates by converting functions into equivalent comparisons without conversions (e.g. toYear(col) = 2023 -> col >= '2023-01-01' AND col <= '2023-12-31')", 0) \ - M(Bool, normalize_function_names, true, "Normalize function names to their canonical names", 0) \ - M(Bool, enable_early_constant_folding, true, "Enable query optimization where we analyze function and subqueries results and rewrite query if there are constants there", 0) \ - M(Bool, deduplicate_blocks_in_dependent_materialized_views, false, "Should deduplicate blocks for materialized views. Use true to always deduplicate in dependent tables.", 0) \ - M(Bool, throw_if_deduplication_in_dependent_materialized_views_enabled_with_async_insert, true, "Throw exception on INSERT query when the setting `deduplicate_blocks_in_dependent_materialized_views` is enabled along with `async_insert`. It guarantees correctness, because these features can't work together.", 0) \ - M(Bool, materialized_views_ignore_errors, false, "Allows to ignore errors for MATERIALIZED VIEW, and deliver original block to the table regardless of MVs", 0) \ - M(Bool, ignore_materialized_views_with_dropped_target_table, false, "Ignore MVs with dropped target table during pushing to views", 0) \ - M(Bool, allow_materialized_view_with_bad_select, true, "Allow CREATE MATERIALIZED VIEW with SELECT query that references nonexistent tables or columns. It must still be syntactically valid. Doesn't apply to refreshable MVs. Doesn't apply if the MV schema needs to be inferred from the SELECT query (i.e. if the CREATE has no column list and no TO table). Can be used for creating MV before its source table.", 0) \ - M(Bool, use_compact_format_in_distributed_parts_names, true, "Changes format of directories names for distributed table insert parts.", 0) \ - M(Bool, validate_polygons, true, "Throw exception if polygon is invalid in function pointInPolygon (e.g. self-tangent, self-intersecting). If the setting is false, the function will accept invalid polygons but may silently return wrong result.", 0) \ - M(UInt64, max_parser_depth, DBMS_DEFAULT_MAX_PARSER_DEPTH, "Maximum parser depth (recursion depth of recursive descend parser).", 0) \ - M(UInt64, max_parser_backtracks, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS, "Maximum parser backtracking (how many times it tries different alternatives in the recursive descend parsing process).", 0) \ - M(UInt64, max_recursive_cte_evaluation_depth, DBMS_RECURSIVE_CTE_MAX_EVALUATION_DEPTH, "Maximum limit on recursive CTE evaluation depth", 0) \ - M(Bool, allow_settings_after_format_in_insert, false, "Allow SETTINGS after FORMAT, but note, that this is not always safe (note: this is a compatibility setting).", 0) \ - M(Seconds, periodic_live_view_refresh, 60, "Interval after which periodically refreshed live view is forced to refresh.", 0) \ - M(Bool, transform_null_in, false, "If enabled, NULL values will be matched with 'IN' operator as if they are considered equal.", 0) \ - M(Bool, allow_nondeterministic_mutations, false, "Allow non-deterministic functions in ALTER UPDATE/ALTER DELETE statements", 0) \ - M(Seconds, lock_acquire_timeout, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "How long locking request should wait before failing", 0) \ - M(Bool, materialize_ttl_after_modify, true, "Apply TTL for old data, after ALTER MODIFY TTL query", 0) \ - M(String, function_implementation, "", "Choose function implementation for specific target or variant (experimental). If empty enable all of them.", 0) \ - M(Bool, data_type_default_nullable, false, "Data types without NULL or NOT NULL will make Nullable", 0) \ - M(Bool, cast_keep_nullable, false, "CAST operator keep Nullable for result data type", 0) \ - M(Bool, cast_ipv4_ipv6_default_on_conversion_error, false, "CAST operator into IPv4, CAST operator into IPV6 type, toIPv4, toIPv6 functions will return default value instead of throwing exception on conversion error.", 0) \ - M(Bool, alter_partition_verbose_result, false, "Output information about affected parts. Currently works only for FREEZE and ATTACH commands.", 0) \ - M(Bool, system_events_show_zero_values, false, "When querying system.events or system.metrics tables, include all metrics, even with zero values.", 0) \ - M(MySQLDataTypesSupport, mysql_datatypes_support_level, MySQLDataTypesSupportList{}, "Defines how MySQL types are converted to corresponding ClickHouse types. A comma separated list in any combination of 'decimal', 'datetime64', 'date2Date32' or 'date2String'. decimal: convert NUMERIC and DECIMAL types to Decimal when precision allows it. datetime64: convert DATETIME and TIMESTAMP types to DateTime64 instead of DateTime when precision is not 0. date2Date32: convert DATE to Date32 instead of Date. Takes precedence over date2String. date2String: convert DATE to String instead of Date. Overridden by datetime64.", 0) \ - M(Bool, optimize_trivial_insert_select, false, "Optimize trivial 'INSERT INTO table SELECT ... FROM TABLES' query", 0) \ - M(Bool, allow_non_metadata_alters, true, "Allow to execute alters which affects not only tables metadata, but also data on disk", 0) \ - M(Bool, enable_global_with_statement, true, "Propagate WITH statements to UNION queries and all subqueries", 0) \ - M(Bool, aggregate_functions_null_for_empty, false, "Rewrite all aggregate functions in a query, adding -OrNull suffix to them", 0) \ - M(Bool, optimize_syntax_fuse_functions, false, "Allow apply fuse aggregating function. Available only with `allow_experimental_analyzer`", 0) \ - M(Bool, flatten_nested, true, "If true, columns of type Nested will be flatten to separate array columns instead of one array of tuples", 0) \ - M(Bool, asterisk_include_materialized_columns, false, "Include MATERIALIZED columns for wildcard query", 0) \ - M(Bool, asterisk_include_alias_columns, false, "Include ALIAS columns for wildcard query", 0) \ - M(Bool, optimize_skip_merged_partitions, false, "Skip partitions with one part with level > 0 in optimize final", 0) \ - M(Bool, optimize_on_insert, true, "Do the same transformation for inserted block of data as if merge was done on this block.", 0) \ - M(Bool, optimize_use_projections, true, "Automatically choose projections to perform SELECT query", 0) ALIAS(allow_experimental_projection_optimization) \ - M(Bool, optimize_use_implicit_projections, true, "Automatically choose implicit projections to perform SELECT query", 0) \ - M(Bool, force_optimize_projection, false, "If projection optimization is enabled, SELECT queries need to use projection", 0) \ - M(String, force_optimize_projection_name, "", "If it is set to a non-empty string, check that this projection is used in the query at least once.", 0) \ - M(String, preferred_optimize_projection_name, "", "If it is set to a non-empty string, ClickHouse tries to apply specified projection", 0) \ - M(Bool, async_socket_for_remote, true, "Asynchronously read from socket executing remote query", 0) \ - M(Bool, async_query_sending_for_remote, true, "Asynchronously create connections and send query to shards in remote query", 0) \ - M(Bool, insert_null_as_default, true, "Insert DEFAULT values instead of NULL in INSERT SELECT (UNION ALL)", 0) \ - M(Bool, describe_extend_object_types, false, "Deduce concrete type of columns of type Object in DESCRIBE query", 0) \ - M(Bool, describe_include_subcolumns, false, "If true, subcolumns of all table columns will be included into result of DESCRIBE query", 0) \ - M(Bool, describe_include_virtual_columns, false, "If true, virtual columns of table will be included into result of DESCRIBE query", 0) \ - M(Bool, describe_compact_output, false, "If true, include only column names and types into result of DESCRIBE query", 0) \ - M(Bool, apply_mutations_on_fly, false, "Only available in ClickHouse Cloud", 0) \ - M(Bool, mutations_execute_nondeterministic_on_initiator, false, "If true nondeterministic function are executed on initiator and replaced to literals in UPDATE and DELETE queries", 0) \ - M(Bool, mutations_execute_subqueries_on_initiator, false, "If true scalar subqueries are executed on initiator and replaced to literals in UPDATE and DELETE queries", 0) \ - M(UInt64, mutations_max_literal_size_to_replace, 16384, "The maximum size of serialized literal in bytes to replace in UPDATE and DELETE queries", 0) \ + M(UInt64, min_free_disk_space_for_temporary_data, 0, R"( +The minimum disk space to keep while writing temporary data used in external sorting and aggregation. +)", 0) \ \ - M(Float, create_replicated_merge_tree_fault_injection_probability, 0.0f, "The probability of a fault injection during table creation after creating metadata in ZooKeeper", 0) \ + M(DefaultTableEngine, default_temporary_table_engine, DefaultTableEngine::Memory, R"( +Same as [default_table_engine](#default_table_engine) but for temporary tables. + +In this example, any new temporary table that does not specify an `Engine` will use the `Log` table engine: + +Query: + +```sql +SET default_temporary_table_engine = 'Log'; + +CREATE TEMPORARY TABLE my_table ( + x UInt32, + y UInt32 +); + +SHOW CREATE TEMPORARY TABLE my_table; +``` + +Result: + +```response +┌─statement────────────────────────────────────────────────────────────────┐ +│ CREATE TEMPORARY TABLE default.my_table +( + `x` UInt32, + `y` UInt32 +) +ENGINE = Log +└──────────────────────────────────────────────────────────────────────────┘ +``` +)", 0) \ + M(DefaultTableEngine, default_table_engine, DefaultTableEngine::MergeTree, R"( +Default table engine to use when `ENGINE` is not set in a `CREATE` statement. + +Possible values: + +- a string representing any valid table engine name + +Cloud default value: `SharedMergeTree`. + +**Example** + +Query: + +```sql +SET default_table_engine = 'Log'; + +SELECT name, value, changed FROM system.settings WHERE name = 'default_table_engine'; +``` + +Result: + +```response +┌─name─────────────────┬─value─┬─changed─┐ +│ default_table_engine │ Log │ 1 │ +└──────────────────────┴───────┴─────────┘ +``` + +In this example, any new table that does not specify an `Engine` will use the `Log` table engine: + +Query: + +```sql +CREATE TABLE my_table ( + x UInt32, + y UInt32 +); + +SHOW CREATE TABLE my_table; +``` + +Result: + +```response +┌─statement────────────────────────────────────────────────────────────────┐ +│ CREATE TABLE default.my_table +( + `x` UInt32, + `y` UInt32 +) +ENGINE = Log +└──────────────────────────────────────────────────────────────────────────┘ +``` +)", 0) \ + M(Bool, show_table_uuid_in_table_create_query_if_not_nil, false, R"( +Sets the `SHOW TABLE` query display. + +Possible values: + +- 0 — The query will be displayed without table UUID. +- 1 — The query will be displayed with table UUID. +)", 0) \ + M(Bool, database_atomic_wait_for_drop_and_detach_synchronously, false, R"( +Adds a modifier `SYNC` to all `DROP` and `DETACH` queries. + +Possible values: + +- 0 — Queries will be executed with delay. +- 1 — Queries will be executed without delay. +)", 0) \ + M(Bool, enable_scalar_subquery_optimization, true, R"( +If it is set to true, prevent scalar subqueries from (de)serializing large scalar values and possibly avoid running the same subquery more than once. +)", 0) \ + M(Bool, optimize_trivial_count_query, true, R"( +Enables or disables the optimization to trivial query `SELECT count() FROM table` using metadata from MergeTree. If you need to use row-level security, disable this setting. + +Possible values: + + - 0 — Optimization disabled. + - 1 — Optimization enabled. + +See also: + +- [optimize_functions_to_subcolumns](#optimize-functions-to-subcolumns) +)", 0) \ + M(Bool, optimize_trivial_approximate_count_query, false, R"( +Use an approximate value for trivial count optimization of storages that support such estimation, for example, EmbeddedRocksDB. + +Possible values: + + - 0 — Optimization disabled. + - 1 — Optimization enabled. +)", 0) \ + M(Bool, optimize_count_from_files, true, R"( +Enables or disables the optimization of counting number of rows from files in different input formats. It applies to table functions/engines `file`/`s3`/`url`/`hdfs`/`azureBlobStorage`. + +Possible values: + +- 0 — Optimization disabled. +- 1 — Optimization enabled. +)", 0) \ + M(Bool, use_cache_for_count_from_files, true, R"( +Enables caching of rows number during count from files in table functions `file`/`s3`/`url`/`hdfs`/`azureBlobStorage`. + +Enabled by default. +)", 0) \ + M(Bool, optimize_respect_aliases, true, R"( +If it is set to true, it will respect aliases in WHERE/GROUP BY/ORDER BY, that will help with partition pruning/secondary indexes/optimize_aggregation_in_order/optimize_read_in_order/optimize_trivial_count +)", 0) \ + M(UInt64, mutations_sync, 0, R"( +Allows to execute `ALTER TABLE ... UPDATE|DELETE|MATERIALIZE INDEX|MATERIALIZE PROJECTION|MATERIALIZE COLUMN` queries ([mutations](../../sql-reference/statements/alter/index.md#mutations)) synchronously. + +Possible values: + +- 0 - Mutations execute asynchronously. +- 1 - The query waits for all mutations to complete on the current server. +- 2 - The query waits for all mutations to complete on all replicas (if they exist). +)", 0) \ + M(Bool, enable_lightweight_delete, true, R"( +Enable lightweight DELETE mutations for mergetree tables. +)", 0) ALIAS(allow_experimental_lightweight_delete) \ + M(UInt64, lightweight_deletes_sync, 2, R"( +The same as [`mutations_sync`](#mutations_sync), but controls only execution of lightweight deletes. + +Possible values: + +- 0 - Mutations execute asynchronously. +- 1 - The query waits for the lightweight deletes to complete on the current server. +- 2 - The query waits for the lightweight deletes to complete on all replicas (if they exist). + +**See Also** + +- [Synchronicity of ALTER Queries](../../sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) +- [Mutations](../../sql-reference/statements/alter/index.md#mutations) +)", 0) \ + M(Bool, apply_deleted_mask, true, R"( +Enables filtering out rows deleted with lightweight DELETE. If disabled, a query will be able to read those rows. This is useful for debugging and \"undelete\" scenarios +)", 0) \ + M(Bool, optimize_normalize_count_variants, true, R"( +Rewrite aggregate functions that semantically equals to count() as count(). +)", 0) \ + M(Bool, optimize_injective_functions_inside_uniq, true, R"( +Delete injective functions of one argument inside uniq*() functions. +)", 0) \ + M(Bool, rewrite_count_distinct_if_with_count_distinct_implementation, false, R"( +Allows you to rewrite `countDistcintIf` with [count_distinct_implementation](#count_distinct_implementation) setting. + +Possible values: + +- true — Allow. +- false — Disallow. +)", 0) \ + M(Bool, convert_query_to_cnf, false, R"( +When set to `true`, a `SELECT` query will be converted to conjuctive normal form (CNF). There are scenarios where rewriting a query in CNF may execute faster (view this [Github issue](https://github.com/ClickHouse/ClickHouse/issues/11749) for an explanation). + +For example, notice how the following `SELECT` query is not modified (the default behavior): + +```sql +EXPLAIN SYNTAX +SELECT * +FROM +( + SELECT number AS x + FROM numbers(20) +) AS a +WHERE ((x >= 1) AND (x <= 5)) OR ((x >= 10) AND (x <= 15)) +SETTINGS convert_query_to_cnf = false; +``` + +The result is: + +```response +┌─explain────────────────────────────────────────────────────────┐ +│ SELECT x │ +│ FROM │ +│ ( │ +│ SELECT number AS x │ +│ FROM numbers(20) │ +│ WHERE ((x >= 1) AND (x <= 5)) OR ((x >= 10) AND (x <= 15)) │ +│ ) AS a │ +│ WHERE ((x >= 1) AND (x <= 5)) OR ((x >= 10) AND (x <= 15)) │ +│ SETTINGS convert_query_to_cnf = 0 │ +└────────────────────────────────────────────────────────────────┘ +``` + +Let's set `convert_query_to_cnf` to `true` and see what changes: + +```sql +EXPLAIN SYNTAX +SELECT * +FROM +( + SELECT number AS x + FROM numbers(20) +) AS a +WHERE ((x >= 1) AND (x <= 5)) OR ((x >= 10) AND (x <= 15)) +SETTINGS convert_query_to_cnf = true; +``` + +Notice the `WHERE` clause is rewritten in CNF, but the result set is the identical - the Boolean logic is unchanged: + +```response +┌─explain───────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ SELECT x │ +│ FROM │ +│ ( │ +│ SELECT number AS x │ +│ FROM numbers(20) │ +│ WHERE ((x <= 15) OR (x <= 5)) AND ((x <= 15) OR (x >= 1)) AND ((x >= 10) OR (x <= 5)) AND ((x >= 10) OR (x >= 1)) │ +│ ) AS a │ +│ WHERE ((x >= 10) OR (x >= 1)) AND ((x >= 10) OR (x <= 5)) AND ((x <= 15) OR (x >= 1)) AND ((x <= 15) OR (x <= 5)) │ +│ SETTINGS convert_query_to_cnf = 1 │ +└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +Possible values: true, false +)", 0) \ + M(Bool, optimize_or_like_chain, false, R"( +Optimize multiple OR LIKE into multiMatchAny. This optimization should not be enabled by default, because it defies index analysis in some cases. +)", 0) \ + M(Bool, optimize_arithmetic_operations_in_aggregate_functions, true, R"( +Move arithmetic operations out of aggregation functions +)", 0) \ + M(Bool, optimize_redundant_functions_in_order_by, true, R"( +Remove functions from ORDER BY if its argument is also in ORDER BY +)", 0) \ + M(Bool, optimize_if_chain_to_multiif, false, R"( +Replace if(cond1, then1, if(cond2, ...)) chains to multiIf. Currently it's not beneficial for numeric types. +)", 0) \ + M(Bool, optimize_multiif_to_if, true, R"( +Replace 'multiIf' with only one condition to 'if'. +)", 0) \ + M(Bool, optimize_if_transform_strings_to_enum, false, R"( +Replaces string-type arguments in If and Transform to enum. Disabled by default cause it could make inconsistent change in distributed query that would lead to its fail. +)", 0) \ + M(Bool, optimize_functions_to_subcolumns, true, R"( +Enables or disables optimization by transforming some functions to reading subcolumns. This reduces the amount of data to read. + +These functions can be transformed: + +- [length](../../sql-reference/functions/array-functions.md/#array_functions-length) to read the [size0](../../sql-reference/data-types/array.md/#array-size) subcolumn. +- [empty](../../sql-reference/functions/array-functions.md/#function-empty) to read the [size0](../../sql-reference/data-types/array.md/#array-size) subcolumn. +- [notEmpty](../../sql-reference/functions/array-functions.md/#function-notempty) to read the [size0](../../sql-reference/data-types/array.md/#array-size) subcolumn. +- [isNull](../../sql-reference/operators/index.md#operator-is-null) to read the [null](../../sql-reference/data-types/nullable.md/#finding-null) subcolumn. +- [isNotNull](../../sql-reference/operators/index.md#is-not-null) to read the [null](../../sql-reference/data-types/nullable.md/#finding-null) subcolumn. +- [count](../../sql-reference/aggregate-functions/reference/count.md) to read the [null](../../sql-reference/data-types/nullable.md/#finding-null) subcolumn. +- [mapKeys](../../sql-reference/functions/tuple-map-functions.md/#mapkeys) to read the [keys](../../sql-reference/data-types/map.md/#map-subcolumns) subcolumn. +- [mapValues](../../sql-reference/functions/tuple-map-functions.md/#mapvalues) to read the [values](../../sql-reference/data-types/map.md/#map-subcolumns) subcolumn. + +Possible values: + +- 0 — Optimization disabled. +- 1 — Optimization enabled. +)", 0) \ + M(Bool, optimize_using_constraints, false, R"( +Use [constraints](../../sql-reference/statements/create/table.md#constraints) for query optimization. The default is `false`. + +Possible values: + +- true, false +)", 0) \ + M(Bool, optimize_substitute_columns, false, R"( +Use [constraints](../../sql-reference/statements/create/table.md#constraints) for column substitution. The default is `false`. + +Possible values: + +- true, false +)", 0) \ + M(Bool, optimize_append_index, false, R"( +Use [constraints](../../sql-reference/statements/create/table.md#constraints) in order to append index condition. The default is `false`. + +Possible values: + +- true, false +)", 0) \ + M(Bool, optimize_time_filter_with_preimage, true, R"( +Optimize Date and DateTime predicates by converting functions into equivalent comparisons without conversions (e.g. toYear(col) = 2023 -> col >= '2023-01-01' AND col <= '2023-12-31') +)", 0) \ + M(Bool, normalize_function_names, true, R"( +Normalize function names to their canonical names +)", 0) \ + M(Bool, enable_early_constant_folding, true, R"( +Enable query optimization where we analyze function and subqueries results and rewrite query if there are constants there +)", 0) \ + M(Bool, deduplicate_blocks_in_dependent_materialized_views, false, R"( +Enables or disables the deduplication check for materialized views that receive data from Replicated\* tables. + +Possible values: + + 0 — Disabled. + 1 — Enabled. + +Usage + +By default, deduplication is not performed for materialized views but is done upstream, in the source table. +If an INSERTed block is skipped due to deduplication in the source table, there will be no insertion into attached materialized views. This behaviour exists to enable the insertion of highly aggregated data into materialized views, for cases where inserted blocks are the same after materialized view aggregation but derived from different INSERTs into the source table. +At the same time, this behaviour “breaks” `INSERT` idempotency. If an `INSERT` into the main table was successful and `INSERT` into a materialized view failed (e.g. because of communication failure with ClickHouse Keeper) a client will get an error and can retry the operation. However, the materialized view won’t receive the second insert because it will be discarded by deduplication in the main (source) table. The setting `deduplicate_blocks_in_dependent_materialized_views` allows for changing this behaviour. On retry, a materialized view will receive the repeat insert and will perform a deduplication check by itself, +ignoring check result for the source table, and will insert rows lost because of the first failure. +)", 0) \ + M(Bool, throw_if_deduplication_in_dependent_materialized_views_enabled_with_async_insert, true, R"( +Throw exception on INSERT query when the setting `deduplicate_blocks_in_dependent_materialized_views` is enabled along with `async_insert`. It guarantees correctness, because these features can't work together. +)", 0) \ + M(Bool, materialized_views_ignore_errors, false, R"( +Allows to ignore errors for MATERIALIZED VIEW, and deliver original block to the table regardless of MVs +)", 0) \ + M(Bool, ignore_materialized_views_with_dropped_target_table, false, R"( +Ignore MVs with dropped target table during pushing to views +)", 0) \ + M(Bool, allow_materialized_view_with_bad_select, true, R"( +Allow CREATE MATERIALIZED VIEW with SELECT query that references nonexistent tables or columns. It must still be syntactically valid. Doesn't apply to refreshable MVs. Doesn't apply if the MV schema needs to be inferred from the SELECT query (i.e. if the CREATE has no column list and no TO table). Can be used for creating MV before its source table. +)", 0) \ + M(Bool, use_compact_format_in_distributed_parts_names, true, R"( +Uses compact format for storing blocks for background (`distributed_foreground_insert`) INSERT into tables with `Distributed` engine. + +Possible values: + +- 0 — Uses `user[:password]@host:port#default_database` directory format. +- 1 — Uses `[shard{shard_index}[_replica{replica_index}]]` directory format. + +:::note +- with `use_compact_format_in_distributed_parts_names=0` changes from cluster definition will not be applied for background INSERT. +- with `use_compact_format_in_distributed_parts_names=1` changing the order of the nodes in the cluster definition, will change the `shard_index`/`replica_index` so be aware. +::: +)", 0) \ + M(Bool, validate_polygons, true, R"( +Enables or disables throwing an exception in the [pointInPolygon](../../sql-reference/functions/geo/index.md#pointinpolygon) function, if the polygon is self-intersecting or self-tangent. + +Possible values: + +- 0 — Throwing an exception is disabled. `pointInPolygon` accepts invalid polygons and returns possibly incorrect results for them. +- 1 — Throwing an exception is enabled. +)", 0) \ + M(UInt64, max_parser_depth, DBMS_DEFAULT_MAX_PARSER_DEPTH, R"( +Limits maximum recursion depth in the recursive descent parser. Allows controlling the stack size. + +Possible values: + +- Positive integer. +- 0 — Recursion depth is unlimited. +)", 0) \ + M(UInt64, max_parser_backtracks, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS, R"( +Maximum parser backtracking (how many times it tries different alternatives in the recursive descend parsing process). +)", 0) \ + M(UInt64, max_recursive_cte_evaluation_depth, DBMS_RECURSIVE_CTE_MAX_EVALUATION_DEPTH, R"( +Maximum limit on recursive CTE evaluation depth +)", 0) \ + M(Bool, allow_settings_after_format_in_insert, false, R"( +Control whether `SETTINGS` after `FORMAT` in `INSERT` queries is allowed or not. It is not recommended to use this, since this may interpret part of `SETTINGS` as values. + +Example: + +```sql +INSERT INTO FUNCTION null('foo String') SETTINGS max_threads=1 VALUES ('bar'); +``` + +But the following query will work only with `allow_settings_after_format_in_insert`: + +```sql +SET allow_settings_after_format_in_insert=1; +INSERT INTO FUNCTION null('foo String') VALUES ('bar') SETTINGS max_threads=1; +``` + +Possible values: + +- 0 — Disallow. +- 1 — Allow. + +:::note +Use this setting only for backward compatibility if your use cases depend on old syntax. +::: +)", 0) \ + M(Seconds, periodic_live_view_refresh, 60, R"( +Interval after which periodically refreshed live view is forced to refresh. +)", 0) \ + M(Bool, transform_null_in, false, R"( +Enables equality of [NULL](../../sql-reference/syntax.md/#null-literal) values for [IN](../../sql-reference/operators/in.md) operator. + +By default, `NULL` values can’t be compared because `NULL` means undefined value. Thus, comparison `expr = NULL` must always return `false`. With this setting `NULL = NULL` returns `true` for `IN` operator. + +Possible values: + +- 0 — Comparison of `NULL` values in `IN` operator returns `false`. +- 1 — Comparison of `NULL` values in `IN` operator returns `true`. + +**Example** + +Consider the `null_in` table: + +``` text +┌──idx─┬─────i─┐ +│ 1 │ 1 │ +│ 2 │ NULL │ +│ 3 │ 3 │ +└──────┴───────┘ +``` + +Query: + +``` sql +SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 0; +``` + +Result: + +``` text +┌──idx─┬────i─┐ +│ 1 │ 1 │ +└──────┴──────┘ +``` + +Query: + +``` sql +SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 1; +``` + +Result: + +``` text +┌──idx─┬─────i─┐ +│ 1 │ 1 │ +│ 2 │ NULL │ +└──────┴───────┘ +``` + +**See Also** + +- [NULL Processing in IN Operators](../../sql-reference/operators/in.md/#in-null-processing) +)", 0) \ + M(Bool, allow_nondeterministic_mutations, false, R"( +User-level setting that allows mutations on replicated tables to make use of non-deterministic functions such as `dictGet`. + +Given that, for example, dictionaries, can be out of sync across nodes, mutations that pull values from them are disallowed on replicated tables by default. Enabling this setting allows this behavior, making it the user's responsibility to ensure that the data used is in sync across all nodes. + +**Example** + +``` xml + + + 1 + + + + + + + +``` +)", 0) \ + M(Seconds, lock_acquire_timeout, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, R"( +Defines how many seconds a locking request waits before failing. + +Locking timeout is used to protect from deadlocks while executing read/write operations with tables. When the timeout expires and the locking request fails, the ClickHouse server throws an exception "Locking attempt timed out! Possible deadlock avoided. Client should retry." with error code `DEADLOCK_AVOIDED`. + +Possible values: + +- Positive integer (in seconds). +- 0 — No locking timeout. +)", 0) \ + M(Bool, materialize_ttl_after_modify, true, R"( +Apply TTL for old data, after ALTER MODIFY TTL query +)", 0) \ + M(String, function_implementation, "", R"( +Choose function implementation for specific target or variant (experimental). If empty enable all of them. +)", 0) \ + M(Bool, data_type_default_nullable, false, R"( +Allows data types without explicit modifiers [NULL or NOT NULL](../../sql-reference/statements/create/table.md/#null-modifiers) in column definition will be [Nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable). + +Possible values: + +- 1 — The data types in column definitions are set to `Nullable` by default. +- 0 — The data types in column definitions are set to not `Nullable` by default. +)", 0) \ + M(Bool, cast_keep_nullable, false, R"( +Enables or disables keeping of the `Nullable` data type in [CAST](../../sql-reference/functions/type-conversion-functions.md/#castx-t) operations. + +When the setting is enabled and the argument of `CAST` function is `Nullable`, the result is also transformed to `Nullable` type. When the setting is disabled, the result always has the destination type exactly. + +Possible values: + +- 0 — The `CAST` result has exactly the destination type specified. +- 1 — If the argument type is `Nullable`, the `CAST` result is transformed to `Nullable(DestinationDataType)`. + +**Examples** + +The following query results in the destination data type exactly: + +```sql +SET cast_keep_nullable = 0; +SELECT CAST(toNullable(toInt32(0)) AS Int32) as x, toTypeName(x); +``` + +Result: + +```text +┌─x─┬─toTypeName(CAST(toNullable(toInt32(0)), 'Int32'))─┐ +│ 0 │ Int32 │ +└───┴───────────────────────────────────────────────────┘ +``` + +The following query results in the `Nullable` modification on the destination data type: + +```sql +SET cast_keep_nullable = 1; +SELECT CAST(toNullable(toInt32(0)) AS Int32) as x, toTypeName(x); +``` + +Result: + +```text +┌─x─┬─toTypeName(CAST(toNullable(toInt32(0)), 'Int32'))─┐ +│ 0 │ Nullable(Int32) │ +└───┴───────────────────────────────────────────────────┘ +``` + +**See Also** + +- [CAST](../../sql-reference/functions/type-conversion-functions.md/#type_conversion_function-cast) function +)", 0) \ + M(Bool, cast_ipv4_ipv6_default_on_conversion_error, false, R"( +CAST operator into IPv4, CAST operator into IPV6 type, toIPv4, toIPv6 functions will return default value instead of throwing exception on conversion error. +)", 0) \ + M(Bool, alter_partition_verbose_result, false, R"( +Enables or disables the display of information about the parts to which the manipulation operations with partitions and parts have been successfully applied. +Applicable to [ATTACH PARTITION|PART](../../sql-reference/statements/alter/partition.md/#alter_attach-partition) and to [FREEZE PARTITION](../../sql-reference/statements/alter/partition.md/#alter_freeze-partition). + +Possible values: + +- 0 — disable verbosity. +- 1 — enable verbosity. + +**Example** + +```sql +CREATE TABLE test(a Int64, d Date, s String) ENGINE = MergeTree PARTITION BY toYYYYMM(d) ORDER BY a; +INSERT INTO test VALUES(1, '2021-01-01', ''); +INSERT INTO test VALUES(1, '2021-01-01', ''); +ALTER TABLE test DETACH PARTITION ID '202101'; + +ALTER TABLE test ATTACH PARTITION ID '202101' SETTINGS alter_partition_verbose_result = 1; + +┌─command_type─────┬─partition_id─┬─part_name────┬─old_part_name─┐ +│ ATTACH PARTITION │ 202101 │ 202101_7_7_0 │ 202101_5_5_0 │ +│ ATTACH PARTITION │ 202101 │ 202101_8_8_0 │ 202101_6_6_0 │ +└──────────────────┴──────────────┴──────────────┴───────────────┘ + +ALTER TABLE test FREEZE SETTINGS alter_partition_verbose_result = 1; + +┌─command_type─┬─partition_id─┬─part_name────┬─backup_name─┬─backup_path───────────────────┬─part_backup_path────────────────────────────────────────────┐ +│ FREEZE ALL │ 202101 │ 202101_7_7_0 │ 8 │ /var/lib/clickhouse/shadow/8/ │ /var/lib/clickhouse/shadow/8/data/default/test/202101_7_7_0 │ +│ FREEZE ALL │ 202101 │ 202101_8_8_0 │ 8 │ /var/lib/clickhouse/shadow/8/ │ /var/lib/clickhouse/shadow/8/data/default/test/202101_8_8_0 │ +└──────────────┴──────────────┴──────────────┴─────────────┴───────────────────────────────┴─────────────────────────────────────────────────────────────┘ +``` +)", 0) \ + M(Bool, system_events_show_zero_values, false, R"( +Allows to select zero-valued events from [`system.events`](../../operations/system-tables/events.md). + +Some monitoring systems require passing all the metrics values to them for each checkpoint, even if the metric value is zero. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +**Examples** + +Query + +```sql +SELECT * FROM system.events WHERE event='QueryMemoryLimitExceeded'; +``` + +Result + +```text +Ok. +``` + +Query +```sql +SET system_events_show_zero_values = 1; +SELECT * FROM system.events WHERE event='QueryMemoryLimitExceeded'; +``` + +Result + +```text +┌─event────────────────────┬─value─┬─description───────────────────────────────────────────┐ +│ QueryMemoryLimitExceeded │ 0 │ Number of times when memory limit exceeded for query. │ +└──────────────────────────┴───────┴───────────────────────────────────────────────────────┘ +``` +)", 0) \ + M(MySQLDataTypesSupport, mysql_datatypes_support_level, MySQLDataTypesSupportList{}, R"( +Defines how MySQL types are converted to corresponding ClickHouse types. A comma separated list in any combination of `decimal`, `datetime64`, `date2Date32` or `date2String`. +- `decimal`: convert `NUMERIC` and `DECIMAL` types to `Decimal` when precision allows it. +- `datetime64`: convert `DATETIME` and `TIMESTAMP` types to `DateTime64` instead of `DateTime` when precision is not `0`. +- `date2Date32`: convert `DATE` to `Date32` instead of `Date`. Takes precedence over `date2String`. +- `date2String`: convert `DATE` to `String` instead of `Date`. Overridden by `datetime64`. +)", 0) \ + M(Bool, optimize_trivial_insert_select, false, R"( +Optimize trivial 'INSERT INTO table SELECT ... FROM TABLES' query +)", 0) \ + M(Bool, allow_non_metadata_alters, true, R"( +Allow to execute alters which affects not only tables metadata, but also data on disk +)", 0) \ + M(Bool, enable_global_with_statement, true, R"( +Propagate WITH statements to UNION queries and all subqueries +)", 0) \ + M(Bool, aggregate_functions_null_for_empty, false, R"( +Enables or disables rewriting all aggregate functions in a query, adding [-OrNull](../../sql-reference/aggregate-functions/combinators.md/#agg-functions-combinator-ornull) suffix to them. Enable it for SQL standard compatibility. +It is implemented via query rewrite (similar to [count_distinct_implementation](#count_distinct_implementation) setting) to get consistent results for distributed queries. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +**Example** + +Consider the following query with aggregate functions: +```sql +SELECT SUM(-1), MAX(0) FROM system.one WHERE 0; +``` + +With `aggregate_functions_null_for_empty = 0` it would produce: +```text +┌─SUM(-1)─┬─MAX(0)─┐ +│ 0 │ 0 │ +└─────────┴────────┘ +``` + +With `aggregate_functions_null_for_empty = 1` the result would be: +```text +┌─SUMOrNull(-1)─┬─MAXOrNull(0)─┐ +│ NULL │ NULL │ +└───────────────┴──────────────┘ +``` +)", 0) \ + M(Bool, optimize_syntax_fuse_functions, false, R"( +Enables to fuse aggregate functions with identical argument. It rewrites query contains at least two aggregate functions from [sum](../../sql-reference/aggregate-functions/reference/sum.md/#agg_function-sum), [count](../../sql-reference/aggregate-functions/reference/count.md/#agg_function-count) or [avg](../../sql-reference/aggregate-functions/reference/avg.md/#agg_function-avg) with identical argument to [sumCount](../../sql-reference/aggregate-functions/reference/sumcount.md/#agg_function-sumCount). + +Possible values: + +- 0 — Functions with identical argument are not fused. +- 1 — Functions with identical argument are fused. + +**Example** + +Query: + +``` sql +CREATE TABLE fuse_tbl(a Int8, b Int8) Engine = Log; +SET optimize_syntax_fuse_functions = 1; +EXPLAIN SYNTAX SELECT sum(a), sum(b), count(b), avg(b) from fuse_tbl FORMAT TSV; +``` + +Result: + +``` text +SELECT + sum(a), + sumCount(b).1, + sumCount(b).2, + (sumCount(b).1) / (sumCount(b).2) +FROM fuse_tbl +``` +)", 0) \ + M(Bool, flatten_nested, true, R"( +Sets the data format of a [nested](../../sql-reference/data-types/nested-data-structures/index.md) columns. + +Possible values: + +- 1 — Nested column is flattened to separate arrays. +- 0 — Nested column stays a single array of tuples. + +**Usage** + +If the setting is set to `0`, it is possible to use an arbitrary level of nesting. + +**Examples** + +Query: + +``` sql +SET flatten_nested = 1; +CREATE TABLE t_nest (`n` Nested(a UInt32, b UInt32)) ENGINE = MergeTree ORDER BY tuple(); + +SHOW CREATE TABLE t_nest; +``` + +Result: + +``` text +┌─statement───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ CREATE TABLE default.t_nest +( + `n.a` Array(UInt32), + `n.b` Array(UInt32) +) +ENGINE = MergeTree +ORDER BY tuple() +SETTINGS index_granularity = 8192 │ +└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +Query: + +``` sql +SET flatten_nested = 0; + +CREATE TABLE t_nest (`n` Nested(a UInt32, b UInt32)) ENGINE = MergeTree ORDER BY tuple(); + +SHOW CREATE TABLE t_nest; +``` + +Result: + +``` text +┌─statement──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ CREATE TABLE default.t_nest +( + `n` Nested(a UInt32, b UInt32) +) +ENGINE = MergeTree +ORDER BY tuple() +SETTINGS index_granularity = 8192 │ +└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` +)", 0) \ + M(Bool, asterisk_include_materialized_columns, false, R"( +Include [MATERIALIZED](../../sql-reference/statements/create/table.md#materialized) columns for wildcard query (`SELECT *`). + +Possible values: + +- 0 - disabled +- 1 - enabled +)", 0) \ + M(Bool, asterisk_include_alias_columns, false, R"( +Include [ALIAS](../../sql-reference/statements/create/table.md#alias) columns for wildcard query (`SELECT *`). + +Possible values: + +- 0 - disabled +- 1 - enabled +)", 0) \ + M(Bool, optimize_skip_merged_partitions, false, R"( +Enables or disables optimization for [OPTIMIZE TABLE ... FINAL](../../sql-reference/statements/optimize.md) query if there is only one part with level > 0 and it doesn't have expired TTL. + +- `OPTIMIZE TABLE ... FINAL SETTINGS optimize_skip_merged_partitions=1` + +By default, `OPTIMIZE TABLE ... FINAL` query rewrites the one part even if there is only a single part. + +Possible values: + +- 1 - Enable optimization. +- 0 - Disable optimization. +)", 0) \ + M(Bool, optimize_on_insert, true, R"( +Enables or disables data transformation before the insertion, as if merge was done on this block (according to table engine). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +**Example** + +The difference between enabled and disabled: + +Query: + +```sql +SET optimize_on_insert = 1; + +CREATE TABLE test1 (`FirstTable` UInt32) ENGINE = ReplacingMergeTree ORDER BY FirstTable; + +INSERT INTO test1 SELECT number % 2 FROM numbers(5); + +SELECT * FROM test1; + +SET optimize_on_insert = 0; + +CREATE TABLE test2 (`SecondTable` UInt32) ENGINE = ReplacingMergeTree ORDER BY SecondTable; + +INSERT INTO test2 SELECT number % 2 FROM numbers(5); + +SELECT * FROM test2; +``` + +Result: + +``` text +┌─FirstTable─┐ +│ 0 │ +│ 1 │ +└────────────┘ + +┌─SecondTable─┐ +│ 0 │ +│ 0 │ +│ 0 │ +│ 1 │ +│ 1 │ +└─────────────┘ +``` + +Note that this setting influences [Materialized view](../../sql-reference/statements/create/view.md/#materialized) and [MaterializedMySQL](../../engines/database-engines/materialized-mysql.md) behaviour. +)", 0) \ + M(Bool, optimize_use_projections, true, R"( +Enables or disables [projection](../../engines/table-engines/mergetree-family/mergetree.md/#projections) optimization when processing `SELECT` queries. + +Possible values: + +- 0 — Projection optimization disabled. +- 1 — Projection optimization enabled. +)", 0) ALIAS(allow_experimental_projection_optimization) \ + M(Bool, optimize_use_implicit_projections, true, R"( +Automatically choose implicit projections to perform SELECT query +)", 0) \ + M(Bool, force_optimize_projection, false, R"( +Enables or disables the obligatory use of [projections](../../engines/table-engines/mergetree-family/mergetree.md/#projections) in `SELECT` queries, when projection optimization is enabled (see [optimize_use_projections](#optimize_use_projections) setting). + +Possible values: + +- 0 — Projection optimization is not obligatory. +- 1 — Projection optimization is obligatory. +)", 0) \ + M(String, force_optimize_projection_name, "", R"( +If it is set to a non-empty string, check that this projection is used in the query at least once. + +Possible values: + +- string: name of projection that used in a query +)", 0) \ + M(String, preferred_optimize_projection_name, "", R"( +If it is set to a non-empty string, ClickHouse will try to apply specified projection in query. + + +Possible values: + +- string: name of preferred projection +)", 0) \ + M(Bool, async_socket_for_remote, true, R"( +Enables asynchronous read from socket while executing remote query. + +Enabled by default. +)", 0) \ + M(Bool, async_query_sending_for_remote, true, R"( +Enables asynchronous connection creation and query sending while executing remote query. + +Enabled by default. +)", 0) \ + M(Bool, insert_null_as_default, true, R"( +Enables or disables the insertion of [default values](../../sql-reference/statements/create/table.md/#create-default-values) instead of [NULL](../../sql-reference/syntax.md/#null-literal) into columns with not [nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable) data type. +If column type is not nullable and this setting is disabled, then inserting `NULL` causes an exception. If column type is nullable, then `NULL` values are inserted as is, regardless of this setting. + +This setting is applicable to [INSERT ... SELECT](../../sql-reference/statements/insert-into.md/#inserting-the-results-of-select) queries. Note that `SELECT` subqueries may be concatenated with `UNION ALL` clause. + +Possible values: + +- 0 — Inserting `NULL` into a not nullable column causes an exception. +- 1 — Default column value is inserted instead of `NULL`. +)", 0) \ + M(Bool, describe_extend_object_types, false, R"( +Deduce concrete type of columns of type Object in DESCRIBE query +)", 0) \ + M(Bool, describe_include_subcolumns, false, R"( +Enables describing subcolumns for a [DESCRIBE](../../sql-reference/statements/describe-table.md) query. For example, members of a [Tuple](../../sql-reference/data-types/tuple.md) or subcolumns of a [Map](../../sql-reference/data-types/map.md/#map-subcolumns), [Nullable](../../sql-reference/data-types/nullable.md/#finding-null) or an [Array](../../sql-reference/data-types/array.md/#array-size) data type. + +Possible values: + +- 0 — Subcolumns are not included in `DESCRIBE` queries. +- 1 — Subcolumns are included in `DESCRIBE` queries. + +**Example** + +See an example for the [DESCRIBE](../../sql-reference/statements/describe-table.md) statement. +)", 0) \ + M(Bool, describe_include_virtual_columns, false, R"( +If true, virtual columns of table will be included into result of DESCRIBE query +)", 0) \ + M(Bool, describe_compact_output, false, R"( +If true, include only column names and types into result of DESCRIBE query +)", 0) \ + M(Bool, apply_mutations_on_fly, false, R"( +If true, mutations (UPDATEs and DELETEs) which are not materialized in data part will be applied on SELECTs. Only available in ClickHouse Cloud. +)", 0) \ + M(Bool, mutations_execute_nondeterministic_on_initiator, false, R"( +If true constant nondeterministic functions (e.g. function `now()`) are executed on initiator and replaced to literals in `UPDATE` and `DELETE` queries. It helps to keep data in sync on replicas while executing mutations with constant nondeterministic functions. Default value: `false`. +)", 0) \ + M(Bool, mutations_execute_subqueries_on_initiator, false, R"( +If true scalar subqueries are executed on initiator and replaced to literals in `UPDATE` and `DELETE` queries. Default value: `false`. +)", 0) \ + M(UInt64, mutations_max_literal_size_to_replace, 16384, R"( +The maximum size of serialized literal in bytes to replace in `UPDATE` and `DELETE` queries. Takes effect only if at least one the two settings above is enabled. Default value: 16384 (16 KiB). +)", 0) \ \ - M(Bool, use_query_cache, false, "Enable the query cache", 0) \ - M(Bool, enable_writes_to_query_cache, true, "Enable storing results of SELECT queries in the query cache", 0) \ - M(Bool, enable_reads_from_query_cache, true, "Enable reading results of SELECT queries from the query cache", 0) \ - M(QueryCacheNondeterministicFunctionHandling, query_cache_nondeterministic_function_handling, QueryCacheNondeterministicFunctionHandling::Throw, "How the query cache handles queries with non-deterministic functions, e.g. now()", 0) \ - M(QueryCacheSystemTableHandling, query_cache_system_table_handling, QueryCacheSystemTableHandling::Throw, "How the query cache handles queries against system tables, i.e. tables in databases 'system.*' and 'information_schema.*'", 0) \ - M(UInt64, query_cache_max_size_in_bytes, 0, "The maximum amount of memory (in bytes) the current user may allocate in the query cache. 0 means unlimited. ", 0) \ - M(UInt64, query_cache_max_entries, 0, "The maximum number of query results the current user may store in the query cache. 0 means unlimited.", 0) \ - M(UInt64, query_cache_min_query_runs, 0, "Minimum number a SELECT query must run before its result is stored in the query cache", 0) \ - M(Milliseconds, query_cache_min_query_duration, 0, "Minimum time in milliseconds for a query to run for its result to be stored in the query cache.", 0) \ - M(Bool, query_cache_compress_entries, true, "Compress cache entries.", 0) \ - M(Bool, query_cache_squash_partial_results, true, "Squash partial result blocks to blocks of size 'max_block_size'. Reduces performance of inserts into the query cache but improves the compressability of cache entries.", 0) \ - M(Seconds, query_cache_ttl, 60, "After this time in seconds entries in the query cache become stale", 0) \ - M(Bool, query_cache_share_between_users, false, "Allow other users to read entry in the query cache", 0) \ - M(String, query_cache_tag, "", "A string which acts as a label for query cache entries. The same queries with different tags are considered different by the query cache.", 0) \ - M(Bool, enable_sharing_sets_for_mutations, true, "Allow sharing set objects build for IN subqueries between different tasks of the same mutation. This reduces memory usage and CPU consumption", 0) \ + M(Float, create_replicated_merge_tree_fault_injection_probability, 0.0f, R"( +The probability of a fault injection during table creation after creating metadata in ZooKeeper +)", 0) \ \ - M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \ - M(Bool, optimize_rewrite_aggregate_function_with_if, true, "Rewrite aggregate functions with if expression as argument when logically equivalent. For example, avg(if(cond, col, null)) can be rewritten to avgIf(cond, col)", 0) \ - M(Bool, optimize_rewrite_array_exists_to_has, false, "Rewrite arrayExists() functions to has() when logically equivalent. For example, arrayExists(x -> x = 1, arr) can be rewritten to has(arr, 1)", 0) \ - M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \ + M(Bool, use_query_cache, false, R"( +If turned on, `SELECT` queries may utilize the [query cache](../query-cache.md). Parameters [enable_reads_from_query_cache](#enable-reads-from-query-cache) +and [enable_writes_to_query_cache](#enable-writes-to-query-cache) control in more detail how the cache is used. + +Possible values: + +- 0 - Disabled +- 1 - Enabled +)", 0) \ + M(Bool, enable_writes_to_query_cache, true, R"( +If turned on, results of `SELECT` queries are stored in the [query cache](../query-cache.md). + +Possible values: + +- 0 - Disabled +- 1 - Enabled +)", 0) \ + M(Bool, enable_reads_from_query_cache, true, R"( +If turned on, results of `SELECT` queries are retrieved from the [query cache](../query-cache.md). + +Possible values: + +- 0 - Disabled +- 1 - Enabled +)", 0) \ + M(QueryCacheNondeterministicFunctionHandling, query_cache_nondeterministic_function_handling, QueryCacheNondeterministicFunctionHandling::Throw, R"( +Controls how the [query cache](../query-cache.md) handles `SELECT` queries with non-deterministic functions like `rand()` or `now()`. + +Possible values: + +- `'throw'` - Throw an exception and don't cache the query result. +- `'save'` - Cache the query result. +- `'ignore'` - Don't cache the query result and don't throw an exception. +)", 0) \ + M(QueryCacheSystemTableHandling, query_cache_system_table_handling, QueryCacheSystemTableHandling::Throw, R"( +Controls how the [query cache](../query-cache.md) handles `SELECT` queries against system tables, i.e. tables in databases `system.*` and `information_schema.*`. + +Possible values: + +- `'throw'` - Throw an exception and don't cache the query result. +- `'save'` - Cache the query result. +- `'ignore'` - Don't cache the query result and don't throw an exception. +)", 0) \ + M(UInt64, query_cache_max_size_in_bytes, 0, R"( +The maximum amount of memory (in bytes) the current user may allocate in the [query cache](../query-cache.md). 0 means unlimited. + +Possible values: + +- Positive integer >= 0. +)", 0) \ + M(UInt64, query_cache_max_entries, 0, R"( +The maximum number of query results the current user may store in the [query cache](../query-cache.md). 0 means unlimited. + +Possible values: + +- Positive integer >= 0. +)", 0) \ + M(UInt64, query_cache_min_query_runs, 0, R"( +Minimum number of times a `SELECT` query must run before its result is stored in the [query cache](../query-cache.md). + +Possible values: + +- Positive integer >= 0. +)", 0) \ + M(Milliseconds, query_cache_min_query_duration, 0, R"( +Minimum duration in milliseconds a query needs to run for its result to be stored in the [query cache](../query-cache.md). + +Possible values: + +- Positive integer >= 0. +)", 0) \ + M(Bool, query_cache_compress_entries, true, R"( +Compress entries in the [query cache](../query-cache.md). Lessens the memory consumption of the query cache at the cost of slower inserts into / reads from it. + +Possible values: + +- 0 - Disabled +- 1 - Enabled +)", 0) \ + M(Bool, query_cache_squash_partial_results, true, R"( +Squash partial result blocks to blocks of size [max_block_size](#setting-max_block_size). Reduces performance of inserts into the [query cache](../query-cache.md) but improves the compressability of cache entries (see [query_cache_compress-entries](#query-cache-compress-entries)). + +Possible values: + +- 0 - Disabled +- 1 - Enabled +)", 0) \ + M(Seconds, query_cache_ttl, 60, R"( +After this time in seconds entries in the [query cache](../query-cache.md) become stale. + +Possible values: + +- Positive integer >= 0. +)", 0) \ + M(Bool, query_cache_share_between_users, false, R"( +If turned on, the result of `SELECT` queries cached in the [query cache](../query-cache.md) can be read by other users. +It is not recommended to enable this setting due to security reasons. + +Possible values: + +- 0 - Disabled +- 1 - Enabled +)", 0) \ + M(String, query_cache_tag, "", R"( +A string which acts as a label for [query cache](../query-cache.md) entries. +The same queries with different tags are considered different by the query cache. + +Possible values: + +- Any string +)", 0) \ + M(Bool, enable_sharing_sets_for_mutations, true, R"( +Allow sharing set objects build for IN subqueries between different tasks of the same mutation. This reduces memory usage and CPU consumption +)", 0) \ \ - M(Bool, collect_hash_table_stats_during_aggregation, true, "Enable collecting hash table statistics to optimize memory allocation", 0) \ - M(UInt64, max_size_to_preallocate_for_aggregation, 100'000'000, "For how many elements it is allowed to preallocate space in all hash tables in total before aggregation", 0) \ + M(Bool, optimize_rewrite_sum_if_to_count_if, true, R"( +Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent +)", 0) \ + M(Bool, optimize_rewrite_aggregate_function_with_if, true, R"( +Rewrite aggregate functions with if expression as argument when logically equivalent. +For example, `avg(if(cond, col, null))` can be rewritten to `avgOrNullIf(cond, col)`. It may improve performance. + +:::note +Supported only with experimental analyzer (`enable_analyzer = 1`). +::: +)", 0) \ + M(Bool, optimize_rewrite_array_exists_to_has, false, R"( +Rewrite arrayExists() functions to has() when logically equivalent. For example, arrayExists(x -> x = 1, arr) can be rewritten to has(arr, 1) +)", 0) \ + M(UInt64, insert_shard_id, 0, R"( +If not `0`, specifies the shard of [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table into which the data will be inserted synchronously. + +If `insert_shard_id` value is incorrect, the server will throw an exception. + +To get the number of shards on `requested_cluster`, you can check server config or use this query: + +``` sql +SELECT uniq(shard_num) FROM system.clusters WHERE cluster = 'requested_cluster'; +``` + +Possible values: + +- 0 — Disabled. +- Any number from `1` to `shards_num` of corresponding [Distributed](../../engines/table-engines/special/distributed.md/#distributed) table. + +**Example** + +Query: + +```sql +CREATE TABLE x AS system.numbers ENGINE = MergeTree ORDER BY number; +CREATE TABLE x_dist AS x ENGINE = Distributed('test_cluster_two_shards_localhost', currentDatabase(), x); +INSERT INTO x_dist SELECT * FROM numbers(5) SETTINGS insert_shard_id = 1; +SELECT * FROM x_dist ORDER BY number ASC; +``` + +Result: + +``` text +┌─number─┐ +│ 0 │ +│ 0 │ +│ 1 │ +│ 1 │ +│ 2 │ +│ 2 │ +│ 3 │ +│ 3 │ +│ 4 │ +│ 4 │ +└────────┘ +``` +)", 0) \ \ - M(Bool, collect_hash_table_stats_during_joins, true, "Enable collecting hash table statistics to optimize memory allocation", 0) \ - M(UInt64, max_size_to_preallocate_for_joins, 100'000'000, "For how many elements it is allowed to preallocate space in all hash tables in total before join", 0) \ + M(Bool, collect_hash_table_stats_during_aggregation, true, R"( +Enable collecting hash table statistics to optimize memory allocation +)", 0) \ + M(UInt64, max_size_to_preallocate_for_aggregation, 100'000'000, R"( +For how many elements it is allowed to preallocate space in all hash tables in total before aggregation +)", 0) \ \ - M(Bool, kafka_disable_num_consumers_limit, false, "Disable limit on kafka_num_consumers that depends on the number of available CPU cores", 0) \ - M(Bool, allow_experimental_kafka_offsets_storage_in_keeper, false, "Allow experimental feature to store Kafka related offsets in ClickHouse Keeper. When enabled a ClickHouse Keeper path and replica name can be specified to the Kafka table engine. As a result instead of the regular Kafka engine, a new type of storage engine will be used that stores the committed offsets primarily in ClickHouse Keeper", 0) \ - M(Bool, enable_software_prefetch_in_aggregation, true, "Enable use of software prefetch in aggregation", 0) \ - M(Bool, allow_aggregate_partitions_independently, false, "Enable independent aggregation of partitions on separate threads when partition key suits group by key. Beneficial when number of partitions close to number of cores and partitions have roughly the same size", 0) \ - M(Bool, force_aggregate_partitions_independently, false, "Force the use of optimization when it is applicable, but heuristics decided not to use it", 0) \ - M(UInt64, max_number_of_partitions_for_independent_aggregation, 128, "Maximal number of partitions in table to apply optimization", 0) \ - M(Float, min_hit_rate_to_use_consecutive_keys_optimization, 0.5, "Minimal hit rate of a cache which is used for consecutive keys optimization in aggregation to keep it enabled", 0) \ + M(Bool, collect_hash_table_stats_during_joins, true, R"( +Enable collecting hash table statistics to optimize memory allocation +)", 0) \ + M(UInt64, max_size_to_preallocate_for_joins, 100'000'000, R"( +For how many elements it is allowed to preallocate space in all hash tables in total before join +)", 0) \ \ - M(Bool, engine_file_empty_if_not_exists, false, "Allows to select data from a file engine table without file", 0) \ - M(Bool, engine_file_truncate_on_insert, false, "Enables or disables truncate before insert in file engine tables", 0) \ - M(Bool, engine_file_allow_create_multiple_files, false, "Enables or disables creating a new file on each insert in file engine tables if format has suffix.", 0) \ - M(Bool, engine_file_skip_empty_files, false, "Allows to skip empty files in file table engine", 0) \ - M(Bool, engine_url_skip_empty_files, false, "Allows to skip empty files in the URL table engine", 0) \ - M(Bool, enable_url_encoding, true, " Allows to enable/disable decoding/encoding path in URI in the URL table engine", 0) \ - M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \ - M(Bool, database_replicated_enforce_synchronous_settings, false, "Enforces synchronous waiting for some queries (see also database_atomic_wait_for_drop_and_detach_synchronously, mutation_sync, alter_sync). Not recommended to enable these settings.", 0) \ - M(UInt64, max_distributed_depth, 5, "Maximum distributed query depth", 0) \ - M(Bool, database_replicated_always_detach_permanently, false, "Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated", 0) \ - M(Bool, database_replicated_allow_only_replicated_engine, false, "Allow to create only Replicated tables in database with engine Replicated", 0) \ - M(UInt64, database_replicated_allow_replicated_engine_arguments, 0, "0 - Don't allow to explicitly specify ZooKeeper path and replica name for *MergeTree tables in Replicated databases. 1 - Allow. 2 - Allow, but ignore the specified path and use default one instead.", 0) \ - M(UInt64, database_replicated_allow_explicit_uuid, 0, "0 - Don't allow to explicitly specify UUIDs for tables in Replicated databases. 1 - Allow. 2 - Allow, but ignore the specified UUID and generate a random one instead.", 0) \ - M(Bool, database_replicated_allow_heavy_create, false, "Allow long-running DDL queries (CREATE AS SELECT and POPULATE) in Replicated database engine. Note that it can block DDL queue for a long time.", 0) \ - M(Bool, cloud_mode, false, "Only available in ClickHouse Cloud", 0) \ - M(UInt64, cloud_mode_engine, 1, "Only available in ClickHouse Cloud", 0) \ - M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result, one of: 'none', 'throw', 'null_status_on_timeout', 'never_throw', 'none_only_active', 'throw_only_active', 'null_status_on_timeout_only_active'", 0) \ - M(UInt64, distributed_ddl_entry_format_version, 5, "Compatibility version of distributed DDL (ON CLUSTER) queries", 0) \ + M(Bool, kafka_disable_num_consumers_limit, false, R"( +Disable limit on kafka_num_consumers that depends on the number of available CPU cores. +)", 0) \ + M(Bool, allow_experimental_kafka_offsets_storage_in_keeper, false, R"( +Allow experimental feature to store Kafka related offsets in ClickHouse Keeper. When enabled a ClickHouse Keeper path and replica name can be specified to the Kafka table engine. As a result instead of the regular Kafka engine, a new type of storage engine will be used that stores the committed offsets primarily in ClickHouse Keeper +)", 0) \ + M(Bool, enable_software_prefetch_in_aggregation, true, R"( +Enable use of software prefetch in aggregation +)", 0) \ + M(Bool, allow_aggregate_partitions_independently, false, R"( +Enable independent aggregation of partitions on separate threads when partition key suits group by key. Beneficial when number of partitions close to number of cores and partitions have roughly the same size +)", 0) \ + M(Bool, force_aggregate_partitions_independently, false, R"( +Force the use of optimization when it is applicable, but heuristics decided not to use it +)", 0) \ + M(UInt64, max_number_of_partitions_for_independent_aggregation, 128, R"( +Maximal number of partitions in table to apply optimization +)", 0) \ + M(Float, min_hit_rate_to_use_consecutive_keys_optimization, 0.5, R"( +Minimal hit rate of a cache which is used for consecutive keys optimization in aggregation to keep it enabled +)", 0) \ \ - M(UInt64, external_storage_max_read_rows, 0, "Limit maximum number of rows when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializedMySQL. If equal to 0, this setting is disabled", 0) \ - M(UInt64, external_storage_max_read_bytes, 0, "Limit maximum number of bytes when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializedMySQL. If equal to 0, this setting is disabled", 0) \ - M(UInt64, external_storage_connect_timeout_sec, DBMS_DEFAULT_CONNECT_TIMEOUT_SEC, "Connect timeout in seconds. Now supported only for MySQL", 0) \ - M(UInt64, external_storage_rw_timeout_sec, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "Read/write timeout in seconds. Now supported only for MySQL", 0) \ + M(Bool, engine_file_empty_if_not_exists, false, R"( +Allows to select data from a file engine table without file. + +Possible values: +- 0 — `SELECT` throws exception. +- 1 — `SELECT` returns empty result. +)", 0) \ + M(Bool, engine_file_truncate_on_insert, false, R"( +Enables or disables truncate before insert in [File](../../engines/table-engines/special/file.md) engine tables. + +Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` query replaces existing content of the file with the new data. +)", 0) \ + M(Bool, engine_file_allow_create_multiple_files, false, R"( +Enables or disables creating a new file on each insert in file engine tables if the format has the suffix (`JSON`, `ORC`, `Parquet`, etc.). If enabled, on each insert a new file will be created with a name following this pattern: + +`data.Parquet` -> `data.1.Parquet` -> `data.2.Parquet`, etc. + +Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` query creates a new file. +)", 0) \ + M(Bool, engine_file_skip_empty_files, false, R"( +Enables or disables skipping empty files in [File](../../engines/table-engines/special/file.md) engine tables. + +Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. +)", 0) \ + M(Bool, engine_url_skip_empty_files, false, R"( +Enables or disables skipping empty files in [URL](../../engines/table-engines/special/url.md) engine tables. + +Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. +)", 0) \ + M(Bool, enable_url_encoding, true, R"( +Allows to enable/disable decoding/encoding path in uri in [URL](../../engines/table-engines/special/url.md) engine tables. + +Enabled by default. +)", 0) \ + M(UInt64, database_replicated_initial_query_timeout_sec, 300, R"( +Sets how long initial DDL query should wait for Replicated database to process previous DDL queue entries in seconds. + +Possible values: + +- Positive integer. +- 0 — Unlimited. +)", 0) \ + M(Bool, database_replicated_enforce_synchronous_settings, false, R"( +Enforces synchronous waiting for some queries (see also database_atomic_wait_for_drop_and_detach_synchronously, mutation_sync, alter_sync). Not recommended to enable these settings. +)", 0) \ + M(UInt64, max_distributed_depth, 5, R"( +Limits the maximum depth of recursive queries for [Distributed](../../engines/table-engines/special/distributed.md) tables. + +If the value is exceeded, the server throws an exception. + +Possible values: + +- Positive integer. +- 0 — Unlimited depth. +)", 0) \ + M(Bool, database_replicated_always_detach_permanently, false, R"( +Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated +)", 0) \ + M(Bool, database_replicated_allow_only_replicated_engine, false, R"( +Allow to create only Replicated tables in database with engine Replicated +)", 0) \ + M(UInt64, database_replicated_allow_replicated_engine_arguments, 0, R"( +0 - Don't allow to explicitly specify ZooKeeper path and replica name for *MergeTree tables in Replicated databases. 1 - Allow. 2 - Allow, but ignore the specified path and use default one instead. 3 - Allow and don't log a warning. +)", 0) \ + M(UInt64, database_replicated_allow_explicit_uuid, 0, R"( +0 - Don't allow to explicitly specify UUIDs for tables in Replicated databases. 1 - Allow. 2 - Allow, but ignore the specified UUID and generate a random one instead. +)", 0) \ + M(Bool, database_replicated_allow_heavy_create, false, R"( +Allow long-running DDL queries (CREATE AS SELECT and POPULATE) in Replicated database engine. Note that it can block DDL queue for a long time. +)", 0) \ + M(Bool, cloud_mode, false, R"( +Cloud mode +)", 0) \ + M(UInt64, cloud_mode_engine, 1, R"( +The engine family allowed in Cloud. 0 - allow everything, 1 - rewrite DDLs to use *ReplicatedMergeTree, 2 - rewrite DDLs to use SharedMergeTree. UInt64 to minimize public part +)", 0) \ + M(UInt64, cloud_mode_database_engine, 1, R"( +The database engine allowed in Cloud. 1 - rewrite DDLs to use Replicated database, 2 - rewrite DDLs to use Shared database +)", 0) \ + M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, R"( +Sets format of distributed DDL query result. + +Possible values: + +- `throw` — Returns result set with query execution status for all hosts where query is finished. If query has failed on some hosts, then it will rethrow the first exception. If query is not finished yet on some hosts and [distributed_ddl_task_timeout](#distributed_ddl_task_timeout) exceeded, then it throws `TIMEOUT_EXCEEDED` exception. +- `none` — Is similar to throw, but distributed DDL query returns no result set. +- `null_status_on_timeout` — Returns `NULL` as execution status in some rows of result set instead of throwing `TIMEOUT_EXCEEDED` if query is not finished on the corresponding hosts. +- `never_throw` — Do not throw `TIMEOUT_EXCEEDED` and do not rethrow exceptions if query has failed on some hosts. +- `none_only_active` - similar to `none`, but doesn't wait for inactive replicas of the `Replicated` database. Note: with this mode it's impossible to figure out that the query was not executed on some replica and will be executed in background. +- `null_status_on_timeout_only_active` — similar to `null_status_on_timeout`, but doesn't wait for inactive replicas of the `Replicated` database +- `throw_only_active` — similar to `throw`, but doesn't wait for inactive replicas of the `Replicated` database + +Cloud default value: `none`. +)", 0) \ + M(UInt64, distributed_ddl_entry_format_version, 5, R"( +Compatibility version of distributed DDL (ON CLUSTER) queries +)", 0) \ \ - M(SetOperationMode, union_default_mode, SetOperationMode::Unspecified, "Set default mode in UNION query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without mode will throw exception.", 0) \ - M(SetOperationMode, intersect_default_mode, SetOperationMode::ALL, "Set default mode in INTERSECT query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without mode will throw exception.", 0) \ - M(SetOperationMode, except_default_mode, SetOperationMode::ALL, "Set default mode in EXCEPT query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without mode will throw exception.", 0) \ - M(Bool, optimize_aggregators_of_group_by_keys, true, "Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section", 0) \ - M(Bool, optimize_injective_functions_in_group_by, true, "Replaces injective functions by it's arguments in GROUP BY section", 0) \ - M(Bool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \ - M(Bool, optimize_group_by_constant_keys, true, "Optimize GROUP BY when all keys in block are constant", 0) \ - M(Bool, legacy_column_name_of_tuple_literal, false, "List all names of element of large tuple literals in their column names instead of hash. This settings exists only for compatibility reasons. It makes sense to set to 'true', while doing rolling update of cluster from version lower than 21.7 to higher.", 0) \ - M(Bool, enable_named_columns_in_function_tuple, true, "Generate named tuples in function tuple() when all names are unique and can be treated as unquoted identifiers.", 0) \ + M(UInt64, external_storage_max_read_rows, 0, R"( +Limit maximum number of rows when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializedMySQL. If equal to 0, this setting is disabled +)", 0) \ + M(UInt64, external_storage_max_read_bytes, 0, R"( +Limit maximum number of bytes when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializedMySQL. If equal to 0, this setting is disabled +)", 0) \ + M(UInt64, external_storage_connect_timeout_sec, DBMS_DEFAULT_CONNECT_TIMEOUT_SEC, R"( +Connect timeout in seconds. Now supported only for MySQL +)", 0) \ + M(UInt64, external_storage_rw_timeout_sec, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, R"( +Read/write timeout in seconds. Now supported only for MySQL +)", 0) \ \ - M(Bool, query_plan_enable_optimizations, true, "Globally enable/disable query optimization at the query plan level", 0) \ - M(UInt64, query_plan_max_optimizations_to_apply, 10000, "Limit the total number of optimizations applied to query plan. If zero, ignored. If limit reached, throw exception", 0) \ - M(Bool, query_plan_lift_up_array_join, true, "Allow to move array joins up in the query plan", 0) \ - M(Bool, query_plan_push_down_limit, true, "Allow to move LIMITs down in the query plan", 0) \ - M(Bool, query_plan_split_filter, true, "Allow to split filters in the query plan", 0) \ - M(Bool, query_plan_merge_expressions, true, "Allow to merge expressions in the query plan", 0) \ - M(Bool, query_plan_merge_filters, false, "Allow to merge filters in the query plan", 0) \ - M(Bool, query_plan_filter_push_down, true, "Allow to push down filter by predicate query plan step", 0) \ - M(Bool, query_plan_convert_outer_join_to_inner_join, true, "Allow to convert OUTER JOIN to INNER JOIN if filter after JOIN always filters default values", 0) \ - M(Bool, query_plan_optimize_prewhere, true, "Allow to push down filter to PREWHERE expression for supported storages", 0) \ - M(Bool, query_plan_execute_functions_after_sorting, true, "Allow to re-order functions after sorting", 0) \ - M(Bool, query_plan_reuse_storage_ordering_for_window_functions, true, "Allow to use the storage sorting for window functions", 0) \ - M(Bool, query_plan_lift_up_union, true, "Allow to move UNIONs up so that more parts of the query plan can be optimized", 0) \ - M(Bool, query_plan_read_in_order, true, "Use query plan for read-in-order optimization", 0) \ - M(Bool, query_plan_aggregation_in_order, true, "Use query plan for aggregation-in-order optimization", 0) \ - M(Bool, query_plan_remove_redundant_sorting, true, "Remove redundant sorting in query plan. For example, sorting steps related to ORDER BY clauses in subqueries", 0) \ - M(Bool, query_plan_remove_redundant_distinct, true, "Remove redundant Distinct step in query plan", 0) \ - M(Bool, query_plan_enable_multithreading_after_window_functions, true, "Enable multithreading after evaluating window functions to allow parallel stream processing", 0) \ - M(UInt64, regexp_max_matches_per_row, 1000, "Max matches of any single regexp per row, used to safeguard 'extractAllGroupsHorizontal' against consuming too much memory with greedy RE.", 0) \ + M(SetOperationMode, union_default_mode, SetOperationMode::Unspecified, R"( +Sets a mode for combining `SELECT` query results. The setting is only used when shared with [UNION](../../sql-reference/statements/select/union.md) without explicitly specifying the `UNION ALL` or `UNION DISTINCT`. + +Possible values: + +- `'DISTINCT'` — ClickHouse outputs rows as a result of combining queries removing duplicate rows. +- `'ALL'` — ClickHouse outputs all rows as a result of combining queries including duplicate rows. +- `''` — ClickHouse generates an exception when used with `UNION`. + +See examples in [UNION](../../sql-reference/statements/select/union.md). +)", 0) \ + M(SetOperationMode, intersect_default_mode, SetOperationMode::ALL, R"( +Set default mode in INTERSECT query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without mode will throw exception. +)", 0) \ + M(SetOperationMode, except_default_mode, SetOperationMode::ALL, R"( +Set default mode in EXCEPT query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without mode will throw exception. +)", 0) \ + M(Bool, optimize_aggregators_of_group_by_keys, true, R"( +Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section +)", 0) \ + M(Bool, optimize_injective_functions_in_group_by, true, R"( +Replaces injective functions by it's arguments in GROUP BY section +)", 0) \ + M(Bool, optimize_group_by_function_keys, true, R"( +Eliminates functions of other keys in GROUP BY section +)", 0) \ + M(Bool, optimize_group_by_constant_keys, true, R"( +Optimize GROUP BY when all keys in block are constant +)", 0) \ + M(Bool, legacy_column_name_of_tuple_literal, false, R"( +List all names of element of large tuple literals in their column names instead of hash. This settings exists only for compatibility reasons. It makes sense to set to 'true', while doing rolling update of cluster from version lower than 21.7 to higher. +)", 0) \ + M(Bool, enable_named_columns_in_function_tuple, true, R"( +Generate named tuples in function tuple() when all names are unique and can be treated as unquoted identifiers. +)", 0) \ \ - M(UInt64, limit, 0, "Limit on read rows from the most 'end' result for select query, default 0 means no limit length", 0) \ - M(UInt64, offset, 0, "Offset on read rows from the most 'end' result for select query", 0) \ + M(Bool, query_plan_enable_optimizations, true, R"( +Toggles query optimization at the query plan level. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable all optimizations at the query plan level +- 1 - Enable optimizations at the query plan level (but individual optimizations may still be disabled via their individual settings) +)", 0) \ + M(UInt64, query_plan_max_optimizations_to_apply, 10000, R"( +Limits the total number of optimizations applied to query plan, see setting [query_plan_enable_optimizations](#query_plan_enable_optimizations). +Useful to avoid long optimization times for complex queries. +If the actual number of optimizations exceeds this setting, an exception is thrown. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: +)", 0) \ + M(Bool, query_plan_lift_up_array_join, true, R"( +Toggles a query-plan-level optimization which moves ARRAY JOINs up in the execution plan. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable +)", 0) \ + M(Bool, query_plan_push_down_limit, true, R"( +Toggles a query-plan-level optimization which moves LIMITs down in the execution plan. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable +)", 0) \ + M(Bool, query_plan_split_filter, true, R"( +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Toggles a query-plan-level optimization which splits filters into expressions. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +Possible values: + +- 0 - Disable +- 1 - Enable +)", 0) \ + M(Bool, query_plan_merge_expressions, true, R"( +Toggles a query-plan-level optimization which merges consecutive filters. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable +)", 0) \ + M(Bool, query_plan_merge_filters, false, R"( +Allow to merge filters in the query plan +)", 0) \ + M(Bool, query_plan_filter_push_down, true, R"( +Toggles a query-plan-level optimization which moves filters down in the execution plan. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable +)", 0) \ + M(Bool, query_plan_convert_outer_join_to_inner_join, true, R"( +Allow to convert OUTER JOIN to INNER JOIN if filter after JOIN always filters default values +)", 0) \ + M(Bool, query_plan_optimize_prewhere, true, R"( +Allow to push down filter to PREWHERE expression for supported storages +)", 0) \ + M(Bool, query_plan_execute_functions_after_sorting, true, R"( +Toggles a query-plan-level optimization which moves expressions after sorting steps. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable +)", 0) \ + M(Bool, query_plan_reuse_storage_ordering_for_window_functions, true, R"( +Toggles a query-plan-level optimization which uses storage sorting when sorting for window functions. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable +)", 0) \ + M(Bool, query_plan_lift_up_union, true, R"( +Toggles a query-plan-level optimization which moves larger subtrees of the query plan into union to enable further optimizations. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable +)", 0) \ + M(Bool, query_plan_read_in_order, true, R"( +Toggles the read in-order optimization query-plan-level optimization. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable +)", 0) \ + M(Bool, query_plan_aggregation_in_order, true, R"( +Toggles the aggregation in-order query-plan-level optimization. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable +)", 0) \ + M(Bool, query_plan_remove_redundant_sorting, true, R"( +Toggles a query-plan-level optimization which removes redundant sorting steps, e.g. in subqueries. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable +)", 0) \ + M(Bool, query_plan_remove_redundant_distinct, true, R"( +Toggles a query-plan-level optimization which removes redundant DISTINCT steps. +Only takes effect if setting [query_plan_enable_optimizations](#query_plan_enable_optimizations) is 1. + +:::note +This is an expert-level setting which should only be used for debugging by developers. The setting may change in future in backward-incompatible ways or be removed. +::: + +Possible values: + +- 0 - Disable +- 1 - Enable +)", 0) \ + M(Bool, query_plan_enable_multithreading_after_window_functions, true, R"( +Enable multithreading after evaluating window functions to allow parallel stream processing +)", 0) \ + M(UInt64, regexp_max_matches_per_row, 1000, R"( +Sets the maximum number of matches for a single regular expression per row. Use it to protect against memory overload when using greedy regular expression in the [extractAllGroupsHorizontal](../../sql-reference/functions/string-search-functions.md/#extractallgroups-horizontal) function. + +Possible values: + +- Positive integer. +)", 0) \ \ - M(UInt64, function_range_max_elements_in_block, 500000000, "Maximum number of values generated by function `range` per block of data (sum of array sizes for every row in a block, see also 'max_block_size' and 'min_insert_block_size_rows'). It is a safety threshold.", 0) \ - M(UInt64, function_sleep_max_microseconds_per_block, 3000000, "Maximum number of microseconds the function `sleep` is allowed to sleep for each block. If a user called it with a larger value, it throws an exception. It is a safety threshold.", 0) \ - M(UInt64, function_visible_width_behavior, 1, "The version of `visibleWidth` behavior. 0 - only count the number of code points; 1 - correctly count zero-width and combining characters, count full-width characters as two, estimate the tab width, count delete characters.", 0) \ - M(ShortCircuitFunctionEvaluation, short_circuit_function_evaluation, ShortCircuitFunctionEvaluation::ENABLE, "Setting for short-circuit function evaluation configuration. Possible values: 'enable' - use short-circuit function evaluation for functions that are suitable for it, 'disable' - disable short-circuit function evaluation, 'force_enable' - use short-circuit function evaluation for all functions.", 0) \ + M(UInt64, limit, 0, R"( +Sets the maximum number of rows to get from the query result. It adjusts the value set by the [LIMIT](../../sql-reference/statements/select/limit.md/#limit-clause) clause, so that the limit, specified in the query, cannot exceed the limit, set by this setting. + +Possible values: + +- 0 — The number of rows is not limited. +- Positive integer. +)", 0) \ + M(UInt64, offset, 0, R"( +Sets the number of rows to skip before starting to return rows from the query. It adjusts the offset set by the [OFFSET](../../sql-reference/statements/select/offset.md/#offset-fetch) clause, so that these two values are summarized. + +Possible values: + +- 0 — No rows are skipped . +- Positive integer. + +**Example** + +Input table: + +``` sql +CREATE TABLE test (i UInt64) ENGINE = MergeTree() ORDER BY i; +INSERT INTO test SELECT number FROM numbers(500); +``` + +Query: + +``` sql +SET limit = 5; +SET offset = 7; +SELECT * FROM test LIMIT 10 OFFSET 100; +``` +Result: + +``` text +┌───i─┐ +│ 107 │ +│ 108 │ +│ 109 │ +└─────┘ +``` +)", 0) \ \ - M(LocalFSReadMethod, storage_file_read_method, LocalFSReadMethod::pread, "Method of reading data from storage file, one of: read, pread, mmap. The mmap method does not apply to clickhouse-server (it's intended for clickhouse-local).", 0) \ - M(String, local_filesystem_read_method, "pread_threadpool", "Method of reading data from local filesystem, one of: read, pread, mmap, io_uring, pread_threadpool. The 'io_uring' method is experimental and does not work for Log, TinyLog, StripeLog, File, Set and Join, and other tables with append-able files in presence of concurrent reads and writes.", 0) \ - M(String, remote_filesystem_read_method, "threadpool", "Method of reading data from remote filesystem, one of: read, threadpool.", 0) \ - M(Bool, local_filesystem_read_prefetch, false, "Should use prefetching when reading data from local filesystem.", 0) \ - M(Bool, remote_filesystem_read_prefetch, true, "Should use prefetching when reading data from remote filesystem.", 0) \ - M(Int64, read_priority, 0, "Priority to read data from local filesystem or remote filesystem. Only supported for 'pread_threadpool' method for local filesystem and for `threadpool` method for remote filesystem.", 0) \ - M(UInt64, merge_tree_min_rows_for_concurrent_read_for_remote_filesystem, (20 * 8192), "If at least as many lines are read from one file, the reading can be parallelized, when reading from remote filesystem.", 0) \ - M(UInt64, merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem, (24 * 10 * 1024 * 1024), "If at least as many bytes are read from one file, the reading can be parallelized, when reading from remote filesystem.", 0) \ - M(UInt64, remote_read_min_bytes_for_seek, 4 * DBMS_DEFAULT_BUFFER_SIZE, "Min bytes required for remote read (url, s3) to do seek, instead of read with ignore.", 0) \ - M(UInt64, merge_tree_min_bytes_per_task_for_remote_reading, 2 * DBMS_DEFAULT_BUFFER_SIZE, "Min bytes to read per task.", 0) ALIAS(filesystem_prefetch_min_bytes_for_single_read_task) \ - M(Bool, merge_tree_use_const_size_tasks_for_remote_reading, true, "Whether to use constant size tasks for reading from a remote table.", 0) \ - M(Bool, merge_tree_determine_task_size_by_prewhere_columns, true, "Whether to use only prewhere columns size to determine reading task size.", 0) \ - M(UInt64, merge_tree_compact_parts_min_granules_to_multibuffer_read, 16, "Only available in ClickHouse Cloud", 0) \ + M(UInt64, function_range_max_elements_in_block, 500000000, R"( +Sets the safety threshold for data volume generated by function [range](../../sql-reference/functions/array-functions.md/#range). Defines the maximum number of values generated by function per block of data (sum of array sizes for every row in a block). + +Possible values: + +- Positive integer. + +**See Also** + +- [max_block_size](#setting-max_block_size) +- [min_insert_block_size_rows](#min-insert-block-size-rows) +)", 0) \ + M(UInt64, function_sleep_max_microseconds_per_block, 3000000, R"( +Maximum number of microseconds the function `sleep` is allowed to sleep for each block. If a user called it with a larger value, it throws an exception. It is a safety threshold. +)", 0) \ + M(UInt64, function_visible_width_behavior, 1, R"( +The version of `visibleWidth` behavior. 0 - only count the number of code points; 1 - correctly count zero-width and combining characters, count full-width characters as two, estimate the tab width, count delete characters. +)", 0) \ + M(ShortCircuitFunctionEvaluation, short_circuit_function_evaluation, ShortCircuitFunctionEvaluation::ENABLE, R"( +Allows calculating the [if](../../sql-reference/functions/conditional-functions.md/#if), [multiIf](../../sql-reference/functions/conditional-functions.md/#multiif), [and](../../sql-reference/functions/logical-functions.md/#logical-and-function), and [or](../../sql-reference/functions/logical-functions.md/#logical-or-function) functions according to a [short scheme](https://en.wikipedia.org/wiki/Short-circuit_evaluation). This helps optimize the execution of complex expressions in these functions and prevent possible exceptions (such as division by zero when it is not expected). + +Possible values: + +- `enable` — Enables short-circuit function evaluation for functions that are suitable for it (can throw an exception or computationally heavy). +- `force_enable` — Enables short-circuit function evaluation for all functions. +- `disable` — Disables short-circuit function evaluation. +)", 0) \ \ - M(Bool, async_insert, false, "If true, data from INSERT query is stored in queue and later flushed to table in background. If wait_for_async_insert is false, INSERT query is processed almost instantly, otherwise client will wait until data will be flushed to table", 0) \ - M(Bool, wait_for_async_insert, true, "If true wait for processing of asynchronous insertion", 0) \ - M(Seconds, wait_for_async_insert_timeout, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "Timeout for waiting for processing asynchronous insertion", 0) \ - M(UInt64, async_insert_max_data_size, 10485760, "Maximum size in bytes of unparsed data collected per query before being inserted", 0) \ - M(UInt64, async_insert_max_query_number, 450, "Maximum number of insert queries before being inserted", 0) \ - M(Milliseconds, async_insert_poll_timeout_ms, 10, "Timeout for polling data from asynchronous insert queue", 0) \ - M(Bool, async_insert_use_adaptive_busy_timeout, true, "If it is set to true, use adaptive busy timeout for asynchronous inserts", 0) \ - M(Milliseconds, async_insert_busy_timeout_min_ms, 50, "If auto-adjusting is enabled through async_insert_use_adaptive_busy_timeout, minimum time to wait before dumping collected data per query since the first data appeared. It also serves as the initial value for the adaptive algorithm", 0) \ - M(Milliseconds, async_insert_busy_timeout_max_ms, 200, "Maximum time to wait before dumping collected data per query since the first data appeared.", 0) ALIAS(async_insert_busy_timeout_ms) \ - M(Double, async_insert_busy_timeout_increase_rate, 0.2, "The exponential growth rate at which the adaptive asynchronous insert timeout increases", 0) \ - M(Double, async_insert_busy_timeout_decrease_rate, 0.2, "The exponential growth rate at which the adaptive asynchronous insert timeout decreases", 0) \ + M(LocalFSReadMethod, storage_file_read_method, LocalFSReadMethod::pread, R"( +Method of reading data from storage file, one of: `read`, `pread`, `mmap`. The mmap method does not apply to clickhouse-server (it's intended for clickhouse-local). +)", 0) \ + M(String, local_filesystem_read_method, "pread_threadpool", R"( +Method of reading data from local filesystem, one of: read, pread, mmap, io_uring, pread_threadpool. The 'io_uring' method is experimental and does not work for Log, TinyLog, StripeLog, File, Set and Join, and other tables with append-able files in presence of concurrent reads and writes. +)", 0) \ + M(String, remote_filesystem_read_method, "threadpool", R"( +Method of reading data from remote filesystem, one of: read, threadpool. +)", 0) \ + M(Bool, local_filesystem_read_prefetch, false, R"( +Should use prefetching when reading data from local filesystem. +)", 0) \ + M(Bool, remote_filesystem_read_prefetch, true, R"( +Should use prefetching when reading data from remote filesystem. +)", 0) \ + M(Int64, read_priority, 0, R"( +Priority to read data from local filesystem or remote filesystem. Only supported for 'pread_threadpool' method for local filesystem and for `threadpool` method for remote filesystem. +)", 0) \ + M(UInt64, merge_tree_min_rows_for_concurrent_read_for_remote_filesystem, (20 * 8192), R"( +The minimum number of lines to read from one file before the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem. + +Possible values: + +- Positive integer. +)", 0) \ + M(UInt64, merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem, (24 * 10 * 1024 * 1024), R"( +The minimum number of bytes to read from one file before [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem. + +Possible values: + +- Positive integer. +)", 0) \ + M(UInt64, remote_read_min_bytes_for_seek, 4 * DBMS_DEFAULT_BUFFER_SIZE, R"( +Min bytes required for remote read (url, s3) to do seek, instead of read with ignore. +)", 0) \ + M(UInt64, merge_tree_min_bytes_per_task_for_remote_reading, 2 * DBMS_DEFAULT_BUFFER_SIZE, R"( +Min bytes to read per task. +)", 0) ALIAS(filesystem_prefetch_min_bytes_for_single_read_task) \ + M(Bool, merge_tree_use_const_size_tasks_for_remote_reading, true, R"( +Whether to use constant size tasks for reading from a remote table. +)", 0) \ + M(Bool, merge_tree_determine_task_size_by_prewhere_columns, true, R"( +Whether to use only prewhere columns size to determine reading task size. +)", 0) \ + M(UInt64, merge_tree_compact_parts_min_granules_to_multibuffer_read, 16, R"( +Only available in ClickHouse Cloud. Number of granules in stripe of compact part of MergeTree tables to use multibuffer reader, which supports parallel reading and prefetch. In case of reading from remote fs using of multibuffer reader increases number of read request. +)", 0) \ \ - M(UInt64, remote_fs_read_max_backoff_ms, 10000, "Max wait time when trying to read data for remote disk", 0) \ - M(UInt64, remote_fs_read_backoff_max_tries, 5, "Max attempts to read with backoff", 0) \ - M(Bool, enable_filesystem_cache, true, "Use cache for remote filesystem. This setting does not turn on/off cache for disks (must be done via disk config), but allows to bypass cache for some queries if intended", 0) \ - M(Bool, enable_filesystem_cache_on_write_operations, false, "Write into cache on write operations. To actually work this setting requires be added to disk config too", 0) \ - M(Bool, enable_filesystem_cache_log, false, "Allows to record the filesystem caching log for each query", 0) \ - M(Bool, read_from_filesystem_cache_if_exists_otherwise_bypass_cache, false, "Allow to use the filesystem cache in passive mode - benefit from the existing cache entries, but don't put more entries into the cache. If you set this setting for heavy ad-hoc queries and leave it disabled for short real-time queries, this will allows to avoid cache threshing by too heavy queries and to improve the overall system efficiency.", 0) \ - M(Bool, skip_download_if_exceeds_query_cache, true, "Skip download from remote filesystem if exceeds query cache size", 0) \ - M(UInt64, filesystem_cache_max_download_size, (128UL * 1024 * 1024 * 1024), "Max remote filesystem cache size that can be downloaded by a single query", 0) \ - M(Bool, throw_on_error_from_cache_on_write_operations, false, "Ignore error from cache when caching on write operations (INSERT, merges)", 0) \ - M(UInt64, filesystem_cache_segments_batch_size, 20, "Limit on size of a single batch of file segments that a read buffer can request from cache. Too low value will lead to excessive requests to cache, too large may slow down eviction from cache", 0) \ - M(UInt64, filesystem_cache_reserve_space_wait_lock_timeout_milliseconds, 1000, "Wait time to lock cache for space reservation in filesystem cache", 0) \ - M(UInt64, temporary_data_in_cache_reserve_space_wait_lock_timeout_milliseconds, (10 * 60 * 1000), "Wait time to lock cache for space reservation for temporary data in filesystem cache", 0) \ + M(Bool, async_insert, false, R"( +If true, data from INSERT query is stored in queue and later flushed to table in background. If wait_for_async_insert is false, INSERT query is processed almost instantly, otherwise client will wait until data will be flushed to table +)", 0) \ + M(Bool, wait_for_async_insert, true, R"( +If true wait for processing of asynchronous insertion +)", 0) \ + M(Seconds, wait_for_async_insert_timeout, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, R"( +Timeout for waiting for processing asynchronous insertion +)", 0) \ + M(UInt64, async_insert_max_data_size, 10485760, R"( +Maximum size in bytes of unparsed data collected per query before being inserted +)", 0) \ + M(UInt64, async_insert_max_query_number, 450, R"( +Maximum number of insert queries before being inserted +)", 0) \ + M(Milliseconds, async_insert_poll_timeout_ms, 10, R"( +Timeout for polling data from asynchronous insert queue +)", 0) \ + M(Bool, async_insert_use_adaptive_busy_timeout, true, R"( +If it is set to true, use adaptive busy timeout for asynchronous inserts +)", 0) \ + M(Milliseconds, async_insert_busy_timeout_min_ms, 50, R"( +If auto-adjusting is enabled through async_insert_use_adaptive_busy_timeout, minimum time to wait before dumping collected data per query since the first data appeared. It also serves as the initial value for the adaptive algorithm +)", 0) \ + M(Milliseconds, async_insert_busy_timeout_max_ms, 200, R"( +Maximum time to wait before dumping collected data per query since the first data appeared. +)", 0) ALIAS(async_insert_busy_timeout_ms) \ + M(Double, async_insert_busy_timeout_increase_rate, 0.2, R"( +The exponential growth rate at which the adaptive asynchronous insert timeout increases +)", 0) \ + M(Double, async_insert_busy_timeout_decrease_rate, 0.2, R"( +The exponential growth rate at which the adaptive asynchronous insert timeout decreases +)", 0) \ \ - M(Bool, use_page_cache_for_disks_without_file_cache, false, "Use userspace page cache for remote disks that don't have filesystem cache enabled.", 0) \ - M(Bool, read_from_page_cache_if_exists_otherwise_bypass_cache, false, "Use userspace page cache in passive mode, similar to read_from_filesystem_cache_if_exists_otherwise_bypass_cache.", 0) \ - M(Bool, page_cache_inject_eviction, false, "Userspace page cache will sometimes invalidate some pages at random. Intended for testing.", 0) \ + M(UInt64, remote_fs_read_max_backoff_ms, 10000, R"( +Max wait time when trying to read data for remote disk +)", 0) \ + M(UInt64, remote_fs_read_backoff_max_tries, 5, R"( +Max attempts to read with backoff +)", 0) \ + M(Bool, enable_filesystem_cache, true, R"( +Use cache for remote filesystem. This setting does not turn on/off cache for disks (must be done via disk config), but allows to bypass cache for some queries if intended +)", 0) \ + M(Bool, enable_filesystem_cache_on_write_operations, false, R"( +Write into cache on write operations. To actually work this setting requires be added to disk config too +)", 0) \ + M(Bool, enable_filesystem_cache_log, false, R"( +Allows to record the filesystem caching log for each query +)", 0) \ + M(Bool, read_from_filesystem_cache_if_exists_otherwise_bypass_cache, false, R"( +Allow to use the filesystem cache in passive mode - benefit from the existing cache entries, but don't put more entries into the cache. If you set this setting for heavy ad-hoc queries and leave it disabled for short real-time queries, this will allows to avoid cache threshing by too heavy queries and to improve the overall system efficiency. +)", 0) \ + M(Bool, skip_download_if_exceeds_query_cache, true, R"( +Skip download from remote filesystem if exceeds query cache size +)", 0) \ + M(UInt64, filesystem_cache_max_download_size, (128UL * 1024 * 1024 * 1024), R"( +Max remote filesystem cache size that can be downloaded by a single query +)", 0) \ + M(Bool, throw_on_error_from_cache_on_write_operations, false, R"( +Ignore error from cache when caching on write operations (INSERT, merges) +)", 0) \ + M(UInt64, filesystem_cache_segments_batch_size, 20, R"( +Limit on size of a single batch of file segments that a read buffer can request from cache. Too low value will lead to excessive requests to cache, too large may slow down eviction from cache +)", 0) \ + M(UInt64, filesystem_cache_reserve_space_wait_lock_timeout_milliseconds, 1000, R"( +Wait time to lock cache for space reservation in filesystem cache +)", 0) \ + M(UInt64, temporary_data_in_cache_reserve_space_wait_lock_timeout_milliseconds, (10 * 60 * 1000), R"( +Wait time to lock cache for space reservation for temporary data in filesystem cache +)", 0) \ \ - M(Bool, load_marks_asynchronously, false, "Load MergeTree marks asynchronously", 0) \ - M(Bool, enable_filesystem_read_prefetches_log, false, "Log to system.filesystem prefetch_log during query. Should be used only for testing or debugging, not recommended to be turned on by default", 0) \ - M(Bool, allow_prefetched_read_pool_for_remote_filesystem, true, "Prefer prefetched threadpool if all parts are on remote filesystem", 0) \ - M(Bool, allow_prefetched_read_pool_for_local_filesystem, false, "Prefer prefetched threadpool if all parts are on local filesystem", 0) \ + M(Bool, use_page_cache_for_disks_without_file_cache, false, R"( +Use userspace page cache for remote disks that don't have filesystem cache enabled. +)", 0) \ + M(Bool, read_from_page_cache_if_exists_otherwise_bypass_cache, false, R"( +Use userspace page cache in passive mode, similar to read_from_filesystem_cache_if_exists_otherwise_bypass_cache. +)", 0) \ + M(Bool, page_cache_inject_eviction, false, R"( +Userspace page cache will sometimes invalidate some pages at random. Intended for testing. +)", 0) \ \ - M(UInt64, prefetch_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, "The maximum size of the prefetch buffer to read from the filesystem.", 0) \ - M(UInt64, filesystem_prefetch_step_bytes, 0, "Prefetch step in bytes. Zero means `auto` - approximately the best prefetch step will be auto deduced, but might not be 100% the best. The actual value might be different because of setting filesystem_prefetch_min_bytes_for_single_read_task", 0) \ - M(UInt64, filesystem_prefetch_step_marks, 0, "Prefetch step in marks. Zero means `auto` - approximately the best prefetch step will be auto deduced, but might not be 100% the best. The actual value might be different because of setting filesystem_prefetch_min_bytes_for_single_read_task", 0) \ - M(UInt64, filesystem_prefetch_max_memory_usage, "1Gi", "Maximum memory usage for prefetches.", 0) \ - M(UInt64, filesystem_prefetches_limit, 200, "Maximum number of prefetches. Zero means unlimited. A setting `filesystem_prefetches_max_memory_usage` is more recommended if you want to limit the number of prefetches", 0) \ + M(Bool, load_marks_asynchronously, false, R"( +Load MergeTree marks asynchronously +)", 0) \ + M(Bool, enable_filesystem_read_prefetches_log, false, R"( +Log to system.filesystem prefetch_log during query. Should be used only for testing or debugging, not recommended to be turned on by default +)", 0) \ + M(Bool, allow_prefetched_read_pool_for_remote_filesystem, true, R"( +Prefer prefetched threadpool if all parts are on remote filesystem +)", 0) \ + M(Bool, allow_prefetched_read_pool_for_local_filesystem, false, R"( +Prefer prefetched threadpool if all parts are on local filesystem +)", 0) \ \ - M(UInt64, use_structure_from_insertion_table_in_table_functions, 2, "Use structure from insertion table instead of schema inference from data. Possible values: 0 - disabled, 1 - enabled, 2 - auto", 0) \ + M(UInt64, prefetch_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, R"( +The maximum size of the prefetch buffer to read from the filesystem. +)", 0) \ + M(UInt64, filesystem_prefetch_step_bytes, 0, R"( +Prefetch step in bytes. Zero means `auto` - approximately the best prefetch step will be auto deduced, but might not be 100% the best. The actual value might be different because of setting filesystem_prefetch_min_bytes_for_single_read_task +)", 0) \ + M(UInt64, filesystem_prefetch_step_marks, 0, R"( +Prefetch step in marks. Zero means `auto` - approximately the best prefetch step will be auto deduced, but might not be 100% the best. The actual value might be different because of setting filesystem_prefetch_min_bytes_for_single_read_task +)", 0) \ + M(UInt64, filesystem_prefetch_max_memory_usage, "1Gi", R"( +Maximum memory usage for prefetches. +)", 0) \ + M(UInt64, filesystem_prefetches_limit, 200, R"( +Maximum number of prefetches. Zero means unlimited. A setting `filesystem_prefetches_max_memory_usage` is more recommended if you want to limit the number of prefetches +)", 0) \ \ - M(UInt64, http_max_tries, 10, "Max attempts to read via http.", 0) \ - M(UInt64, http_retry_initial_backoff_ms, 100, "Min milliseconds for backoff, when retrying read via http", 0) \ - M(UInt64, http_retry_max_backoff_ms, 10000, "Max milliseconds for backoff, when retrying read via http", 0) \ + M(UInt64, use_structure_from_insertion_table_in_table_functions, 2, R"( +Use structure from insertion table instead of schema inference from data. Possible values: 0 - disabled, 1 - enabled, 2 - auto +)", 0) \ \ - M(Bool, force_remove_data_recursively_on_drop, false, "Recursively remove data on DROP query. Avoids 'Directory not empty' error, but may silently remove detached data", 0) \ - M(Bool, check_table_dependencies, true, "Check that DDL query (such as DROP TABLE or RENAME) will not break dependencies", 0) \ - M(Bool, check_referential_table_dependencies, false, "Check that DDL query (such as DROP TABLE or RENAME) will not break referential dependencies", 0) \ - M(Bool, use_local_cache_for_remote_storage, true, "Use local cache for remote storage like HDFS or S3, it's used for remote table engine only", 0) \ + M(UInt64, http_max_tries, 10, R"( +Max attempts to read via http. +)", 0) \ + M(UInt64, http_retry_initial_backoff_ms, 100, R"( +Min milliseconds for backoff, when retrying read via http +)", 0) \ + M(UInt64, http_retry_max_backoff_ms, 10000, R"( +Max milliseconds for backoff, when retrying read via http +)", 0) \ \ - M(Bool, allow_unrestricted_reads_from_keeper, false, "Allow unrestricted (without condition on path) reads from system.zookeeper table, can be handy, but is not safe for zookeeper", 0) \ - M(Bool, allow_deprecated_database_ordinary, false, "Allow to create databases with deprecated Ordinary engine", 0) \ - M(Bool, allow_deprecated_syntax_for_merge_tree, false, "Allow to create *MergeTree tables with deprecated engine definition syntax", 0) \ - M(Bool, allow_asynchronous_read_from_io_pool_for_merge_tree, false, "Use background I/O pool to read from MergeTree tables. This setting may increase performance for I/O bound queries", 0) \ - M(UInt64, max_streams_for_merge_tree_reading, 0, "If is not zero, limit the number of reading streams for MergeTree table.", 0) \ + M(Bool, force_remove_data_recursively_on_drop, false, R"( +Recursively remove data on DROP query. Avoids 'Directory not empty' error, but may silently remove detached data +)", 0) \ + M(Bool, check_table_dependencies, true, R"( +Check that DDL query (such as DROP TABLE or RENAME) will not break dependencies +)", 0) \ + M(Bool, check_referential_table_dependencies, false, R"( +Check that DDL query (such as DROP TABLE or RENAME) will not break referential dependencies +)", 0) \ + M(Bool, use_local_cache_for_remote_storage, true, R"( +Use local cache for remote storage like HDFS or S3, it's used for remote table engine only +)", 0) \ \ - M(Bool, force_grouping_standard_compatibility, true, "Make GROUPING function to return 1 when argument is not used as an aggregation key", 0) \ + M(Bool, allow_unrestricted_reads_from_keeper, false, R"( +Allow unrestricted (without condition on path) reads from system.zookeeper table, can be handy, but is not safe for zookeeper +)", 0) \ + M(Bool, allow_deprecated_database_ordinary, false, R"( +Allow to create databases with deprecated Ordinary engine +)", 0) \ + M(Bool, allow_deprecated_syntax_for_merge_tree, false, R"( +Allow to create *MergeTree tables with deprecated engine definition syntax +)", 0) \ + M(Bool, allow_asynchronous_read_from_io_pool_for_merge_tree, false, R"( +Use background I/O pool to read from MergeTree tables. This setting may increase performance for I/O bound queries +)", 0) \ + M(UInt64, max_streams_for_merge_tree_reading, 0, R"( +If is not zero, limit the number of reading streams for MergeTree table. +)", 0) \ \ - M(Bool, schema_inference_use_cache_for_file, true, "Use cache in schema inference while using file table function", 0) \ - M(Bool, schema_inference_use_cache_for_s3, true, "Use cache in schema inference while using s3 table function", 0) \ - M(Bool, schema_inference_use_cache_for_azure, true, "Use cache in schema inference while using azure table function", 0) \ - M(Bool, schema_inference_use_cache_for_hdfs, true, "Use cache in schema inference while using hdfs table function", 0) \ - M(Bool, schema_inference_use_cache_for_url, true, "Use cache in schema inference while using url table function", 0) \ - M(Bool, schema_inference_cache_require_modification_time_for_url, true, "Use schema from cache for URL with last modification time validation (for URLs with Last-Modified header)", 0) \ + M(Bool, force_grouping_standard_compatibility, true, R"( +Make GROUPING function to return 1 when argument is not used as an aggregation key +)", 0) \ \ - M(String, compatibility, "", "Changes other settings according to provided ClickHouse version. If we know that we changed some behaviour in ClickHouse by changing some settings in some version, this compatibility setting will control these settings", 0) \ + M(Bool, schema_inference_use_cache_for_file, true, R"( +Use cache in schema inference while using file table function +)", 0) \ + M(Bool, schema_inference_use_cache_for_s3, true, R"( +Use cache in schema inference while using s3 table function +)", 0) \ + M(Bool, schema_inference_use_cache_for_azure, true, R"( +Use cache in schema inference while using azure table function +)", 0) \ + M(Bool, schema_inference_use_cache_for_hdfs, true, R"( +Use cache in schema inference while using hdfs table function +)", 0) \ + M(Bool, schema_inference_use_cache_for_url, true, R"( +Use cache in schema inference while using url table function +)", 0) \ + M(Bool, schema_inference_cache_require_modification_time_for_url, true, R"( +Use schema from cache for URL with last modification time validation (for URLs with Last-Modified header) +)", 0) \ \ - M(Map, additional_table_filters, "", "Additional filter expression which would be applied after reading from specified table. Syntax: {'table1': 'expression', 'database.table2': 'expression'}", 0) \ - M(String, additional_result_filter, "", "Additional filter expression which would be applied to query result", 0) \ + M(String, compatibility, "", R"( +The `compatibility` setting causes ClickHouse to use the default settings of a previous version of ClickHouse, where the previous version is provided as the setting. + +If settings are set to non-default values, then those settings are honored (only settings that have not been modified are affected by the `compatibility` setting). + +This setting takes a ClickHouse version number as a string, like `22.3`, `22.8`. An empty value means that this setting is disabled. + +Disabled by default. + +:::note +In ClickHouse Cloud the compatibility setting must be set by ClickHouse Cloud support. Please [open a case](https://clickhouse.cloud/support) to have it set. +::: +)", 0) \ \ - M(String, workload, "default", "Name of workload to be used to access resources", 0) \ - M(Milliseconds, storage_system_stack_trace_pipe_read_timeout_ms, 100, "Maximum time to read from a pipe for receiving information from the threads when querying the `system.stack_trace` table. This setting is used for testing purposes and not meant to be changed by users.", 0) \ + M(Map, additional_table_filters, "", R"( +An additional filter expression that is applied after reading +from the specified table. + +**Example** + +``` sql +INSERT INTO table_1 VALUES (1, 'a'), (2, 'bb'), (3, 'ccc'), (4, 'dddd'); +SELECT * FROM table_1; +``` +```response +┌─x─┬─y────┐ +│ 1 │ a │ +│ 2 │ bb │ +│ 3 │ ccc │ +│ 4 │ dddd │ +└───┴──────┘ +``` +```sql +SELECT * +FROM table_1 +SETTINGS additional_table_filters = {'table_1': 'x != 2'} +``` +```response +┌─x─┬─y────┐ +│ 1 │ a │ +│ 3 │ ccc │ +│ 4 │ dddd │ +└───┴──────┘ +``` +)", 0) \ + M(String, additional_result_filter, "", R"( +An additional filter expression to apply to the result of `SELECT` query. +This setting is not applied to any subquery. + +**Example** + +``` sql +INSERT INTO table_1 VALUES (1, 'a'), (2, 'bb'), (3, 'ccc'), (4, 'dddd'); +SElECT * FROM table_1; +``` +```response +┌─x─┬─y────┐ +│ 1 │ a │ +│ 2 │ bb │ +│ 3 │ ccc │ +│ 4 │ dddd │ +└───┴──────┘ +``` +```sql +SELECT * +FROM table_1 +SETTINGS additional_result_filter = 'x != 2' +``` +```response +┌─x─┬─y────┐ +│ 1 │ a │ +│ 3 │ ccc │ +│ 4 │ dddd │ +└───┴──────┘ +``` +)", 0) \ \ - M(String, rename_files_after_processing, "", "Rename successfully processed files according to the specified pattern; Pattern can include the following placeholders: `%a` (full original file name), `%f` (original filename without extension), `%e` (file extension with dot), `%t` (current timestamp in µs), and `%%` (% sign)", 0) \ + M(String, workload, "default", R"( +Name of workload to be used to access resources +)", 0) \ + M(Milliseconds, storage_system_stack_trace_pipe_read_timeout_ms, 100, R"( +Maximum time to read from a pipe for receiving information from the threads when querying the `system.stack_trace` table. This setting is used for testing purposes and not meant to be changed by users. +)", 0) \ \ - M(Bool, parallelize_output_from_storages, true, "Parallelize output for reading step from storage. It allows parallelization of query processing right after reading from storage if possible", 0) \ - M(String, insert_deduplication_token, "", "If not empty, used for duplicate detection instead of data digest", 0) \ - M(Bool, count_distinct_optimization, false, "Rewrite count distinct to subquery of group by", 0) \ - M(Bool, throw_if_no_data_to_insert, true, "Allows or forbids empty INSERTs, enabled by default (throws an error on an empty insert)", 0) \ - M(Bool, compatibility_ignore_auto_increment_in_create_table, false, "Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL", 0) \ - M(Bool, multiple_joins_try_to_keep_original_names, false, "Do not add aliases to top level expression list on multiple joins rewrite", 0) \ - M(Bool, optimize_sorting_by_input_stream_properties, true, "Optimize sorting by sorting properties of input stream", 0) \ - M(UInt64, keeper_max_retries, 10, "Max retries for general keeper operations", 0) \ - M(UInt64, keeper_retry_initial_backoff_ms, 100, "Initial backoff timeout for general keeper operations", 0) \ - M(UInt64, keeper_retry_max_backoff_ms, 5000, "Max backoff timeout for general keeper operations", 0) \ - M(UInt64, insert_keeper_max_retries, 20, "Max retries for keeper operations during insert", 0) \ - M(UInt64, insert_keeper_retry_initial_backoff_ms, 100, "Initial backoff timeout for keeper operations during insert", 0) \ - M(UInt64, insert_keeper_retry_max_backoff_ms, 10000, "Max backoff timeout for keeper operations during insert", 0) \ - M(Float, insert_keeper_fault_injection_probability, 0.0f, "Approximate probability of failure for a keeper request during insert. Valid value is in interval [0.0f, 1.0f]", 0) \ - M(UInt64, insert_keeper_fault_injection_seed, 0, "0 - random seed, otherwise the setting value", 0) \ - M(Bool, force_aggregation_in_order, false, "The setting is used by the server itself to support distributed queries. Do not change it manually, because it will break normal operations. (Forces use of aggregation in order on remote nodes during distributed aggregation).", IMPORTANT) \ - M(UInt64, http_max_request_param_data_size, 10_MiB, "Limit on size of request data used as a query parameter in predefined HTTP requests.", 0) \ - M(Bool, function_json_value_return_type_allow_nullable, false, "Allow function JSON_VALUE to return nullable type.", 0) \ - M(Bool, function_json_value_return_type_allow_complex, false, "Allow function JSON_VALUE to return complex type, such as: struct, array, map.", 0) \ - M(Bool, use_with_fill_by_sorting_prefix, true, "Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently", 0) \ - M(Bool, optimize_uniq_to_count, true, "Rewrite uniq and its variants(except uniqUpTo) to count if subquery has distinct or group by clause.", 0) \ - M(Bool, use_variant_as_common_type, false, "Use Variant as a result type for if/multiIf in case when there is no common type for arguments", 0) \ - M(Bool, enable_order_by_all, true, "Enable sorting expression ORDER BY ALL.", 0) \ - M(Float, ignore_drop_queries_probability, 0, "If enabled, server will ignore all DROP table queries with specified probability (for Memory and JOIN engines it will replcase DROP to TRUNCATE). Used for testing purposes", 0) \ - M(Bool, traverse_shadow_remote_data_paths, false, "Traverse shadow directory when query system.remote_data_paths", 0) \ - M(Bool, geo_distance_returns_float64_on_float64_arguments, true, "If all four arguments to `geoDistance`, `greatCircleDistance`, `greatCircleAngle` functions are Float64, return Float64 and use double precision for internal calculations. In previous ClickHouse versions, the functions always returned Float32.", 0) \ - M(Bool, allow_get_client_http_header, false, "Allow to use the function `getClientHTTPHeader` which lets to obtain a value of an the current HTTP request's header. It is not enabled by default for security reasons, because some headers, such as `Cookie`, could contain sensitive info. Note that the `X-ClickHouse-*` and `Authentication` headers are always restricted and cannot be obtained with this function.", 0) \ - M(Bool, cast_string_to_dynamic_use_inference, false, "Use types inference during String to Dynamic conversion", 0) \ - M(Bool, enable_blob_storage_log, true, "Write information about blob storage operations to system.blob_storage_log table", 0) \ - M(Bool, use_json_alias_for_old_object_type, false, "When enabled, JSON type alias will create old experimental Object type instead of a new JSON type", 0) \ - M(Bool, allow_create_index_without_type, false, "Allow CREATE INDEX query without TYPE. Query will be ignored. Made for SQL compatibility tests.", 0) \ - M(Bool, create_index_ignore_unique, false, "Ignore UNIQUE keyword in CREATE UNIQUE INDEX. Made for SQL compatibility tests.", 0) \ - M(Bool, print_pretty_type_names, true, "Print pretty type names in the DESCRIBE query and `toTypeName` function, as well as in the `SHOW CREATE TABLE` query and the `formatQuery` function.", 0) \ - M(Bool, create_table_empty_primary_key_by_default, false, "Allow to create *MergeTree tables with empty primary key when ORDER BY and PRIMARY KEY not specified", 0) \ - M(Bool, allow_named_collection_override_by_default, true, "Allow named collections' fields override by default.", 0) \ - M(SQLSecurityType, default_normal_view_sql_security, SQLSecurityType::INVOKER, "Allows to set a default value for SQL SECURITY option when creating a normal view.", 0) \ - M(SQLSecurityType, default_materialized_view_sql_security, SQLSecurityType::DEFINER, "Allows to set a default value for SQL SECURITY option when creating a materialized view.", 0) \ - M(String, default_view_definer, "CURRENT_USER", "Allows to set a default value for DEFINER option when creating view.", 0) \ - M(UInt64, cache_warmer_threads, 4, "Only available in ClickHouse Cloud. Number of background threads for speculatively downloading new data parts into file cache, when cache_populated_by_fetch is enabled. Zero to disable.", 0) \ - M(Int64, ignore_cold_parts_seconds, 0, "Only available in ClickHouse Cloud. Exclude new data parts from SELECT queries until they're either pre-warmed (see cache_populated_by_fetch) or this many seconds old. Only for Replicated-/SharedMergeTree.", 0) \ - M(Int64, prefer_warmed_unmerged_parts_seconds, 0, "Only available in ClickHouse Cloud. If a merged part is less than this many seconds old and is not pre-warmed (see cache_populated_by_fetch), but all its source parts are available and pre-warmed, SELECT queries will read from those parts instead. Only for ReplicatedMergeTree. Note that this only checks whether CacheWarmer processed the part; if the part was fetched into cache by something else, it'll still be considered cold until CacheWarmer gets to it; if it was warmed, then evicted from cache, it'll still be considered warm.", 0) \ - M(Bool, iceberg_engine_ignore_schema_evolution, false, "Ignore schema evolution in Iceberg table engine and read all data using latest schema saved on table creation. Note that it can lead to incorrect result", 0) \ - M(Bool, allow_deprecated_error_prone_window_functions, false, "Allow usage of deprecated error prone window functions (neighbor, runningAccumulate, runningDifferenceStartingWithFirstValue, runningDifference)", 0) \ - M(Bool, allow_deprecated_snowflake_conversion_functions, false, "Enables deprecated functions snowflakeToDateTime[64] and dateTime[64]ToSnowflake.", 0) \ - M(Bool, optimize_distinct_in_order, true, "Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \ - M(Bool, keeper_map_strict_mode, false, "Enforce additional checks during operations on KeeperMap. E.g. throw an exception on an insert for already existing key", 0) \ - M(UInt64, extract_key_value_pairs_max_pairs_per_row, 1000, "Max number of pairs that can be produced by the `extractKeyValuePairs` function. Used as a safeguard against consuming too much memory.", 0) ALIAS(extract_kvp_max_pairs_per_row) \ - M(Bool, restore_replace_external_engines_to_null, false, "Replace all the external table engines to Null on restore. Useful for testing purposes", 0) \ - M(Bool, restore_replace_external_table_functions_to_null, false, "Replace all table functions to Null on restore. Useful for testing purposes", 0) \ - M(Bool, restore_replace_external_dictionary_source_to_null, false, "Replace external dictionary sources to Null on restore. Useful for testing purposes", 0) \ - M(Bool, create_if_not_exists, false, "Enable IF NOT EXISTS for CREATE statements by default", 0) \ - M(Bool, enable_secure_identifiers, false, "If enabled, only allow secure identifiers which contain only underscore and alphanumeric characters", 0) \ - M(Bool, mongodb_throw_on_unsupported_query, true, "If enabled, MongoDB tables will return an error when a MongoDB query cannot be built. Otherwise, ClickHouse reads the full table and processes it locally. This option does not apply to the legacy implementation or when 'allow_experimental_analyzer=0'.", 0) \ + M(String, rename_files_after_processing, "", R"( +- **Type:** String + +- **Default value:** Empty string + +This setting allows to specify renaming pattern for files processed by `file` table function. When option is set, all files read by `file` table function will be renamed according to specified pattern with placeholders, only if files processing was successful. + +### Placeholders + +- `%a` — Full original filename (e.g., "sample.csv"). +- `%f` — Original filename without extension (e.g., "sample"). +- `%e` — Original file extension with dot (e.g., ".csv"). +- `%t` — Timestamp (in microseconds). +- `%%` — Percentage sign ("%"). + +### Example +- Option: `--rename_files_after_processing="processed_%f_%t%e"` + +- Query: `SELECT * FROM file('sample.csv')` + + +If reading `sample.csv` is successful, file will be renamed to `processed_sample_1683473210851438.csv` +)", 0) \ + \ + /* CLOUD ONLY */ \ + M(Bool, read_through_distributed_cache, false, R"( +Only in ClickHouse Cloud. Allow reading from distributed cache +)", 0) \ + M(Bool, write_through_distributed_cache, false, R"( +Only in ClickHouse Cloud. Allow writing to distributed cache (writing to s3 will also be done by distributed cache) +)", 0) \ + M(Bool, distributed_cache_throw_on_error, false, R"( +Only in ClickHouse Cloud. Rethrow exception happened during communication with distributed cache or exception received from distributed cache. Otherwise fallback to skipping distributed cache on error +)", 0) \ + M(DistributedCacheLogMode, distributed_cache_log_mode, DistributedCacheLogMode::LOG_ON_ERROR, R"( +Only in ClickHouse Cloud. Mode for writing to system.distributed_cache_log +)", 0) \ + M(Bool, distributed_cache_fetch_metrics_only_from_current_az, true, R"( +Only in ClickHouse Cloud. Fetch metrics only from current availability zone in system.distributed_cache_metrics, system.distributed_cache_events +)", 0) \ + M(UInt64, distributed_cache_connect_max_tries, 100, R"( +Only in ClickHouse Cloud. Number of tries to connect to distributed cache if unsuccessful +)", 0) \ + M(UInt64, distributed_cache_receive_response_wait_milliseconds, 60000, R"( +Only in ClickHouse Cloud. Wait time in milliseconds to receive data for request from distributed cache +)", 0) \ + M(UInt64, distributed_cache_receive_timeout_milliseconds, 10000, R"( +Only in ClickHouse Cloud. Wait time in milliseconds to receive any kind of response from distributed cache +)", 0) \ + M(UInt64, distributed_cache_wait_connection_from_pool_milliseconds, 100, R"( +Only in ClickHouse Cloud. Wait time in milliseconds to receive connection from connection pool if distributed_cache_pool_behaviour_on_limit is wait +)", 0) \ + M(Bool, distributed_cache_bypass_connection_pool, false, R"( +Only in ClickHouse Cloud. Allow to bypass distributed cache connection pool +)", 0) \ + M(DistributedCachePoolBehaviourOnLimit, distributed_cache_pool_behaviour_on_limit, DistributedCachePoolBehaviourOnLimit::ALLOCATE_NEW_BYPASSING_POOL, R"( +Only in ClickHouse Cloud. Identifies behaviour of distributed cache connection on pool limit reached +)", 0) \ + M(UInt64, distributed_cache_read_alignment, 0, R"( +Only in ClickHouse Cloud. A setting for testing purposes, do not change it +)", 0) \ + M(UInt64, distributed_cache_max_unacked_inflight_packets, DistributedCache::MAX_UNACKED_INFLIGHT_PACKETS, R"( +Only in ClickHouse Cloud. A maximum number of unacknowledged in-flight packets in a single distributed cache read request +)", 0) \ + M(UInt64, distributed_cache_data_packet_ack_window, DistributedCache::ACK_DATA_PACKET_WINDOW, R"( +Only in ClickHouse Cloud. A window for sending ACK for DataPacket sequence in a single distributed cache read request +)", 0) \ + \ + M(Bool, parallelize_output_from_storages, true, R"( +Parallelize output for reading step from storage. It allows parallelization of query processing right after reading from storage if possible +)", 0) \ + M(String, insert_deduplication_token, "", R"( +The setting allows a user to provide own deduplication semantic in MergeTree/ReplicatedMergeTree +For example, by providing a unique value for the setting in each INSERT statement, +user can avoid the same inserted data being deduplicated. + +Possible values: + +- Any string + +`insert_deduplication_token` is used for deduplication _only_ when not empty. + +For the replicated tables by default the only 100 of the most recent inserts for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md/#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)). +For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window). + +:::note +`insert_deduplication_token` works on a partition level (the same as `insert_deduplication` checksum). Multiple partitions can have the same `insert_deduplication_token`. +::: + +Example: + +```sql +CREATE TABLE test_table +( A Int64 ) +ENGINE = MergeTree +ORDER BY A +SETTINGS non_replicated_deduplication_window = 100; + +INSERT INTO test_table SETTINGS insert_deduplication_token = 'test' VALUES (1); + +-- the next insert won't be deduplicated because insert_deduplication_token is different +INSERT INTO test_table SETTINGS insert_deduplication_token = 'test1' VALUES (1); + +-- the next insert will be deduplicated because insert_deduplication_token +-- is the same as one of the previous +INSERT INTO test_table SETTINGS insert_deduplication_token = 'test' VALUES (2); + +SELECT * FROM test_table + +┌─A─┐ +│ 1 │ +└───┘ +┌─A─┐ +│ 1 │ +└───┘ +``` +)", 0) \ + M(Bool, count_distinct_optimization, false, R"( +Rewrite count distinct to subquery of group by +)", 0) \ + M(Bool, throw_if_no_data_to_insert, true, R"( +Allows or forbids empty INSERTs, enabled by default (throws an error on an empty insert) +)", 0) \ + M(Bool, compatibility_ignore_auto_increment_in_create_table, false, R"( +Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL +)", 0) \ + M(Bool, multiple_joins_try_to_keep_original_names, false, R"( +Do not add aliases to top level expression list on multiple joins rewrite +)", 0) \ + M(Bool, optimize_sorting_by_input_stream_properties, true, R"( +Optimize sorting by sorting properties of input stream +)", 0) \ + M(UInt64, keeper_max_retries, 10, R"( +Max retries for general keeper operations +)", 0) \ + M(UInt64, keeper_retry_initial_backoff_ms, 100, R"( +Initial backoff timeout for general keeper operations +)", 0) \ + M(UInt64, keeper_retry_max_backoff_ms, 5000, R"( +Max backoff timeout for general keeper operations +)", 0) \ + M(UInt64, insert_keeper_max_retries, 20, R"( +The setting sets the maximum number of retries for ClickHouse Keeper (or ZooKeeper) requests during insert into replicated MergeTree. Only Keeper requests which failed due to network error, Keeper session timeout, or request timeout are considered for retries. + +Possible values: + +- Positive integer. +- 0 — Retries are disabled + +Cloud default value: `20`. + +Keeper request retries are done after some timeout. The timeout is controlled by the following settings: `insert_keeper_retry_initial_backoff_ms`, `insert_keeper_retry_max_backoff_ms`. +The first retry is done after `insert_keeper_retry_initial_backoff_ms` timeout. The consequent timeouts will be calculated as follows: +``` +timeout = min(insert_keeper_retry_max_backoff_ms, latest_timeout * 2) +``` + +For example, if `insert_keeper_retry_initial_backoff_ms=100`, `insert_keeper_retry_max_backoff_ms=10000` and `insert_keeper_max_retries=8` then timeouts will be `100, 200, 400, 800, 1600, 3200, 6400, 10000`. + +Apart from fault tolerance, the retries aim to provide a better user experience - they allow to avoid returning an error during INSERT execution if Keeper is restarted, for example, due to an upgrade. +)", 0) \ + M(UInt64, insert_keeper_retry_initial_backoff_ms, 100, R"( +Initial timeout(in milliseconds) to retry a failed Keeper request during INSERT query execution + +Possible values: + +- Positive integer. +- 0 — No timeout +)", 0) \ + M(UInt64, insert_keeper_retry_max_backoff_ms, 10000, R"( +Maximum timeout (in milliseconds) to retry a failed Keeper request during INSERT query execution + +Possible values: + +- Positive integer. +- 0 — Maximum timeout is not limited +)", 0) \ + M(Float, insert_keeper_fault_injection_probability, 0.0f, R"( +Approximate probability of failure for a keeper request during insert. Valid value is in interval [0.0f, 1.0f] +)", 0) \ + M(UInt64, insert_keeper_fault_injection_seed, 0, R"( +0 - random seed, otherwise the setting value +)", 0) \ + M(Bool, force_aggregation_in_order, false, R"( +The setting is used by the server itself to support distributed queries. Do not change it manually, because it will break normal operations. (Forces use of aggregation in order on remote nodes during distributed aggregation). +)", IMPORTANT) \ + M(UInt64, http_max_request_param_data_size, 10_MiB, R"( +Limit on size of request data used as a query parameter in predefined HTTP requests. +)", 0) \ + M(Bool, function_json_value_return_type_allow_nullable, false, R"( +Control whether allow to return `NULL` when value is not exist for JSON_VALUE function. + +```sql +SELECT JSON_VALUE('{"hello":"world"}', '$.b') settings function_json_value_return_type_allow_nullable=true; + +┌─JSON_VALUE('{"hello":"world"}', '$.b')─┐ +│ ᴺᵁᴸᴸ │ +└────────────────────────────────────────┘ + +1 row in set. Elapsed: 0.001 sec. +``` + +Possible values: + +- true — Allow. +- false — Disallow. +)", 0) \ + M(Bool, function_json_value_return_type_allow_complex, false, R"( +Control whether allow to return complex type (such as: struct, array, map) for json_value function. + +```sql +SELECT JSON_VALUE('{"hello":{"world":"!"}}', '$.hello') settings function_json_value_return_type_allow_complex=true + +┌─JSON_VALUE('{"hello":{"world":"!"}}', '$.hello')─┐ +│ {"world":"!"} │ +└──────────────────────────────────────────────────┘ + +1 row in set. Elapsed: 0.001 sec. +``` + +Possible values: + +- true — Allow. +- false — Disallow. +)", 0) \ + M(Bool, use_with_fill_by_sorting_prefix, true, R"( +Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently +)", 0) \ + M(Bool, optimize_uniq_to_count, true, R"( +Rewrite uniq and its variants(except uniqUpTo) to count if subquery has distinct or group by clause. +)", 0) \ + M(Bool, use_variant_as_common_type, false, R"( +Allows to use `Variant` type as a result type for [if](../../sql-reference/functions/conditional-functions.md/#if)/[multiIf](../../sql-reference/functions/conditional-functions.md/#multiif)/[array](../../sql-reference/functions/array-functions.md)/[map](../../sql-reference/functions/tuple-map-functions.md) functions when there is no common type for argument types. + +Example: + +```sql +SET use_variant_as_common_type = 1; +SELECT toTypeName(if(number % 2, number, range(number))) as variant_type FROM numbers(1); +SELECT if(number % 2, number, range(number)) as variant FROM numbers(5); +``` + +```text +┌─variant_type───────────────────┐ +│ Variant(Array(UInt64), UInt64) │ +└────────────────────────────────┘ +┌─variant───┐ +│ [] │ +│ 1 │ +│ [0,1] │ +│ 3 │ +│ [0,1,2,3] │ +└───────────┘ +``` + +```sql +SET use_variant_as_common_type = 1; +SELECT toTypeName(multiIf((number % 4) = 0, 42, (number % 4) = 1, [1, 2, 3], (number % 4) = 2, 'Hello, World!', NULL)) AS variant_type FROM numbers(1); +SELECT multiIf((number % 4) = 0, 42, (number % 4) = 1, [1, 2, 3], (number % 4) = 2, 'Hello, World!', NULL) AS variant FROM numbers(4); +``` + +```text +─variant_type─────────────────────────┐ +│ Variant(Array(UInt8), String, UInt8) │ +└──────────────────────────────────────┘ + +┌─variant───────┐ +│ 42 │ +│ [1,2,3] │ +│ Hello, World! │ +│ ᴺᵁᴸᴸ │ +└───────────────┘ +``` + +```sql +SET use_variant_as_common_type = 1; +SELECT toTypeName(array(range(number), number, 'str_' || toString(number))) as array_of_variants_type from numbers(1); +SELECT array(range(number), number, 'str_' || toString(number)) as array_of_variants FROM numbers(3); +``` + +```text +┌─array_of_variants_type────────────────────────┐ +│ Array(Variant(Array(UInt64), String, UInt64)) │ +└───────────────────────────────────────────────┘ + +┌─array_of_variants─┐ +│ [[],0,'str_0'] │ +│ [[0],1,'str_1'] │ +│ [[0,1],2,'str_2'] │ +└───────────────────┘ +``` + +```sql +SET use_variant_as_common_type = 1; +SELECT toTypeName(map('a', range(number), 'b', number, 'c', 'str_' || toString(number))) as map_of_variants_type from numbers(1); +SELECT map('a', range(number), 'b', number, 'c', 'str_' || toString(number)) as map_of_variants FROM numbers(3); +``` + +```text +┌─map_of_variants_type────────────────────────────────┐ +│ Map(String, Variant(Array(UInt64), String, UInt64)) │ +└─────────────────────────────────────────────────────┘ + +┌─map_of_variants───────────────┐ +│ {'a':[],'b':0,'c':'str_0'} │ +│ {'a':[0],'b':1,'c':'str_1'} │ +│ {'a':[0,1],'b':2,'c':'str_2'} │ +└───────────────────────────────┘ +``` +)", 0) \ + M(Bool, enable_order_by_all, true, R"( +Enables or disables sorting with `ORDER BY ALL` syntax, see [ORDER BY](../../sql-reference/statements/select/order-by.md). + +Possible values: + +- 0 — Disable ORDER BY ALL. +- 1 — Enable ORDER BY ALL. + +**Example** + +Query: + +```sql +CREATE TABLE TAB(C1 Int, C2 Int, ALL Int) ENGINE=Memory(); + +INSERT INTO TAB VALUES (10, 20, 30), (20, 20, 10), (30, 10, 20); + +SELECT * FROM TAB ORDER BY ALL; -- returns an error that ALL is ambiguous + +SELECT * FROM TAB ORDER BY ALL SETTINGS enable_order_by_all = 0; +``` + +Result: + +```text +┌─C1─┬─C2─┬─ALL─┐ +│ 20 │ 20 │ 10 │ +│ 30 │ 10 │ 20 │ +│ 10 │ 20 │ 30 │ +└────┴────┴─────┘ +``` +)", 0) \ + M(Float, ignore_drop_queries_probability, 0, R"( +If enabled, server will ignore all DROP table queries with specified probability (for Memory and JOIN engines it will replcase DROP to TRUNCATE). Used for testing purposes +)", 0) \ + M(Bool, traverse_shadow_remote_data_paths, false, R"( +Traverse shadow directory when query system.remote_data_paths +)", 0) \ + M(Bool, geo_distance_returns_float64_on_float64_arguments, true, R"( +If all four arguments to `geoDistance`, `greatCircleDistance`, `greatCircleAngle` functions are Float64, return Float64 and use double precision for internal calculations. In previous ClickHouse versions, the functions always returned Float32. +)", 0) \ + M(Bool, allow_get_client_http_header, false, R"( +Allow to use the function `getClientHTTPHeader` which lets to obtain a value of an the current HTTP request's header. It is not enabled by default for security reasons, because some headers, such as `Cookie`, could contain sensitive info. Note that the `X-ClickHouse-*` and `Authentication` headers are always restricted and cannot be obtained with this function. +)", 0) \ + M(Bool, cast_string_to_dynamic_use_inference, false, R"( +Use types inference during String to Dynamic conversion +)", 0) \ + M(Bool, enable_blob_storage_log, true, R"( +Write information about blob storage operations to system.blob_storage_log table +)", 0) \ + M(Bool, use_json_alias_for_old_object_type, false, R"( +When enabled, `JSON` data type alias will be used to create an old [Object('json')](../../sql-reference/data-types/json.md) type instead of the new [JSON](../../sql-reference/data-types/newjson.md) type. +)", 0) \ + M(Bool, allow_create_index_without_type, false, R"( +Allow CREATE INDEX query without TYPE. Query will be ignored. Made for SQL compatibility tests. +)", 0) \ + M(Bool, create_index_ignore_unique, false, R"( +Ignore UNIQUE keyword in CREATE UNIQUE INDEX. Made for SQL compatibility tests. +)", 0) \ + M(Bool, print_pretty_type_names, true, R"( +Allows to print deep-nested type names in a pretty way with indents in `DESCRIBE` query and in `toTypeName()` function. + +Example: + +```sql +CREATE TABLE test (a Tuple(b String, c Tuple(d Nullable(UInt64), e Array(UInt32), f Array(Tuple(g String, h Map(String, Array(Tuple(i String, j UInt64))))), k Date), l Nullable(String))) ENGINE=Memory; +DESCRIBE TABLE test FORMAT TSVRaw SETTINGS print_pretty_type_names=1; +``` + +``` +a Tuple( + b String, + c Tuple( + d Nullable(UInt64), + e Array(UInt32), + f Array(Tuple( + g String, + h Map( + String, + Array(Tuple( + i String, + j UInt64 + )) + ) + )), + k Date + ), + l Nullable(String) +) +``` +)", 0) \ + M(Bool, create_table_empty_primary_key_by_default, false, R"( +Allow to create *MergeTree tables with empty primary key when ORDER BY and PRIMARY KEY not specified +)", 0) \ + M(Bool, allow_named_collection_override_by_default, true, R"( +Allow named collections' fields override by default. +)", 0) \ + M(SQLSecurityType, default_normal_view_sql_security, SQLSecurityType::INVOKER, R"( +Allows to set default `SQL SECURITY` option while creating a normal view. [More about SQL security](../../sql-reference/statements/create/view.md#sql_security). + +The default value is `INVOKER`. +)", 0) \ + M(SQLSecurityType, default_materialized_view_sql_security, SQLSecurityType::DEFINER, R"( +Allows to set a default value for SQL SECURITY option when creating a materialized view. [More about SQL security](../../sql-reference/statements/create/view.md#sql_security). + +The default value is `DEFINER`. +)", 0) \ + M(String, default_view_definer, "CURRENT_USER", R"( +Allows to set default `DEFINER` option while creating a view. [More about SQL security](../../sql-reference/statements/create/view.md#sql_security). + +The default value is `CURRENT_USER`. +)", 0) \ + M(UInt64, cache_warmer_threads, 4, R"( +Only available in ClickHouse Cloud. Number of background threads for speculatively downloading new data parts into file cache, when cache_populated_by_fetch is enabled. Zero to disable. +)", 0) \ + M(Int64, ignore_cold_parts_seconds, 0, R"( +Only available in ClickHouse Cloud. Exclude new data parts from SELECT queries until they're either pre-warmed (see cache_populated_by_fetch) or this many seconds old. Only for Replicated-/SharedMergeTree. +)", 0) \ + M(Int64, prefer_warmed_unmerged_parts_seconds, 0, R"( +Only available in ClickHouse Cloud. If a merged part is less than this many seconds old and is not pre-warmed (see cache_populated_by_fetch), but all its source parts are available and pre-warmed, SELECT queries will read from those parts instead. Only for ReplicatedMergeTree. Note that this only checks whether CacheWarmer processed the part; if the part was fetched into cache by something else, it'll still be considered cold until CacheWarmer gets to it; if it was warmed, then evicted from cache, it'll still be considered warm. +)", 0) \ + M(Bool, iceberg_engine_ignore_schema_evolution, false, R"( +Allow to ignore schema evolution in Iceberg table engine and read all data using schema specified by the user on table creation or latest schema parsed from metadata on table creation. + +:::note +Enabling this setting can lead to incorrect result as in case of evolved schema all data files will be read using the same schema. +::: +)", 0) \ + M(Bool, allow_deprecated_error_prone_window_functions, false, R"( +Allow usage of deprecated error prone window functions (neighbor, runningAccumulate, runningDifferenceStartingWithFirstValue, runningDifference) +)", 0) \ + M(Bool, allow_deprecated_snowflake_conversion_functions, false, R"( +Functions `snowflakeToDateTime`, `snowflakeToDateTime64`, `dateTimeToSnowflake`, and `dateTime64ToSnowflake` are deprecated and disabled by default. +Please use functions `snowflakeIDToDateTime`, `snowflakeIDToDateTime64`, `dateTimeToSnowflakeID`, and `dateTime64ToSnowflakeID` instead. + +To re-enable the deprecated functions (e.g., during a transition period), please set this setting to `true`. +)", 0) \ + M(Bool, optimize_distinct_in_order, true, R"( +Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement +)", 0) \ + M(Bool, keeper_map_strict_mode, false, R"( +Enforce additional checks during operations on KeeperMap. E.g. throw an exception on an insert for already existing key +)", 0) \ + M(UInt64, extract_key_value_pairs_max_pairs_per_row, 1000, R"( +Max number of pairs that can be produced by the `extractKeyValuePairs` function. Used as a safeguard against consuming too much memory. +)", 0) ALIAS(extract_kvp_max_pairs_per_row) \ + M(Bool, restore_replace_external_engines_to_null, false, R"( +For testing purposes. Replaces all external engines to Null to not initiate external connections. +)", 0) \ + M(Bool, restore_replace_external_table_functions_to_null, false, R"( +For testing purposes. Replaces all external table functions to Null to not initiate external connections. +)", 0) \ + M(Bool, restore_replace_external_dictionary_source_to_null, false, R"( +Replace external dictionary sources to Null on restore. Useful for testing purposes +)", 0) \ + M(Bool, create_if_not_exists, false, R"( +Enable `IF NOT EXISTS` for `CREATE` statement by default. If either this setting or `IF NOT EXISTS` is specified and a table with the provided name already exists, no exception will be thrown. +)", 0) \ + M(Bool, enable_secure_identifiers, false, R"( +If enabled, only allow secure identifiers which contain only underscore and alphanumeric characters +)", 0) \ + M(Bool, mongodb_throw_on_unsupported_query, true, R"( +If enabled, MongoDB tables will return an error when a MongoDB query cannot be built. Otherwise, ClickHouse reads the full table and processes it locally. This option does not apply to the legacy implementation or when 'allow_experimental_analyzer=0'. +)", 0) \ \ /* ###################################### */ \ /* ######## EXPERIMENTAL FEATURES ####### */ \ /* ###################################### */ \ - M(Bool, allow_experimental_materialized_postgresql_table, false, "Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental", 0) \ - M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \ - M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \ - M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \ - M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ - M(Bool, allow_experimental_time_series_table, false, "Allows experimental TimeSeries table engine", 0) \ - M(Bool, allow_experimental_vector_similarity_index, false, "Allow experimental vector similarity index", 0) \ - M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \ - M(Bool, allow_experimental_dynamic_type, false, "Allow Dynamic data type", 0) \ - M(Bool, allow_experimental_json_type, false, "Allow JSON data type", 0) \ - M(Bool, allow_experimental_codecs, false, "If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).", 0) \ - M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \ - M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \ - M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \ - M(Bool, implicit_transaction, false, "If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback)", 0) \ - M(UInt64, grace_hash_join_initial_buckets, 1, "Initial number of grace hash join buckets", 0) \ - M(UInt64, grace_hash_join_max_buckets, 1024, "Limit on the number of grace hash join buckets", 0) \ - M(UInt64, join_to_sort_minimum_perkey_rows, 40, "The lower limit of per-key average rows in the right table to determine whether to rerange the right table by key in left or inner join. This setting ensures that the optimization is not applied for sparse table keys", 0) \ - M(UInt64, join_to_sort_maximum_table_rows, 10000, "The maximum number of rows in the right table to determine whether to rerange the right table by key in left or inner join.", 0) \ - M(Bool, allow_experimental_join_right_table_sorting, false, "If it is set to true, and the conditions of `join_to_sort_minimum_perkey_rows` and `join_to_sort_maximum_table_rows` are met, rerange the right table by key to improve the performance in left or inner hash join.", 0) \ - M(Timezone, session_timezone, "", "This setting can be removed in the future due to potential caveats. It is experimental and is not suitable for production usage. The default timezone for current session or query. The server default timezone if empty.", 0) \ - M(Bool, use_hive_partitioning, false, "Allows to use hive partitioning for File, URL, S3, AzureBlobStorage and HDFS engines.", 0)\ + M(Bool, allow_experimental_materialized_postgresql_table, false, R"( +Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental +)", 0) \ + M(Bool, allow_experimental_funnel_functions, false, R"( +Enable experimental functions for funnel analysis. +)", 0) \ + M(Bool, allow_experimental_nlp_functions, false, R"( +Enable experimental functions for natural language processing. +)", 0) \ + M(Bool, allow_experimental_hash_functions, false, R"( +Enable experimental hash functions +)", 0) \ + M(Bool, allow_experimental_object_type, false, R"( +Allow Object and JSON data types +)", 0) \ + M(Bool, allow_experimental_time_series_table, false, R"( +Allows creation of tables with the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine. + +Possible values: + +- 0 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is disabled. +- 1 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is enabled. +)", 0) \ + M(Bool, allow_experimental_vector_similarity_index, false, R"( +Allow experimental vector similarity index +)", 0) \ + M(Bool, allow_experimental_variant_type, false, R"( +Allows creation of experimental [Variant](../../sql-reference/data-types/variant.md). +)", 0) \ + M(Bool, allow_experimental_dynamic_type, false, R"( +Allow Dynamic data type +)", 0) \ + M(Bool, allow_experimental_json_type, false, R"( +Allow JSON data type +)", 0) \ + M(Bool, allow_experimental_codecs, false, R"( +If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing). +)", 0) \ + M(Bool, allow_experimental_shared_set_join, true, R"( +Only in ClickHouse Cloud. Allow to create ShareSet and SharedJoin +)", 0) \ + M(UInt64, max_limit_for_ann_queries, 1'000'000, R"( +SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes. +)", 0) \ + M(Bool, throw_on_unsupported_query_inside_transaction, true, R"( +Throw exception if unsupported query is used inside transaction +)", 0) \ + M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, R"( +Wait for committed changes to become actually visible in the latest snapshot +)", 0) \ + M(Bool, implicit_transaction, false, R"( +If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback) +)", 0) \ + M(UInt64, grace_hash_join_initial_buckets, 1, R"( +Initial number of grace hash join buckets +)", 0) \ + M(UInt64, grace_hash_join_max_buckets, 1024, R"( +Limit on the number of grace hash join buckets +)", 0) \ + M(UInt64, join_to_sort_minimum_perkey_rows, 40, R"( +The lower limit of per-key average rows in the right table to determine whether to rerange the right table by key in left or inner join. This setting ensures that the optimization is not applied for sparse table keys +)", 0) \ + M(UInt64, join_to_sort_maximum_table_rows, 10000, R"( +The maximum number of rows in the right table to determine whether to rerange the right table by key in left or inner join. +)", 0) \ + M(Bool, allow_experimental_join_right_table_sorting, false, R"( +If it is set to true, and the conditions of `join_to_sort_minimum_perkey_rows` and `join_to_sort_maximum_table_rows` are met, rerange the right table by key to improve the performance in left or inner hash join. +)", 0) \ + M(Timezone, session_timezone, "", R"( +Sets the implicit time zone of the current session or query. +The implicit time zone is the time zone applied to values of type DateTime/DateTime64 which have no explicitly specified time zone. +The setting takes precedence over the globally configured (server-level) implicit time zone. +A value of '' (empty string) means that the implicit time zone of the current session or query is equal to the [server time zone](../server-configuration-parameters/settings.md#timezone). + +You can use functions `timeZone()` and `serverTimeZone()` to get the session time zone and server time zone. + +Possible values: + +- Any time zone name from `system.time_zones`, e.g. `Europe/Berlin`, `UTC` or `Zulu` + +Examples: + +```sql +SELECT timeZone(), serverTimeZone() FORMAT CSV + +"Europe/Berlin","Europe/Berlin" +``` + +```sql +SELECT timeZone(), serverTimeZone() SETTINGS session_timezone = 'Asia/Novosibirsk' FORMAT CSV + +"Asia/Novosibirsk","Europe/Berlin" +``` + +Assign session time zone 'America/Denver' to the inner DateTime without explicitly specified time zone: + +```sql +SELECT toDateTime64(toDateTime64('1999-12-12 23:23:23.123', 3), 3, 'Europe/Zurich') SETTINGS session_timezone = 'America/Denver' FORMAT TSV + +1999-12-13 07:23:23.123 +``` + +:::warning +Not all functions that parse DateTime/DateTime64 respect `session_timezone`. This can lead to subtle errors. +See the following example and explanation. +::: + +```sql +CREATE TABLE test_tz (`d` DateTime('UTC')) ENGINE = Memory AS SELECT toDateTime('2000-01-01 00:00:00', 'UTC'); + +SELECT *, timeZone() FROM test_tz WHERE d = toDateTime('2000-01-01 00:00:00') SETTINGS session_timezone = 'Asia/Novosibirsk' +0 rows in set. + +SELECT *, timeZone() FROM test_tz WHERE d = '2000-01-01 00:00:00' SETTINGS session_timezone = 'Asia/Novosibirsk' +┌───────────────────d─┬─timeZone()───────┐ +│ 2000-01-01 00:00:00 │ Asia/Novosibirsk │ +└─────────────────────┴──────────────────┘ +``` + +This happens due to different parsing pipelines: + +- `toDateTime()` without explicitly given time zone used in the first `SELECT` query honors setting `session_timezone` and the global time zone. +- In the second query, a DateTime is parsed from a String, and inherits the type and time zone of the existing column`d`. Thus, setting `session_timezone` and the global time zone are not honored. + +**See also** + +- [timezone](../server-configuration-parameters/settings.md#timezone) +)", 0) \ + M(Bool, use_hive_partitioning, false, R"( +When enabled, ClickHouse will detect Hive-style partitioning in path (`/name=value/`) in file-like table engines [File](../../engines/table-engines/special/file.md#hive-style-partitioning)/[S3](../../engines/table-engines/integrations/s3.md#hive-style-partitioning)/[URL](../../engines/table-engines/special/url.md#hive-style-partitioning)/[HDFS](../../engines/table-engines/integrations/hdfs.md#hive-style-partitioning)/[AzureBlobStorage](../../engines/table-engines/integrations/azureBlobStorage.md#hive-style-partitioning) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. +)", 0)\ \ - M(Bool, allow_statistics_optimize, false, "Allows using statistics to optimize queries", 0) ALIAS(allow_statistic_optimize) \ - M(Bool, allow_experimental_statistics, false, "Allows using statistics", 0) ALIAS(allow_experimental_statistic) \ + M(Bool, allow_statistics_optimize, false, R"( +Allows using statistics to optimize queries +)", 0) ALIAS(allow_statistic_optimize) \ + M(Bool, allow_experimental_statistics, false, R"( +Allows defining columns with [statistics](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) and [manipulate statistics](../../engines/table-engines/mergetree-family/mergetree.md#column-statistics). +)", 0) ALIAS(allow_experimental_statistic) \ \ /* Parallel replicas */ \ - M(UInt64, allow_experimental_parallel_reading_from_replicas, 0, "Use up to `max_parallel_replicas` the number of replicas from each shard for SELECT query execution. Reading is parallelized and coordinated dynamically. 0 - disabled, 1 - enabled, silently disable them in case of failure, 2 - enabled, throw an exception in case of failure", 0) ALIAS(enable_parallel_replicas) \ - M(NonZeroUInt64, max_parallel_replicas, 1, "The maximum number of replicas of each shard used when the query is executed. For consistency (to get different parts of the same partition), this option only works for the specified sampling key. The lag of the replicas is not controlled. Should be always greater than 0", 0) \ - M(ParallelReplicasMode, parallel_replicas_mode, ParallelReplicasMode::READ_TASKS, "Type of filter to use with custom key for parallel replicas. default - use modulo operation on the custom key, range - use range filter on custom key using all possible values for the value type of custom key.", 0) \ - M(UInt64, parallel_replicas_count, 0, "This is internal setting that should not be used directly and represents an implementation detail of the 'parallel replicas' mode. This setting will be automatically set up by the initiator server for distributed queries to the number of parallel replicas participating in query processing.", 0) \ - M(UInt64, parallel_replica_offset, 0, "This is internal setting that should not be used directly and represents an implementation detail of the 'parallel replicas' mode. This setting will be automatically set up by the initiator server for distributed queries to the index of the replica participating in query processing among parallel replicas.", 0) \ - M(String, parallel_replicas_custom_key, "", "Custom key assigning work to replicas when parallel replicas are used.", 0) \ - M(UInt64, parallel_replicas_custom_key_range_lower, 0, "Lower bound for the universe that the parallel replicas custom range filter is calculated over", 0) \ - M(UInt64, parallel_replicas_custom_key_range_upper, 0, "Upper bound for the universe that the parallel replicas custom range filter is calculated over. A value of 0 disables the upper bound, setting it to the max value of the custom key expression", 0) \ - M(String, cluster_for_parallel_replicas, "", "Cluster for a shard in which current server is located", 0) \ - M(Bool, parallel_replicas_allow_in_with_subquery, true, "If true, subquery for IN will be executed on every follower replica.", 0) \ - M(Float, parallel_replicas_single_task_marks_count_multiplier, 2, "A multiplier which will be added during calculation for minimal number of marks to retrieve from coordinator. This will be applied only for remote replicas.", 0) \ - M(Bool, parallel_replicas_for_non_replicated_merge_tree, false, "If true, ClickHouse will use parallel replicas algorithm also for non-replicated MergeTree tables", 0) \ - M(UInt64, parallel_replicas_min_number_of_rows_per_replica, 0, "Limit the number of replicas used in a query to (estimated rows to read / min_number_of_rows_per_replica). The max is still limited by 'max_parallel_replicas'", 0) \ - M(Bool, parallel_replicas_prefer_local_join, true, "If true, and JOIN can be executed with parallel replicas algorithm, and all storages of right JOIN part are *MergeTree, local JOIN will be used instead of GLOBAL JOIN.", 0) \ - M(UInt64, parallel_replicas_mark_segment_size, 0, "Parts virtually divided into segments to be distributed between replicas for parallel reading. This setting controls the size of these segments. Not recommended to change until you're absolutely sure in what you're doing. Value should be in range [128; 16384]", 0) \ - M(Bool, allow_archive_path_syntax, true, "File/S3 engines/table function will parse paths with '::' as ' :: ' if archive has correct extension", 0) \ - M(Bool, parallel_replicas_local_plan, false, "Build local plan for local replica", 0) \ + M(UInt64, allow_experimental_parallel_reading_from_replicas, 0, R"( +Use up to `max_parallel_replicas` the number of replicas from each shard for SELECT query execution. Reading is parallelized and coordinated dynamically. 0 - disabled, 1 - enabled, silently disable them in case of failure, 2 - enabled, throw an exception in case of failure +)", 0) ALIAS(enable_parallel_replicas) \ + M(NonZeroUInt64, max_parallel_replicas, 1, R"( +The maximum number of replicas for each shard when executing a query. + +Possible values: + +- Positive integer. + +**Additional Info** + +This options will produce different results depending on the settings used. + +:::note +This setting will produce incorrect results when joins or subqueries are involved, and all tables don't meet certain requirements. See [Distributed Subqueries and max_parallel_replicas](../../sql-reference/operators/in.md/#max_parallel_replica-subqueries) for more details. +::: + +### Parallel processing using `SAMPLE` key + +A query may be processed faster if it is executed on several servers in parallel. But the query performance may degrade in the following cases: + +- The position of the sampling key in the partitioning key does not allow efficient range scans. +- Adding a sampling key to the table makes filtering by other columns less efficient. +- The sampling key is an expression that is expensive to calculate. +- The cluster latency distribution has a long tail, so that querying more servers increases the query overall latency. + +### Parallel processing using [parallel_replicas_custom_key](#parallel_replicas_custom_key) + +This setting is useful for any replicated table. +)", 0) \ + M(ParallelReplicasMode, parallel_replicas_mode, ParallelReplicasMode::READ_TASKS, R"( +Type of filter to use with custom key for parallel replicas. default - use modulo operation on the custom key, range - use range filter on custom key using all possible values for the value type of custom key. +)", 0) \ + M(UInt64, parallel_replicas_count, 0, R"( +This is internal setting that should not be used directly and represents an implementation detail of the 'parallel replicas' mode. This setting will be automatically set up by the initiator server for distributed queries to the number of parallel replicas participating in query processing. +)", 0) \ + M(UInt64, parallel_replica_offset, 0, R"( +This is internal setting that should not be used directly and represents an implementation detail of the 'parallel replicas' mode. This setting will be automatically set up by the initiator server for distributed queries to the index of the replica participating in query processing among parallel replicas. +)", 0) \ + M(String, parallel_replicas_custom_key, "", R"( +An arbitrary integer expression that can be used to split work between replicas for a specific table. +The value can be any integer expression. + +Simple expressions using primary keys are preferred. + +If the setting is used on a cluster that consists of a single shard with multiple replicas, those replicas will be converted into virtual shards. +Otherwise, it will behave same as for `SAMPLE` key, it will use multiple replicas of each shard. +)", 0) \ + M(UInt64, parallel_replicas_custom_key_range_lower, 0, R"( +Allows the filter type `range` to split the work evenly between replicas based on the custom range `[parallel_replicas_custom_key_range_lower, INT_MAX]`. + +When used in conjunction with [parallel_replicas_custom_key_range_upper](#parallel_replicas_custom_key_range_upper), it lets the filter evenly split the work over replicas for the range `[parallel_replicas_custom_key_range_lower, parallel_replicas_custom_key_range_upper]`. + +Note: This setting will not cause any additional data to be filtered during query processing, rather it changes the points at which the range filter breaks up the range `[0, INT_MAX]` for parallel processing. +)", 0) \ + M(UInt64, parallel_replicas_custom_key_range_upper, 0, R"( +Allows the filter type `range` to split the work evenly between replicas based on the custom range `[0, parallel_replicas_custom_key_range_upper]`. A value of 0 disables the upper bound, setting it the max value of the custom key expression. + +When used in conjunction with [parallel_replicas_custom_key_range_lower](#parallel_replicas_custom_key_range_lower), it lets the filter evenly split the work over replicas for the range `[parallel_replicas_custom_key_range_lower, parallel_replicas_custom_key_range_upper]`. + +Note: This setting will not cause any additional data to be filtered during query processing, rather it changes the points at which the range filter breaks up the range `[0, INT_MAX]` for parallel processing +)", 0) \ + M(String, cluster_for_parallel_replicas, "", R"( +Cluster for a shard in which current server is located +)", 0) \ + M(Bool, parallel_replicas_allow_in_with_subquery, true, R"( +If true, subquery for IN will be executed on every follower replica. +)", 0) \ + M(Float, parallel_replicas_single_task_marks_count_multiplier, 2, R"( +A multiplier which will be added during calculation for minimal number of marks to retrieve from coordinator. This will be applied only for remote replicas. +)", 0) \ + M(Bool, parallel_replicas_for_non_replicated_merge_tree, false, R"( +If true, ClickHouse will use parallel replicas algorithm also for non-replicated MergeTree tables +)", 0) \ + M(UInt64, parallel_replicas_min_number_of_rows_per_replica, 0, R"( +Limit the number of replicas used in a query to (estimated rows to read / min_number_of_rows_per_replica). The max is still limited by 'max_parallel_replicas' +)", 0) \ + M(Bool, parallel_replicas_prefer_local_join, true, R"( +If true, and JOIN can be executed with parallel replicas algorithm, and all storages of right JOIN part are *MergeTree, local JOIN will be used instead of GLOBAL JOIN. +)", 0) \ + M(UInt64, parallel_replicas_mark_segment_size, 0, R"( +Parts virtually divided into segments to be distributed between replicas for parallel reading. This setting controls the size of these segments. Not recommended to change until you're absolutely sure in what you're doing. Value should be in range [128; 16384] +)", 0) \ + M(Bool, allow_archive_path_syntax, true, R"( +File/S3 engines/table function will parse paths with '::' as '\\ :: \\' if archive has correct extension +)", 0) \ + M(Bool, parallel_replicas_local_plan, false, R"( +Build local plan for local replica +)", 0) \ \ - M(Bool, allow_experimental_inverted_index, false, "If it is set to true, allow to use experimental inverted index.", 0) \ - M(Bool, allow_experimental_full_text_index, false, "If it is set to true, allow to use experimental full-text index.", 0) \ + M(Bool, allow_experimental_inverted_index, false, R"( +If it is set to true, allow to use experimental inverted index. +)", 0) \ + M(Bool, allow_experimental_full_text_index, false, R"( +If it is set to true, allow to use experimental full-text index. +)", 0) \ \ - M(Bool, allow_experimental_join_condition, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y.", 0) \ + M(Bool, allow_experimental_join_condition, false, R"( +Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y. +)", 0) \ \ - M(Bool, allow_experimental_analyzer, true, "Allow new query analyzer.", IMPORTANT) ALIAS(enable_analyzer) \ - M(Bool, analyzer_compatibility_join_using_top_level_identifier, false, "Force to resolve identifier in JOIN USING from projection (for example, in `SELECT a + 1 AS b FROM t1 JOIN t2 USING (b)` join will be performed by `t1.a + 1 = t2.b`, rather then `t1.b = t2.b`).", 0) \ + M(Bool, allow_experimental_analyzer, true, R"( +Allow new query analyzer. +)", IMPORTANT) ALIAS(enable_analyzer) \ + M(Bool, analyzer_compatibility_join_using_top_level_identifier, false, R"( +Force to resolve identifier in JOIN USING from projection (for example, in `SELECT a + 1 AS b FROM t1 JOIN t2 USING (b)` join will be performed by `t1.a + 1 = t2.b`, rather then `t1.b = t2.b`). +)", 0) \ \ - M(Bool, allow_experimental_live_view, false, "Enable LIVE VIEW. Not mature enough.", 0) \ - M(Seconds, live_view_heartbeat_interval, 15, "The heartbeat interval in seconds to indicate live query is alive.", 0) \ - M(UInt64, max_live_view_insert_blocks_before_refresh, 64, "Limit maximum number of inserted blocks after which mergeable blocks are dropped and query is re-executed.", 0) \ + M(Bool, allow_experimental_live_view, false, R"( +Allows creation of a deprecated LIVE VIEW. + +Possible values: + +- 0 — Working with live views is disabled. +- 1 — Working with live views is enabled. +)", 0) \ + M(Seconds, live_view_heartbeat_interval, 15, R"( +The heartbeat interval in seconds to indicate live query is alive. +)", 0) \ + M(UInt64, max_live_view_insert_blocks_before_refresh, 64, R"( +Limit maximum number of inserted blocks after which mergeable blocks are dropped and query is re-executed. +)", 0) \ \ - M(Bool, allow_experimental_window_view, false, "Enable WINDOW VIEW. Not mature enough.", 0) \ - M(Seconds, window_view_clean_interval, 60, "The clean interval of window view in seconds to free outdated data.", 0) \ - M(Seconds, window_view_heartbeat_interval, 15, "The heartbeat interval in seconds to indicate watch query is alive.", 0) \ - M(Seconds, wait_for_window_view_fire_signal_timeout, 10, "Timeout for waiting for window view fire signal in event time processing", 0) \ + M(Bool, allow_experimental_window_view, false, R"( +Enable WINDOW VIEW. Not mature enough. +)", 0) \ + M(Seconds, window_view_clean_interval, 60, R"( +The clean interval of window view in seconds to free outdated data. +)", 0) \ + M(Seconds, window_view_heartbeat_interval, 15, R"( +The heartbeat interval in seconds to indicate watch query is alive. +)", 0) \ + M(Seconds, wait_for_window_view_fire_signal_timeout, 10, R"( +Timeout for waiting for window view fire signal in event time processing +)", 0) \ \ - M(Bool, allow_experimental_refreshable_materialized_view, false, "Allow refreshable materialized views (CREATE MATERIALIZED VIEW REFRESH ...).", 0) \ - M(Bool, stop_refreshable_materialized_views_on_startup, false, "On server startup, prevent scheduling of refreshable materialized views, as if with SYSTEM STOP VIEWS. You can manually start them with SYSTEM START VIEWS or SYSTEM START VIEW afterwards. Also applies to newly created views. Has no effect on non-refreshable materialized views.", 0) \ + M(Bool, allow_experimental_refreshable_materialized_view, false, R"( +Allow refreshable materialized views (CREATE MATERIALIZED VIEW \\ REFRESH ...). +)", 0) \ + M(Bool, stop_refreshable_materialized_views_on_startup, false, R"( +On server startup, prevent scheduling of refreshable materialized views, as if with SYSTEM STOP VIEWS. You can manually start them with SYSTEM START VIEWS or SYSTEM START VIEW \\ afterwards. Also applies to newly created views. Has no effect on non-refreshable materialized views. +)", 0) \ \ - M(Bool, allow_experimental_database_materialized_mysql, false, "Allow to create database with Engine=MaterializedMySQL(...).", 0) \ - M(Bool, allow_experimental_database_materialized_postgresql, false, "Allow to create database with Engine=MaterializedPostgreSQL(...).", 0) \ + M(Bool, allow_experimental_database_materialized_mysql, false, R"( +Allow to create database with Engine=MaterializedMySQL(...). +)", 0) \ + M(Bool, allow_experimental_database_materialized_postgresql, false, R"( +Allow to create database with Engine=MaterializedPostgreSQL(...). +)", 0) \ \ /** Experimental feature for moving data between shards. */ \ - M(Bool, allow_experimental_query_deduplication, false, "Experimental data deduplication for SELECT queries based on part UUIDs", 0) \ + M(Bool, allow_experimental_query_deduplication, false, R"( +Experimental data deduplication for SELECT queries based on part UUIDs +)", 0) \ /** End of experimental features */ @@ -1404,7 +6214,15 @@ void Settings::dumpToSystemSettingsColumns(MutableColumnsAndConstraints & params { res_columns[1]->insert(setting.getValueString()); res_columns[2]->insert(setting.isValueChanged()); - res_columns[3]->insert(setting.getDescription()); + + /// Trim starting/ending newline. + std::string_view doc = setting.getDescription(); + if (!doc.empty() && doc[0] == '\n') + doc = doc.substr(1); + if (!doc.empty() && doc[doc.length() - 1] == '\n') + doc = doc.substr(0, doc.length() - 1); + + res_columns[3]->insert(doc); Field min, max; SettingConstraintWritability writability = SettingConstraintWritability::WRITABLE; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 5909ab6314c..ecfd4240a59 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -54,6 +54,8 @@ class WriteBuffer; M(CLASS_NAME, DefaultDatabaseEngine) \ M(CLASS_NAME, DefaultTableEngine) \ M(CLASS_NAME, Dialect) \ + M(CLASS_NAME, DistributedCacheLogMode) /* Cloud only */ \ + M(CLASS_NAME, DistributedCachePoolBehaviourOnLimit) /* Cloud only */ \ M(CLASS_NAME, DistributedDDLOutputMode) \ M(CLASS_NAME, DistributedProductMode) \ M(CLASS_NAME, Double) \ diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 6c0274992ee..c3cb96881f5 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -1,9 +1,10 @@ -#include #include +#include #include #include #include + namespace DB { @@ -79,6 +80,22 @@ static std::initializer_list & getSettingsChangesHistory() +static std::initializer_list> merge_tree_settings_changes_history_initializer = { - static std::map settings_changes_history; + {"24.12", + { + } + }, + {"24.11", + { + } + }, + {"24.10", + { + } + }, + {"24.9", + { + } + }, + {"24.8", + { + {"deduplicate_merge_projection_mode", "ignore", "throw", "Do not allow to create inconsistent projection"} + } + }, +}; - static std::once_flag initialized_flag; - std::call_once(initialized_flag, []() +static void initSettingsChangesHistory( + std::map & settings_changes_history, + std::once_flag & initialized_flag, + std::initializer_list> & initializer +) +{ + std::call_once(initialized_flag, [&]() { - for (const auto & setting_change : settings_changes_history_initializer) + for (const auto & setting_change : initializer) { /// Disallow duplicate keys in the settings changes history. Example: /// {"21.2", {{"some_setting_1", false, true, "[...]"}}}, @@ -564,7 +606,24 @@ const std::map & get settings_changes_history[setting_change.first] = setting_change.second; } }); +} + +const std::map & getSettingsChangesHistory() +{ + static std::map settings_changes_history; + static std::once_flag initialized_flag; + initSettingsChangesHistory(settings_changes_history, initialized_flag, settings_changes_history_initializer); return settings_changes_history; } + +const std::map & getMergeTreeSettingsChangesHistory() +{ + static std::map merge_tree_settings_changes_history; + static std::once_flag initialized_flag; + initSettingsChangesHistory(merge_tree_settings_changes_history, initialized_flag, merge_tree_settings_changes_history_initializer); + + return merge_tree_settings_changes_history; +} + } diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index b1a69c3b6d6..0fd03cd5804 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -39,5 +39,6 @@ namespace SettingsChangesHistory } const std::map & getSettingsChangesHistory(); +const std::map & getMergeTreeSettingsChangesHistory(); } diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp index e4887a2ec0a..99f9162867b 100644 --- a/src/Core/SettingsEnums.cpp +++ b/src/Core/SettingsEnums.cpp @@ -68,6 +68,14 @@ IMPLEMENT_SETTING_ENUM(OverflowMode, ErrorCodes::UNKNOWN_OVERFLOW_MODE, {{"throw", OverflowMode::THROW}, {"break", OverflowMode::BREAK}}) +IMPLEMENT_SETTING_ENUM(DistributedCacheLogMode, ErrorCodes::BAD_ARGUMENTS, + {{"nothing", DistributedCacheLogMode::LOG_NOTHING}, + {"on_error", DistributedCacheLogMode::LOG_ON_ERROR}, + {"all", DistributedCacheLogMode::LOG_ALL}}) + +IMPLEMENT_SETTING_ENUM(DistributedCachePoolBehaviourOnLimit, ErrorCodes::BAD_ARGUMENTS, + {{"wait", DistributedCachePoolBehaviourOnLimit::WAIT}, + {"allocate_bypassing_pool", DistributedCachePoolBehaviourOnLimit::ALLOCATE_NEW_BYPASSING_POOL}}); IMPLEMENT_SETTING_ENUM(OverflowModeGroupBy, ErrorCodes::UNKNOWN_OVERFLOW_MODE, {{"throw", OverflowMode::THROW}, @@ -178,7 +186,8 @@ IMPLEMENT_SETTING_ENUM(LightweightMutationProjectionMode, ErrorCodes::BAD_ARGUME {"rebuild", LightweightMutationProjectionMode::REBUILD}}) IMPLEMENT_SETTING_ENUM(DeduplicateMergeProjectionMode, ErrorCodes::BAD_ARGUMENTS, - {{"throw", DeduplicateMergeProjectionMode::THROW}, + {{"ignore", DeduplicateMergeProjectionMode::IGNORE}, + {"throw", DeduplicateMergeProjectionMode::THROW}, {"drop", DeduplicateMergeProjectionMode::DROP}, {"rebuild", DeduplicateMergeProjectionMode::REBUILD}}) diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h index 08778ae5a49..0ed0e2aef0e 100644 --- a/src/Core/SettingsEnums.h +++ b/src/Core/SettingsEnums.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include @@ -218,6 +218,9 @@ enum class DefaultTableEngine : uint8_t DECLARE_SETTING_ENUM(DefaultTableEngine) +DECLARE_SETTING_ENUM(DistributedCacheLogMode) + +DECLARE_SETTING_ENUM(DistributedCachePoolBehaviourOnLimit) enum class CleanDeletedRows : uint8_t { @@ -314,6 +317,7 @@ DECLARE_SETTING_ENUM(LightweightMutationProjectionMode) enum class DeduplicateMergeProjectionMode : uint8_t { + IGNORE, THROW, DROP, REBUILD, diff --git a/src/DataTypes/Serializations/SerializationObjectDeprecated.cpp b/src/DataTypes/Serializations/SerializationObjectDeprecated.cpp index 4e9ebf6c03d..87dec688528 100644 --- a/src/DataTypes/Serializations/SerializationObjectDeprecated.cpp +++ b/src/DataTypes/Serializations/SerializationObjectDeprecated.cpp @@ -294,7 +294,7 @@ void SerializationObjectDeprecated::serializeBinaryBulkWithMultipleStrea } settings.path.push_back(Substream::DeprecatedObjectData); - if (auto * stream = settings.getter(settings.path)) + if (auto * /*stream*/ _ = settings.getter(settings.path)) { state_object->nested_serialization->serializeBinaryBulkWithMultipleStreams( *tuple_column, offset, limit, settings, state_object->nested_state); diff --git a/src/DataTypes/convertMySQLDataType.h b/src/DataTypes/convertMySQLDataType.h index 4b1cbebc01b..3646fc2576f 100644 --- a/src/DataTypes/convertMySQLDataType.h +++ b/src/DataTypes/convertMySQLDataType.h @@ -2,8 +2,7 @@ #include #include -#include -#include "IDataType.h" +#include namespace DB { diff --git a/src/Databases/DatabaseFilesystem.h b/src/Databases/DatabaseFilesystem.h index 4b9db5e574d..ead56618f49 100644 --- a/src/Databases/DatabaseFilesystem.h +++ b/src/Databases/DatabaseFilesystem.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include diff --git a/src/Databases/DatabaseHDFS.h b/src/Databases/DatabaseHDFS.h index d19918000cf..13c589fd410 100644 --- a/src/Databases/DatabaseHDFS.h +++ b/src/Databases/DatabaseHDFS.h @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index 5b2f6aaeb9c..c98f2709116 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -51,6 +51,11 @@ namespace Setting extern const SettingsSetOperationMode union_default_mode; } +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsString storage_policy; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -154,7 +159,7 @@ void DatabaseOrdinary::convertMergeTreeToReplicatedIfNeeded(ASTPtr ast, const Qu /// Get table's storage policy MergeTreeSettings default_settings = getContext()->getMergeTreeSettings(); - auto policy = getContext()->getStoragePolicy(default_settings.storage_policy); + auto policy = getContext()->getStoragePolicy(default_settings[MergeTreeSetting::storage_policy]); if (auto * query_settings = create_query->storage->settings) if (Field * policy_setting = query_settings->changes.tryGet("storage_policy")) policy = getContext()->getStoragePolicy(policy_setting->safeGet()); diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 1f2d21d7f6a..d5b242180fc 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -1612,7 +1612,7 @@ void DatabaseReplicated::dropTable(ContextPtr local_context, const String & tabl waitDatabaseStarted(); auto txn = local_context->getZooKeeperMetadataTransaction(); - assert(!ddl_worker || !ddl_worker->isCurrentlyActive() || txn || startsWith(table_name, ".inner_id.")); + assert(!ddl_worker || !ddl_worker->isCurrentlyActive() || txn || startsWith(table_name, ".inner_id.") || startsWith(table_name, ".tmp.inner_id.")); if (txn && txn->isInitialQuery() && !txn->isCreateOrReplaceQuery()) { String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index f5a9ccb187b..4b1d4d6d707 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -1,5 +1,7 @@ #include + #include +#include #include #include #include @@ -21,6 +23,8 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int DATABASE_REPLICATION_FAILED; extern const int NOT_A_LEADER; + extern const int QUERY_WAS_CANCELLED; + extern const int TABLE_IS_DROPPED; extern const int UNFINISHED; } @@ -229,7 +233,7 @@ bool DatabaseReplicatedDDLWorker::waitForReplicaToProcessAllEntries(UInt64 timeo String DatabaseReplicatedDDLWorker::enqueueQueryImpl(const ZooKeeperPtr & zookeeper, DDLLogEntry & entry, - DatabaseReplicated * const database, bool committed) + DatabaseReplicated * const database, bool committed, Coordination::Requests additional_checks) { const String query_path_prefix = database->zookeeper_path + "/log/query-"; @@ -244,15 +248,16 @@ String DatabaseReplicatedDDLWorker::enqueueQueryImpl(const ZooKeeperPtr & zookee Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(counter_lock_path, database->getFullReplicaName(), zkutil::CreateMode::Ephemeral)); ops.emplace_back(zkutil::makeCreateRequest(counter_prefix, "", zkutil::CreateMode::EphemeralSequential)); + ops.insert(ops.end(), additional_checks.begin(), additional_checks.end()); Coordination::Responses res; Coordination::Error code = zookeeper->tryMulti(ops, res); if (code == Coordination::Error::ZOK) { - counter_path = dynamic_cast(*res.back()).path_created; + counter_path = dynamic_cast(*res[1]).path_created; break; } - else if (code != Coordination::Error::ZNODEEXISTS) + else if (res[0]->error != Coordination::Error::ZNODEEXISTS) zkutil::KeeperMultiException::check(code, ops, res); sleepForMilliseconds(50); @@ -305,7 +310,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr throw Exception(ErrorCodes::NOT_A_LEADER, "Cannot enqueue query on this replica, " "because it has replication lag of {} queries. Try other replica.", max_log_ptr - our_log_ptr); - String entry_path = enqueueQuery(entry); + String entry_path = enqueueQueryImpl(zookeeper, entry, database, false, query_context->getDDLAdditionalChecksOnEnqueue()); auto try_node = zkutil::EphemeralNodeHolder::existing(entry_path + "/try", *zookeeper); String entry_name = entry_path.substr(entry_path.rfind('/') + 1); auto task = std::make_unique(entry_name, entry_path, database); @@ -317,12 +322,21 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); UInt64 timeout = query_context->getSettingsRef()[Setting::database_replicated_initial_query_timeout_sec]; + StopToken cancellation = query_context->getDDLQueryCancellation(); + StopCallback cancellation_callback(cancellation, [&] { wait_current_task_change.notify_all(); }); { std::unique_lock lock{mutex}; bool processed = wait_current_task_change.wait_for(lock, std::chrono::seconds(timeout), [&]() { assert(zookeeper->expired() || current_task <= entry_name); - return zookeeper->expired() || current_task == entry_name || stop_flag; + + if (zookeeper->expired() || stop_flag) + throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "ZooKeeper session expired or replication stopped, try again"); + + if (cancellation.stop_requested()) + throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "DDL query was cancelled"); + + return current_task == entry_name; }); if (!processed) @@ -330,8 +344,8 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr "most likely because replica is busy with previous queue entries"); } - if (zookeeper->expired() || stop_flag) - throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "ZooKeeper session expired or replication stopped, try again"); + if (entry.parent_table_uuid.has_value() && !checkParentTableExists(entry.parent_table_uuid.value())) + throw Exception(ErrorCodes::TABLE_IS_DROPPED, "Parent table doesn't exist"); processTask(*task, zookeeper); @@ -350,8 +364,9 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr return entry_path; } -DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) +DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper, bool dry_run) { + if (!dry_run) { std::lock_guard lock{mutex}; if (current_task < entry_name) @@ -377,7 +392,7 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na zkutil::EventPtr wait_committed_or_failed = std::make_shared(); String try_node_path = fs::path(entry_path) / "try"; - if (zookeeper->tryGet(try_node_path, initiator_name, nullptr, wait_committed_or_failed)) + if (!dry_run && zookeeper->tryGet(try_node_path, initiator_name, nullptr, wait_committed_or_failed)) { task->is_initial_query = initiator_name == task->host_id_str; @@ -458,6 +473,12 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na return {}; } + if (task->entry.parent_table_uuid.has_value() && !checkParentTableExists(task->entry.parent_table_uuid.value())) + { + out_reason = fmt::format("Parent table {} doesn't exist", task->entry.parent_table_uuid.value()); + return {}; + } + return task; } @@ -468,6 +489,12 @@ bool DatabaseReplicatedDDLWorker::canRemoveQueueEntry(const String & entry_name, return entry_number + logs_to_keep < max_log_ptr; } +bool DatabaseReplicatedDDLWorker::checkParentTableExists(const UUID & uuid) const +{ + auto [db, table] = DatabaseCatalog::instance().tryGetByUUID(uuid); + return db.get() == database && table != nullptr && !table->is_dropped.load() && !table->is_detached.load(); +} + void DatabaseReplicatedDDLWorker::initializeLogPointer(const String & processed_entry_name) { updateMaxDDLEntryID(processed_entry_name); diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index 2309c831839..e741037e702 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -33,7 +33,7 @@ public: bool waitForReplicaToProcessAllEntries(UInt64 timeout_ms); static String enqueueQueryImpl(const ZooKeeperPtr & zookeeper, DDLLogEntry & entry, - DatabaseReplicated * const database, bool committed = false); /// NOLINT + DatabaseReplicated * const database, bool committed = false, Coordination::Requests additional_checks = {}); /// NOLINT UInt32 getLogPointer() const; @@ -43,9 +43,11 @@ private: void initializeReplication(); void initializeLogPointer(const String & processed_entry_name); - DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override; + DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper, bool dry_run) override; bool canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat) override; + bool checkParentTableExists(const UUID & uuid) const; + DatabaseReplicated * const database; mutable std::mutex mutex; std::condition_variable wait_current_task_change; diff --git a/src/Databases/DatabaseS3.h b/src/Databases/DatabaseS3.h index 7e38da0fe63..874e20d196d 100644 --- a/src/Databases/DatabaseS3.h +++ b/src/Databases/DatabaseS3.h @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include diff --git a/src/Databases/DatabasesCommon.h b/src/Databases/DatabasesCommon.h index 1ca49e90c23..c27479571ef 100644 --- a/src/Databases/DatabasesCommon.h +++ b/src/Databases/DatabasesCommon.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include #include diff --git a/src/Databases/enableAllExperimentalSettings.cpp b/src/Databases/enableAllExperimentalSettings.cpp index 01e989dc10b..f07a661a7bf 100644 --- a/src/Databases/enableAllExperimentalSettings.cpp +++ b/src/Databases/enableAllExperimentalSettings.cpp @@ -46,6 +46,7 @@ void enableAllExperimentalSettings(ContextMutablePtr context) context->setSetting("enable_zstd_qat_codec", 1); context->setSetting("allow_create_index_without_type", 1); context->setSetting("allow_experimental_s3queue", 1); + context->setSetting("allow_experimental_shared_set_join", 1); } } diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index 781822533e9..8c9f59cc112 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -585,7 +585,7 @@ private: template void getAttributeContainer(size_t attribute_index, GetContainerFunc && func) const { - return const_cast *>(this)->template getAttributeContainer(attribute_index, std::forward(func)); + return const_cast *>(this)->getAttributeContainer(attribute_index, std::forward(func)); } template diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp index c96f5f0c931..7055a7018ce 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp @@ -11,20 +11,6 @@ using namespace DB; - -namespace -{ - bool withFileCache(const ReadSettings & settings) - { - return settings.remote_fs_cache && settings.enable_filesystem_cache; - } - - bool withPageCache(const ReadSettings & settings, bool with_file_cache) - { - return settings.page_cache && !with_file_cache && settings.use_page_cache_for_disks_without_file_cache; - } -} - namespace DB { namespace ErrorCodes @@ -35,7 +21,7 @@ namespace ErrorCodes size_t chooseBufferSizeForRemoteReading(const DB::ReadSettings & settings, size_t file_size) { /// Only when cache is used we could download bigger portions of FileSegments than what we actually gonna read within particular task. - if (!withFileCache(settings)) + if (!settings.enable_filesystem_cache) return settings.remote_fs_buffer_size; /// Buffers used for prefetch and pre-download better to have enough size, but not bigger than the whole file. @@ -45,7 +31,6 @@ size_t chooseBufferSizeForRemoteReading(const DB::ReadSettings & settings, size_ ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather( ReadBufferCreator && read_buffer_creator_, const StoredObjects & blobs_to_read_, - const std::string & cache_path_prefix_, const ReadSettings & settings_, std::shared_ptr cache_log_, bool use_external_buffer_) @@ -54,12 +39,10 @@ ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather( , settings(settings_) , blobs_to_read(blobs_to_read_) , read_buffer_creator(std::move(read_buffer_creator_)) - , cache_path_prefix(cache_path_prefix_) , cache_log(settings.enable_filesystem_cache_log ? cache_log_ : nullptr) , query_id(CurrentThread::getQueryId()) , use_external_buffer(use_external_buffer_) - , with_file_cache(withFileCache(settings)) - , with_page_cache(withPageCache(settings, with_file_cache)) + , with_file_cache(settings.enable_filesystem_cache) , log(getLogger("ReadBufferFromRemoteFSGather")) { if (!blobs_to_read.empty()) @@ -74,47 +57,7 @@ SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(c } current_object = object; - const auto & object_path = object.remote_path; - - std::unique_ptr buf; - - if (with_file_cache) - { - if (settings.remote_fs_cache->isInitialized()) - { - auto cache_key = settings.remote_fs_cache->createKeyForPath(object_path); - buf = std::make_unique( - object_path, - cache_key, - settings.remote_fs_cache, - FileCache::getCommonUser(), - [=, this]() { return read_buffer_creator(/* restricted_seek */true, object); }, - settings, - query_id, - object.bytes_size, - /* allow_seeks */false, - /* use_external_buffer */true, - /* read_until_position */std::nullopt, - cache_log); - } - else - { - settings.remote_fs_cache->throwInitExceptionIfNeeded(); - } - } - - /// Can't wrap CachedOnDiskReadBufferFromFile in CachedInMemoryReadBufferFromFile because the - /// former doesn't support seeks. - if (with_page_cache && !buf) - { - auto inner = read_buffer_creator(/* restricted_seek */false, object); - auto cache_key = FileChunkAddress { .path = cache_path_prefix + object_path }; - buf = std::make_unique( - cache_key, settings.page_cache, std::move(inner), settings); - } - - if (!buf) - buf = read_buffer_creator(/* restricted_seek */true, object); + auto buf = read_buffer_creator(/* restricted_seek */true, object); if (read_until_position > start_offset && read_until_position < start_offset + object.bytes_size) buf->setReadUntilPosition(read_until_position - start_offset); diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.h b/src/Disks/IO/ReadBufferFromRemoteFSGather.h index 9f1cb681f1a..27f94a3e552 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.h +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.h @@ -26,7 +26,6 @@ public: ReadBufferFromRemoteFSGather( ReadBufferCreator && read_buffer_creator_, const StoredObjects & blobs_to_read_, - const std::string & cache_path_prefix_, const ReadSettings & settings_, std::shared_ptr cache_log_, bool use_external_buffer_); @@ -71,12 +70,10 @@ private: const ReadSettings settings; const StoredObjects blobs_to_read; const ReadBufferCreator read_buffer_creator; - const std::string cache_path_prefix; const std::shared_ptr cache_log; const String query_id; const bool use_external_buffer; const bool with_file_cache; - const bool with_page_cache; size_t read_until_position = 0; size_t file_offset_of_buffer_end = 0; diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index fa48825e1a6..673c82806bd 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -210,63 +210,14 @@ std::unique_ptr AzureObjectStorage::readObject( /// NOLI auto settings_ptr = settings.get(); return std::make_unique( - client.get(), object.remote_path, patchSettings(read_settings), settings_ptr->max_single_read_retries, - settings_ptr->max_single_download_retries); -} - -std::unique_ptr AzureObjectStorage::readObjects( /// NOLINT - const StoredObjects & objects, - const ReadSettings & read_settings, - std::optional, - std::optional) const -{ - ReadSettings disk_read_settings = patchSettings(read_settings); - auto settings_ptr = settings.get(); - auto global_context = Context::getGlobalContextInstance(); - - auto read_buffer_creator = - [this, settings_ptr, disk_read_settings] - (bool restricted_seek, const StoredObject & object_) -> std::unique_ptr - { - return std::make_unique( - client.get(), - object_.remote_path, - disk_read_settings, - settings_ptr->max_single_read_retries, - settings_ptr->max_single_download_retries, - /* use_external_buffer */true, - restricted_seek); - }; - - switch (read_settings.remote_fs_method) - { - case RemoteFSReadMethod::read: - { - return std::make_unique( - std::move(read_buffer_creator), - objects, - "azure:", - disk_read_settings, - global_context->getFilesystemCacheLog(), - /* use_external_buffer */false); - } - case RemoteFSReadMethod::threadpool: - { - auto impl = std::make_unique( - std::move(read_buffer_creator), - objects, - "azure:", - disk_read_settings, - global_context->getFilesystemCacheLog(), - /* use_external_buffer */true); - - auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); - return std::make_unique( - std::move(impl), reader, disk_read_settings, - global_context->getAsyncReadCounters(), - global_context->getFilesystemReadPrefetchesLog()); - } - } + client.get(), + object.remote_path, + patchSettings(read_settings), + settings_ptr->max_single_read_retries, + settings_ptr->max_single_download_retries, + read_settings.remote_read_buffer_use_external_buffer, + read_settings.remote_read_buffer_restrict_seek, + /* read_until_position */0); } /// Open the file for write and return WriteBufferFromFileBase object. diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 15a0bfb9ac1..58225eccd90 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -51,12 +51,6 @@ public: std::optional read_hint = {}, std::optional file_size = {}) const override; - std::unique_ptr readObjects( /// NOLINT - const StoredObjects & objects, - const ReadSettings & read_settings, - std::optional read_hint = {}, - std::optional file_size = {}) const override; - /// Open the file for write and return WriteBufferFromFileBase object. std::unique_ptr writeObject( /// NOLINT const StoredObject & object, diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp index ab0d357119c..a59ee615454 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp @@ -48,9 +48,7 @@ CachedObjectStorage::generateObjectKeyPrefixForDirectoryPath(const std::string & ReadSettings CachedObjectStorage::patchSettings(const ReadSettings & read_settings) const { - ReadSettings modified_settings{read_settings}; - modified_settings.remote_fs_cache = cache; - return object_storage->patchSettings(modified_settings); + return object_storage->patchSettings(read_settings); } void CachedObjectStorage::startup() @@ -63,21 +61,45 @@ bool CachedObjectStorage::exists(const StoredObject & object) const return object_storage->exists(object); } -std::unique_ptr CachedObjectStorage::readObjects( /// NOLINT - const StoredObjects & objects, - const ReadSettings & read_settings, - std::optional read_hint, - std::optional file_size) const -{ - return object_storage->readObjects(objects, patchSettings(read_settings), read_hint, file_size); -} - std::unique_ptr CachedObjectStorage::readObject( /// NOLINT const StoredObject & object, const ReadSettings & read_settings, std::optional read_hint, std::optional file_size) const { + if (read_settings.enable_filesystem_cache) + { + if (cache->isInitialized()) + { + auto cache_key = cache->createKeyForPath(object.remote_path); + auto global_context = Context::getGlobalContextInstance(); + auto modified_read_settings = read_settings.withNestedBuffer(); + + auto read_buffer_creator = [=, this]() + { + return object_storage->readObject(object, patchSettings(read_settings), read_hint, file_size); + }; + + return std::make_unique( + object.remote_path, + cache_key, + cache, + FileCache::getCommonUser(), + read_buffer_creator, + modified_read_settings, + std::string(CurrentThread::getQueryId()), + object.bytes_size, + /* allow_seeks */!read_settings.remote_read_buffer_restrict_seek, + /* use_external_buffer */read_settings.remote_read_buffer_use_external_buffer, + /* read_until_position */std::nullopt, + global_context->getFilesystemCacheLog()); + } + else + { + cache->throwInitExceptionIfNeeded(); + } + } + return object_storage->readObject(object, patchSettings(read_settings), read_hint, file_size); } diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h index 03b013c2eed..b77baf21e40 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h @@ -37,12 +37,6 @@ public: std::optional read_hint = {}, std::optional file_size = {}) const override; - std::unique_ptr readObjects( /// NOLINT - const StoredObjects & objects, - const ReadSettings & read_settings, - std::optional read_hint = {}, - std::optional file_size = {}) const override; - /// Open the file for write and return WriteBufferFromFileBase object. std::unique_ptr writeObject( /// NOLINT const StoredObject & object, diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index 07e2edac129..474851df7d5 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -11,6 +11,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -496,16 +499,60 @@ std::unique_ptr DiskObjectStorage::readFile( std::optional file_size) const { const auto storage_objects = metadata_storage->getStorageObjects(path); + auto global_context = Context::getGlobalContextInstance(); const bool file_can_be_empty = !file_size.has_value() || *file_size == 0; if (storage_objects.empty() && file_can_be_empty) return std::make_unique(); - return object_storage->readObjects( + auto read_settings = updateIOSchedulingSettings(settings, getReadResourceName(), getWriteResourceName()); + /// We wrap read buffer from object storage (read_buf = object_storage->readObject()) + /// inside ReadBufferFromRemoteFSGather, so add nested buffer setting. + read_settings = read_settings.withNestedBuffer(); + + auto read_buffer_creator = + [this, read_settings, read_hint, file_size] + (bool restricted_seek, const StoredObject & object_) mutable -> std::unique_ptr + { + read_settings.remote_read_buffer_restrict_seek = restricted_seek; + auto impl = object_storage->readObject(object_, read_settings, read_hint, file_size); + + if ((!object_storage->supportsCache() || !read_settings.enable_filesystem_cache) + && read_settings.page_cache && read_settings.use_page_cache_for_disks_without_file_cache) + { + /// Can't wrap CachedOnDiskReadBufferFromFile in CachedInMemoryReadBufferFromFile because the + /// former doesn't support seeks. + auto cache_path_prefix = fmt::format("{}:", magic_enum::enum_name(object_storage->getType())); + const auto object_namespace = object_storage->getObjectsNamespace(); + if (!object_namespace.empty()) + cache_path_prefix += object_namespace + "/"; + + const auto cache_key = FileChunkAddress { .path = cache_path_prefix + object_.remote_path }; + + impl = std::make_unique( + cache_key, read_settings.page_cache, std::move(impl), read_settings); + } + return impl; + }; + + const bool use_async_buffer = read_settings.remote_fs_method == RemoteFSReadMethod::threadpool; + auto impl = std::make_unique( + std::move(read_buffer_creator), storage_objects, - updateIOSchedulingSettings(settings, getReadResourceName(), getWriteResourceName()), - read_hint, - file_size); + read_settings, + global_context->getFilesystemCacheLog(), + /* use_external_buffer */use_async_buffer); + + if (use_async_buffer) + { + auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); + return std::make_unique( + std::move(impl), reader, read_settings, + global_context->getAsyncReadCounters(), + global_context->getFilesystemReadPrefetchesLog()); + + } + return impl; } std::unique_ptr DiskObjectStorage::writeFile( diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index 512cc34ef44..182534529ea 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -82,28 +82,12 @@ std::unique_ptr HDFSObjectStorage::readObject( /// NOLIN initializeHDFSFS(); auto path = extractObjectKeyFromURL(object); return std::make_unique( - fs::path(url_without_path) / "", fs::path(data_directory) / path, config, patchSettings(read_settings)); -} - -std::unique_ptr HDFSObjectStorage::readObjects( /// NOLINT - const StoredObjects & objects, - const ReadSettings & read_settings, - std::optional, - std::optional) const -{ - initializeHDFSFS(); - auto disk_read_settings = patchSettings(read_settings); - auto read_buffer_creator = - [this, disk_read_settings] - (bool /* restricted_seek */, const StoredObject & object_) -> std::unique_ptr - { - auto path = extractObjectKeyFromURL(object_); - return std::make_unique( - fs::path(url_without_path) / "", fs::path(data_directory) / path, config, disk_read_settings, /* read_until_position */0, /* use_external_buffer */true); - }; - - return std::make_unique( - std::move(read_buffer_creator), objects, "hdfs:", disk_read_settings, nullptr, /* use_external_buffer */false); + fs::path(url_without_path) / "", + fs::path(data_directory) / path, + config, + patchSettings(read_settings), + /* read_until_position */0, + read_settings.remote_read_buffer_use_external_buffer); } std::unique_ptr HDFSObjectStorage::writeObject( /// NOLINT diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h index 21c2b0635ca..b53161beb76 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h @@ -69,12 +69,6 @@ public: std::optional read_hint = {}, std::optional file_size = {}) const override; - std::unique_ptr readObjects( /// NOLINT - const StoredObjects & objects, - const ReadSettings & read_settings, - std::optional read_hint = {}, - std::optional file_size = {}) const override; - /// Open the file for write and return WriteBufferFromFileBase object. std::unique_ptr writeObject( /// NOLINT const StoredObject & object, diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 72f6d150d34..8dde96b8b16 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -150,13 +150,6 @@ public: std::optional read_hint = {}, std::optional file_size = {}) const = 0; - /// Read multiple objects with common prefix - virtual std::unique_ptr readObjects( /// NOLINT - const StoredObjects & objects, - const ReadSettings & read_settings, - std::optional read_hint = {}, - std::optional file_size = {}) const = 0; - /// Open the file for write and return WriteBufferFromFileBase object. virtual std::unique_ptr writeObject( /// NOLINT const StoredObject & object, diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp index 3b650adb71f..5f1b6aedc72 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp @@ -40,47 +40,12 @@ bool LocalObjectStorage::exists(const StoredObject & object) const return fs::exists(object.remote_path); } -std::unique_ptr LocalObjectStorage::readObjects( /// NOLINT - const StoredObjects & objects, - const ReadSettings & read_settings, - std::optional, - std::optional) const -{ - auto modified_settings = patchSettings(read_settings); - auto global_context = Context::getGlobalContextInstance(); - auto read_buffer_creator = [=](bool /* restricted_seek */, const StoredObject & object) -> std::unique_ptr - { return std::make_unique(object.remote_path); }; - - return std::make_unique( - std::move(read_buffer_creator), - objects, - "file:", - modified_settings, - global_context->getFilesystemCacheLog(), - /* use_external_buffer */ false); -} - ReadSettings LocalObjectStorage::patchSettings(const ReadSettings & read_settings) const { - if (!read_settings.enable_filesystem_cache) - return IObjectStorage::patchSettings(read_settings); - auto modified_settings{read_settings}; - /// For now we cannot allow asynchronous reader from local filesystem when CachedObjectStorage is used. - switch (modified_settings.local_fs_method) - { - case LocalFSReadMethod::pread_threadpool: - case LocalFSReadMethod::pread_fake_async: - { - modified_settings.local_fs_method = LocalFSReadMethod::pread; - LOG_INFO(log, "Changing local filesystem read method to `pread`"); - break; - } - default: - { - break; - } - } + /// Other options might break assertions in AsynchronousBoundedReadBuffer. + modified_settings.local_fs_method = LocalFSReadMethod::pread; + modified_settings.direct_io_threshold = 0; /// Disable. return IObjectStorage::patchSettings(modified_settings); } diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.h b/src/Disks/ObjectStorages/Local/LocalObjectStorage.h index 155359ce663..f1a0391a984 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.h +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.h @@ -34,12 +34,6 @@ public: std::optional read_hint = {}, std::optional file_size = {}) const override; - std::unique_ptr readObjects( /// NOLINT - const StoredObjects & objects, - const ReadSettings & read_settings, - std::optional read_hint = {}, - std::optional file_size = {}) const override; - /// Open the file for write and return WriteBufferFromFileBase object. std::unique_ptr writeObject( /// NOLINT const StoredObject & object, diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 0df498e1a70..0a7f659ee7b 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -176,65 +176,6 @@ bool S3ObjectStorage::exists(const StoredObject & object) const return S3::objectExists(*client.get(), uri.bucket, object.remote_path, {}); } -std::unique_ptr S3ObjectStorage::readObjects( /// NOLINT - const StoredObjects & objects, - const ReadSettings & read_settings, - std::optional, - std::optional) const -{ - ReadSettings disk_read_settings = patchSettings(read_settings); - auto global_context = Context::getGlobalContextInstance(); - - auto settings_ptr = s3_settings.get(); - - auto read_buffer_creator = - [this, settings_ptr, disk_read_settings] - (bool restricted_seek, const StoredObject & object_) -> std::unique_ptr - { - return std::make_unique( - client.get(), - uri.bucket, - object_.remote_path, - uri.version_id, - settings_ptr->request_settings, - disk_read_settings, - /* use_external_buffer */true, - /* offset */0, - /* read_until_position */0, - restricted_seek); - }; - - switch (read_settings.remote_fs_method) - { - case RemoteFSReadMethod::read: - { - return std::make_unique( - std::move(read_buffer_creator), - objects, - "s3:" + uri.bucket + "/", - disk_read_settings, - global_context->getFilesystemCacheLog(), - /* use_external_buffer */false); - } - case RemoteFSReadMethod::threadpool: - { - auto impl = std::make_unique( - std::move(read_buffer_creator), - objects, - "s3:" + uri.bucket + "/", - disk_read_settings, - global_context->getFilesystemCacheLog(), - /* use_external_buffer */true); - - auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); - return std::make_unique( - std::move(impl), reader, disk_read_settings, - global_context->getAsyncReadCounters(), - global_context->getFilesystemReadPrefetchesLog()); - } - } -} - std::unique_ptr S3ObjectStorage::readObject( /// NOLINT const StoredObject & object, const ReadSettings & read_settings, @@ -248,7 +189,12 @@ std::unique_ptr S3ObjectStorage::readObject( /// NOLINT object.remote_path, uri.version_id, settings_ptr->request_settings, - patchSettings(read_settings)); + patchSettings(read_settings), + read_settings.remote_read_buffer_use_external_buffer, + /* offset */0, + /* read_until_position */0, + read_settings.remote_read_buffer_restrict_seek, + object.bytes_size ? std::optional(object.bytes_size) : std::nullopt); } std::unique_ptr S3ObjectStorage::writeObject( /// NOLINT diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index b99867d8663..ef9da8a948e 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -89,12 +89,6 @@ public: std::optional read_hint = {}, std::optional file_size = {}) const override; - std::unique_ptr readObjects( /// NOLINT - const StoredObjects & objects, - const ReadSettings & read_settings, - std::optional read_hint = {}, - std::optional file_size = {}) const override; - /// Open the file for write and return WriteBufferFromFileBase object. std::unique_ptr writeObject( /// NOLINT const StoredObject & object, diff --git a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp index 7f7a3fe1a62..61ea584c4ad 100644 --- a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp @@ -233,69 +233,18 @@ WebObjectStorage::FileDataPtr WebObjectStorage::tryGetFileInfo(const String & pa } } -std::unique_ptr WebObjectStorage::readObjects( /// NOLINT - const StoredObjects & objects, - const ReadSettings & read_settings, - std::optional read_hint, - std::optional file_size) const -{ - if (objects.size() != 1) - throw Exception(ErrorCodes::LOGICAL_ERROR, "WebObjectStorage support read only from single object"); - - return readObject(objects[0], read_settings, read_hint, file_size); - -} - std::unique_ptr WebObjectStorage::readObject( /// NOLINT const StoredObject & object, const ReadSettings & read_settings, std::optional, std::optional) const { - size_t object_size = object.bytes_size; - auto read_buffer_creator = - [this, read_settings, object_size] - (bool /* restricted_seek */, const StoredObject & object_) -> std::unique_ptr - { - return std::make_unique( - fs::path(url) / object_.remote_path, - getContext(), - object_size, - read_settings, - /* use_external_buffer */true); - }; - - auto global_context = Context::getGlobalContextInstance(); - - switch (read_settings.remote_fs_method) - { - case RemoteFSReadMethod::read: - { - return std::make_unique( - std::move(read_buffer_creator), - StoredObjects{object}, - "url:" + url + "/", - read_settings, - global_context->getFilesystemCacheLog(), - /* use_external_buffer */false); - } - case RemoteFSReadMethod::threadpool: - { - auto impl = std::make_unique( - std::move(read_buffer_creator), - StoredObjects{object}, - "url:" + url + "/", - read_settings, - global_context->getFilesystemCacheLog(), - /* use_external_buffer */true); - - auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); - return std::make_unique( - std::move(impl), reader, read_settings, - global_context->getAsyncReadCounters(), - global_context->getFilesystemReadPrefetchesLog()); - } - } + return std::make_unique( + fs::path(url) / object.remote_path, + getContext(), + object.bytes_size, + read_settings, + read_settings.remote_read_buffer_use_external_buffer); } void WebObjectStorage::throwNotAllowed() diff --git a/src/Disks/ObjectStorages/Web/WebObjectStorage.h b/src/Disks/ObjectStorages/Web/WebObjectStorage.h index 9f7f7c137e2..9b94ae01021 100644 --- a/src/Disks/ObjectStorages/Web/WebObjectStorage.h +++ b/src/Disks/ObjectStorages/Web/WebObjectStorage.h @@ -39,12 +39,6 @@ public: std::optional read_hint = {}, std::optional file_size = {}) const override; - std::unique_ptr readObjects( /// NOLINT - const StoredObjects & objects, - const ReadSettings & read_settings, - std::optional read_hint = {}, - std::optional file_size = {}) const override; - /// Open the file for write and return WriteBufferFromFileBase object. std::unique_ptr writeObject( /// NOLINT const StoredObject & object, diff --git a/src/Disks/TemporaryFileOnDisk.cpp b/src/Disks/TemporaryFileOnDisk.cpp index 88674e068d9..9a6e562ff65 100644 --- a/src/Disks/TemporaryFileOnDisk.cpp +++ b/src/Disks/TemporaryFileOnDisk.cpp @@ -1,3 +1,4 @@ +#include #include #include #include diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index f805149e892..ae421a125ea 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -978,7 +978,7 @@ namespace if (settings.try_infer_integers) { /// If we read from String, we can do it in a more efficient way. - if (auto * string_buf = dynamic_cast(&buf)) + if (auto * /*string_buf*/ _ = dynamic_cast(&buf)) { /// Remember the pointer to the start of the number to rollback to it. /// We can safely get back to the start of the number, because we read from a string and we didn't reach eof. diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index ed13e581759..615589a7d43 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -4608,26 +4608,124 @@ private: return [to_max_types] (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr { - const auto & column_dynamic = assert_cast(*arguments[0].column); + const auto & dynamic_column = assert_cast(*arguments[0].column); /// We should use the same limit as already used in column and change only global limit. /// It's needed because shared variant should contain values only when limit is exceeded, /// so if there are already some data, we cannot increase the limit. - return ColumnDynamic::create(column_dynamic.getVariantColumnPtr(), column_dynamic.getVariantInfo(), column_dynamic.getMaxDynamicTypes(), to_max_types); + return ColumnDynamic::create(dynamic_column.getVariantColumnPtr(), dynamic_column.getVariantInfo(), dynamic_column.getMaxDynamicTypes(), to_max_types); }; } return [to_max_types] (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr { - const auto & column_dynamic = assert_cast(*arguments[0].column); + const auto & dynamic_column = assert_cast(*arguments[0].column); /// If real limit in the column is not greater than desired, just use the same variant column. - if (column_dynamic.getMaxDynamicTypes() <= to_max_types) - return ColumnDynamic::create(column_dynamic.getVariantColumnPtr(), column_dynamic.getVariantInfo(), column_dynamic.getMaxDynamicTypes(), to_max_types); + if (dynamic_column.getMaxDynamicTypes() <= to_max_types) + return ColumnDynamic::create(dynamic_column.getVariantColumnPtr(), dynamic_column.getVariantInfo(), dynamic_column.getMaxDynamicTypes(), to_max_types); + + /// Otherwise some variants should go to the shared variant. We try to keep the most frequent variants. + const auto & variant_info = dynamic_column.getVariantInfo(); + const auto & variants = assert_cast(*variant_info.variant_type).getVariants(); + const auto & statistics = dynamic_column.getStatistics(); + const auto & variant_column = dynamic_column.getVariantColumn(); + auto shared_variant_discr = dynamic_column.getSharedVariantDiscriminator(); + std::vector> variants_with_sizes; + variants_with_sizes.reserve(variant_info.variant_names.size()); + for (const auto & [name, discr] : variant_info.variant_name_to_discriminator) + { + /// Don't include shared variant. + if (discr == shared_variant_discr) + continue; + + size_t size = variant_column.getVariantByGlobalDiscriminator(discr).size(); + /// If column has statistics from the data part, use size from it for consistency. + /// It's important to keep the same dynamic structure of the result column during ALTER. + if (statistics) + { + auto statistics_it = statistics->variants_statistics.find(name); + if (statistics_it != statistics->variants_statistics.end()) + size = statistics_it->second; + } + variants_with_sizes.emplace_back(size, name, variants[discr]); + } + + std::sort(variants_with_sizes.begin(), variants_with_sizes.end(), std::greater()); + DataTypes result_variants; + result_variants.reserve(to_max_types + 1); /// +1 for shared variant. + /// Add new variants from sorted list until we reach to_max_types. + for (const auto & [size, name, type] : variants_with_sizes) + { + if (result_variants.size() < to_max_types) + result_variants.push_back(type); + else + break; + } + + /// Add shared variant. + result_variants.push_back(ColumnDynamic::getSharedVariantDataType()); + /// Create resulting Variant type and Dynamic column. + auto result_variant_type = std::make_shared(result_variants); + auto result_dynamic_column = ColumnDynamic::create(result_variant_type->createColumn(), result_variant_type, to_max_types, to_max_types); + const auto & result_variant_info = result_dynamic_column->getVariantInfo(); + auto & result_variant_column = result_dynamic_column->getVariantColumn(); + auto result_shared_variant_discr = result_dynamic_column->getSharedVariantDiscriminator(); + /// Create mapping from old discriminators to the new ones. + std::vector old_to_new_discriminators; + old_to_new_discriminators.resize(variant_info.variant_name_to_discriminator.size(), result_shared_variant_discr); + for (const auto & [name, discr] : result_variant_info.variant_name_to_discriminator) + { + auto old_discr = variant_info.variant_name_to_discriminator.at(name); + old_to_new_discriminators[old_discr] = discr; + /// Reuse old variant column if it's not shared variant. + if (discr != result_shared_variant_discr) + result_variant_column.getVariantPtrByGlobalDiscriminator(discr) = variant_column.getVariantPtrByGlobalDiscriminator(old_discr); + } + + const auto & local_discriminators = variant_column.getLocalDiscriminators(); + const auto & offsets = variant_column.getOffsets(); + const auto & shared_variant = dynamic_column.getSharedVariant(); + auto & result_local_discriminators = result_variant_column.getLocalDiscriminators(); + result_local_discriminators.reserve(local_discriminators.size()); + auto & result_offsets = result_variant_column.getOffsets(); + result_offsets.reserve(offsets.size()); + auto & result_shared_variant = result_dynamic_column->getSharedVariant(); + for (size_t i = 0; i != local_discriminators.size(); ++i) + { + auto global_discr = variant_column.globalDiscriminatorByLocal(local_discriminators[i]); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + { + result_local_discriminators.push_back(ColumnVariant::NULL_DISCRIMINATOR); + result_offsets.emplace_back(); + } + else if (global_discr == shared_variant_discr) + { + result_local_discriminators.push_back(result_variant_column.localDiscriminatorByGlobal(result_shared_variant_discr)); + result_offsets.push_back(result_shared_variant.size()); + result_shared_variant.insertFrom(shared_variant, offsets[i]); + } + else + { + auto result_global_discr = old_to_new_discriminators[global_discr]; + if (result_global_discr == result_shared_variant_discr) + { + result_local_discriminators.push_back(result_variant_column.localDiscriminatorByGlobal(result_shared_variant_discr)); + result_offsets.push_back(result_shared_variant.size()); + ColumnDynamic::serializeValueIntoSharedVariant( + result_shared_variant, + variant_column.getVariantByGlobalDiscriminator(global_discr), + variants[global_discr], + variants[global_discr]->getDefaultSerialization(), + offsets[i]); + } + else + { + result_local_discriminators.push_back(result_variant_column.localDiscriminatorByGlobal(result_global_discr)); + result_offsets.push_back(offsets[i]); + } + } + } - /// Otherwise some variants should go to the shared variant. In this case we can just insert all - /// the data into resulting column and it will do all the logic with shared variant. - auto result_dynamic_column = ColumnDynamic::create(to_max_types); - result_dynamic_column->insertRangeFrom(column_dynamic, 0, column_dynamic.size()); return result_dynamic_column; }; } diff --git a/src/Functions/JSONPath/Generator/IGenerator.h b/src/Functions/JSONPath/Generator/IGenerator.h index 323145e07e1..d3a6c32de72 100644 --- a/src/Functions/JSONPath/Generator/IGenerator.h +++ b/src/Functions/JSONPath/Generator/IGenerator.h @@ -2,7 +2,6 @@ #include #include -#include namespace DB { diff --git a/src/Functions/UserDefined/UserDefinedSQLObjectsStorageBase.h b/src/Functions/UserDefined/UserDefinedSQLObjectsStorageBase.h index 0dbc5586f08..41dedb95b9f 100644 --- a/src/Functions/UserDefined/UserDefinedSQLObjectsStorageBase.h +++ b/src/Functions/UserDefined/UserDefinedSQLObjectsStorageBase.h @@ -6,10 +6,11 @@ #include #include -#include +#include namespace DB { +using Strings = std::vector; class UserDefinedSQLObjectsStorageBase : public IUserDefinedSQLObjectsStorage { diff --git a/src/Functions/array/arrayAggregation.cpp b/src/Functions/array/arrayAggregation.cpp index adb1bb707d8..bb2503886f1 100644 --- a/src/Functions/array/arrayAggregation.cpp +++ b/src/Functions/array/arrayAggregation.cpp @@ -104,7 +104,7 @@ struct ArrayAggregateImpl static DataTypePtr getReturnType(const DataTypePtr & expression_return, const DataTypePtr & /*array_element*/) { - if (aggregate_operation == AggregateOperation::max || aggregate_operation == AggregateOperation::min) + if constexpr (aggregate_operation == AggregateOperation::max || aggregate_operation == AggregateOperation::min) { return expression_return; } @@ -152,9 +152,62 @@ struct ArrayAggregateImpl return result; } + template + requires(op == AggregateOperation::min || op == AggregateOperation::max) + static void executeMinOrMax(const ColumnPtr & mapped, const ColumnArray::Offsets & offsets, ColumnPtr & res_ptr) + { + const ColumnConst * const_column = checkAndGetColumn(&*mapped); + if (const_column) + { + MutableColumnPtr res_column = const_column->getDataColumn().cloneEmpty(); + res_column->insertMany(const_column->getField(), offsets.size()); + res_ptr = std::move(res_column); + return; + } + + MutableColumnPtr res_column = mapped->cloneEmpty(); + static constexpr int nan_null_direction_hint = aggregate_operation == AggregateOperation::min ? 1 : -1; + + /// TODO: Introduce row_begin and row_end to getPermutation or an equivalent function to use that instead + /// (same use case as SingleValueDataBase::getSmallestIndex) + UInt64 start_of_array = 0; + for (auto end_of_array : offsets) + { + /// Array is empty + if (start_of_array == end_of_array) + { + res_column->insertDefault(); + continue; + } + + UInt64 index = start_of_array; + for (UInt64 i = index + 1; i < end_of_array; i++) + { + if constexpr (aggregate_operation == AggregateOperation::min) + { + if ((mapped->compareAt(i, index, *mapped, nan_null_direction_hint) < 0)) + index = i; + } + else + { + if ((mapped->compareAt(i, index, *mapped, nan_null_direction_hint) > 0)) + index = i; + } + } + + res_column->insertFrom(*mapped, index); + start_of_array = end_of_array; + } + + chassert(res_column->size() == offsets.size()); + res_ptr = std::move(res_column); + } + template static NO_SANITIZE_UNDEFINED bool executeType(const ColumnPtr & mapped, const ColumnArray::Offsets & offsets, ColumnPtr & res_ptr) { + /// Min and Max are implemented in a different function + static_assert(aggregate_operation != AggregateOperation::min && aggregate_operation != AggregateOperation::max); using ResultType = ArrayAggregateResult; using ColVecType = ColumnVectorOrDecimal; using ColVecResultType = ColumnVectorOrDecimal; @@ -197,11 +250,6 @@ struct ArrayAggregateImpl /// Just multiply the value by array size. res[i] = x * static_cast(array_size); } - else if constexpr (aggregate_operation == AggregateOperation::min || - aggregate_operation == AggregateOperation::max) - { - res[i] = x; - } else if constexpr (aggregate_operation == AggregateOperation::average) { if constexpr (is_decimal) @@ -292,20 +340,6 @@ struct ArrayAggregateImpl { aggregate_value += element; } - else if constexpr (aggregate_operation == AggregateOperation::min) - { - if (element < aggregate_value) - { - aggregate_value = element; - } - } - else if constexpr (aggregate_operation == AggregateOperation::max) - { - if (element > aggregate_value) - { - aggregate_value = element; - } - } else if constexpr (aggregate_operation == AggregateOperation::product) { if constexpr (is_decimal) @@ -360,74 +394,41 @@ struct ArrayAggregateImpl static ColumnPtr execute(const ColumnArray & array, ColumnPtr mapped) { - if constexpr (aggregate_operation == AggregateOperation::max || aggregate_operation == AggregateOperation::min) - { - MutableColumnPtr res; - const auto & column = array.getDataPtr(); - const ColumnConst * const_column = checkAndGetColumn(&*column); - if (const_column) - { - res = const_column->getDataColumn().cloneEmpty(); - } - else - { - res = column->cloneEmpty(); - } - const IColumn::Offsets & offsets = array.getOffsets(); - size_t pos = 0; - for (const auto & offset : offsets) - { - if (offset == pos) - { - res->insertDefault(); - continue; - } - size_t current_max_or_min_index = pos; - ++pos; - for (; pos < offset; ++pos) - { - int compare_result = column->compareAt(pos, current_max_or_min_index, *column, 1); - if (aggregate_operation == AggregateOperation::max && compare_result > 0) - { - current_max_or_min_index = pos; - } - else if (aggregate_operation == AggregateOperation::min && compare_result < 0) - { - current_max_or_min_index = pos; - } - } - res->insert((*column)[current_max_or_min_index]); - } - return res; - } - const IColumn::Offsets & offsets = array.getOffsets(); ColumnPtr res; - if (executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res) || - executeType(mapped, offsets, res)) + if constexpr (aggregate_operation == AggregateOperation::min || aggregate_operation == AggregateOperation::max) { + executeMinOrMax(mapped, offsets, res); return res; } else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Unexpected column for arraySum: {}", mapped->getName()); + { + if (executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res) || + executeType(mapped, offsets, res)) + { + return res; + } + } + + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Unexpected column for arraySum: {}", mapped->getName()); } }; diff --git a/src/Functions/space.cpp b/src/Functions/space.cpp index cf1634e0319..764da8e49f5 100644 --- a/src/Functions/space.cpp +++ b/src/Functions/space.cpp @@ -141,7 +141,7 @@ public: ColumnString::Offsets & res_offsets = col_res->getOffsets(); ColumnString::Chars & res_chars = col_res->getChars(); - if (const ColumnConst * col_num_const = checkAndGetColumn(col_num.get())) + if (const ColumnConst * /*col_num_const*/ _ = checkAndGetColumn(col_num.get())) { if ((executeConstant(col_num, res_offsets, res_chars, input_rows_count)) || (executeConstant(col_num, res_offsets, res_chars, input_rows_count)) diff --git a/src/IO/DistributedCacheSettings.h b/src/IO/DistributedCacheSettings.h new file mode 100644 index 00000000000..f0c9080ed1b --- /dev/null +++ b/src/IO/DistributedCacheSettings.h @@ -0,0 +1,45 @@ +#pragma once + + +#include +#include + +namespace DB +{ + +enum class DistributedCachePoolBehaviourOnLimit +{ + WAIT, + ALLOCATE_NEW_BYPASSING_POOL, +}; + +enum class DistributedCacheLogMode +{ + LOG_NOTHING, + LOG_ON_ERROR, + LOG_ALL, +}; + +struct DistributedCacheSettings +{ + bool throw_on_error = false; + bool bypass_connection_pool = false; + + size_t wait_connection_from_pool_milliseconds = 100; + size_t connect_max_tries = 100; + size_t read_alignment = 0; + size_t max_unacked_inflight_packets = ::DistributedCache::MAX_UNACKED_INFLIGHT_PACKETS; + size_t data_packet_ack_window = ::DistributedCache::ACK_DATA_PACKET_WINDOW; + + DistributedCachePoolBehaviourOnLimit pool_behaviour_on_limit = DistributedCachePoolBehaviourOnLimit::ALLOCATE_NEW_BYPASSING_POOL; + size_t receive_response_wait_milliseconds = 10000; + size_t receive_timeout_milliseconds = 1000; + + DistributedCacheLogMode log_mode = DistributedCacheLogMode::LOG_ON_ERROR; + + bool operator ==(const DistributedCacheSettings &) const = default; + + void validate() const; +}; + +} diff --git a/src/IO/ReadSettings.h b/src/IO/ReadSettings.h index fb92fd72572..7d6b9f10931 100644 --- a/src/IO/ReadSettings.h +++ b/src/IO/ReadSettings.h @@ -1,12 +1,13 @@ #pragma once #include -#include #include #include #include #include #include +#include +#include namespace DB { @@ -116,7 +117,8 @@ struct ReadSettings size_t remote_read_min_bytes_for_seek = DBMS_DEFAULT_BUFFER_SIZE; - FileCachePtr remote_fs_cache; + bool remote_read_buffer_restrict_seek = false; + bool remote_read_buffer_use_external_buffer = false; /// Bandwidth throttler to use during reading ThrottlerPtr remote_throttler; @@ -130,6 +132,10 @@ struct ReadSettings bool http_skip_not_found_url_for_globs = true; bool http_make_head_request = true; + bool read_through_distributed_cache = false; + DistributedCacheSettings distributed_cache_settings; + std::optional filecache_user_info; + ReadSettings adjustBufferSize(size_t file_size) const { ReadSettings res = *this; @@ -138,6 +144,14 @@ struct ReadSettings res.prefetch_buffer_size = std::min(std::max(1ul, file_size), prefetch_buffer_size); return res; } + + ReadSettings withNestedBuffer() const + { + ReadSettings res = *this; + res.remote_read_buffer_restrict_seek = true; + res.remote_read_buffer_use_external_buffer = true; + return res; + } }; ReadSettings getReadSettings(); diff --git a/src/IO/WriteSettings.h b/src/IO/WriteSettings.h index 6b3d04f4e5c..94410f787f0 100644 --- a/src/IO/WriteSettings.h +++ b/src/IO/WriteSettings.h @@ -2,6 +2,7 @@ #include #include +#include namespace DB { @@ -27,6 +28,9 @@ struct WriteSettings bool use_adaptive_write_buffer = false; size_t adaptive_write_buffer_initial_size = 16 * 1024; + bool write_through_distributed_cache = false; + DistributedCacheSettings distributed_cache_settings; + bool operator==(const WriteSettings & other) const = default; }; diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp index 398a48c790b..981c1052d01 100644 --- a/src/Interpreters/Cache/Metadata.cpp +++ b/src/Interpreters/Cache/Metadata.cpp @@ -131,7 +131,12 @@ bool KeyMetadata::createBaseDirectory(bool throw_if_failed) { created_base_directory = false; - if (!throw_if_failed && e.code() == std::errc::no_space_on_device) + if (!throw_if_failed && + (e.code() == std::errc::no_space_on_device + || e.code() == std::errc::read_only_file_system + || e.code() == std::errc::permission_denied + || e.code() == std::errc::too_many_files_open + || e.code() == std::errc::operation_not_permitted)) { LOG_TRACE(cache_metadata->log, "Failed to create base directory for key {}, " "because no space left on device", key); diff --git a/src/Interpreters/ClusterProxy/executeQuery.h b/src/Interpreters/ClusterProxy/executeQuery.h index 2a21f3e8255..48f8fa34dca 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.h +++ b/src/Interpreters/ClusterProxy/executeQuery.h @@ -2,7 +2,7 @@ #include #include -#include +#include namespace DB { diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 3caebeb0ea5..2320af806f0 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -238,6 +238,13 @@ namespace Setting extern const SettingsBool use_page_cache_for_disks_without_file_cache; extern const SettingsUInt64 use_structure_from_insertion_table_in_table_functions; extern const SettingsString workload; + extern const SettingsString compatibility; +} + +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsString merge_workload; + extern const MergeTreeSettingsString mutation_workload; } namespace ErrorCodes @@ -2750,7 +2757,7 @@ void Context::makeQueryContextForMerge(const MergeTreeSettings & merge_tree_sett { makeQueryContext(); classifier.reset(); // It is assumed that there are no active queries running using this classifier, otherwise this will lead to crashes - (*settings)[Setting::workload] = merge_tree_settings.merge_workload.value.empty() ? getMergeWorkload() : merge_tree_settings.merge_workload; + (*settings)[Setting::workload] = merge_tree_settings[MergeTreeSetting::merge_workload].value.empty() ? getMergeWorkload() : merge_tree_settings[MergeTreeSetting::merge_workload]; } void Context::makeQueryContextForMutate(const MergeTreeSettings & merge_tree_settings) @@ -2758,7 +2765,7 @@ void Context::makeQueryContextForMutate(const MergeTreeSettings & merge_tree_set makeQueryContext(); classifier.reset(); // It is assumed that there are no active queries running using this classifier, otherwise this will lead to crashes (*settings)[Setting::workload] - = merge_tree_settings.mutation_workload.value.empty() ? getMutationWorkload() : merge_tree_settings.mutation_workload; + = merge_tree_settings[MergeTreeSetting::mutation_workload].value.empty() ? getMutationWorkload() : merge_tree_settings[MergeTreeSetting::mutation_workload]; } void Context::makeSessionContext() @@ -4648,6 +4655,11 @@ const MergeTreeSettings & Context::getMergeTreeSettings() const { const auto & config = shared->getConfigRefWithLock(lock); MergeTreeSettings mt_settings; + + /// Respect compatibility setting from the default profile. + /// First, we apply compatibility values, and only after apply changes from the config. + mt_settings.applyCompatibilitySetting((*settings)[Setting::compatibility]); + mt_settings.loadFromConfig("merge_tree", config); shared->merge_tree_settings.emplace(mt_settings); } @@ -4663,6 +4675,11 @@ const MergeTreeSettings & Context::getReplicatedMergeTreeSettings() const { const auto & config = shared->getConfigRefWithLock(lock); MergeTreeSettings mt_settings; + + /// Respect compatibility setting from the default profile. + /// First, we apply compatibility values, and only after apply changes from the config. + mt_settings.applyCompatibilitySetting((*settings)[Setting::compatibility]); + mt_settings.loadFromConfig("merge_tree", config); mt_settings.loadFromConfig("replicated_merge_tree", config); shared->replicated_merge_tree_settings.emplace(mt_settings); @@ -5311,6 +5328,38 @@ void Context::resetZooKeeperMetadataTransaction() metadata_transaction = nullptr; } +void Context::setParentTable(UUID uuid) +{ + chassert(!parent_table_uuid.has_value()); + parent_table_uuid = uuid; +} + +std::optional Context::getParentTable() const +{ + return parent_table_uuid; +} + +void Context::setDDLQueryCancellation(StopToken cancel) +{ + chassert(!ddl_query_cancellation.stop_possible()); + ddl_query_cancellation = cancel; +} + +StopToken Context::getDDLQueryCancellation() const +{ + return ddl_query_cancellation; +} + +void Context::setDDLAdditionalChecksOnEnqueue(Coordination::Requests requests) +{ + ddl_additional_checks_on_enqueue = requests; +} + +Coordination::Requests Context::getDDLAdditionalChecksOnEnqueue() const +{ + return ddl_additional_checks_on_enqueue; +} + void Context::checkTransactionsAreAllowed(bool explicit_tcl_query /* = false */) const { diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 9b73e48e511..b3af82bcc54 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,12 @@ namespace zkutil class ZooKeeper; using ZooKeeperPtr = std::shared_ptr; } +namespace Coordination +{ + struct Request; + using RequestPtr = std::shared_ptr; + using Requests = std::vector; +} struct OvercommitTracker; @@ -518,6 +525,9 @@ protected: /// to DatabaseOnDisk::commitCreateTable(...) or IStorage::alter(...) without changing /// thousands of signatures. /// And I hope it will be replaced with more common Transaction sometime. + std::optional parent_table_uuid; /// See comment on setParentTable(). + StopToken ddl_query_cancellation; // See comment on setDDLQueryCancellation(). + Coordination::Requests ddl_additional_checks_on_enqueue; // See comment on setDDLAdditionalChecksOnEnqueue(). MergeTreeTransactionPtr merge_tree_transaction; /// Current transaction context. Can be inside session or query context. /// It's shared with all children contexts. @@ -1285,6 +1295,26 @@ public: /// Removes context of current distributed DDL. void resetZooKeeperMetadataTransaction(); + /// Tells DatabaseReplicated to make this query conditional: it'll only succeed if table with the given UUID exists. + /// Used by refreshable materialized views to prevent creating inner tables after the MV is dropped. + /// Doesn't do anything if not in DatabaseReplicated. + void setParentTable(UUID uuid); + std::optional getParentTable() const; + /// Allows cancelling DDL query in DatabaseReplicated. Usage: + /// 1. Call this. + /// 2. Do a query that goes through DatabaseReplicated's DDL queue (e.g. CREATE TABLE). + /// 3. The query will wait to complete all previous queries in DDL queue before running this one. + /// You can interrupt this wait (and cancel the query from step 2) by cancelling the StopToken. + /// (In particular, such cancellation can be done from DDL worker thread itself. + /// We do it when dropping refreshable materialized views.) + /// 4. If the query was interrupted, it'll throw a QUERY_WAS_CANCELLED and will have no effect. + /// If the query already started execution, interruption won't happen, and the query will complete normally. + void setDDLQueryCancellation(StopToken cancel); + StopToken getDDLQueryCancellation() const; + /// Allows adding extra zookeeper operations to the transaction that enqueues a DDL query in DatabaseReplicated. + void setDDLAdditionalChecksOnEnqueue(Coordination::Requests requests); + Coordination::Requests getDDLAdditionalChecksOnEnqueue() const; + void checkTransactionsAreAllowed(bool explicit_tcl_query = false) const; void initCurrentTransaction(MergeTreeTransactionPtr txn); void setCurrentTransaction(MergeTreeTransactionPtr txn); diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index f70241373f9..d3b9d01a981 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -85,6 +85,10 @@ void DDLLogEntry::setSettingsIfRequired(ContextPtr context) throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown distributed_ddl_entry_format_version: {}." "Maximum supported version is {}.", version, DDL_ENTRY_FORMAT_MAX_VERSION); + parent_table_uuid = context->getParentTable(); + if (parent_table_uuid.has_value()) + version = std::max(version, PARENT_TABLE_UUID_VERSION); + /// NORMALIZE_CREATE_ON_INITIATOR_VERSION does not affect entry format in ZooKeeper if (version == NORMALIZE_CREATE_ON_INITIATOR_VERSION) version = SETTINGS_IN_ZK_VERSION; @@ -133,6 +137,16 @@ String DDLLogEntry::toString() const if (version >= BACKUP_RESTORE_FLAG_IN_ZK_VERSION) wb << "is_backup_restore: " << is_backup_restore << "\n"; + if (version >= PARENT_TABLE_UUID_VERSION) + { + wb << "parent: "; + if (parent_table_uuid.has_value()) + wb << parent_table_uuid.value(); + else + wb << "-"; + wb << "\n"; + } + return wb.str(); } @@ -193,6 +207,18 @@ void DDLLogEntry::parse(const String & data) checkChar('\n', rb); } + if (version >= PARENT_TABLE_UUID_VERSION) + { + rb >> "parent: "; + if (!checkChar('-', rb)) + { + UUID uuid; + rb >> uuid; + parent_table_uuid = uuid; + } + rb >> "\n"; + } + assertEOF(rb); if (!host_id_strings.empty()) diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 515a35d8671..f0f5d60db6d 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -77,10 +77,11 @@ struct DDLLogEntry static constexpr const UInt64 OPENTELEMETRY_ENABLED_VERSION = 4; static constexpr const UInt64 PRESERVE_INITIAL_QUERY_ID_VERSION = 5; static constexpr const UInt64 BACKUP_RESTORE_FLAG_IN_ZK_VERSION = 6; + static constexpr const UInt64 PARENT_TABLE_UUID_VERSION = 7; /// Add new version here /// Remember to update the value below once new version is added - static constexpr const UInt64 DDL_ENTRY_FORMAT_MAX_VERSION = 6; + static constexpr const UInt64 DDL_ENTRY_FORMAT_MAX_VERSION = 7; UInt64 version = 1; String query; @@ -90,6 +91,9 @@ struct DDLLogEntry OpenTelemetry::TracingContext tracing_context; String initial_query_id; bool is_backup_restore = false; + /// If present, this entry should be executed only if table with this uuid exists. + /// Only for DatabaseReplicated. + std::optional parent_table_uuid; void setSettingsIfRequired(ContextPtr context); String toString() const; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index feff049e114..fec8e6c53c1 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -176,7 +176,7 @@ ZooKeeperPtr DDLWorker::getAndSetZooKeeper() } -DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) +DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper, bool /*dry_run*/) { if (entries_to_skip.contains(entry_name)) return {}; @@ -384,7 +384,7 @@ void DDLWorker::scheduleTasks(bool reinitialized) { /// We should return true if some invariants are violated. String reason; - auto task = initAndCheckTask(entry_name, reason, zookeeper); + auto task = initAndCheckTask(entry_name, reason, zookeeper, /*dry_run*/ true); bool maybe_currently_processing = current_tasks.end() != std::find_if(current_tasks.begin(), current_tasks.end(), [&](const auto & t) { return t->entry_name == entry_name; @@ -418,7 +418,7 @@ void DDLWorker::scheduleTasks(bool reinitialized) LOG_TRACE(log, "Checking task {}", entry_name); String reason; - auto task = initAndCheckTask(entry_name, reason, zookeeper); + auto task = initAndCheckTask(entry_name, reason, zookeeper, /*dry_run*/ false); if (task) { queue_fully_loaded_after_initialization_debug_helper = true; diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 8af50bca69b..ac07b086242 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -117,7 +117,8 @@ protected: /// Reads entry and check that the host belongs to host list of the task /// Returns non-empty DDLTaskPtr if entry parsed and the check is passed - virtual DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper); + /// If dry_run = false, the task will be processed right after this call. + virtual DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper, bool dry_run); void processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper); void updateMaxDDLEntryID(const String & entry_name); diff --git a/src/Interpreters/GetAggregatesVisitor.cpp b/src/Interpreters/GetAggregatesVisitor.cpp index 718721308b1..60b287130df 100644 --- a/src/Interpreters/GetAggregatesVisitor.cpp +++ b/src/Interpreters/GetAggregatesVisitor.cpp @@ -37,7 +37,7 @@ struct WindowExpressionsCollectorMatcher } // We process every expression manually - if (auto * func = node->as()) + if (auto * /*func*/ _ = node->as()) return false; return true; diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index f51da7d4e70..641c69ef5f2 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -1067,9 +1067,9 @@ void InterpreterCreateQuery::validateMaterializedViewColumnsAndEngine(const ASTC if (create.refresh_strategy && !create.refresh_strategy->append) { - if (database && database->getEngineName() != "Atomic") + if (database && database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") throw Exception(ErrorCodes::INCORRECT_QUERY, - "Refreshable materialized views (except with APPEND) only support Atomic database engine, but database {} has engine {}", create.getDatabase(), database->getEngineName()); + "Refreshable materialized views (except with APPEND) only support Atomic and Replicated database engines, but database {} has engine {}", create.getDatabase(), database->getEngineName()); std::string message; if (!supportsAtomicRename(&message)) @@ -1362,7 +1362,7 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data bool from_path = create.attach_from_path.has_value(); bool is_on_cluster = getContext()->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; - if (database->getEngineName() == "Replicated" && create.uuid != UUIDHelpers::Nil && !is_replicated_database_internal && !is_on_cluster && !create.attach) + if (database->getEngineName() == "Replicated" && create.uuid != UUIDHelpers::Nil && !is_replicated_database_internal && !internal && !is_on_cluster && !create.attach) { if (getContext()->getSettingsRef()[Setting::database_replicated_allow_explicit_uuid] == 0) { diff --git a/src/Interpreters/InterpreterDeleteQuery.cpp b/src/Interpreters/InterpreterDeleteQuery.cpp index e732bba51e7..c5e44bddd51 100644 --- a/src/Interpreters/InterpreterDeleteQuery.cpp +++ b/src/Interpreters/InterpreterDeleteQuery.cpp @@ -29,6 +29,11 @@ namespace Setting extern const SettingsSeconds lock_acquire_timeout; } +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsLightweightMutationProjectionMode lightweight_mutation_projection_mode; +} + namespace ErrorCodes { extern const int TABLE_IS_READ_ONLY; @@ -100,7 +105,7 @@ BlockIO InterpreterDeleteQuery::execute() if (metadata_snapshot->hasProjections()) { if (const auto * merge_tree_data = dynamic_cast(table.get())) - if (merge_tree_data->getSettings()->lightweight_mutation_projection_mode == LightweightMutationProjectionMode::THROW) + if ((*merge_tree_data->getSettings())[MergeTreeSetting::lightweight_mutation_projection_mode] == LightweightMutationProjectionMode::THROW) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "DELETE query is not allowed for table {} because as it has projections and setting " "lightweight_mutation_projection_mode is set to THROW. " diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 8c9d9453d79..cb731333aed 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -1135,26 +1135,34 @@ void InterpreterSystemQuery::dropDatabaseReplica(ASTSystemQuery & query) throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid query"); } +bool InterpreterSystemQuery::trySyncReplica(IStorage * table, SyncReplicaMode sync_replica_mode, const std::unordered_set & src_replicas, ContextPtr context_) + { + auto table_id_ = table->getStorageID(); + if (auto * storage_replicated = dynamic_cast(table)) + { + auto log = getLogger("InterpreterSystemQuery"); + LOG_TRACE(log, "Synchronizing entries in replica's queue with table's log and waiting for current last entry to be processed"); + auto sync_timeout = context_->getSettingsRef()[Setting::receive_timeout].totalMilliseconds(); + if (!storage_replicated->waitForProcessingQueue(sync_timeout, sync_replica_mode, src_replicas)) + { + LOG_ERROR(log, "SYNC REPLICA {}: Timed out.", table_id_.getNameForLogs()); + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "SYNC REPLICA {}: command timed out. " \ + "See the 'receive_timeout' setting", table_id_.getNameForLogs()); + } + LOG_TRACE(log, "SYNC REPLICA {}: OK", table_id_.getNameForLogs()); + } + else + return false; + + return true; +} + void InterpreterSystemQuery::syncReplica(ASTSystemQuery & query) { getContext()->checkAccess(AccessType::SYSTEM_SYNC_REPLICA, table_id); StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext()); - - if (auto * storage_replicated = dynamic_cast(table.get())) - { - LOG_TRACE(log, "Synchronizing entries in replica's queue with table's log and waiting for current last entry to be processed"); - auto sync_timeout = getContext()->getSettingsRef()[Setting::receive_timeout].totalMilliseconds(); - - std::unordered_set replicas(query.src_replicas.begin(), query.src_replicas.end()); - if (!storage_replicated->waitForProcessingQueue(sync_timeout, query.sync_replica_mode, replicas)) - { - LOG_ERROR(log, "SYNC REPLICA {}: Timed out.", table_id.getNameForLogs()); - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "SYNC REPLICA {}: command timed out. " \ - "See the 'receive_timeout' setting", table_id.getNameForLogs()); - } - LOG_TRACE(log, "SYNC REPLICA {}: OK", table_id.getNameForLogs()); - } - else + std::unordered_set replicas(query.src_replicas.begin(), query.src_replicas.end()); + if (!trySyncReplica(table.get(), query.sync_replica_mode, replicas, getContext())) throw Exception(ErrorCodes::BAD_ARGUMENTS, table_is_not_replicated.data(), table_id.getNameForLogs()); } diff --git a/src/Interpreters/InterpreterSystemQuery.h b/src/Interpreters/InterpreterSystemQuery.h index f44fe930b04..6bd2b23ec1c 100644 --- a/src/Interpreters/InterpreterSystemQuery.h +++ b/src/Interpreters/InterpreterSystemQuery.h @@ -2,8 +2,8 @@ #include #include +#include #include -#include #include #include #include @@ -18,8 +18,10 @@ class Context; class AccessRightsElements; class ASTSystemQuery; class IDatabase; - using DatabasePtr = std::shared_ptr; +class RefreshTask; +using RefreshTaskPtr = std::shared_ptr; +using RefreshTaskList = std::list; /** Implement various SYSTEM queries. @@ -45,6 +47,8 @@ public: const String & database_name, const DatabasePtr & database, const ContextPtr & local_context, LoggerPtr log); + static bool trySyncReplica(IStorage * table, SyncReplicaMode sync_replica_mode, const std::unordered_set & src_replicas, ContextPtr context_); + private: ASTPtr query_ptr; LoggerPtr log = nullptr; diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 14ec539036a..b0d9e4de0d7 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -54,6 +54,13 @@ namespace Setting extern const SettingsUInt64 max_block_size; } +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsUInt64 index_granularity_bytes; + extern const MergeTreeSettingsBool materialize_ttl_recalculate_only; + extern const MergeTreeSettingsBool ttl_only_drop_parts; +} + namespace ErrorCodes { extern const int NOT_IMPLEMENTED; @@ -342,7 +349,7 @@ bool MutationsInterpreter::Source::hasLightweightDeleteMask() const bool MutationsInterpreter::Source::materializeTTLRecalculateOnly() const { - return data && data->getSettings()->materialize_ttl_recalculate_only; + return data && (*data->getSettings())[MergeTreeSetting::materialize_ttl_recalculate_only]; } bool MutationsInterpreter::Source::hasSecondaryIndex(const String & name) const @@ -771,7 +778,7 @@ void MutationsInterpreter::prepare(bool dry_run) /// If the part is compact and adaptive index granularity is enabled, modify data in one column via ALTER UPDATE can change /// the part granularity, so we need to rebuild indexes - if (source.isCompactPart() && source.getMergeTreeData() && source.getMergeTreeData()->getSettings()->index_granularity_bytes > 0) + if (source.isCompactPart() && source.getMergeTreeData() && (*source.getMergeTreeData()->getSettings())[MergeTreeSetting::index_granularity_bytes] > 0) need_rebuild_indexes = true; } else if (command.type == MutationCommand::MATERIALIZE_COLUMN) @@ -857,7 +864,7 @@ void MutationsInterpreter::prepare(bool dry_run) else if (command.type == MutationCommand::MATERIALIZE_TTL) { mutation_kind.set(MutationKind::MUTATE_OTHER); - bool suitable_for_ttl_optimization = source.getMergeTreeData()->getSettings()->ttl_only_drop_parts + bool suitable_for_ttl_optimization = (*source.getMergeTreeData()->getSettings())[MergeTreeSetting::ttl_only_drop_parts] && metadata_snapshot->hasOnlyRowsTTL(); if (materialize_ttl_recalculate_only || suitable_for_ttl_optimization) diff --git a/src/Interpreters/OptimizeDateOrDateTimeConverterWithPreimageVisitor.cpp b/src/Interpreters/OptimizeDateOrDateTimeConverterWithPreimageVisitor.cpp index 913f9900b77..b2419018e18 100644 --- a/src/Interpreters/OptimizeDateOrDateTimeConverterWithPreimageVisitor.cpp +++ b/src/Interpreters/OptimizeDateOrDateTimeConverterWithPreimageVisitor.cpp @@ -127,7 +127,7 @@ void OptimizeDateOrDateTimeConverterWithPreimageMatcher::visit(const ASTFunction size_t func_id = function.arguments->children.size(); for (size_t i = 0; i < function.arguments->children.size(); i++) - if (const auto * func = function.arguments->children[i]->as()) + if (const auto * /*func*/ _ = function.arguments->children[i]->as()) func_id = i; if (func_id == function.arguments->children.size()) diff --git a/src/Interpreters/RewriteArrayExistsFunctionVisitor.cpp b/src/Interpreters/RewriteArrayExistsFunctionVisitor.cpp index 64e03767c49..3de9138491b 100644 --- a/src/Interpreters/RewriteArrayExistsFunctionVisitor.cpp +++ b/src/Interpreters/RewriteArrayExistsFunctionVisitor.cpp @@ -102,7 +102,7 @@ void RewriteArrayExistsFunctionMatcher::visit(const ASTFunction & func, ASTPtr & bool RewriteArrayExistsFunctionMatcher::needChildVisit(const ASTPtr & ast, const ASTPtr &) { /// Children of ASTTableJoin are handled separately in visit() function - if (auto * join = ast->as()) + if (auto * /*join*/ _ = ast->as()) return false; return true; diff --git a/src/Interpreters/Set.h b/src/Interpreters/Set.h index 8b44f36b278..044de20e163 100644 --- a/src/Interpreters/Set.h +++ b/src/Interpreters/Set.h @@ -5,7 +5,6 @@ #include #include #include -#include #include #include diff --git a/src/Interpreters/TransactionLog.cpp b/src/Interpreters/TransactionLog.cpp index fb7d60a7a2c..eb85467236e 100644 --- a/src/Interpreters/TransactionLog.cpp +++ b/src/Interpreters/TransactionLog.cpp @@ -603,7 +603,7 @@ void TransactionLog::assertTIDIsNotOutdated(const TransactionID & tid, const std /// If the second case takes place transaction's commit csn has to be set. /// We should load CSN again to distinguish the second case. if (failback_with_strict_load_csn) - if (CSN maybe_csn = failback_with_strict_load_csn->load()) + if (CSN _ = failback_with_strict_load_csn->load()) return; throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to get CSN for too old TID {}, current tail_ptr is {}, probably it's a bug", tid, tail); diff --git a/src/Interpreters/formatWithPossiblyHidingSecrets.cpp b/src/Interpreters/formatWithPossiblyHidingSecrets.cpp index 9b7a58332a2..a4acbaa1a51 100644 --- a/src/Interpreters/formatWithPossiblyHidingSecrets.cpp +++ b/src/Interpreters/formatWithPossiblyHidingSecrets.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace DB { diff --git a/src/Parsers/ASTSelectWithUnionQuery.cpp b/src/Parsers/ASTSelectWithUnionQuery.cpp index cf72358dea7..d131a54d9b1 100644 --- a/src/Parsers/ASTSelectWithUnionQuery.cpp +++ b/src/Parsers/ASTSelectWithUnionQuery.cpp @@ -61,7 +61,7 @@ void ASTSelectWithUnionQuery::formatQueryImpl(const FormatSettings & settings, F << mode_to_str((is_normalized) ? union_mode : list_of_modes[it - list_of_selects->children.begin() - 1]) << (settings.hilite ? hilite_none : ""); - if (auto * node = (*it)->as()) + if (auto * /*node*/ _ = (*it)->as()) { if (it != list_of_selects->children.begin()) settings.ostr << settings.nl_or_ws; diff --git a/src/Parsers/CreateQueryUUIDs.cpp b/src/Parsers/CreateQueryUUIDs.cpp index fbdc6161408..c788cc7a025 100644 --- a/src/Parsers/CreateQueryUUIDs.cpp +++ b/src/Parsers/CreateQueryUUIDs.cpp @@ -43,7 +43,8 @@ CreateQueryUUIDs::CreateQueryUUIDs(const ASTCreateQuery & query, bool generate_r /// If destination table (to_table_id) is not specified for materialized view, /// then MV will create inner table. We should generate UUID of inner table here. - if (query.is_materialized_view) + /// An exception is refreshable MV that replaces inner table by renaming, changing UUID on each refresh. + if (query.is_materialized_view && !(query.refresh_strategy && !query.refresh_strategy->append)) generate_target_uuid(ViewTarget::To); if (query.is_time_series_table) diff --git a/src/Parsers/ParserTimeInterval.cpp b/src/Parsers/ParserTimeInterval.cpp index 8454eb27e1d..42375170314 100644 --- a/src/Parsers/ParserTimeInterval.cpp +++ b/src/Parsers/ParserTimeInterval.cpp @@ -38,6 +38,13 @@ bool ParserTimeInterval::parseImpl(Pos & pos, ASTPtr & node, Expected & expected if (intervals.empty()) return false; + std::sort(intervals.begin(), intervals.end()); + for (size_t i = 0; i + 1 < intervals.size(); ++i) + { + if (intervals[i].first == intervals[i + 1].first) + throw Exception(ErrorCodes::SYNTAX_ERROR, "Time interval contains multiple {} components", intervals[i].first.toString()); + } + CalendarTimeInterval interval(intervals); if (!options.allow_zero) diff --git a/src/Planner/Planner.cpp b/src/Planner/Planner.cpp index c6bce7b38b6..ee74ed45db5 100644 --- a/src/Planner/Planner.cpp +++ b/src/Planner/Planner.cpp @@ -825,7 +825,7 @@ void addWithFillStepIfNeeded(QueryPlan & query_plan, /// /// However, INPUT `s` does not exist. Instead, we have a constant with execution name 'Hello'_String. /// To fix this, we prepend a rename : 'Hello'_String -> s - if (const auto * constant_node = interpolate_node_typed.getExpression()->as()) + if (const auto * /*constant_node*/ _ = interpolate_node_typed.getExpression()->as()) { const auto * node = &rename_dag.addInput(alias_node->result_name, alias_node->result_type); node = &rename_dag.addAlias(*node, interpolate_node_typed.getExpressionName()); diff --git a/src/Planner/Utils.cpp b/src/Planner/Utils.cpp index 35b99380ebf..665cf5cadba 100644 --- a/src/Planner/Utils.cpp +++ b/src/Planner/Utils.cpp @@ -144,7 +144,7 @@ ASTPtr queryNodeToSelectQuery(const QueryTreeNodePtr & query_node) while (true) { - if (auto * select_query = result_ast->as()) + if (auto * /*select_query*/ _ = result_ast->as()) break; else if (auto * select_with_union = result_ast->as()) result_ast = select_with_union->list_of_selects->children.at(0); diff --git a/src/Planner/Utils.h b/src/Planner/Utils.h index 254b8f4eae1..795374dc865 100644 --- a/src/Planner/Utils.h +++ b/src/Planner/Utils.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include diff --git a/src/Processors/Formats/Impl/ConstantExpressionTemplate.h b/src/Processors/Formats/Impl/ConstantExpressionTemplate.h index 11165d5c449..41a5a49bb21 100644 --- a/src/Processors/Formats/Impl/ConstantExpressionTemplate.h +++ b/src/Processors/Formats/Impl/ConstantExpressionTemplate.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include namespace DB diff --git a/src/Processors/QueryPlan/Optimizations/applyOrder.cpp b/src/Processors/QueryPlan/Optimizations/applyOrder.cpp index 1671f641514..8695f29c26b 100644 --- a/src/Processors/QueryPlan/Optimizations/applyOrder.cpp +++ b/src/Processors/QueryPlan/Optimizations/applyOrder.cpp @@ -137,7 +137,7 @@ SortingProperty applyOrder(QueryPlan::Node * parent, SortingProperty * propertie return std::move(*properties); } - if (auto * union_step = typeid_cast(parent->step.get())) + if (auto * /*union_step*/ _ = typeid_cast(parent->step.get())) { SortDescription common_sort_description = std::move(properties->sort_description); auto sort_scope = properties->sort_scope; diff --git a/src/Processors/QueryPlan/Optimizations/optimizeReadInOrder.cpp b/src/Processors/QueryPlan/Optimizations/optimizeReadInOrder.cpp index 7b2f10a901a..a23ff9fe93a 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeReadInOrder.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeReadInOrder.cpp @@ -91,7 +91,7 @@ using StepStack = std::vector; QueryPlan::Node * findReadingStep(QueryPlan::Node & node, bool allow_existing_order) { IQueryPlanStep * step = node.step.get(); - if (auto * reading = checkSupportedReadingStep(step, allow_existing_order)) + if (auto * /*reading*/ _ = checkSupportedReadingStep(step, allow_existing_order)) return &node; if (node.children.size() != 1) diff --git a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp index 0b6362d3cd2..ca128a121fb 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp @@ -552,7 +552,7 @@ AggregateProjectionCandidates getAggregateProjectionCandidates( static QueryPlan::Node * findReadingStep(QueryPlan::Node & node) { IQueryPlanStep * step = node.step.get(); - if (auto * reading = typeid_cast(step)) + if (auto * /*reading*/ _ = typeid_cast(step)) return &node; if (node.children.size() != 1) diff --git a/src/Processors/QueryPlan/Optimizations/useDataParallelAggregation.cpp b/src/Processors/QueryPlan/Optimizations/useDataParallelAggregation.cpp index d943fea785b..fdef4a447f3 100644 --- a/src/Processors/QueryPlan/Optimizations/useDataParallelAggregation.cpp +++ b/src/Processors/QueryPlan/Optimizations/useDataParallelAggregation.cpp @@ -194,7 +194,7 @@ size_t tryAggregatePartitionsIndependently(QueryPlan::Node * node, QueryPlan::No auto * maybe_reading_step = expression_node->children.front()->step.get(); - if (const auto * filter = typeid_cast(maybe_reading_step)) + if (const auto * /*filter*/ _ = typeid_cast(maybe_reading_step)) { const auto * filter_node = expression_node->children.front(); if (filter_node->children.size() != 1 || !filter_node->children.front()->step) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index ff6e1d92189..da9f18586e0 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -177,6 +177,12 @@ namespace Setting extern const SettingsBool use_uncompressed_cache; } +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsUInt64 index_granularity; + extern const MergeTreeSettingsUInt64 index_granularity_bytes; +} + namespace ErrorCodes { extern const int INDEX_NOT_USED; @@ -480,7 +486,7 @@ Pipe ReadFromMergeTree::readFromPool( /// Maybe it will make sense to add settings `max_block_size_bytes` if (block_size.max_block_size_rows && !data.canUseAdaptiveGranularity()) { - size_t fixed_index_granularity = data.getSettings()->index_granularity; + size_t fixed_index_granularity = (*data.getSettings())[MergeTreeSetting::index_granularity]; pool_settings.min_marks_for_concurrent_read = (pool_settings.min_marks_for_concurrent_read * fixed_index_granularity + block_size.max_block_size_rows - 1) / block_size.max_block_size_rows * block_size.max_block_size_rows / fixed_index_granularity; } @@ -767,12 +773,12 @@ struct PartRangesReadInfo } if (adaptive_parts > parts.size() / 2) - index_granularity_bytes = data_settings.index_granularity_bytes; + index_granularity_bytes = data_settings[MergeTreeSetting::index_granularity_bytes]; max_marks_to_use_cache = MergeTreeDataSelectExecutor::roundRowsOrBytesToMarks( settings[Setting::merge_tree_max_rows_to_use_cache], settings[Setting::merge_tree_max_bytes_to_use_cache], - data_settings.index_granularity, + data_settings[MergeTreeSetting::index_granularity], index_granularity_bytes); auto all_parts_on_remote_disk = checkAllPartsOnRemoteFS(parts); @@ -792,7 +798,7 @@ struct PartRangesReadInfo min_marks_for_concurrent_read = MergeTreeDataSelectExecutor::minMarksForConcurrentRead( min_rows_for_concurrent_read, min_bytes_for_concurrent_read, - data_settings.index_granularity, index_granularity_bytes, sum_marks); + data_settings[MergeTreeSetting::index_granularity], index_granularity_bytes, sum_marks); use_uncompressed_cache = settings[Setting::use_uncompressed_cache]; if (sum_marks > max_marks_to_use_cache) @@ -970,7 +976,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsWithOrder( /// Let's split ranges to avoid reading much data. auto split_ranges - = [rows_granularity = data_settings->index_granularity, my_max_block_size = block_size.max_block_size_rows] + = [rows_granularity = (*data_settings)[MergeTreeSetting::index_granularity], my_max_block_size = block_size.max_block_size_rows] (const auto & ranges, int direction) { MarkRanges new_ranges; diff --git a/src/Processors/Transforms/getSourceFromASTInsertQuery.h b/src/Processors/Transforms/getSourceFromASTInsertQuery.h index dc541873972..0665269d66d 100644 --- a/src/Processors/Transforms/getSourceFromASTInsertQuery.h +++ b/src/Processors/Transforms/getSourceFromASTInsertQuery.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include #include diff --git a/src/Server/ReplicasStatusHandler.cpp b/src/Server/ReplicasStatusHandler.cpp index 419ad635d0d..730fdd50930 100644 --- a/src/Server/ReplicasStatusHandler.cpp +++ b/src/Server/ReplicasStatusHandler.cpp @@ -20,6 +20,12 @@ namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsUInt64 min_absolute_delay_to_close; + extern const MergeTreeSettingsUInt64 min_relative_delay_to_close; +} + ReplicasStatusHandler::ReplicasStatusHandler(IServer & server) : WithContext(server.context()) { } @@ -74,8 +80,8 @@ void ReplicasStatusHandler::handleRequest(HTTPServerRequest & request, HTTPServe { table_replicated->getReplicaDelays(absolute_delay, relative_delay); - if ((settings.min_absolute_delay_to_close && absolute_delay >= static_cast(settings.min_absolute_delay_to_close)) - || (settings.min_relative_delay_to_close && relative_delay >= static_cast(settings.min_relative_delay_to_close))) + if ((settings[MergeTreeSetting::min_absolute_delay_to_close] && absolute_delay >= static_cast(settings[MergeTreeSetting::min_absolute_delay_to_close])) + || (settings[MergeTreeSetting::min_relative_delay_to_close] && relative_delay >= static_cast(settings[MergeTreeSetting::min_relative_delay_to_close]))) ok = false; message << backQuoteIfNeed(db.first) << "." << backQuoteIfNeed(iterator->name()) diff --git a/src/Storages/ColumnDefault.h b/src/Storages/ColumnDefault.h index 0ec486e022f..ffd88391d37 100644 --- a/src/Storages/ColumnDefault.h +++ b/src/Storages/ColumnDefault.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 0de9fec3bb2..0d0689618eb 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -607,7 +607,7 @@ bool ColumnsDescription::hasSubcolumn(const String & column_name) const auto it = columns.get<1>().find(ordinary_column_name); if (it != columns.get<1>().end() && it->type->hasDynamicSubcolumns()) { - if (auto dynamic_subcolumn_type = it->type->tryGetSubcolumnType(dynamic_subcolumn_name)) + if (auto /*dynamic_subcolumn_type*/ _ = it->type->tryGetSubcolumnType(dynamic_subcolumn_name)) return true; } @@ -811,7 +811,7 @@ bool ColumnsDescription::hasColumnOrSubcolumn(GetColumnsOptions::Kind kind, cons it = columns.get<1>().find(ordinary_column_name); if (it != columns.get<1>().end() && it->type->hasDynamicSubcolumns()) { - if (auto dynamic_subcolumn_type = it->type->hasSubcolumn(dynamic_subcolumn_name)) + if (auto /*dynamic_subcolumn_type*/ _ = it->type->hasSubcolumn(dynamic_subcolumn_name)) return true; } diff --git a/src/Storages/MaterializedView/RefreshSchedule.cpp b/src/Storages/MaterializedView/RefreshSchedule.cpp index 87ac489e631..48f3fb57a6d 100644 --- a/src/Storages/MaterializedView/RefreshSchedule.cpp +++ b/src/Storages/MaterializedView/RefreshSchedule.cpp @@ -21,6 +21,18 @@ bool RefreshSchedule::operator!=(const RefreshSchedule & rhs) const return std::tie(kind, period, offset, spread) != std::tie(rhs.kind, rhs.period, rhs.offset, rhs.spread); } +static std::chrono::sys_seconds floorEvery(std::chrono::sys_seconds prev, CalendarTimeInterval period, CalendarTimeInterval offset) +{ + auto period_start = period.floor(prev); + auto t = offset.advance(period_start); + if (t <= prev) + return t; + period_start = period.floor(period_start - std::chrono::seconds(1)); + t = offset.advance(period_start); + chassert(t <= prev); + return t; +} + static std::chrono::sys_seconds advanceEvery(std::chrono::system_clock::time_point prev, CalendarTimeInterval period, CalendarTimeInterval offset) { auto period_start = period.floor(prev); @@ -32,27 +44,31 @@ static std::chrono::sys_seconds advanceEvery(std::chrono::system_clock::time_poi return t; } -std::chrono::sys_seconds RefreshSchedule::prescribeNext( - std::chrono::system_clock::time_point last_prescribed, std::chrono::system_clock::time_point now) const +std::chrono::sys_seconds RefreshSchedule::timeslotForCompletedRefresh(std::chrono::sys_seconds last_completed_timeslot, std::chrono::sys_seconds start_time, std::chrono::sys_seconds end_time, bool out_of_schedule) const { if (kind == RefreshScheduleKind::AFTER) - return period.advance(now); - - /// It's important to use prescribed instead of actual time here, otherwise we would do multiple - /// refreshes instead of one if the generated spread is negative and the the refresh completes - /// faster than the spread. - auto res = advanceEvery(last_prescribed, period, offset); - if (res < now) - res = advanceEvery(now, period, offset); // fell behind by a whole period, skip to current time - + return end_time; + /// Timeslot based on when the refresh actually happened. Useful if we fell behind and missed + /// some timeslots. + auto res = floorEvery(start_time, period, offset); + if (out_of_schedule) + return res; + /// Next timeslot after the last completed one. Useful in case we did a refresh a little early + /// because of random spread. + res = std::max(res, advanceEvery(last_completed_timeslot, period, offset)); return res; } -std::chrono::system_clock::time_point RefreshSchedule::addRandomSpread(std::chrono::sys_seconds prescribed_time) const +std::chrono::sys_seconds RefreshSchedule::advance(std::chrono::sys_seconds last_completed_timeslot) const { - Int64 ms = Int64(spread.minSeconds() * 1000 / 2); - auto add = std::uniform_int_distribution(-ms, ms)(thread_local_rng); - return prescribed_time + std::chrono::milliseconds(add); + if (kind == RefreshScheduleKind::AFTER) + return period.advance(last_completed_timeslot); + return advanceEvery(last_completed_timeslot, period, offset); +} + +std::chrono::system_clock::time_point RefreshSchedule::addRandomSpread(std::chrono::sys_seconds timeslot, Int64 randomness) const +{ + return timeslot + std::chrono::milliseconds(Int64(spread.minSeconds() * 1e3 / 2 * randomness / 1e9)); } } diff --git a/src/Storages/MaterializedView/RefreshSchedule.h b/src/Storages/MaterializedView/RefreshSchedule.h index db4ee1b99ce..6d03423d055 100644 --- a/src/Storages/MaterializedView/RefreshSchedule.h +++ b/src/Storages/MaterializedView/RefreshSchedule.h @@ -19,11 +19,13 @@ struct RefreshSchedule explicit RefreshSchedule(const ASTRefreshStrategy & strategy); bool operator!=(const RefreshSchedule & rhs) const; - /// Tells when to do the next refresh (without random spread). - std::chrono::sys_seconds prescribeNext( - std::chrono::system_clock::time_point last_prescribed, std::chrono::system_clock::time_point now) const; + /// What to store as "last completed timeslot" value after a refresh completes. + /// This value is used for scheduling subsequent refreshes. + std::chrono::sys_seconds timeslotForCompletedRefresh(std::chrono::sys_seconds last_completed_timeslot, std::chrono::sys_seconds start_time, std::chrono::sys_seconds end_time, bool out_of_schedule) const; - std::chrono::system_clock::time_point addRandomSpread(std::chrono::sys_seconds prescribed_time) const; + std::chrono::sys_seconds advance(std::chrono::sys_seconds last_completed_timeslot) const; + + std::chrono::system_clock::time_point addRandomSpread(std::chrono::sys_seconds timeslot, Int64 randomness) const; }; } diff --git a/src/Storages/MaterializedView/RefreshSet.cpp b/src/Storages/MaterializedView/RefreshSet.cpp index 7536f59c1e4..4c13bdf1dd0 100644 --- a/src/Storages/MaterializedView/RefreshSet.cpp +++ b/src/Storages/MaterializedView/RefreshSet.cpp @@ -21,8 +21,10 @@ RefreshSet::Handle & RefreshSet::Handle::operator=(Handle && other) noexcept reset(); parent_set = std::exchange(other.parent_set, nullptr); id = std::move(other.id); + inner_table_id = std::move(other.inner_table_id); dependencies = std::move(other.dependencies); iter = std::move(other.iter); + inner_table_iter = std::move(other.inner_table_iter); metric_increment = std::move(other.metric_increment); return *this; } @@ -32,21 +34,26 @@ RefreshSet::Handle::~Handle() reset(); } -void RefreshSet::Handle::rename(StorageID new_id) +void RefreshSet::Handle::rename(StorageID new_id, std::optional new_inner_table_id) { std::lock_guard lock(parent_set->mutex); - RefreshTaskHolder task = *iter; + RefreshTaskPtr task = *iter; parent_set->removeDependenciesLocked(task, dependencies); parent_set->removeTaskLocked(id, iter); + if (inner_table_id) + parent_set->removeInnerTableLocked(*inner_table_id, inner_table_iter); id = new_id; + inner_table_id = new_inner_table_id; iter = parent_set->addTaskLocked(id, task); + if (inner_table_id) + inner_table_iter = parent_set->addInnerTableLocked(*inner_table_id, task); parent_set->addDependenciesLocked(task, dependencies); } void RefreshSet::Handle::changeDependencies(std::vector deps) { std::lock_guard lock(parent_set->mutex); - RefreshTaskHolder task = *iter; + RefreshTaskPtr task = *iter; parent_set->removeDependenciesLocked(task, dependencies); dependencies = std::move(deps); parent_set->addDependenciesLocked(task, dependencies); @@ -61,6 +68,8 @@ void RefreshSet::Handle::reset() std::lock_guard lock(parent_set->mutex); parent_set->removeDependenciesLocked(*iter, dependencies); parent_set->removeTaskLocked(id, iter); + if (inner_table_id) + parent_set->removeInnerTableLocked(*inner_table_id, inner_table_iter); } parent_set = nullptr; @@ -69,16 +78,19 @@ void RefreshSet::Handle::reset() RefreshSet::RefreshSet() = default; -void RefreshSet::emplace(StorageID id, const std::vector & dependencies, RefreshTaskHolder task) +void RefreshSet::emplace(StorageID id, std::optional inner_table_id, const std::vector & dependencies, RefreshTaskPtr task) { std::lock_guard guard(mutex); const auto iter = addTaskLocked(id, task); + RefreshTaskList::iterator inner_table_iter; + if (inner_table_id) + inner_table_iter = addInnerTableLocked(*inner_table_id, task); addDependenciesLocked(task, dependencies); - task->setRefreshSetHandleUnlock(Handle(this, id, iter, dependencies)); + task->setRefreshSetHandleUnlock(Handle(this, id, inner_table_id, iter, inner_table_iter, dependencies)); } -RefreshTaskList::iterator RefreshSet::addTaskLocked(StorageID id, RefreshTaskHolder task) +RefreshTaskList::iterator RefreshSet::addTaskLocked(StorageID id, RefreshTaskPtr task) { RefreshTaskList & list = tasks[id]; list.push_back(task); @@ -93,13 +105,28 @@ void RefreshSet::removeTaskLocked(StorageID id, RefreshTaskList::iterator iter) tasks.erase(it); } -void RefreshSet::addDependenciesLocked(RefreshTaskHolder task, const std::vector & dependencies) +RefreshTaskList::iterator RefreshSet::addInnerTableLocked(StorageID inner_table_id, RefreshTaskPtr task) +{ + RefreshTaskList & list = inner_tables[inner_table_id]; + list.push_back(task); + return std::prev(list.end()); +} + +void RefreshSet::removeInnerTableLocked(StorageID inner_table_id, RefreshTaskList::iterator inner_table_iter) +{ + const auto it = inner_tables.find(inner_table_id); + it->second.erase(inner_table_iter); + if (it->second.empty()) + inner_tables.erase(it); +} + +void RefreshSet::addDependenciesLocked(RefreshTaskPtr task, const std::vector & dependencies) { for (const StorageID & dep : dependencies) dependents[dep].insert(task); } -void RefreshSet::removeDependenciesLocked(RefreshTaskHolder task, const std::vector & dependencies) +void RefreshSet::removeDependenciesLocked(RefreshTaskPtr task, const std::vector & dependencies) { for (const StorageID & dep : dependencies) { @@ -118,30 +145,42 @@ RefreshTaskList RefreshSet::findTasks(const StorageID & id) const return {}; } -RefreshSet::InfoContainer RefreshSet::getInfo() const +std::vector RefreshSet::getTasks() const { std::unique_lock lock(mutex); - auto tasks_copy = tasks; - lock.unlock(); - - InfoContainer res; - for (const auto & [id, list] : tasks_copy) + std::vector res; + for (const auto & [_, list] : tasks) for (const auto & task : list) - res.push_back(task->getInfo()); + res.push_back(task); return res; } -std::vector RefreshSet::getDependents(const StorageID & id) const +RefreshTaskPtr RefreshSet::tryGetTaskForInnerTable(const StorageID & inner_table_id) const { - std::lock_guard lock(mutex); - auto it = dependents.find(id); - if (it == dependents.end()) - return {}; - return std::vector(it->second.begin(), it->second.end()); + std::unique_lock lock(mutex); + auto it = inner_tables.find(inner_table_id); + if (it == inner_tables.end()) + return nullptr; + return *it->second.begin(); } -RefreshSet::Handle::Handle(RefreshSet * parent_set_, StorageID id_, RefreshTaskList::iterator iter_, std::vector dependencies_) - : parent_set(parent_set_), id(std::move(id_)), dependencies(std::move(dependencies_)) - , iter(iter_), metric_increment(CurrentMetrics::Increment(CurrentMetrics::RefreshableViews)) {} +void RefreshSet::notifyDependents(const StorageID & id) const +{ + std::vector res; + { + std::lock_guard lock(mutex); + auto it = dependents.find(id); + if (it == dependents.end()) + return; + for (const auto & task : it->second) + res.push_back(task); + } + for (const RefreshTaskPtr & t : res) + t->notifyDependencyProgress(); +} + +RefreshSet::Handle::Handle(RefreshSet * parent_set_, StorageID id_, std::optional inner_table_id_, RefreshTaskList::iterator iter_, RefreshTaskList::iterator inner_table_iter_, std::vector dependencies_) + : parent_set(parent_set_), id(std::move(id_)), inner_table_id(std::move(inner_table_id_)), dependencies(std::move(dependencies_)) + , iter(iter_), inner_table_iter(inner_table_iter_), metric_increment(CurrentMetrics::Increment(CurrentMetrics::RefreshableViews)) {} } diff --git a/src/Storages/MaterializedView/RefreshSet.h b/src/Storages/MaterializedView/RefreshSet.h index 6141a69996a..205a5512ffb 100644 --- a/src/Storages/MaterializedView/RefreshSet.h +++ b/src/Storages/MaterializedView/RefreshSet.h @@ -3,44 +3,15 @@ #include #include #include -#include #include #include namespace DB { -enum class RefreshState -{ - Disabled = 0, - Scheduled, - WaitingForDependencies, - Running, -}; - -enum class LastRefreshResult -{ - Unknown = 0, - Cancelled, - Error, - Finished -}; - -struct RefreshInfo -{ - StorageID view_id = StorageID::createEmpty(); - RefreshState state = RefreshState::Scheduled; - LastRefreshResult last_refresh_result = LastRefreshResult::Unknown; - std::optional last_attempt_time; - std::optional last_success_time; - UInt64 last_attempt_duration_ms = 0; - UInt32 next_refresh_time = 0; - UInt64 refresh_count = 0; - UInt64 retry = 0; - String exception_message; // if last_refresh_result is Error - std::vector remaining_dependencies; - ProgressValues progress; -}; +class RefreshTask; +using RefreshTaskPtr = std::shared_ptr; +using RefreshTaskList = std::list; /// Set of refreshable views class RefreshSet @@ -58,7 +29,7 @@ public: ~Handle(); - void rename(StorageID new_id); + void rename(StorageID new_id, std::optional new_inner_table_id); void changeDependencies(std::vector deps); void reset(); @@ -71,31 +42,33 @@ public: private: RefreshSet * parent_set = nullptr; StorageID id = StorageID::createEmpty(); + std::optional inner_table_id; std::vector dependencies; RefreshTaskList::iterator iter; // in parent_set->tasks[id] + RefreshTaskList::iterator inner_table_iter; // in parent_set->inner_tables[inner_table_id] std::optional metric_increment; - Handle(RefreshSet * parent_set_, StorageID id_, RefreshTaskList::iterator iter_, std::vector dependencies_); + Handle(RefreshSet * parent_set_, StorageID id_, std::optional inner_table_id, RefreshTaskList::iterator iter_, RefreshTaskList::iterator inner_table_iter_, std::vector dependencies_); }; - using InfoContainer = std::vector; - RefreshSet(); - void emplace(StorageID id, const std::vector & dependencies, RefreshTaskHolder task); + void emplace(StorageID id, std::optional inner_table_id, const std::vector & dependencies, RefreshTaskPtr task); /// Finds active refreshable view(s) by database and table name. /// Normally there's at most one, but we allow name collisions here, just in case. RefreshTaskList findTasks(const StorageID & id) const; + std::vector getTasks() const; - InfoContainer getInfo() const; + RefreshTaskPtr tryGetTaskForInnerTable(const StorageID & inner_table_id) const; - /// Get tasks that depend on the given one. - std::vector getDependents(const StorageID & id) const; + /// Calls notifyDependencyProgress() on all tasks that depend on `id`. + void notifyDependents(const StorageID & id) const; private: using TaskMap = std::unordered_map; - using DependentsMap = std::unordered_map, StorageID::DatabaseAndTableNameHash, StorageID::DatabaseAndTableNameEqual>; + using DependentsMap = std::unordered_map, StorageID::DatabaseAndTableNameHash, StorageID::DatabaseAndTableNameEqual>; + using InnerTableMap = std::unordered_map; /// Protects the two maps below, not locked for any nontrivial operations (e.g. operations that /// block or lock other mutexes). @@ -103,11 +76,14 @@ private: TaskMap tasks; DependentsMap dependents; + InnerTableMap inner_tables; - RefreshTaskList::iterator addTaskLocked(StorageID id, RefreshTaskHolder task); + RefreshTaskList::iterator addTaskLocked(StorageID id, RefreshTaskPtr task); void removeTaskLocked(StorageID id, RefreshTaskList::iterator iter); - void addDependenciesLocked(RefreshTaskHolder task, const std::vector & dependencies); - void removeDependenciesLocked(RefreshTaskHolder task, const std::vector & dependencies); + RefreshTaskList::iterator addInnerTableLocked(StorageID inner_table_id, RefreshTaskPtr task); + void removeInnerTableLocked(StorageID inner_table_id, RefreshTaskList::iterator inner_table_iter); + void addDependenciesLocked(RefreshTaskPtr task, const std::vector & dependencies); + void removeDependenciesLocked(RefreshTaskPtr task, const std::vector & dependencies); }; } diff --git a/src/Storages/MaterializedView/RefreshSettings.h b/src/Storages/MaterializedView/RefreshSettings.h index 23676538788..20a2991af95 100644 --- a/src/Storages/MaterializedView/RefreshSettings.h +++ b/src/Storages/MaterializedView/RefreshSettings.h @@ -6,10 +6,10 @@ namespace DB { #define LIST_OF_REFRESH_SETTINGS(M, ALIAS) \ - M(Int64, refresh_retries, 0, "How many times to retry refresh query if it fails. If all attempts fail, wait for the next refresh time according to schedule. 0 to disable retries. -1 for infinite retries.", 0) \ + M(Int64, refresh_retries, 2, "How many times to retry refresh query if it fails. If all attempts fail, wait for the next refresh time according to schedule. 0 to disable retries. -1 for infinite retries.", 0) \ M(UInt64, refresh_retry_initial_backoff_ms, 100, "Delay before the first retry if refresh query fails (if refresh_retries setting is not zero). Each subsequent retry doubles the delay, up to refresh_retry_max_backoff_ms.", 0) \ M(UInt64, refresh_retry_max_backoff_ms, 60'000, "Limit on the exponential growth of delay between refresh attempts, if they keep failing and refresh_retries is positive.", 0) \ - + M(Bool, all_replicas, /* do not change or existing tables will break */ false, "If the materialized view is in a Replicated database, and APPEND is enabled, this flag controls whether all replicas or one replica will refresh.", 0) \ DECLARE_SETTINGS_TRAITS(RefreshSettingsTraits, LIST_OF_REFRESH_SETTINGS) diff --git a/src/Storages/MaterializedView/RefreshTask.cpp b/src/Storages/MaterializedView/RefreshTask.cpp index 93cab24af7f..3b5e88b82cf 100644 --- a/src/Storages/MaterializedView/RefreshTask.cpp +++ b/src/Storages/MaterializedView/RefreshTask.cpp @@ -2,10 +2,17 @@ #include #include +#include +#include +#include +#include #include +#include #include -#include +#include #include +#include +#include #include #include #include @@ -22,6 +29,7 @@ namespace Setting { extern const SettingsUInt64 log_queries_cut_to_length; extern const SettingsBool stop_refreshable_materialized_views_on_startup; + extern const SettingsSeconds lock_acquire_timeout; } namespace ErrorCodes @@ -29,10 +37,12 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int QUERY_WAS_CANCELLED; extern const int REFRESH_FAILED; + extern const int TABLE_IS_DROPPED; + extern const int NOT_IMPLEMENTED; } RefreshTask::RefreshTask( - StorageMaterializedView * view_, const DB::ASTRefreshStrategy & strategy) + StorageMaterializedView * view_, ContextPtr context, const DB::ASTRefreshStrategy & strategy, bool attach, bool coordinated, bool empty) : log(getLogger("RefreshTask")) , view(view_) , refresh_schedule(strategy) @@ -40,14 +50,54 @@ RefreshTask::RefreshTask( { if (strategy.settings != nullptr) refresh_settings.applyChanges(strategy.settings->changes); + + coordination.root_znode.randomize(); + if (empty) + coordination.root_znode.last_completed_timeslot = std::chrono::floor(currentTime()); + if (coordinated) + { + coordination.coordinated = true; + + const auto & server_settings = context->getServerSettings(); + const auto macros = context->getMacros(); + Macros::MacroExpansionInfo info; + info.table_id = view->getStorageID(); + coordination.path = macros->expand(server_settings.default_replica_path, info); + coordination.replica_name = context->getMacros()->expand(server_settings.default_replica_name, info); + + auto zookeeper = context->getZooKeeper(); + String replica_path = coordination.path + "/replicas/" + coordination.replica_name; + if (attach) + { + /// Check that this replica is registered in keeper. + if (!zookeeper->exists(replica_path)) + { + LOG_ERROR(log, "Attaching refreshable materialized view {} as read-only because znode {} is missing", view->getStorageID().getFullTableName(), replica_path); + coordination.read_only = true; + } + } + else + { + zookeeper->createAncestors(coordination.path); + /// Create coordination znodes if they don't exist. Register this replica, throwing if already exists. + Coordination::Requests ops; + ops.emplace_back(zkutil::makeCreateRequest(coordination.path, coordination.root_znode.toString(), zkutil::CreateMode::Persistent, true)); + ops.emplace_back(zkutil::makeCreateRequest(coordination.path + "/replicas", "", zkutil::CreateMode::Persistent, true)); + ops.emplace_back(zkutil::makeCreateRequest(replica_path, "", zkutil::CreateMode::Persistent)); + zookeeper->multi(ops); + } + } } OwnedRefreshTask RefreshTask::create( StorageMaterializedView * view, ContextMutablePtr context, - const DB::ASTRefreshStrategy & strategy) + const DB::ASTRefreshStrategy & strategy, + bool attach, + bool coordinated, + bool empty) { - auto task = std::make_shared(view, strategy); + auto task = std::make_shared(view, context, strategy, attach, coordinated, empty); task->refresh_task = context->getSchedulePool().createTask("RefreshTask", [self = task.get()] { self->refreshTask(); }); @@ -59,116 +109,15 @@ OwnedRefreshTask RefreshTask::create( return OwnedRefreshTask(task); } -void RefreshTask::initializeAndStart() +void RefreshTask::startup() { if (view->getContext()->getSettingsRef()[Setting::stop_refreshable_materialized_views_on_startup]) - stop_requested = true; - view->getContext()->getRefreshSet().emplace(view->getStorageID(), initial_dependencies, shared_from_this()); - populateDependencies(); - advanceNextRefreshTime(currentTime()); + scheduling.stop_requested = true; + auto inner_table_id = refresh_append ? std::nullopt : std::make_optional(view->getTargetTableId()); + view->getContext()->getRefreshSet().emplace(view->getStorageID(), inner_table_id, initial_dependencies, shared_from_this()); refresh_task->schedule(); } -void RefreshTask::rename(StorageID new_id) -{ - std::lock_guard guard(mutex); - if (set_handle) - set_handle.rename(new_id); -} - -void RefreshTask::alterRefreshParams(const DB::ASTRefreshStrategy & new_strategy) -{ - std::lock_guard guard(mutex); - - RefreshSchedule new_schedule(new_strategy); - std::vector deps; - if (new_strategy.dependencies) - for (auto && dependency : new_strategy.dependencies->children) - deps.emplace_back(dependency->as()); - - /// Reschedule next refresh. - if (new_schedule != refresh_schedule) - { - refresh_schedule = new_schedule; - next_refresh_prescribed = {}; - advanceNextRefreshTime(currentTime()); - refresh_task->schedule(); - } - - /// Update dependency graph. - set_handle.changeDependencies(deps); - - /// Mark removed dependencies as satisfied. - DatabaseAndTableNameSet deps_set(deps.begin(), deps.end()); - std::vector removed_deps; - for (const auto & id : remaining_dependencies) - if (!deps_set.contains(id)) - removed_deps.push_back(id); - for (const auto & id : removed_deps) - if (arriveDependency(id) && !std::exchange(refresh_immediately, true)) - refresh_task->schedule(); - - refresh_settings = {}; - if (new_strategy.settings != nullptr) - refresh_settings.applyChanges(new_strategy.settings->changes); - - refresh_append = new_strategy.append; -} - -RefreshInfo RefreshTask::getInfo() const -{ - std::lock_guard guard(mutex); - auto res = info; - res.view_id = set_handle.getID(); - res.remaining_dependencies.assign(remaining_dependencies.begin(), remaining_dependencies.end()); - if (res.last_refresh_result != LastRefreshResult::Error) - res.exception_message.clear(); - res.progress = progress.getValues(); - return res; -} - -void RefreshTask::start() -{ - std::lock_guard guard(mutex); - if (!std::exchange(stop_requested, false)) - return; - refresh_task->schedule(); -} - -void RefreshTask::stop() -{ - std::lock_guard guard(mutex); - if (std::exchange(stop_requested, true)) - return; - interruptExecution(); - refresh_task->schedule(); -} - -void RefreshTask::run() -{ - std::lock_guard guard(mutex); - if (std::exchange(refresh_immediately, true)) - return; - next_refresh_prescribed = std::chrono::floor(currentTime()); - next_refresh_actual = currentTime(); - refresh_task->schedule(); -} - -void RefreshTask::cancel() -{ - std::lock_guard guard(mutex); - interruptExecution(); - refresh_task->schedule(); -} - -void RefreshTask::wait() -{ - std::unique_lock lock(mutex); - refresh_cv.wait(lock, [&] { return info.state != RefreshState::Running && !refresh_immediately; }); - if (info.last_refresh_result == LastRefreshResult::Error) - throw Exception(ErrorCodes::REFRESH_FAILED, "Refresh failed: {}", info.exception_message); -} - void RefreshTask::shutdown() { { @@ -177,10 +126,14 @@ void RefreshTask::shutdown() if (view == nullptr) return; // already shut down - stop_requested = true; + scheduling.stop_requested = true; interruptExecution(); } + /// If we're in DatabaseReplicated, interrupt replicated CREATE/EXCHANGE/DROP queries in refresh task. + /// Without this we can deadlock waiting for refresh_task because this shutdown happens from the same DDL thread for which CREATE/EXCHANGE/DROP wait. + execution.cancel_ddl_queries.request_stop(); + /// Wait for the task to return and prevent it from being scheduled in future. refresh_task->deactivate(); @@ -194,199 +147,411 @@ void RefreshTask::shutdown() view = nullptr; } -void RefreshTask::notify(const StorageID & parent_id, std::chrono::sys_seconds parent_next_prescribed_time) +void RefreshTask::drop(ContextPtr context) +{ + if (coordination.coordinated) + { + auto zookeeper = context->getZooKeeper(); + + zookeeper->tryRemove(coordination.path + "/replicas/" + coordination.replica_name); + + /// Redundant, refreshTask() is supposed to clean up after itself, but let's be paranoid. + removeRunningZnodeIfMine(zookeeper); + + /// If no replicas left, remove the coordination znode. + Coordination::Requests ops; + ops.emplace_back(zkutil::makeRemoveRequest(coordination.path + "/replicas", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(coordination.path, -1)); + Coordination::Responses responses; + auto code = zookeeper->tryMulti(ops, responses); + if (responses[0]->error != Coordination::Error::ZNOTEMPTY && responses[0]->error != Coordination::Error::ZNONODE) + zkutil::KeeperMultiException::check(code, ops, responses); + } +} + +void RefreshTask::rename(StorageID new_id, StorageID new_inner_table_id) { std::lock_guard guard(mutex); - if (!set_handle) - return; // we've shut down + if (set_handle) + set_handle.rename(new_id, refresh_append ? std::nullopt : std::make_optional(new_inner_table_id)); +} - /// In the general case, it's not clear what the meaning of dependencies should be. - /// E.g. what behavior would the user want/expect in the following cases?: - /// * REFRESH EVERY 3 HOUR depends on REFRESH EVERY 2 HOUR - /// * REFRESH AFTER 3 HOUR depends on REFRESH AFTER 2 HOUR - /// * REFRESH AFTER 3 HOUR depends on REFRESH EVERY 1 DAY - /// I don't know. - /// - /// Cases that are important to support well include: - /// (1) REFRESH EVERY 1 DAY depends on REFRESH EVERY 1 DAY - /// Here the second refresh should start only after the first refresh completed *for the same day*. - /// Yesterday's refresh of the dependency shouldn't trigger today's refresh of the dependent, - /// even if it completed today. - /// (2) REFRESH EVERY 1 DAY OFFSET 2 HOUR depends on REFRESH EVERY 1 DAY OFFSET 1 HOUR - /// (3) REFRESH EVERY 1 DAY OFFSET 1 HOUR depends on REFRESH EVERY 1 DAY OFFSET 23 HOUR - /// Here the dependency's refresh on day X should trigger dependent's refresh on day X+1. - /// (4) REFRESH EVERY 2 HOUR depends on REFRESH EVERY 1 HOUR - /// The 2 HOUR refresh should happen after the 1 HOUR refresh for every other hour, e.g. - /// after the 2pm refresh, then after the 4pm refresh, etc. - /// - /// We currently don't allow dependencies in REFRESH AFTER case, because its unclear how to define - /// it in a non-confusing way. Consider view y that depends on view x, both with - /// REFRESH AFTER 1 hour. The user's intention is probably to make y always refresh immediately - /// after x. But suppose y takes slightly longer to refresh than x. If we don't do anything - /// special, x's refresh schedule will run ahead, and the DEPENDS ON will have pretty much no - /// effect - confusing! As a dirty way to prevent this, we could just decrease refresh period by, - /// say, 50%, if the view has dependencies at all. But that still sounds more confusing than useful. - /// Or we could say that we only refresh y if x refreshes less than 10% of 1 HOUR ago, so in our - /// scenario y would be refreshing every 2 hours instead of 1 hour sometimes. +void RefreshTask::checkAlterIsPossible(const DB::ASTRefreshStrategy & new_strategy) +{ + RefreshSettings s; + if (new_strategy.settings) + s.applyChanges(new_strategy.settings->changes); + if (s.all_replicas != refresh_settings.all_replicas) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Altering setting 'all_replicas' is not supported."); + if (new_strategy.append != refresh_append) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Adding or removing APPEND is not supported."); +} - /// Only accept the dependency's refresh if its next refresh time is after ours. - /// This takes care of cases (1)-(4). - if (parent_next_prescribed_time <= next_refresh_prescribed) - return; +void RefreshTask::alterRefreshParams(const DB::ASTRefreshStrategy & new_strategy) +{ + { + std::lock_guard guard(mutex); + + refresh_schedule = RefreshSchedule(new_strategy); + std::vector deps; + if (new_strategy.dependencies) + for (auto && dependency : new_strategy.dependencies->children) + deps.emplace_back(dependency->as()); + + /// Update dependency graph. + set_handle.changeDependencies(deps); - if (arriveDependency(parent_id) && !std::exchange(refresh_immediately, true)) refresh_task->schedule(); + scheduling.dependencies_satisfied_until = std::chrono::sys_seconds(std::chrono::seconds(-1)); + + refresh_settings = {}; + if (new_strategy.settings != nullptr) + refresh_settings.applyChanges(new_strategy.settings->changes); + } + /// In case refresh period changed. + view->getContext()->getRefreshSet().notifyDependents(view->getStorageID()); +} + +RefreshTask::Info RefreshTask::getInfo() const +{ + std::lock_guard guard(mutex); + return Info {.view_id = set_handle.getID(), .state = state, .next_refresh_time = next_refresh_time, .znode = coordination.root_znode, .refresh_running = coordination.running_znode_exists, .progress = execution.progress.getValues()}; +} + +void RefreshTask::start() +{ + std::lock_guard guard(mutex); + if (!std::exchange(scheduling.stop_requested, false)) + return; + refresh_task->schedule(); +} + +void RefreshTask::stop() +{ + std::lock_guard guard(mutex); + if (std::exchange(scheduling.stop_requested, true)) + return; + interruptExecution(); + refresh_task->schedule(); +} + +void RefreshTask::run() +{ + std::lock_guard guard(mutex); + if (std::exchange(scheduling.out_of_schedule_refresh_requested, true)) + return; + refresh_task->schedule(); +} + +void RefreshTask::cancel() +{ + std::lock_guard guard(mutex); + interruptExecution(); + refresh_task->schedule(); +} + +void RefreshTask::wait() +{ + auto throw_if_error = [&] + { + if (!view) + throw Exception(ErrorCodes::TABLE_IS_DROPPED, "The table was dropped or detached"); + if (!coordination.running_znode_exists && !coordination.root_znode.last_attempt_succeeded && coordination.root_znode.last_attempt_time.time_since_epoch().count() != 0) + throw Exception(ErrorCodes::REFRESH_FAILED, + "Refresh failed{}: {}", coordination.coordinated ? " (on replica " + coordination.root_znode.last_attempt_replica + ")" : "", + coordination.root_znode.last_attempt_error.empty() ? "Replica went away" : coordination.root_znode.last_attempt_error); + }; + + std::unique_lock lock(mutex); + refresh_cv.wait(lock, [&] { + return state != RefreshState::Running && state != RefreshState::Scheduling && + state != RefreshState::RunningOnAnotherReplica && !scheduling.out_of_schedule_refresh_requested; + }); + throw_if_error(); + + if (coordination.coordinated && !refresh_append) + { + /// Wait until we see the table produced by the latest refresh. + while (true) + { + UUID expected_table_uuid = coordination.root_znode.last_success_table_uuid; + StorageID storage_id = view->getTargetTableId(); + ContextPtr context = view->getContext(); + lock.unlock(); + + /// (Can't use `view` here because shutdown() may unset it in parallel with us.) + StoragePtr storage = DatabaseCatalog::instance().tryGetTable(storage_id, context); + if (storage && storage->getStorageID().uuid == expected_table_uuid) + return; + + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + lock.lock(); + /// Re-check last_attempt_succeeded in case another refresh EXCHANGEd the table but failed to write its uuid to keeper. + throw_if_error(); + } + } +} + +std::chrono::sys_seconds RefreshTask::getNextRefreshTimeslot() const +{ + std::lock_guard guard(mutex); + return refresh_schedule.advance(coordination.root_znode.last_completed_timeslot); +} + +void RefreshTask::notifyDependencyProgress() +{ + std::lock_guard guard(mutex); + scheduling.dependencies_satisfied_until = std::chrono::sys_seconds(std::chrono::seconds(-1)); + refresh_task->schedule(); } void RefreshTask::setFakeTime(std::optional t) { std::unique_lock lock(mutex); - fake_clock.store(t.value_or(INT64_MIN), std::memory_order_relaxed); + scheduling.fake_clock.store(t.value_or(INT64_MIN), std::memory_order_relaxed); /// Reschedule task with shorter delay if currently scheduled. refresh_task->scheduleAfter(100, /*overwrite*/ true, /*only_if_scheduled*/ true); } void RefreshTask::refreshTask() { + std::unique_lock lock(mutex); + + auto schedule_keeper_retry = [&] { + chassert(lock.owns_lock()); + chassert(state == RefreshState::Scheduling); + coordination.watches->should_reread_znodes.store(true); + refresh_task->scheduleAfter(5000); + }; + try { - std::unique_lock lock(mutex); - - /// Whoever breaks out of this loop should assign info.state first. + bool refreshed_just_now = false; + /// Whoever breaks out of this loop should assign state. while (true) { + setState(RefreshState::Scheduling, lock); + execution.interrupt_execution.store(false); + + updateDependenciesIfNeeded(lock); + + std::shared_ptr zookeeper; + if (coordination.coordinated) + zookeeper = view->getContext()->getZooKeeper(); + readZnodesIfNeeded(zookeeper, lock); chassert(lock.owns_lock()); - interrupt_execution.store(false); - refresh_cv.notify_all(); // we'll assign info.state before unlocking the mutex - - if (stop_requested) + /// Check if another replica is already running a refresh. + if (coordination.running_znode_exists) { - /// Exit the task and wait for the user to start or resume, which will schedule the task again. - info.state = RefreshState::Disabled; - break; - } - - if (!refresh_immediately) - { - auto now = currentTime(); - if (now >= next_refresh_actual) + if (coordination.root_znode.last_attempt_replica == coordination.replica_name) { - if (arriveTime()) - refresh_immediately = true; - else - { - info.state = RefreshState::WaitingForDependencies; - break; - } + LOG_ERROR(log, "Znode {} indicates that this replica is running a refresh, but it isn't. Likely a bug.", coordination.path + "/running"); +#ifdef ABORT_ON_LOGICAL_ERROR + abortOnFailedAssertion("Unexpected refresh lock in keeper"); +#else + coordination.running_znode_exists = false; + if (coordination.coordinated) + removeRunningZnodeIfMine(zookeeper); + schedule_keeper_retry(); + break; +#endif } else { - size_t delay_ms = std::chrono::duration_cast( - next_refresh_actual - now).count(); - - /// If we're in a test that fakes the clock, poll every 100ms. - if (fake_clock.load(std::memory_order_relaxed) != INT64_MIN) - delay_ms = 100; - - refresh_task->scheduleAfter(delay_ms); - info.state = RefreshState::Scheduled; + setState(RefreshState::RunningOnAnotherReplica, lock); break; } } + chassert(lock.owns_lock()); + + if (scheduling.stop_requested || coordination.read_only) + { + /// Exit the task and wait for the user to start or resume, which will schedule the task again. + setState(RefreshState::Disabled, lock); + break; + } + + /// Check if it's time to refresh. + auto now = currentTime(); + auto start_time = std::chrono::floor(now); + auto start_time_steady = std::chrono::steady_clock::now(); + auto [when, timeslot, start_znode] = determineNextRefreshTime(start_time); + next_refresh_time = when; + bool out_of_schedule = scheduling.out_of_schedule_refresh_requested; + if (out_of_schedule) + { + chassert(start_znode.attempt_number > 0); + start_znode.attempt_number -= 1; + } + else if (now < when) + { + size_t delay_ms = std::chrono::duration_cast(when - now).count(); + /// If we're in a test that fakes the clock, poll every 100ms. + if (scheduling.fake_clock.load(std::memory_order_relaxed) != INT64_MIN) + delay_ms = 100; + refresh_task->scheduleAfter(delay_ms); + setState(RefreshState::Scheduled, lock); + break; + } + else if (timeslot >= scheduling.dependencies_satisfied_until) + { + setState(RefreshState::WaitingForDependencies, lock); + break; + } + + if (refreshed_just_now) + { + /// If doing two refreshes in a row, go through Scheduled state first, + /// to give wait() a chance to complete. + setState(RefreshState::Scheduled, lock); + refresh_task->schedule(); + break; + } + + /// Write to keeper. + if (!updateCoordinationState(start_znode, true, zookeeper, lock)) + { + schedule_keeper_retry(); + return; + } + chassert(lock.owns_lock()); + /// Perform a refresh. + setState(RefreshState::Running, lock); + scheduling.out_of_schedule_refresh_requested = false; bool append = refresh_append; - refresh_immediately = false; - info.state = RefreshState::Running; + int32_t root_znode_version = coordination.coordinated ? coordination.root_znode.version : -1; CurrentMetrics::Increment metric_inc(CurrentMetrics::RefreshingViews); lock.unlock(); bool refreshed = false; - std::optional exception; - auto start_time = std::chrono::steady_clock::now(); + String error_message; + UUID new_table_uuid; try { - executeRefreshUnlocked(append); + new_table_uuid = executeRefreshUnlocked(append, root_znode_version); refreshed = true; } catch (...) { - if (!interrupt_execution.load()) - exception = getCurrentExceptionMessage(true); + if (execution.interrupt_execution.load()) + { + error_message = "cancelled"; + LOG_INFO(log, "{}: Refresh cancelled", view->getStorageID().getFullTableName()); + } + else + { + error_message = getCurrentExceptionMessage(true); + LOG_ERROR(log, "{}: Refresh failed (attempt {}/{}): {}", view->getStorageID().getFullTableName(), start_znode.attempt_number, refresh_settings.refresh_retries + 1, error_message); + } } lock.lock(); - auto now = currentTime(); - auto secs = std::chrono::floor(now); - info.last_attempt_time = UInt32(secs.time_since_epoch().count()); - info.last_attempt_duration_ms = std::chrono::duration_cast(std::chrono::steady_clock::now() - start_time).count(); + setState(RefreshState::Scheduling, lock); - if (exception) + auto end_time = std::chrono::floor(currentTime()); + auto znode = coordination.root_znode; + znode.last_attempt_time = end_time; + if (refreshed) { - info.last_refresh_result = LastRefreshResult::Error; - info.exception_message = *exception; - Int64 attempt_number = num_retries + 1; - scheduleRetryOrSkipToNextRefresh(now); - LOG_ERROR(log, "Refresh view {} failed (attempt {}/{}): {}", view->getStorageID().getFullTableName(), attempt_number, refresh_settings.refresh_retries + 1, *exception); - } - else if (!refreshed) - { - info.last_refresh_result = LastRefreshResult::Cancelled; - - /// Make sure we don't just start another refresh immediately. - if (!stop_requested) - advanceNextRefreshTime(now); + znode.last_attempt_succeeded = true; + znode.last_completed_timeslot = refresh_schedule.timeslotForCompletedRefresh(znode.last_completed_timeslot, start_time, end_time, out_of_schedule); + znode.last_success_time = start_time; + znode.last_success_duration = std::chrono::duration_cast(std::chrono::steady_clock::now() - start_time_steady); + znode.last_success_table_uuid = new_table_uuid; + znode.previous_attempt_error = ""; + znode.attempt_number = 0; + znode.randomize(); } else { - info.last_refresh_result = LastRefreshResult::Finished; - info.last_success_time = info.last_attempt_time; - info.refresh_count += 1; - advanceNextRefreshTime(now); + znode.last_attempt_error = error_message; + } - auto next_time = next_refresh_prescribed; + bool ok = updateCoordinationState(znode, false, zookeeper, lock); + chassert(ok); + chassert(lock.owns_lock()); + if (refreshed) + { lock.unlock(); - StorageID my_id = view->getStorageID(); - auto dependents = view->getContext()->getRefreshSet().getDependents(my_id); - for (const RefreshTaskHolder & dep_task : dependents) - dep_task->notify(my_id, next_time); + view->getContext()->getRefreshSet().notifyDependents(view->getStorageID()); lock.lock(); } + + refreshed_just_now = true; } } + catch (Coordination::Exception &) + { + tryLogCurrentException(log, "Keeper error"); + if (!lock.owns_lock()) + lock.lock(); + schedule_keeper_retry(); + } catch (...) { - std::unique_lock lock(mutex); - stop_requested = true; + if (!lock.owns_lock()) + lock.lock(); + scheduling.stop_requested = true; + coordination.watches->should_reread_znodes.store(true); + coordination.running_znode_exists = false; + lock.unlock(); + tryLogCurrentException(log, "Unexpected exception in refresh scheduling, please investigate. The view will be stopped."); #ifdef DEBUG_OR_SANITIZER_BUILD abortOnFailedAssertion("Unexpected exception in refresh scheduling"); +#else + if (coordination.coordinated) + removeRunningZnodeIfMine(view->getContext()->getZooKeeper()); #endif } } -void RefreshTask::executeRefreshUnlocked(bool append) +UUID RefreshTask::executeRefreshUnlocked(bool append, int32_t root_znode_version) { LOG_DEBUG(log, "Refreshing view {}", view->getStorageID().getFullTableName()); - progress.reset(); + execution.progress.reset(); ContextMutablePtr refresh_context = view->createRefreshContext(); + + if (!append) + { + refresh_context->setParentTable(view->getStorageID().uuid); + refresh_context->setDDLQueryCancellation(execution.cancel_ddl_queries.get_token()); + if (root_znode_version != -1) + refresh_context->setDDLAdditionalChecksOnEnqueue({zkutil::makeCheckRequest(coordination.path, root_znode_version)}); + } + std::optional table_to_drop; + auto new_table_id = StorageID::createEmpty(); try { - /// Create a table. - auto refresh_query = view->prepareRefresh(append, refresh_context, table_to_drop); - - /// Run the query. { - CurrentThread::QueryScope query_scope(refresh_context); // create a thread group for the query + /// Create a table. + auto [refresh_query, query_scope] = view->prepareRefresh(append, refresh_context, table_to_drop); + new_table_id = refresh_query->table_id; + + /// Add the query to system.processes and allow it to be killed with KILL QUERY. + String query_for_logging = refresh_query->formatForLogging( + refresh_context->getSettingsRef()[Setting::log_queries_cut_to_length]); + auto process_list_entry = refresh_context->getProcessList().insert( + query_for_logging, refresh_query.get(), refresh_context, Stopwatch{CLOCK_MONOTONIC}.getStart()); + refresh_context->setProcessListElement(process_list_entry->getQueryStatus()); + refresh_context->setProgressCallback([this](const Progress & prog) + { + execution.progress.incrementPiecewiseAtomically(prog); + }); + + /// Run the query. BlockIO block_io = InterpreterInsertQuery( refresh_query, @@ -397,19 +562,6 @@ void RefreshTask::executeRefreshUnlocked(bool append) /* async_isnert */ false).execute(); QueryPipeline & pipeline = block_io.pipeline; - pipeline.setProgressCallback([this](const Progress & prog) - { - /// TODO: Investigate why most fields are not populated. Change columns in system.view_refreshes as needed, update documentation (docs/en/operations/system-tables/view_refreshes.md). - progress.incrementPiecewiseAtomically(prog); - }); - - /// Add the query to system.processes and allow it to be killed with KILL QUERY. - String query_for_logging = refresh_query->formatForLogging(refresh_context->getSettingsRef()[Setting::log_queries_cut_to_length]); - block_io.process_list_entry = refresh_context->getProcessList().insert( - query_for_logging, refresh_query.get(), refresh_context, Stopwatch{CLOCK_MONOTONIC}.getStart()); - pipeline.setProcessListElement(block_io.process_list_entry->getQueryStatus()); - refresh_context->setProcessListElement(block_io.process_list_entry->getQueryStatus()); - if (!pipeline.completed()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Pipeline for view refresh must be completed"); @@ -417,14 +569,14 @@ void RefreshTask::executeRefreshUnlocked(bool append) executor.setReadProgressCallback(pipeline.getReadProgressCallback()); { - std::unique_lock exec_lock(executor_mutex); - if (interrupt_execution.load()) + std::unique_lock exec_lock(execution.executor_mutex); + if (execution.interrupt_execution.load()) throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Refresh cancelled"); - running_executor = &executor; + execution.executor = &executor; } SCOPE_EXIT({ - std::unique_lock exec_lock(executor_mutex); - running_executor = nullptr; + std::unique_lock exec_lock(execution.executor_mutex); + execution.executor = nullptr; }); executor.execute(pipeline.getNumThreads(), pipeline.getConcurrencyControl()); @@ -435,13 +587,13 @@ void RefreshTask::executeRefreshUnlocked(bool append) /// * do it before destroying the QueryPipeline; otherwise it may fail assertions about /// being unexpectedly destroyed before completion and without uncaught exception /// (specifically, the assert in ~WriteBuffer()). - if (interrupt_execution.load()) + if (execution.interrupt_execution.load()) throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Refresh cancelled"); } /// Exchange tables. if (!append) - table_to_drop = view->exchangeTargetTable(refresh_query->table_id, refresh_context); + table_to_drop = view->exchangeTargetTable(new_table_id, refresh_context); } catch (...) { @@ -452,86 +604,279 @@ void RefreshTask::executeRefreshUnlocked(bool append) if (table_to_drop.has_value()) view->dropTempTable(table_to_drop.value(), refresh_context); + + return new_table_id.uuid; } -void RefreshTask::advanceNextRefreshTime(std::chrono::system_clock::time_point now) +void RefreshTask::updateDependenciesIfNeeded(std::unique_lock & lock) { - std::chrono::sys_seconds next = refresh_schedule.prescribeNext(next_refresh_prescribed, now); - next_refresh_prescribed = next; - next_refresh_actual = refresh_schedule.addRandomSpread(next); - - num_retries = 0; - info.retry = num_retries; - - auto secs = std::chrono::floor(next_refresh_actual); - info.next_refresh_time = UInt32(secs.time_since_epoch().count()); -} - -void RefreshTask::scheduleRetryOrSkipToNextRefresh(std::chrono::system_clock::time_point now) -{ - if (refresh_settings.refresh_retries >= 0 && num_retries >= refresh_settings.refresh_retries) + while (true) { - advanceNextRefreshTime(now); + chassert(lock.owns_lock()); + if (scheduling.dependencies_satisfied_until.time_since_epoch().count() >= 0) + return; + auto deps = set_handle.getDependencies(); + if (deps.empty()) + { + scheduling.dependencies_satisfied_until = std::chrono::sys_seconds::max(); + return; + } + scheduling.dependencies_satisfied_until = std::chrono::sys_seconds(std::chrono::seconds(-2)); + lock.unlock(); + + /// Consider a dependency satisfied if its next scheduled refresh time is greater than ours. + /// This seems to produce reasonable behavior in practical cases, e.g.: + /// * REFRESH EVERY 1 DAY depends on REFRESH EVERY 1 DAY + /// The second refresh starts after the first refresh completes *for the same day*. + /// * REFRESH EVERY 1 DAY OFFSET 2 HOUR depends on REFRESH EVERY 1 DAY OFFSET 1 HOUR + /// The second refresh starts after the first refresh completes for the same day as well (scheduled 1 hour earlier). + /// * REFRESH EVERY 1 DAY OFFSET 1 HOUR depends on REFRESH EVERY 1 DAY OFFSET 23 HOUR + /// The dependency's refresh on day X triggers dependent's refresh on day X+1. + /// * REFRESH EVERY 2 HOUR depends on REFRESH EVERY 1 HOUR + /// The 2 HOUR refresh happens after the 1 HOUR refresh for every other hour, e.g. + /// after the 2pm refresh, then after the 4pm refresh, etc. + /// + /// We currently don't allow dependencies in REFRESH AFTER case, because its unclear what their meaning should be. + + const RefreshSet & set = view->getContext()->getRefreshSet(); + auto min_ts = std::chrono::sys_seconds::max(); + for (const StorageID & id : deps) + { + auto tasks = set.findTasks(id); + if (tasks.empty()) + min_ts = {}; // missing table, dependency unsatisfied + else + min_ts = std::min(min_ts, (*tasks.begin())->getNextRefreshTimeslot()); + } + + lock.lock(); + + if (scheduling.dependencies_satisfied_until.time_since_epoch().count() != -2) + { + /// Dependencies changed again after we started looking at them. Have to re-check. + chassert(scheduling.dependencies_satisfied_until.time_since_epoch().count() == -1); + continue; + } + + scheduling.dependencies_satisfied_until = min_ts; return; } +} - num_retries += 1; - info.retry = num_retries; - +static std::chrono::milliseconds backoff(Int64 retry_idx, const RefreshSettings & refresh_settings) +{ UInt64 delay_ms; - UInt64 multiplier = UInt64(1) << std::min(num_retries - 1, Int64(62)); + UInt64 multiplier = UInt64(1) << std::min(retry_idx, Int64(62)); /// Overflow check: a*b <= c iff a <= c/b iff a <= floor(c/b). if (refresh_settings.refresh_retry_initial_backoff_ms <= refresh_settings.refresh_retry_max_backoff_ms / multiplier) delay_ms = refresh_settings.refresh_retry_initial_backoff_ms * multiplier; else delay_ms = refresh_settings.refresh_retry_max_backoff_ms; - - next_refresh_actual = now + std::chrono::milliseconds(delay_ms); + return std::chrono::milliseconds(delay_ms); } -bool RefreshTask::arriveDependency(const StorageID & parent) +std::tuple +RefreshTask::determineNextRefreshTime(std::chrono::sys_seconds now) { - remaining_dependencies.erase(parent); - if (!remaining_dependencies.empty() || !time_arrived) - return false; - populateDependencies(); + auto znode = coordination.root_znode; + if (refresh_settings.refresh_retries >= 0 && znode.attempt_number > refresh_settings.refresh_retries) + { + /// Skip to the next scheduled refresh, as if a refresh succeeded. + znode.last_completed_timeslot = refresh_schedule.timeslotForCompletedRefresh(znode.last_completed_timeslot, znode.last_attempt_time, znode.last_attempt_time, false); + znode.attempt_number = 0; + } + auto timeslot = refresh_schedule.advance(znode.last_completed_timeslot); + + std::chrono::system_clock::time_point when; + if (znode.attempt_number == 0) + when = refresh_schedule.addRandomSpread(timeslot, znode.randomness); + else + when = znode.last_attempt_time + backoff(znode.attempt_number - 1, refresh_settings); + + znode.previous_attempt_error = ""; + if (!znode.last_attempt_succeeded && znode.last_attempt_time.time_since_epoch().count() != 0) + { + if (znode.last_attempt_error.empty()) + znode.previous_attempt_error = fmt::format("Replica '{}' went away", znode.last_attempt_replica); + else + znode.previous_attempt_error = znode.last_attempt_error; + } + + znode.attempt_number += 1; + znode.last_attempt_time = now; + znode.last_attempt_replica = coordination.replica_name; + znode.last_attempt_error = ""; + znode.last_attempt_succeeded = false; + + return {when, timeslot, znode}; +} + +void RefreshTask::setState(RefreshState s, std::unique_lock & lock) +{ + chassert(lock.owns_lock()); + state = s; + if (s != RefreshState::Running && s != RefreshState::Scheduling) + refresh_cv.notify_all(); +} + +void RefreshTask::readZnodesIfNeeded(std::shared_ptr zookeeper, std::unique_lock & lock) +{ + chassert(lock.owns_lock()); + if (!coordination.coordinated || !coordination.watches->should_reread_znodes.load()) + return; + + coordination.watches->should_reread_znodes.store(false); + auto prev_last_completed_timeslot = coordination.root_znode.last_completed_timeslot; + + lock.unlock(); + + if (!zookeeper->isFeatureEnabled(KeeperFeatureFlag::MULTI_READ)) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Keeper server doesn't support multi-reads. Refreshable materialized views won't work."); + + /// Set watches. (This is a lot of code, is there a better way?) + if (!coordination.watches->root_watch_active.load()) + { + coordination.watches->root_watch_active.store(true); + zookeeper->existsWatch(coordination.path, nullptr, + [w = coordination.watches, task_waker = refresh_task->getWatchCallback()](const Coordination::WatchResponse & response) + { + w->root_watch_active.store(false); + w->should_reread_znodes.store(true); + task_waker(response); + }); + } + if (!coordination.watches->children_watch_active.load()) + { + coordination.watches->children_watch_active.store(true); + zookeeper->getChildrenWatch(coordination.path, nullptr, + [w = coordination.watches, task_waker = refresh_task->getWatchCallback()](const Coordination::WatchResponse & response) + { + w->children_watch_active.store(false); + w->should_reread_znodes.store(true); + task_waker(response); + }); + } + + Strings paths {coordination.path, coordination.path + "/running"}; + auto responses = zookeeper->tryGet(paths.begin(), paths.end()); + + lock.lock(); + + if (responses[0].error != Coordination::Error::ZOK) + throw Coordination::Exception::fromPath(responses[0].error, paths[0]); + if (responses[1].error != Coordination::Error::ZOK && responses[1].error != Coordination::Error::ZNONODE) + throw Coordination::Exception::fromPath(responses[1].error, paths[1]); + + coordination.root_znode.parse(responses[0].data); + coordination.root_znode.version = responses[0].stat.version; + coordination.running_znode_exists = responses[1].error == Coordination::Error::ZOK; + + if (coordination.root_znode.last_completed_timeslot != prev_last_completed_timeslot) + { + lock.unlock(); + view->getContext()->getRefreshSet().notifyDependents(view->getStorageID()); + lock.lock(); + } +} + +bool RefreshTask::updateCoordinationState(CoordinationZnode root, bool running, std::shared_ptr zookeeper, std::unique_lock & lock) +{ + chassert(lock.owns_lock()); + int32_t version = -1; + if (coordination.coordinated) + { + Coordination::Requests ops; + ops.emplace_back(zkutil::makeSetRequest(coordination.path, root.toString(), root.version)); + if (running) + ops.emplace_back(zkutil::makeCreateRequest(coordination.path + "/running", coordination.replica_name, zkutil::CreateMode::Ephemeral)); + else + ops.emplace_back(zkutil::makeRemoveRequest(coordination.path + "/running", -1)); + + Coordination::Responses responses; + + lock.unlock(); + auto code = zookeeper->tryMulti(ops, responses); + lock.lock(); + + if (running && responses[0]->error == Coordination::Error::ZBADVERSION) + /// Lost the race, this is normal, don't log a stack trace. + return false; + zkutil::KeeperMultiException::check(code, ops, responses); + version = dynamic_cast(*responses[0]).stat.version; + + } + coordination.root_znode = root; + coordination.root_znode.version = version; + coordination.running_znode_exists = running; return true; } -bool RefreshTask::arriveTime() +void RefreshTask::removeRunningZnodeIfMine(std::shared_ptr zookeeper) { - time_arrived = true; - if (!remaining_dependencies.empty() || !time_arrived) - return false; - populateDependencies(); - return true; -} - -void RefreshTask::populateDependencies() -{ - chassert(remaining_dependencies.empty()); - auto deps = set_handle.getDependencies(); - remaining_dependencies.insert(deps.begin(), deps.end()); - time_arrived = false; + Coordination::Stat stat; + String data; + if (zookeeper->tryGet(coordination.path + "/running", data, &stat) && data == coordination.replica_name) + { + LOG_WARNING(log, "Removing unexpectedly lingering znode {}", coordination.path + "/running"); + zookeeper->tryRemove(coordination.path + "/running", stat.version); + } } void RefreshTask::interruptExecution() { chassert(!mutex.try_lock()); - std::unique_lock lock(executor_mutex); - if (interrupt_execution.exchange(true)) + std::unique_lock lock(execution.executor_mutex); + if (execution.interrupt_execution.exchange(true)) return; - if (running_executor) + if (execution.executor) { - running_executor->cancel(); - + execution.executor->cancel(); LOG_DEBUG(log, "Cancelling refresh"); } } +std::tuple RefreshTask::getAndLockTargetTable(const StorageID & storage_id, const ContextPtr & context) +{ + StoragePtr storage; + TableLockHolder storage_lock; + for (int attempt = 0; attempt < 10; ++attempt) + { + StoragePtr prev_storage = std::move(storage); + storage = DatabaseCatalog::instance().getTable(storage_id, context); + if (storage == prev_storage) + { + // Table was dropped but is still accessible in DatabaseCatalog. + // Either ABA problem or something's broken. Don't retry, just in case. + break; + } + storage_lock = storage->tryLockForShare(context->getCurrentQueryId(), context->getSettingsRef()[Setting::lock_acquire_timeout]); + if (storage_lock) + break; + } + if (!storage_lock) + throw Exception(ErrorCodes::TABLE_IS_DROPPED, "Table {} is dropped or detached", storage_id.getFullNameNotQuoted()); + + if (coordination.coordinated) + { + UUID uuid = storage->getStorageID().uuid; + + std::lock_guard lock(replica_sync_mutex); + if (uuid != last_synced_inner_uuid) + { + InterpreterSystemQuery::trySyncReplica(storage.get(), SyncReplicaMode::DEFAULT, {}, context); + + /// (Race condition: this may revert from a newer uuid to an older one. This doesn't break + /// anything, just causes an unnecessary sync. Should be rare.) + last_synced_inner_uuid = uuid; + } + } + + return {storage, storage_lock}; +} + std::chrono::system_clock::time_point RefreshTask::currentTime() const { - Int64 fake = fake_clock.load(std::memory_order::relaxed); + Int64 fake = scheduling.fake_clock.load(std::memory_order::relaxed); if (fake == INT64_MIN) return std::chrono::system_clock::now(); else @@ -543,4 +888,49 @@ void RefreshTask::setRefreshSetHandleUnlock(RefreshSet::Handle && set_handle_) set_handle = std::move(set_handle_); } +void RefreshTask::CoordinationZnode::randomize() +{ + randomness = std::uniform_int_distribution(Int64(-1e-9), Int64(1e9))(thread_local_rng); +} + +String RefreshTask::CoordinationZnode::toString() const +{ + WriteBufferFromOwnString out; + out << "format version: 1\n" + << "last_completed_timeslot: " << Int64(last_completed_timeslot.time_since_epoch().count()) << "\n" + << "last_success_time: " << Int64(last_success_time.time_since_epoch().count()) << "\n" + << "last_success_duration_ms: " << Int64(last_success_duration.count()) << "\n" + << "last_success_table_uuid: " << last_success_table_uuid << "\n" + << "last_attempt_time: " << Int64(last_attempt_time.time_since_epoch().count()) << "\n" + << "last_attempt_replica: " << escape << last_attempt_replica << "\n" + << "last_attempt_error: " << escape << last_attempt_error << "\n" + << "last_attempt_succeeded: " << last_attempt_succeeded << "\n" + << "previous_attempt_error: " << escape << previous_attempt_error << "\n" + << "attempt_number: " << attempt_number << "\n" + << "randomness: " << randomness << "\n"; + return out.str(); +} + +void RefreshTask::CoordinationZnode::parse(const String & data) +{ + ReadBufferFromString in(data); + Int64 last_completed_timeslot_int, last_success_time_int, last_success_duration_int, last_attempt_time_int; + in >> "format version: 1\n" + >> "last_completed_timeslot: " >> last_completed_timeslot_int >> "\n" + >> "last_success_time: " >> last_success_time_int >> "\n" + >> "last_success_duration_ms: " >> last_success_duration_int >> "\n" + >> "last_success_table_uuid: " >> last_success_table_uuid >> "\n" + >> "last_attempt_time: " >> last_attempt_time_int >> "\n" + >> "last_attempt_replica: " >> escape >> last_attempt_replica >> "\n" + >> "last_attempt_error: " >> escape >> last_attempt_error >> "\n" + >> "last_attempt_succeeded: " >> last_attempt_succeeded >> "\n" + >> "previous_attempt_error: " >> escape >> previous_attempt_error >> "\n" + >> "attempt_number: " >> attempt_number >> "\n" + >> "randomness: " >> randomness >> "\n"; + last_completed_timeslot = std::chrono::sys_seconds(std::chrono::seconds(last_completed_timeslot_int)); + last_success_time = std::chrono::sys_seconds(std::chrono::seconds(last_success_time_int)); + last_success_duration = std::chrono::milliseconds(last_success_duration_int); + last_attempt_time = std::chrono::sys_seconds(std::chrono::seconds(last_attempt_time_int)); +} + } diff --git a/src/Storages/MaterializedView/RefreshTask.h b/src/Storages/MaterializedView/RefreshTask.h index ad9d949e18e..ceb073c8313 100644 --- a/src/Storages/MaterializedView/RefreshTask.h +++ b/src/Storages/MaterializedView/RefreshTask.h @@ -1,15 +1,19 @@ #pragma once #include -#include #include #include - +#include #include #include +namespace zkutil +{ + class ZooKeeper; +} + namespace DB { @@ -19,37 +23,55 @@ class StorageMaterializedView; class ASTRefreshStrategy; struct OwnedRefreshTask; +enum class RefreshState +{ + Disabled = 0, + Scheduling, + Scheduled, + WaitingForDependencies, + Running, + RunningOnAnotherReplica, +}; + class RefreshTask : public std::enable_shared_from_this { public: - /// Never call it manually, public for shared_ptr construction only - RefreshTask(StorageMaterializedView * view_, const ASTRefreshStrategy & strategy); + struct Info; - /// The only proper way to construct task + /// Never call it manually, public for shared_ptr construction only + RefreshTask(StorageMaterializedView * view_, ContextPtr context, const ASTRefreshStrategy & strategy, bool attach, bool coordinated, bool empty); + + /// If !attach, creates coordination znodes if needed. static OwnedRefreshTask create( StorageMaterializedView * view, ContextMutablePtr context, - const DB::ASTRefreshStrategy & strategy); - - void initializeAndStart(); // called at most once + const DB::ASTRefreshStrategy & strategy, + bool attach, + bool coordinated, + bool empty); + /// Called at most once. + void startup(); + /// Permanently disable task scheduling and remove this table from RefreshSet. + /// Ok to call multiple times, but not in parallel. + /// Ok to call even if startup() wasn't called or failed. + void shutdown(); + /// Call when dropping the table, after shutdown(). Removes coordination znodes if needed. + void drop(ContextPtr context); /// Call when renaming the materialized view. - void rename(StorageID new_id); - + void rename(StorageID new_id, StorageID new_inner_table_id); /// Call when changing refresh params (ALTER MODIFY REFRESH). + void checkAlterIsPossible(const DB::ASTRefreshStrategy & new_strategy); void alterRefreshParams(const DB::ASTRefreshStrategy & new_strategy); - RefreshInfo getInfo() const; + Info getInfo() const; /// Enable task scheduling void start(); - /// Disable task scheduling void stop(); - /// Schedule task immediately void run(); - /// Cancel task execution void cancel(); @@ -58,13 +80,11 @@ public: /// If no refresh is running, completes immediately, throwing an exception if previous refresh failed. void wait(); - /// Permanently disable task scheduling and remove this table from RefreshSet. - /// Ok to call multiple times, but not in parallel. - /// Ok to call even if initializeAndStart() wasn't called or failed. - void shutdown(); + /// A measure of how far this view has progressed. Used by dependent views. + std::chrono::sys_seconds getNextRefreshTimeslot() const; - /// Notify dependent task - void notify(const StorageID & parent_id, std::chrono::sys_seconds parent_next_prescribed_time); + /// Called when progress is made (i.e. getNextRefreshTimeslot() changes) in any task that this task depends on. + void notifyDependencyProgress(); /// For tests void setFakeTime(std::optional t); @@ -72,19 +92,131 @@ public: /// RefreshSet will set handle for refresh tasks, to avoid race condition. void setRefreshSetHandleUnlock(RefreshSet::Handle && set_handle_); + /// Looks up the table, does lockForShare() on it. Handles corner cases: + /// * If the table was EXCHANGEd+dropped between the lookup and the lockForShare(), try again. + /// * If the target table is replicated, and another replica did a refresh, do an equivalent of + /// SYSTEM SYNC REPLICA before first read from this table, to make sure we see the data. + std::tuple getAndLockTargetTable(const StorageID & storage_id, const ContextPtr & context); + + struct CoordinationZnode + { + /// "Official" time of the latest successful refresh, i.e. time according to schedule rather than wall clock, + /// and without randomization. E.g. for REFRESH EVERY 1 DAY this timestamp is always the first second of a + /// calendar day. 0 if no successful refresh happened yet. + /// (We store last rather than next timeslot because it behaves better when ALTER MODIFY REFRESH reduces refresh period.) + std::chrono::sys_seconds last_completed_timeslot; + + /// Time when the latest successful refresh started. + std::chrono::sys_seconds last_success_time; + std::chrono::milliseconds last_success_duration; + /// Note that this may not match the DB if a refresh managed to EXCHANGE tables, then failed to write to keeper. + /// That can only happen if last_attempt_succeeded = false. + UUID last_success_table_uuid; + + /// Information about the last started attempt. (I.e. current attempt if a refresh is running, previous attempt if not running.) + std::chrono::sys_seconds last_attempt_time; // when the attempt started or ended + std::string last_attempt_replica; + std::string last_attempt_error; + bool last_attempt_succeeded = false; + /// If an attempt is in progress, this contains error from the previous attempt. + /// Useful if we keep retrying and failing, and each attempt takes a while - we want to see an error message + /// without having to catch the brief time window between attempts. + std::string previous_attempt_error; + + /// Incremented when a refresh attempt starts. Set to 0 when refresh succeeds or when we skip a timeslot. + /// Used for exponential backoff on errors. + Int64 attempt_number = 0; + + /// Random number in [-1e9, 1e9], for RANDOMIZE FOR. Re-rolled after every refresh attempt. + /// (Why write it to keeper instead of letting each replica toss its own coin? Because then refresh would happen earlier + /// on average, on the replica that generated the shortest delay. We could use nonuniform distribution to complensate, but this is easier.) + Int64 randomness = 0; + + /// Znode version. Not serialized. + int32_t version = -1; + + void randomize(); // assigns `randomness` + + String toString() const; + void parse(const String & data); + }; + + /// Just for observability. + struct Info + { + StorageID view_id = StorageID::createEmpty(); + RefreshState state; + std::chrono::system_clock::time_point next_refresh_time; + CoordinationZnode znode; + bool refresh_running; + ProgressValues progress; + }; + private: + struct CoordinationState + { + /// When coordination is enabled, we have these znodes in Keeper: + /// + /// keeper_path (CoordinationZnode) + /// ├── "replicas" + /// │ ├── name1 + /// │ ├── name2 + /// │ └── name3 + /// └── ["running"] (RunningZnode, ephemeral) + + struct WatchState + { + std::atomic_bool should_reread_znodes {true}; + std::atomic_bool root_watch_active {false}; + std::atomic_bool children_watch_active {false}; + }; + + CoordinationZnode root_znode; + bool running_znode_exists = false; + std::shared_ptr watches = std::make_shared(); + + /// Whether we use Keeper to coordinate refresh across replicas. If false, we don't write to Keeper, + /// but we still use the same in-memory structs (CoordinationZnode etc), as if it's coordinated (with one replica). + bool coordinated = false; + bool read_only = false; + String path; + String replica_name; + }; + + struct ExecutionState + { + /// Protects interrupt_execution and executor. + /// Can be locked while holding `mutex`. + std::mutex executor_mutex; + /// If there's a refresh in progress, it can be aborted by setting this flag and cancel()ling + /// this executor. Refresh task will then reconsider what to do, re-checking `stop_requested`, + /// `cancel_requested`, etc. + std::atomic_bool interrupt_execution {false}; + PipelineExecutor * executor = nullptr; + /// Interrupts internal CREATE/EXCHANGE/DROP queries that refresh does. Only used during shutdown. + StopSource cancel_ddl_queries; + Progress progress; + }; + + struct SchedulingState + { + /// Refreshes are stopped, e.g. by SYSTEM STOP VIEW. + bool stop_requested = false; + /// An out-of-schedule refresh was requested, e.g. by SYSTEM REFRESH VIEW. + bool out_of_schedule_refresh_requested = false; + + /// Timestamp representing the progress of refreshable views we depend on. We're allowed to do + /// refreshes for timeslots <= dependencies_satisfied_until without waiting for dependencies. + /// If negative, we should recalculate this value. + std::chrono::sys_seconds dependencies_satisfied_until {std::chrono::seconds(-1)}; + + /// Used in tests. If not INT64_MIN, we pretend that this is the current time, instead of calling system_clock::now(). + std::atomic fake_clock {INT64_MIN}; + }; + LoggerPtr log = nullptr; StorageMaterializedView * view; - /// Protects interrupt_execution and running_executor. - /// Can be locked while holding `mutex`. - std::mutex executor_mutex; - /// If there's a refresh in progress, it can be aborted by setting this flag and cancel()ling - /// this executor. Refresh task will then reconsider what to do, re-checking `stop_requested`, - /// `cancel_requested`, etc. - std::atomic_bool interrupt_execution {false}; - PipelineExecutor * running_executor = nullptr; - /// Protects all fields below. /// Never locked for blocking operations (e.g. creating or dropping the internal table). /// Can't be locked while holding `executor_mutex`. @@ -94,48 +226,26 @@ private: RefreshSettings refresh_settings; std::vector initial_dependencies; bool refresh_append; + bool in_database_replicated; RefreshSet::Handle set_handle; - /// StorageIDs of our dependencies that we're waiting for. - using DatabaseAndTableNameSet = std::unordered_set; - DatabaseAndTableNameSet remaining_dependencies; - bool time_arrived = false; - - /// Refreshes are stopped (e.g. by SYSTEM STOP VIEW). - bool stop_requested = false; - - /// If true, we should start a refresh right away. All refreshes go through this flag. - bool refresh_immediately = false; - - /// When to refresh next. Updated when a refresh is finished or cancelled. - /// We maintain the distinction between: - /// * The "prescribed" time of the refresh, dictated by the refresh schedule. - /// E.g. for REFERSH EVERY 1 DAY, the prescribed time is always at the exact start of a day. - /// * Actual wall clock timestamps, e.g. when the refresh is scheduled to happen - /// (including random spread) or when a refresh completed. - /// The prescribed time is required for: - /// * Doing REFRESH EVERY correctly if the random spread came up negative, and a refresh completed - /// before the prescribed time. E.g. suppose a refresh was prescribed at 05:00, which was randomly - /// adjusted to 4:50, and the refresh completed at 4:55; we shouldn't schedule another refresh - /// at 5:00, so we should remember that the 4:50-4:55 refresh actually had prescribed time 5:00. - /// * Similarly, for dependencies between REFRESH EVERY tables, using actual time would be unreliable. - /// E.g. for REFRESH EVERY 1 DAY, yesterday's refresh of the dependency shouldn't trigger today's - /// refresh of the dependent even if it happened today (e.g. it was slow or had random spread > 1 day). - std::chrono::sys_seconds next_refresh_prescribed; - std::chrono::system_clock::time_point next_refresh_actual; - Int64 num_retries = 0; - /// Calls refreshTask() from background thread. BackgroundSchedulePool::TaskHolder refresh_task; - /// Used in tests. If not INT64_MIN, we pretend that this is the current time, instead of calling system_clock::now(). - std::atomic fake_clock {INT64_MIN}; + CoordinationState coordination; + ExecutionState execution; + SchedulingState scheduling; - /// Just for observability. - RefreshInfo info; - Progress progress; - std::condition_variable refresh_cv; // notified when info.state changes + RefreshState state = RefreshState::Scheduling; + /// Notified when `state` changes away from Running/Scheduling. + std::condition_variable refresh_cv; + std::chrono::system_clock::time_point next_refresh_time {}; // just for observability + + /// If we're in a Replicated database, and another replica performed a refresh, we have to do an + /// equivalent of SYSTEM SYNC REPLICA on the new table to make sure we see the full data. + std::mutex replica_sync_mutex; + UUID last_synced_inner_uuid = UUIDHelpers::Nil; /// The main loop of the refresh task. It examines the state, sees what needs to be /// done and does it. If there's nothing to do at the moment, returns; it's then scheduled again, @@ -147,31 +257,32 @@ private: /// Perform an actual refresh: create new table, run INSERT SELECT, exchange tables, drop old table. /// Mutex must be unlocked. Called only from refresh_task. - void executeRefreshUnlocked(bool append); + UUID executeRefreshUnlocked(bool append, int32_t root_znode_version); - /// Assigns next_refresh_* - void advanceNextRefreshTime(std::chrono::system_clock::time_point now); + /// Assigns dependencies_satisfied_until. + void updateDependenciesIfNeeded(std::unique_lock & lock); - /// Either advances next_refresh_actual using exponential backoff or does advanceNextRefreshTime(). - void scheduleRetryOrSkipToNextRefresh(std::chrono::system_clock::time_point now); + std::tuple + determineNextRefreshTime(std::chrono::sys_seconds now); - /// Returns true if all dependencies are fulfilled now. Refills remaining_dependencies in this case. - bool arriveDependency(const StorageID & parent); - bool arriveTime(); - void populateDependencies(); + void readZnodesIfNeeded(std::shared_ptr zookeeper, std::unique_lock & lock); + bool updateCoordinationState(CoordinationZnode root, bool running, std::shared_ptr zookeeper, std::unique_lock & lock); + void removeRunningZnodeIfMine(std::shared_ptr zookeeper); + void setState(RefreshState s, std::unique_lock & lock); void interruptExecution(); - std::chrono::system_clock::time_point currentTime() const; }; +using RefreshTaskPtr = std::shared_ptr; + /// Wrapper around shared_ptr, calls shutdown() in destructor. struct OwnedRefreshTask { - RefreshTaskHolder ptr; + RefreshTaskPtr ptr; OwnedRefreshTask() = default; - explicit OwnedRefreshTask(RefreshTaskHolder p) : ptr(std::move(p)) {} + explicit OwnedRefreshTask(RefreshTaskPtr p) : ptr(std::move(p)) {} OwnedRefreshTask(OwnedRefreshTask &&) = default; OwnedRefreshTask & operator=(OwnedRefreshTask &&) = default; diff --git a/src/Storages/MaterializedView/RefreshTask_fwd.h b/src/Storages/MaterializedView/RefreshTask_fwd.h deleted file mode 100644 index ff17c839dc5..00000000000 --- a/src/Storages/MaterializedView/RefreshTask_fwd.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include -#include - -namespace DB -{ - -class RefreshTask; - -using RefreshTaskHolder = std::shared_ptr; -using RefreshTaskList = std::list; - -} diff --git a/src/Storages/MergeTree/AsyncBlockIDsCache.cpp b/src/Storages/MergeTree/AsyncBlockIDsCache.cpp index 6606b00e287..45cf383bd09 100644 --- a/src/Storages/MergeTree/AsyncBlockIDsCache.cpp +++ b/src/Storages/MergeTree/AsyncBlockIDsCache.cpp @@ -19,6 +19,12 @@ namespace CurrentMetrics namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsMilliseconds async_block_ids_cache_update_wait_ms; + extern const MergeTreeSettingsBool use_async_block_ids_cache; +} + static constexpr int FAILURE_RETRY_MS = 3000; template @@ -58,7 +64,7 @@ catch (...) template AsyncBlockIDsCache::AsyncBlockIDsCache(TStorage & storage_) : storage(storage_) - , update_wait(storage.getSettings()->async_block_ids_cache_update_wait_ms) + , update_wait((*storage.getSettings())[MergeTreeSetting::async_block_ids_cache_update_wait_ms]) , path(storage.getZooKeeperPath() + "/async_blocks") , log_name(storage.getStorageID().getFullTableName() + " (AsyncBlockIDsCache)") , log(getLogger(log_name)) @@ -69,7 +75,7 @@ AsyncBlockIDsCache::AsyncBlockIDsCache(TStorage & storage_) template void AsyncBlockIDsCache::start() { - if (storage.getSettings()->use_async_block_ids_cache) + if ((*storage.getSettings())[MergeTreeSetting::use_async_block_ids_cache]) task->activateAndSchedule(); } @@ -87,7 +93,7 @@ void AsyncBlockIDsCache::triggerCacheUpdate() template Strings AsyncBlockIDsCache::detectConflicts(const Strings & paths, UInt64 & last_version) { - if (!storage.getSettings()->use_async_block_ids_cache) + if (!(*storage.getSettings())[MergeTreeSetting::use_async_block_ids_cache]) return {}; CachePtr cur_cache; diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index 061ee356203..e13ec5a7515 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -40,6 +40,14 @@ namespace CurrentMetrics namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool allow_remote_fs_zero_copy_replication; + extern const MergeTreeSettingsBool enable_the_endpoint_id_with_zookeeper_name_prefix; + extern const MergeTreeSettingsBool fsync_part_directory; + extern const MergeTreeSettingsUInt64 min_compressed_bytes_to_fsync_after_fetch; +} + namespace ErrorCodes { extern const int NO_SUCH_DATA_PART; @@ -202,7 +210,7 @@ void Service::processQuery(const HTMLForm & params, ReadBuffer & /*body*/, Write writeBinary(projections.size(), out); } - if (data_settings->allow_remote_fs_zero_copy_replication && + if ((*data_settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication] && client_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_ZERO_COPY) { auto disk_type = part->getDataPartStorage().getDiskType(); @@ -380,7 +388,7 @@ MergeTreeData::DataPartPtr Service::findPart(const String & name) if (!part) throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "No part {} in table", name); - bool zero_copy_enabled = data.getSettings()->allow_remote_fs_zero_copy_replication; + bool zero_copy_enabled = (*data.getSettings())[MergeTreeSetting::allow_remote_fs_zero_copy_replication]; if (!zero_copy_enabled) return part; @@ -448,7 +456,7 @@ std::pair Fetcher::fetchSelected auto part_info = MergeTreePartInfo::fromPartName(part_name, data.format_version); String endpoint_id = getEndpointId( - data_settings->enable_the_endpoint_id_with_zookeeper_name_prefix ? + (*data_settings)[MergeTreeSetting::enable_the_endpoint_id_with_zookeeper_name_prefix] ? zookeeper_name + ":" + replica_path : replica_path); @@ -473,7 +481,7 @@ std::pair Fetcher::fetchSelected } Strings capability; - if (try_zero_copy && data_settings->allow_remote_fs_zero_copy_replication) + if (try_zero_copy && (*data_settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) { if (!disk) { @@ -610,8 +618,8 @@ std::pair Fetcher::fetchSelected if (revision) disk->syncRevision(revision); - bool sync = (data_settings->min_compressed_bytes_to_fsync_after_fetch - && sum_files_size >= data_settings->min_compressed_bytes_to_fsync_after_fetch); + bool sync = ((*data_settings)[MergeTreeSetting::min_compressed_bytes_to_fsync_after_fetch] + && sum_files_size >= (*data_settings)[MergeTreeSetting::min_compressed_bytes_to_fsync_after_fetch]); using PartType = MergeTreeDataPartType; PartType part_type = PartType::Wide; @@ -849,14 +857,14 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk( /// /// We don't control the amount of refs for temporary parts so we cannot decide can we remove blobs /// or not. So we are not doing it - bool keep_shared = part_storage_for_loading->supportZeroCopyReplication() && data_settings->allow_remote_fs_zero_copy_replication; + bool keep_shared = part_storage_for_loading->supportZeroCopyReplication() && (*data_settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]; part_storage_for_loading->removeSharedRecursive(keep_shared); } part_storage_for_loading->createDirectories(); SyncGuardPtr sync_guard; - if (data.getSettings()->fsync_part_directory) + if ((*data.getSettings())[MergeTreeSetting::fsync_part_directory]) sync_guard = part_storage_for_loading->getDirectorySyncGuard(); CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedFetch}; diff --git a/src/Storages/MergeTree/GinIndexStore.cpp b/src/Storages/MergeTree/GinIndexStore.cpp index 6e0273701ad..e92460ff498 100644 --- a/src/Storages/MergeTree/GinIndexStore.cpp +++ b/src/Storages/MergeTree/GinIndexStore.cpp @@ -24,6 +24,7 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int UNKNOWN_FORMAT_VERSION; + extern const int NOT_IMPLEMENTED; }; GinIndexPostingsBuilder::GinIndexPostingsBuilder(UInt64 limit) @@ -153,13 +154,18 @@ GinIndexStore::GinIndexStore(const String & name_, DataPartStoragePtr storage_) : name(name_) , storage(storage_) { + if (storage->getType() != MergeTreeDataPartStorageType::Full) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "INDEX {} with 'full_text' type supports only full storage", name); } + GinIndexStore::GinIndexStore(const String & name_, DataPartStoragePtr storage_, MutableDataPartStoragePtr data_part_storage_builder_, UInt64 max_digestion_size_) : name(name_) , storage(storage_) , data_part_storage_builder(data_part_storage_builder_) , max_digestion_size(max_digestion_size_) { + if (storage->getType() != MergeTreeDataPartStorageType::Full) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "INDEX {} with 'full_text' type supports only full storage", name); } bool GinIndexStore::exists() const diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 8c60988e7e8..484e544f921 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -60,6 +60,17 @@ namespace CurrentMetrics namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool allow_remote_fs_zero_copy_replication; + extern const MergeTreeSettingsBool exclude_deleted_rows_for_part_size_in_merge; + extern const MergeTreeSettingsBool fsync_part_directory; + extern const MergeTreeSettingsBool load_existing_rows_count_for_old_parts; + extern const MergeTreeSettingsBool primary_key_lazy_load; + extern const MergeTreeSettingsFloat primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns; + extern const MergeTreeSettingsFloat ratio_of_defaults_for_sparse_serialization; +} + namespace ErrorCodes { extern const int CANNOT_READ_ALL_DATA; @@ -637,7 +648,7 @@ UInt64 IMergeTreeDataPart::getMarksCount() const UInt64 IMergeTreeDataPart::getExistingBytesOnDisk() const { - if (storage.getSettings()->exclude_deleted_rows_for_part_size_in_merge && supportLightweightDeleteMutate() && hasLightweightDelete() + if ((*storage.getSettings())[MergeTreeSetting::exclude_deleted_rows_for_part_size_in_merge] && supportLightweightDeleteMutate() && hasLightweightDelete() && existing_rows_count.has_value() && existing_rows_count.value() < rows_count && rows_count > 0) return bytes_on_disk * existing_rows_count.value() / rows_count; else @@ -718,7 +729,7 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks loadChecksums(require_columns_checksums); loadIndexGranularity(); - if (!storage.getSettings()->primary_key_lazy_load) + if (!(*storage.getSettings())[MergeTreeSetting::primary_key_lazy_load]) getIndex(); calculateColumnsAndSecondaryIndicesSizesOnDisk(); @@ -923,7 +934,7 @@ void IMergeTreeDataPart::loadIndex() const key_serializations[j]->deserializeBinary(*loaded_index[j], *index_file, {}); /// Cut useless suffix columns, if necessary. - Float64 ratio_to_drop_suffix_columns = storage.getSettings()->primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns; + Float64 ratio_to_drop_suffix_columns = (*storage.getSettings())[MergeTreeSetting::primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns]; if (key_size > 1 && ratio_to_drop_suffix_columns > 0 && ratio_to_drop_suffix_columns < 1) { chassert(marks_count > 0); @@ -1453,8 +1464,8 @@ void IMergeTreeDataPart::loadExistingRowsCount() return; if (!rows_count || !supportLightweightDeleteMutate() || !hasLightweightDelete() - || !storage.getSettings()->exclude_deleted_rows_for_part_size_in_merge - || !storage.getSettings()->load_existing_rows_count_for_old_parts) + || !(*storage.getSettings())[MergeTreeSetting::exclude_deleted_rows_for_part_size_in_merge] + || !(*storage.getSettings())[MergeTreeSetting::load_existing_rows_count_for_old_parts]) existing_rows_count = rows_count; else existing_rows_count = readExistingRowsCount(); @@ -1623,7 +1634,7 @@ void IMergeTreeDataPart::loadColumns(bool require) SerializationInfo::Settings settings = { - .ratio_of_defaults_for_sparse = storage.getSettings()->ratio_of_defaults_for_sparse_serialization, + .ratio_of_defaults_for_sparse = (*storage.getSettings())[MergeTreeSetting::ratio_of_defaults_for_sparse_serialization], .choose_kind = false, }; @@ -1687,7 +1698,7 @@ void IMergeTreeDataPart::storeVersionMetadata(bool force) const throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Transactions are not supported for in-memory parts (table: {}, part: {})", storage.getStorageID().getNameForLogs(), name); - writeVersionMetadata(version, storage.getSettings()->fsync_part_directory); + writeVersionMetadata(version, (*storage.getSettings())[MergeTreeSetting::fsync_part_directory]); } void IMergeTreeDataPart::appendCSNToVersionMetadata(VersionMetadata::WhichCSN which_csn) const @@ -1894,7 +1905,7 @@ try assertOnDisk(); std::string relative_path = storage.relative_data_path; - bool fsync_dir = storage.getSettings()->fsync_part_directory; + bool fsync_dir = (*storage.getSettings())[MergeTreeSetting::fsync_part_directory]; if (parent_part) { @@ -2056,7 +2067,7 @@ DataPartStoragePtr IMergeTreeDataPart::makeCloneInDetached(const String & prefix auto storage_settings = storage.getSettings(); IDataPartStorage::ClonePartParams params { - .copy_instead_of_hardlink = isStoredOnRemoteDiskWithZeroCopySupport() && storage.supportsReplication() && storage_settings->allow_remote_fs_zero_copy_replication, + .copy_instead_of_hardlink = isStoredOnRemoteDiskWithZeroCopySupport() && storage.supportsReplication() && (*storage_settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication], .keep_metadata_version = prefix == "covered-by-broken", .make_source_readonly = true, .external_transaction = disk_transaction diff --git a/src/Storages/MergeTree/IMergedBlockOutputStream.cpp b/src/Storages/MergeTree/IMergedBlockOutputStream.cpp index 70e838e666a..eb904a8e2ef 100644 --- a/src/Storages/MergeTree/IMergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/IMergedBlockOutputStream.cpp @@ -7,6 +7,11 @@ namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsFloat ratio_of_defaults_for_sparse_serialization; +} + IMergedBlockOutputStream::IMergedBlockOutputStream( const MergeTreeSettingsPtr & storage_settings_, MutableDataPartStoragePtr data_part_storage_, @@ -22,7 +27,7 @@ IMergedBlockOutputStream::IMergedBlockOutputStream( { SerializationInfo::Settings info_settings = { - .ratio_of_defaults_for_sparse = storage_settings->ratio_of_defaults_for_sparse_serialization, + .ratio_of_defaults_for_sparse = (*storage_settings)[MergeTreeSetting::ratio_of_defaults_for_sparse_serialization], .choose_kind = false, }; diff --git a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp index 4c947487d21..463349f5b00 100644 --- a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp @@ -18,6 +18,18 @@ namespace ProfileEvents namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool allow_remote_fs_zero_copy_replication; + extern const MergeTreeSettingsBool always_fetch_merged_part; + extern const MergeTreeSettingsBool detach_not_byte_identical_parts; + extern const MergeTreeSettingsSeconds lock_acquire_timeout_for_background_operations; + extern const MergeTreeSettingsUInt64 prefer_fetch_merged_part_size_threshold; + extern const MergeTreeSettingsSeconds prefer_fetch_merged_part_time_threshold; + extern const MergeTreeSettingsSeconds try_fetch_recompressed_part_timeout; + extern const MergeTreeSettingsUInt64 zero_copy_merge_mutation_min_parts_size_sleep_before_lock; +} + namespace ErrorCodes { extern const int BAD_DATA_PART_NAME; @@ -57,7 +69,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare() entry.new_part_name, part, parts, merge_mutate_entry.get(), std::move(profile_counters_snapshot)); }; - if (storage_settings_ptr->always_fetch_merged_part) + if ((*storage_settings_ptr)[MergeTreeSetting::always_fetch_merged_part]) { LOG_INFO(log, "Will fetch part {} because setting 'always_fetch_merged_part' is true", entry.new_part_name); return PrepareResult{ @@ -68,12 +80,12 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare() } if (entry.merge_type == MergeType::TTLRecompress && - (time(nullptr) - entry.create_time) <= storage_settings_ptr->try_fetch_recompressed_part_timeout.totalSeconds() && + (time(nullptr) - entry.create_time) <= (*storage_settings_ptr)[MergeTreeSetting::try_fetch_recompressed_part_timeout].totalSeconds() && entry.source_replica != storage.replica_name) { LOG_INFO(log, "Will try to fetch part {} until '{}' because this part assigned to recompression merge. " "Source replica {} will try to merge this part first", entry.new_part_name, - DateLUT::serverTimezoneInstance().timeToString(entry.create_time + storage_settings_ptr->try_fetch_recompressed_part_timeout.totalSeconds()), entry.source_replica); + DateLUT::serverTimezoneInstance().timeToString(entry.create_time + (*storage_settings_ptr)[MergeTreeSetting::try_fetch_recompressed_part_timeout].totalSeconds()), entry.source_replica); /// Waiting other replica to recompress part. No need to check it. return PrepareResult{ .prepared_successfully = false, @@ -158,7 +170,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare() /// All source parts are found locally, we can execute merge - if (entry.create_time + storage_settings_ptr->prefer_fetch_merged_part_time_threshold.totalSeconds() <= time(nullptr)) + if (entry.create_time + (*storage_settings_ptr)[MergeTreeSetting::prefer_fetch_merged_part_time_threshold].totalSeconds() <= time(nullptr)) { /// If entry is old enough, and have enough size, and part are exists in any replica, /// then prefer fetching of merged part from replica. @@ -167,7 +179,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare() for (const auto & item : parts) sum_parts_bytes_on_disk += item->getBytesOnDisk(); - if (sum_parts_bytes_on_disk >= storage_settings_ptr->prefer_fetch_merged_part_size_threshold) + if (sum_parts_bytes_on_disk >= (*storage_settings_ptr)[MergeTreeSetting::prefer_fetch_merged_part_size_threshold]) { String replica = storage.findReplicaHavingPart(entry.new_part_name, true); /// NOTE excessive ZK requests for same data later, may remove. if (!replica.empty()) @@ -198,7 +210,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare() } /// It will live until the whole task is being destroyed - table_lock_holder = storage.lockForShare(RWLockImpl::NO_QUERY, storage_settings_ptr->lock_acquire_timeout_for_background_operations); + table_lock_holder = storage.lockForShare(RWLockImpl::NO_QUERY, (*storage_settings_ptr)[MergeTreeSetting::lock_acquire_timeout_for_background_operations]); auto future_merged_part = std::make_shared(parts, entry.new_part_format); if (future_merged_part->name != entry.new_part_name) @@ -226,7 +238,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare() future_merged_part->updatePath(storage, reserved_space.get()); future_merged_part->merge_type = entry.merge_type; - if (storage_settings_ptr->allow_remote_fs_zero_copy_replication) + if ((*storage_settings_ptr)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) { if (auto disk = reserved_space->getDisk(); disk->supportZeroCopyReplication()) { @@ -241,8 +253,8 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare() }; } - if (storage_settings_ptr->zero_copy_merge_mutation_min_parts_size_sleep_before_lock != 0 && - estimated_space_for_merge >= storage_settings_ptr->zero_copy_merge_mutation_min_parts_size_sleep_before_lock) + if ((*storage_settings_ptr)[MergeTreeSetting::zero_copy_merge_mutation_min_parts_size_sleep_before_lock] != 0 && + estimated_space_for_merge >= (*storage_settings_ptr)[MergeTreeSetting::zero_copy_merge_mutation_min_parts_size_sleep_before_lock]) { /// In zero copy replication only one replica execute merge/mutation, others just download merged parts metadata. /// Here we are trying to mitigate the skew of merges execution because of faster/slower replicas. @@ -252,12 +264,12 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare() /// /// So here we trying to solve it with the simplest solution -- sleep random time up to 500ms for 1GB part and up to 7 seconds for 300GB part. /// It can sound too much, but we are trying to acquire these locks in background tasks which can be scheduled each 5 seconds or so. - double start_to_sleep_seconds = std::logf(storage_settings_ptr->zero_copy_merge_mutation_min_parts_size_sleep_before_lock.value); + double start_to_sleep_seconds = std::logf((*storage_settings_ptr)[MergeTreeSetting::zero_copy_merge_mutation_min_parts_size_sleep_before_lock].value); uint64_t right_border_to_sleep_ms = static_cast((std::log(estimated_space_for_merge) - start_to_sleep_seconds + 0.5) * 1000); uint64_t time_to_sleep_milliseconds = std::min(10000UL, std::uniform_int_distribution(1, 1 + right_border_to_sleep_ms)(rng)); LOG_INFO(log, "Merge size is {} bytes (it's more than sleep threshold {}) so will intentionally sleep for {} ms to allow other replicas to took this big merge", - estimated_space_for_merge, storage_settings_ptr->zero_copy_merge_mutation_min_parts_size_sleep_before_lock, time_to_sleep_milliseconds); + estimated_space_for_merge, (*storage_settings_ptr)[MergeTreeSetting::zero_copy_merge_mutation_min_parts_size_sleep_before_lock], time_to_sleep_milliseconds); std::this_thread::sleep_for(std::chrono::milliseconds(time_to_sleep_milliseconds)); } @@ -401,7 +413,7 @@ bool MergeFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrite write_part_log(ExecutionStatus::fromCurrentException("", true)); - if (storage.getSettings()->detach_not_byte_identical_parts) + if ((*storage.getSettings())[MergeTreeSetting::detach_not_byte_identical_parts]) storage.forcefullyMovePartToDetachedAndRemoveFromMemory(std::move(part), "merge-not-byte-identical"); else storage.tryRemovePartImmediately(std::move(part)); diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index f0447e71539..9c37f205174 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -71,6 +71,25 @@ namespace Setting extern const SettingsUInt64 min_insert_block_size_rows; } +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool allow_experimental_replacing_merge_with_cleanup; + extern const MergeTreeSettingsBool allow_vertical_merges_from_compact_to_wide_parts; + extern const MergeTreeSettingsMilliseconds background_task_preferred_step_execution_time_ms; + extern const MergeTreeSettingsDeduplicateMergeProjectionMode deduplicate_merge_projection_mode; + extern const MergeTreeSettingsBool enable_block_number_column; + extern const MergeTreeSettingsBool enable_block_offset_column; + extern const MergeTreeSettingsUInt64 enable_vertical_merge_algorithm; + extern const MergeTreeSettingsUInt64 merge_max_block_size_bytes; + extern const MergeTreeSettingsUInt64 merge_max_block_size; + extern const MergeTreeSettingsUInt64 min_merge_bytes_to_use_direct_io; + extern const MergeTreeSettingsFloat ratio_of_defaults_for_sparse_serialization; + extern const MergeTreeSettingsUInt64 vertical_merge_algorithm_min_bytes_to_activate; + extern const MergeTreeSettingsUInt64 vertical_merge_algorithm_min_columns_to_activate; + extern const MergeTreeSettingsUInt64 vertical_merge_algorithm_min_rows_to_activate; + extern const MergeTreeSettingsBool vertical_merge_remote_filesystem_prefetch; +} + namespace ErrorCodes { extern const int ABORTED; @@ -386,7 +405,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() const SerializationInfo::Settings info_settings = { - .ratio_of_defaults_for_sparse = global_ctx->data->getSettings()->ratio_of_defaults_for_sparse_serialization, + .ratio_of_defaults_for_sparse = (*global_ctx->data->getSettings())[MergeTreeSetting::ratio_of_defaults_for_sparse_serialization], .choose_kind = true, }; @@ -553,12 +572,12 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() const bool MergeTask::enabledBlockNumberColumn(GlobalRuntimeContextPtr global_ctx) { - return global_ctx->data->getSettings()->enable_block_number_column && global_ctx->metadata_snapshot->getGroupByTTLs().empty(); + return (*global_ctx->data->getSettings())[MergeTreeSetting::enable_block_number_column] && global_ctx->metadata_snapshot->getGroupByTTLs().empty(); } bool MergeTask::enabledBlockOffsetColumn(GlobalRuntimeContextPtr global_ctx) { - return global_ctx->data->getSettings()->enable_block_offset_column && global_ctx->metadata_snapshot->getGroupByTTLs().empty(); + return (*global_ctx->data->getSettings())[MergeTreeSetting::enable_block_offset_column] && global_ctx->metadata_snapshot->getGroupByTTLs().empty(); } void MergeTask::addGatheringColumn(GlobalRuntimeContextPtr global_ctx, const String & name, const DataTypePtr & type) @@ -629,7 +648,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::execute() void MergeTask::ExecuteAndFinalizeHorizontalPart::prepareProjectionsToMergeAndRebuild() const { - const auto mode = global_ctx->data->getSettings()->deduplicate_merge_projection_mode; + const auto mode = (*global_ctx->data->getSettings())[MergeTreeSetting::deduplicate_merge_projection_mode]; /// Under throw mode, we still choose to drop projections due to backward compatibility since some /// users might have projections before this change. if (global_ctx->data->merging_params.mode != MergeTreeData::MergingParams::Ordinary @@ -648,7 +667,8 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::prepareProjectionsToMergeAndRe for (const auto & projection : projections) { - if (merge_may_reduce_rows) + /// Checking IGNORE here is just for compatibility. + if (merge_may_reduce_rows && mode != DeduplicateMergeProjectionMode::IGNORE) { global_ctx->projections_to_rebuild.push_back(&projection); continue; @@ -772,7 +792,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::executeMergeProjections() cons bool MergeTask::ExecuteAndFinalizeHorizontalPart::executeImpl() const { Stopwatch watch(CLOCK_MONOTONIC_COARSE); - UInt64 step_time_ms = global_ctx->data->getSettings()->background_task_preferred_step_execution_time_ms.totalMilliseconds(); + UInt64 step_time_ms = (*global_ctx->data->getSettings())[MergeTreeSetting::background_task_preferred_step_execution_time_ms].totalMilliseconds(); do { @@ -825,7 +845,7 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::finalize() const throw Exception(ErrorCodes::ABORTED, "Cancelled merging parts with expired TTL"); const size_t sum_compressed_bytes_upper_bound = global_ctx->merge_list_element_ptr->total_size_bytes_compressed; - ctx->need_sync = needSyncPart(ctx->sum_input_rows_upper_bound, sum_compressed_bytes_upper_bound, *global_ctx->data->getSettings()); + ctx->need_sync = global_ctx->data->getSettings()->needSyncPart(ctx->sum_input_rows_upper_bound, sum_compressed_bytes_upper_bound); } bool MergeTask::VerticalMergeStage::prepareVerticalMergeForAllColumns() const @@ -867,7 +887,7 @@ bool MergeTask::VerticalMergeStage::prepareVerticalMergeForAllColumns() const ctx->max_delayed_streams = max_delayed_streams; bool all_parts_on_remote_disks = std::ranges::all_of(global_ctx->future_part->parts, [](const auto & part) { return part->isStoredOnRemoteDisk(); }); - ctx->use_prefetch = all_parts_on_remote_disks && global_ctx->data->getSettings()->vertical_merge_remote_filesystem_prefetch; + ctx->use_prefetch = all_parts_on_remote_disks && (*global_ctx->data->getSettings())[MergeTreeSetting::vertical_merge_remote_filesystem_prefetch]; if (ctx->use_prefetch && ctx->it_name_and_type != global_ctx->gathering_columns.end()) ctx->prepared_pipeline = createPipelineForReadingOneColumn(ctx->it_name_and_type->name); @@ -989,8 +1009,8 @@ MergeTask::VerticalMergeRuntimeContext::PreparedColumnPipeline MergeTask::Vertic auto merge_step = std::make_unique( merge_column_query_plan.getCurrentDataStream(), RowsSourcesTemporaryFile::FILE_ID, - data_settings->merge_max_block_size, - data_settings->merge_max_block_size_bytes, + (*data_settings)[MergeTreeSetting::merge_max_block_size], + (*data_settings)[MergeTreeSetting::merge_max_block_size_bytes], is_result_sparse); merge_step->setStepDescription("Gather column"); merge_column_query_plan.addStep(std::move(merge_step)); @@ -1076,7 +1096,7 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const bool MergeTask::VerticalMergeStage::executeVerticalMergeForOneColumn() const { Stopwatch watch(CLOCK_MONOTONIC_COARSE); - UInt64 step_time_ms = global_ctx->data->getSettings()->background_task_preferred_step_execution_time_ms.totalMilliseconds(); + UInt64 step_time_ms = (*global_ctx->data->getSettings())[MergeTreeSetting::background_task_preferred_step_execution_time_ms].totalMilliseconds(); do { @@ -1580,13 +1600,13 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() const /// and use direct_io + aio if there is more than min_merge_bytes_to_use_direct_io ctx->read_with_direct_io = false; const auto data_settings = global_ctx->data->getSettings(); - if (data_settings->min_merge_bytes_to_use_direct_io != 0) + if ((*data_settings)[MergeTreeSetting::min_merge_bytes_to_use_direct_io] != 0) { size_t total_size = 0; for (const auto & part : global_ctx->future_part->parts) { total_size += part->getBytesOnDisk(); - if (total_size >= data_settings->min_merge_bytes_to_use_direct_io) + if (total_size >= (*data_settings)[MergeTreeSetting::min_merge_bytes_to_use_direct_io]) { LOG_DEBUG(ctx->log, "Will merge parts reading files in O_DIRECT"); ctx->read_with_direct_io = true; @@ -1669,7 +1689,7 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() const /// If merge is vertical we cannot calculate it ctx->blocks_are_granules_size = is_vertical_merge; - if (global_ctx->cleanup && !data_settings->allow_experimental_replacing_merge_with_cleanup) + if (global_ctx->cleanup && !(*data_settings)[MergeTreeSetting::allow_experimental_replacing_merge_with_cleanup]) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Experimental merges with CLEANUP are not allowed"); auto merge_step = std::make_unique( @@ -1678,8 +1698,8 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() const partition_key_columns, global_ctx->merging_params, (is_vertical_merge ? RowsSourcesTemporaryFile::FILE_ID : ""), /// rows_sources temporaty file is used only for vertical merge - data_settings->merge_max_block_size, - data_settings->merge_max_block_size_bytes, + (*data_settings)[MergeTreeSetting::merge_max_block_size], + (*data_settings)[MergeTreeSetting::merge_max_block_size_bytes], ctx->blocks_are_granules_size, global_ctx->cleanup, global_ctx->time_of_merge); @@ -1766,7 +1786,7 @@ MergeAlgorithm MergeTask::ExecuteAndFinalizeHorizontalPart::chooseMergeAlgorithm if (global_ctx->deduplicate) return MergeAlgorithm::Horizontal; - if (data_settings->enable_vertical_merge_algorithm == 0) + if ((*data_settings)[MergeTreeSetting::enable_vertical_merge_algorithm] == 0) return MergeAlgorithm::Horizontal; if (ctx->need_remove_expired_values) return MergeAlgorithm::Horizontal; @@ -1777,7 +1797,7 @@ MergeAlgorithm MergeTask::ExecuteAndFinalizeHorizontalPart::chooseMergeAlgorithm if (global_ctx->cleanup) return MergeAlgorithm::Horizontal; - if (!data_settings->allow_vertical_merges_from_compact_to_wide_parts) + if (!(*data_settings)[MergeTreeSetting::allow_vertical_merges_from_compact_to_wide_parts]) { for (const auto & part : global_ctx->future_part->parts) { @@ -1792,11 +1812,11 @@ MergeAlgorithm MergeTask::ExecuteAndFinalizeHorizontalPart::chooseMergeAlgorithm global_ctx->merging_params.mode == MergeTreeData::MergingParams::Replacing || global_ctx->merging_params.mode == MergeTreeData::MergingParams::VersionedCollapsing; - bool enough_ordinary_cols = global_ctx->gathering_columns.size() >= data_settings->vertical_merge_algorithm_min_columns_to_activate; + bool enough_ordinary_cols = global_ctx->gathering_columns.size() >= (*data_settings)[MergeTreeSetting::vertical_merge_algorithm_min_columns_to_activate]; - bool enough_total_rows = total_rows_count >= data_settings->vertical_merge_algorithm_min_rows_to_activate; + bool enough_total_rows = total_rows_count >= (*data_settings)[MergeTreeSetting::vertical_merge_algorithm_min_rows_to_activate]; - bool enough_total_bytes = total_size_bytes_uncompressed >= data_settings->vertical_merge_algorithm_min_bytes_to_activate; + bool enough_total_bytes = total_size_bytes_uncompressed >= (*data_settings)[MergeTreeSetting::vertical_merge_algorithm_min_bytes_to_activate]; bool no_parts_overflow = global_ctx->future_part->parts.size() <= RowSourcePart::MAX_PARTS; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index cff381a3429..58847817ea9 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -177,6 +177,58 @@ namespace Setting extern const SettingsUInt64 parts_to_throw_insert; } +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool allow_nullable_key; + extern const MergeTreeSettingsBool allow_remote_fs_zero_copy_replication; + extern const MergeTreeSettingsBool allow_suspicious_indices; + extern const MergeTreeSettingsBool assign_part_uuids; + extern const MergeTreeSettingsBool async_insert; + extern const MergeTreeSettingsBool check_sample_column_is_correct; + extern const MergeTreeSettingsBool compatibility_allow_sampling_expression_not_in_primary_key; + extern const MergeTreeSettingsUInt64 concurrent_part_removal_threshold; + extern const MergeTreeSettingsDeduplicateMergeProjectionMode deduplicate_merge_projection_mode; + extern const MergeTreeSettingsBool disable_freeze_partition_for_zero_copy_replication; + extern const MergeTreeSettingsString disk; + extern const MergeTreeSettingsBool enable_mixed_granularity_parts; + extern const MergeTreeSettingsBool fsync_after_insert; + extern const MergeTreeSettingsBool fsync_part_directory; + extern const MergeTreeSettingsUInt64 inactive_parts_to_delay_insert; + extern const MergeTreeSettingsUInt64 inactive_parts_to_throw_insert; + extern const MergeTreeSettingsUInt64 index_granularity_bytes; + extern const MergeTreeSettingsSeconds lock_acquire_timeout_for_background_operations; + extern const MergeTreeSettingsUInt64 max_avg_part_size_for_too_many_parts; + extern const MergeTreeSettingsUInt64 max_delay_to_insert; + extern const MergeTreeSettingsUInt64 max_delay_to_mutate_ms; + extern const MergeTreeSettingsUInt64 max_file_name_length; + extern const MergeTreeSettingsUInt64 max_parts_in_total; + extern const MergeTreeSettingsUInt64 max_projections; + extern const MergeTreeSettingsUInt64 max_suspicious_broken_parts_bytes; + extern const MergeTreeSettingsUInt64 max_suspicious_broken_parts; + extern const MergeTreeSettingsUInt64 min_bytes_for_compact_part; + extern const MergeTreeSettingsUInt64 min_bytes_for_wide_part; + extern const MergeTreeSettingsUInt64 min_bytes_to_rebalance_partition_over_jbod; + extern const MergeTreeSettingsUInt64 min_delay_to_insert_ms; + extern const MergeTreeSettingsUInt64 min_delay_to_mutate_ms; + extern const MergeTreeSettingsUInt64 min_rows_for_compact_part; + extern const MergeTreeSettingsUInt64 min_rows_for_wide_part; + extern const MergeTreeSettingsUInt64 number_of_mutations_to_delay; + extern const MergeTreeSettingsUInt64 number_of_mutations_to_throw; + extern const MergeTreeSettingsSeconds old_parts_lifetime; + extern const MergeTreeSettingsUInt64 part_moves_between_shards_enable; + extern const MergeTreeSettingsUInt64 parts_to_delay_insert; + extern const MergeTreeSettingsUInt64 parts_to_throw_insert; + extern const MergeTreeSettingsFloat ratio_of_defaults_for_sparse_serialization; + extern const MergeTreeSettingsBool remove_empty_parts; + extern const MergeTreeSettingsBool remove_rolled_back_parts_immediately; + extern const MergeTreeSettingsBool replace_long_file_name_to_hash; + extern const MergeTreeSettingsUInt64 simultaneous_parts_removal_limit; + extern const MergeTreeSettingsUInt64 sleep_before_loading_outdated_parts_ms; + extern const MergeTreeSettingsString storage_policy; + extern const MergeTreeSettingsFloat zero_copy_concurrent_part_removal_max_postpone_ratio; + extern const MergeTreeSettingsUInt64 zero_copy_concurrent_part_removal_max_split_times; +} + namespace ErrorCodes { extern const int NO_SUCH_DATA_PART; @@ -393,7 +445,7 @@ MergeTreeData::MergeTreeData( bool sanity_checks = mode <= LoadingStrictnessLevel::CREATE; - allow_nullable_key = !sanity_checks || settings->allow_nullable_key; + allow_nullable_key = !sanity_checks || (*settings)[MergeTreeSetting::allow_nullable_key]; /// Check sanity of MergeTreeSettings. Only when table is created. if (sanity_checks) @@ -428,8 +480,8 @@ MergeTreeData::MergeTreeData( if (metadata_.sampling_key.definition_ast != nullptr) { /// This is for backward compatibility. - checkSampleExpression(metadata_, !sanity_checks || settings->compatibility_allow_sampling_expression_not_in_primary_key, - settings->check_sample_column_is_correct && sanity_checks); + checkSampleExpression(metadata_, !sanity_checks || (*settings)[MergeTreeSetting::compatibility_allow_sampling_expression_not_in_primary_key], + (*settings)[MergeTreeSetting::check_sample_column_is_correct] && sanity_checks); } checkColumnFilenamesForCollision(metadata_.getColumns(), *settings, sanity_checks); @@ -488,10 +540,10 @@ StoragePolicyPtr MergeTreeData::getStoragePolicy() const StoragePolicyPtr storage_policy; - if (settings->disk.changed) - storage_policy = context->getStoragePolicyFromDisk(settings->disk); + if ((*settings)[MergeTreeSetting::disk].changed) + storage_policy = context->getStoragePolicyFromDisk((*settings)[MergeTreeSetting::disk]); else - storage_policy = context->getStoragePolicy(settings->storage_policy); + storage_policy = context->getStoragePolicy((*settings)[MergeTreeSetting::storage_policy]); return storage_policy; } @@ -610,7 +662,7 @@ void MergeTreeData::checkProperties( throw Exception(ErrorCodes::BAD_ARGUMENTS, "Primary key must be a prefix of the sorting key, but its length: " "{} is greater than the sorting key length: {}", primary_key_size, sorting_key_size); - bool allow_suspicious_indices = getSettings()->allow_suspicious_indices; + bool allow_suspicious_indices = (*getSettings())[MergeTreeSetting::allow_suspicious_indices]; if (local_context) allow_suspicious_indices = local_context->getSettingsRef()[Setting::allow_suspicious_indices]; @@ -714,8 +766,8 @@ void MergeTreeData::checkProperties( throw Exception(ErrorCodes::LOGICAL_ERROR, "Projection with name {} already exists", backQuote(projection.name)); const auto settings = getSettings(); - if (projections_names.size() >= settings->max_projections) - throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Maximum limit of {} projection(s) exceeded", settings->max_projections); + if (projections_names.size() >= (*settings)[MergeTreeSetting::max_projections]) + throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Maximum limit of {} projection(s) exceeded", (*settings)[MergeTreeSetting::max_projections]); /// We cannot alter a projection so far. So here we do not try to find a projection in old metadata. bool is_aggregate = projection.type == ProjectionDescription::Type::Aggregate; @@ -905,7 +957,15 @@ void checkSpecialColumn(const std::string_view column_meta_name, const AlterComm { if (command.type == AlterCommand::MODIFY_COLUMN) { - if (!typeid_cast(command.data_type.get())) + if (!command.data_type) + { + throw Exception( + ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN, + "Trying to modify settings for column {} ({}) ", + column_meta_name, + command.column_name); + } + else if (!typeid_cast(command.data_type.get())) { throw Exception( ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN, @@ -1908,7 +1968,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optionalenable_mixed_granularity_parts) + if (have_non_adaptive_parts && have_adaptive_parts && !(*settings)[MergeTreeSetting::enable_mixed_granularity_parts]) throw Exception(ErrorCodes::LOGICAL_ERROR, "Table contains parts with adaptive and non adaptive marks, " "but `setting enable_mixed_granularity_parts` is disabled"); @@ -1919,7 +1979,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optional settings->max_suspicious_broken_parts) + if (suspicious_broken_parts > (*settings)[MergeTreeSetting::max_suspicious_broken_parts]) throw Exception( ErrorCodes::TOO_MANY_UNEXPECTED_DATA_PARTS, "Suspiciously many ({} parts, {} in total) broken parts " @@ -1928,9 +1988,9 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optionalmax_suspicious_broken_parts); + (*settings)[MergeTreeSetting::max_suspicious_broken_parts]); - if (suspicious_broken_parts_bytes > settings->max_suspicious_broken_parts_bytes) + if (suspicious_broken_parts_bytes > (*settings)[MergeTreeSetting::max_suspicious_broken_parts_bytes]) throw Exception( ErrorCodes::TOO_MANY_UNEXPECTED_DATA_PARTS, "Suspiciously big size ({} parts, {} in total) of all broken " @@ -1939,7 +1999,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optionalmax_suspicious_broken_parts_bytes)); + formatReadableSizeWithBinarySuffix((*settings)[MergeTreeSetting::max_suspicious_broken_parts_bytes])); } if (suspicious_broken_unexpected_parts != 0) @@ -2086,13 +2146,13 @@ try is_async ? "asynchronously" : "synchronously"); } - std::this_thread::sleep_for(std::chrono::milliseconds(static_cast(getSettings()->sleep_before_loading_outdated_parts_ms))); + std::this_thread::sleep_for(std::chrono::milliseconds(static_cast((*getSettings())[MergeTreeSetting::sleep_before_loading_outdated_parts_ms]))); ThreadFuzzer::maybeInjectSleep(); /// Acquire shared lock because 'relative_data_path' is used while loading parts. TableLockHolder shared_lock; if (is_async) - shared_lock = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); + shared_lock = lockForShare(RWLockImpl::NO_QUERY, (*getSettings())[MergeTreeSetting::lock_acquire_timeout_for_background_operations]); std::atomic_size_t num_loaded_parts = 0; @@ -2129,6 +2189,8 @@ try runner([&, my_part = part]() { + auto blocker_for_runner_thread = CannotAllocateThreadFaultInjector::blockFaultInjections(); + auto res = loadDataPartWithRetries( my_part->info, my_part->name, my_part->disk, DataPartState::Outdated, data_parts_mutex, loading_parts_initial_backoff_ms, @@ -2374,7 +2436,7 @@ size_t MergeTreeData::clearOldTemporaryDirectories(const String & root_path, siz /// We don't control the amount of refs for temporary parts so we cannot decide can we remove blobs /// or not. So we are not doing it bool keep_shared = false; - if (disk->supportZeroCopyReplication() && settings->allow_remote_fs_zero_copy_replication && supportsReplication()) + if (disk->supportZeroCopyReplication() && (*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication] && supportsReplication()) { LOG_WARNING(log, "Since zero-copy replication is enabled we are not going to remove blobs from shared storage for {}", full_path); keep_shared = true; @@ -2431,7 +2493,7 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) /// Please don't use "zero-copy replication" (a non-production feature) in production. /// It is not ready for production usage. Don't use it. - bool need_remove_parts_in_order = supportsReplication() && getSettings()->allow_remote_fs_zero_copy_replication; + bool need_remove_parts_in_order = supportsReplication() && (*getSettings())[MergeTreeSetting::allow_remote_fs_zero_copy_replication]; if (need_remove_parts_in_order) { @@ -2465,7 +2527,7 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) auto time_now = time(nullptr); { - auto removal_limit = getSettings()->simultaneous_parts_removal_limit; + auto removal_limit = (*getSettings())[MergeTreeSetting::simultaneous_parts_removal_limit]; size_t current_removal_limit = removal_limit == 0 ? std::numeric_limits::max() : static_cast(removal_limit); auto parts_lock = lockParts(); @@ -2509,10 +2571,10 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) } auto part_remove_time = part->remove_time.load(std::memory_order_relaxed); - bool reached_removal_time = part_remove_time <= time_now && time_now - part_remove_time >= getSettings()->old_parts_lifetime.totalSeconds(); + bool reached_removal_time = part_remove_time <= time_now && time_now - part_remove_time >= (*getSettings())[MergeTreeSetting::old_parts_lifetime].totalSeconds(); if ((reached_removal_time && !has_skipped_mutation_parent(part)) || force - || (part->version.creation_csn == Tx::RolledBackCSN && getSettings()->remove_rolled_back_parts_immediately)) + || (part->version.creation_csn == Tx::RolledBackCSN && (*getSettings())[MergeTreeSetting::remove_rolled_back_parts_immediately])) { part->removal_state.store(DataPartRemovalState::REMOVED, std::memory_order_relaxed); parts_to_delete.emplace_back(it); @@ -2687,7 +2749,7 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t } }; - if (parts_to_remove.size() <= settings->concurrent_part_removal_threshold) + if (parts_to_remove.size() <= (*settings)[MergeTreeSetting::concurrent_part_removal_threshold]) { remove_single_thread(); return; @@ -2699,7 +2761,7 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t /// This flag disallow straightforward concurrent parts removal. It's required only in case /// when we have parts on zero-copy disk + at least some of them were mutated. bool remove_parts_in_order = false; - if (settings->allow_remote_fs_zero_copy_replication && dynamic_cast(this) != nullptr) + if ((*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication] && dynamic_cast(this) != nullptr) { remove_parts_in_order = std::any_of( parts_to_remove.begin(), parts_to_remove.end(), @@ -2837,8 +2899,8 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t /// It may happen that we have a huge part covering thousands small parts. /// In this case, we will get a huge range that will be process by only one thread causing really long tail latency. /// Let's try to exclude such parts in order to get smaller tasks for thread pool and more uniform distribution. - if (settings->concurrent_part_removal_threshold < parts_in_range.size() && - split_times < settings->zero_copy_concurrent_part_removal_max_split_times) + if ((*settings)[MergeTreeSetting::concurrent_part_removal_threshold] < parts_in_range.size() && + split_times < (*settings)[MergeTreeSetting::zero_copy_concurrent_part_removal_max_split_times]) { auto smaller_parts_pred = [&range](const DataPartPtr & part) { @@ -2849,7 +2911,7 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t size_t top_level_count = parts_in_range.size() - covered_parts_count; chassert(top_level_count); Float32 parts_to_exclude_ratio = static_cast(top_level_count) / parts_in_range.size(); - if (settings->zero_copy_concurrent_part_removal_max_postpone_ratio < parts_to_exclude_ratio) + if ((*settings)[MergeTreeSetting::zero_copy_concurrent_part_removal_max_postpone_ratio] < parts_to_exclude_ratio) { /// Most likely we have a long mutations chain here LOG_DEBUG(log, "Block range {} contains {} parts including {} top-level parts, will not try to split it", @@ -2905,7 +2967,7 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t size_t MergeTreeData::clearEmptyParts() { - if (!getSettings()->remove_empty_parts) + if (!(*getSettings())[MergeTreeSetting::remove_empty_parts]) return 0; std::vector parts_names_to_drop; @@ -3037,7 +3099,7 @@ void MergeTreeData::dropAllData() for (const auto & part : detached_parts) { bool is_zero_copy = supportsReplication() && part.disk->supportZeroCopyReplication() - && settings_ptr->allow_remote_fs_zero_copy_replication; + && (*settings_ptr)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]; try { bool keep_shared = removeDetachedPart(part.disk, fs::path(relative_data_path) / DETACHED_DIR_NAME / part.dir_name / "", part.dir_name); @@ -3078,7 +3140,7 @@ void MergeTreeData::dropAllData() { if (!disk->isDirectoryEmpty(relative_data_path) && supportsReplication() && disk->supportZeroCopyReplication() - && settings_ptr->allow_remote_fs_zero_copy_replication) + && (*settings_ptr)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) { std::vector files_left; disk->listFiles(relative_data_path, files_left); @@ -3240,7 +3302,7 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context if (std::any_of(commands.begin(), commands.end(), [](const AlterCommand & c) { return c.type == AlterCommand::ADD_PROJECTION; })) { if (merging_params.mode != MergingParams::Mode::Ordinary - && settings_from_storage->deduplicate_merge_projection_mode == DeduplicateMergeProjectionMode::THROW) + && (*settings_from_storage)[MergeTreeSetting::deduplicate_merge_projection_mode] == DeduplicateMergeProjectionMode::THROW) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Projection is fully supported in {} with deduplicate_merge_projection_mode = throw. " "Use 'drop' or 'rebuild' option of deduplicate_merge_projection_mode.", @@ -3410,8 +3472,8 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context throw Exception(ErrorCodes::BAD_ARGUMENTS, "ALTER MODIFY SAMPLE BY is not supported for default-partitioned tables created with the old syntax"); - checkSampleExpression(new_metadata, getSettings()->compatibility_allow_sampling_expression_not_in_primary_key, - getSettings()->check_sample_column_is_correct); + checkSampleExpression(new_metadata, (*getSettings())[MergeTreeSetting::compatibility_allow_sampling_expression_not_in_primary_key], + (*getSettings())[MergeTreeSetting::check_sample_column_is_correct]); } if (command.type == AlterCommand::ADD_INDEX && !is_custom_partitioned) { @@ -3671,7 +3733,7 @@ MergeTreeDataPartFormat MergeTreeData::choosePartFormat(size_t bytes_uncompresse }; auto part_type = PartType::Wide; - if (satisfies(settings->min_bytes_for_wide_part, settings->min_rows_for_wide_part)) + if (satisfies((*settings)[MergeTreeSetting::min_bytes_for_wide_part], (*settings)[MergeTreeSetting::min_rows_for_wide_part])) part_type = PartType::Compact; return {part_type, PartStorageType::Full}; @@ -4490,7 +4552,7 @@ size_t MergeTreeData::getNumberOfOutdatedPartsWithExpiredRemovalTime() const for (const auto & part : outdated_parts_range) { auto part_remove_time = part->remove_time.load(std::memory_order_relaxed); - if (part_remove_time <= time_now && time_now - part_remove_time >= getSettings()->old_parts_lifetime.totalSeconds() && isSharedPtrUnique(part)) + if (part_remove_time <= time_now && time_now - part_remove_time >= (*getSettings())[MergeTreeSetting::old_parts_lifetime].totalSeconds() && isSharedPtrUnique(part)) ++res; } @@ -4565,7 +4627,7 @@ void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until, const Contex const size_t parts_count_in_total = getActivePartsCount(); /// Check if we have too many parts in total - if (allow_throw && parts_count_in_total >= settings->max_parts_in_total) + if (allow_throw && parts_count_in_total >= (*settings)[MergeTreeSetting::max_parts_in_total]) { ProfileEvents::increment(ProfileEvents::RejectedInserts); throw Exception( @@ -4578,10 +4640,10 @@ void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until, const Contex size_t outdated_parts_over_threshold = 0; { size_t outdated_parts_count_in_partition = 0; - if (settings->inactive_parts_to_throw_insert > 0 || settings->inactive_parts_to_delay_insert > 0) + if ((*settings)[MergeTreeSetting::inactive_parts_to_throw_insert] > 0 || (*settings)[MergeTreeSetting::inactive_parts_to_delay_insert] > 0) outdated_parts_count_in_partition = getMaxOutdatedPartsCountForPartition(); - if (allow_throw && settings->inactive_parts_to_throw_insert > 0 && outdated_parts_count_in_partition >= settings->inactive_parts_to_throw_insert) + if (allow_throw && (*settings)[MergeTreeSetting::inactive_parts_to_throw_insert] > 0 && outdated_parts_count_in_partition >= (*settings)[MergeTreeSetting::inactive_parts_to_throw_insert]) { ProfileEvents::increment(ProfileEvents::RejectedInserts); throw Exception( @@ -4589,21 +4651,21 @@ void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until, const Contex "Too many inactive parts ({}) in table '{}'. Parts cleaning are processing significantly slower than inserts", outdated_parts_count_in_partition, getLogName()); } - if (settings->inactive_parts_to_delay_insert > 0 && outdated_parts_count_in_partition >= settings->inactive_parts_to_delay_insert) - outdated_parts_over_threshold = outdated_parts_count_in_partition - settings->inactive_parts_to_delay_insert + 1; + if ((*settings)[MergeTreeSetting::inactive_parts_to_delay_insert] > 0 && outdated_parts_count_in_partition >= (*settings)[MergeTreeSetting::inactive_parts_to_delay_insert]) + outdated_parts_over_threshold = outdated_parts_count_in_partition - (*settings)[MergeTreeSetting::inactive_parts_to_delay_insert] + 1; } auto [parts_count_in_partition, size_of_partition] = getMaxPartsCountAndSizeForPartition(); size_t average_part_size = parts_count_in_partition ? size_of_partition / parts_count_in_partition : 0; const auto active_parts_to_delay_insert - = query_settings[Setting::parts_to_delay_insert] ? query_settings[Setting::parts_to_delay_insert] : settings->parts_to_delay_insert; + = query_settings[Setting::parts_to_delay_insert] ? query_settings[Setting::parts_to_delay_insert] : (*settings)[MergeTreeSetting::parts_to_delay_insert]; const auto active_parts_to_throw_insert - = query_settings[Setting::parts_to_throw_insert] ? query_settings[Setting::parts_to_throw_insert] : settings->parts_to_throw_insert; + = query_settings[Setting::parts_to_throw_insert] ? query_settings[Setting::parts_to_throw_insert] : (*settings)[MergeTreeSetting::parts_to_throw_insert]; size_t active_parts_over_threshold = 0; { bool parts_are_large_enough_in_average - = settings->max_avg_part_size_for_too_many_parts && average_part_size > settings->max_avg_part_size_for_too_many_parts; + = (*settings)[MergeTreeSetting::max_avg_part_size_for_too_many_parts] && average_part_size > (*settings)[MergeTreeSetting::max_avg_part_size_for_too_many_parts]; if (allow_throw && parts_count_in_partition >= active_parts_to_throw_insert && !parts_are_large_enough_in_average) { @@ -4639,11 +4701,11 @@ void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until, const Contex { parts_over_threshold = outdated_parts_over_threshold; allowed_parts_over_threshold = outdated_parts_over_threshold; /// if throw threshold is not set, will use max delay - if (settings->inactive_parts_to_throw_insert > 0) - allowed_parts_over_threshold = settings->inactive_parts_to_throw_insert - settings->inactive_parts_to_delay_insert; + if ((*settings)[MergeTreeSetting::inactive_parts_to_throw_insert] > 0) + allowed_parts_over_threshold = (*settings)[MergeTreeSetting::inactive_parts_to_throw_insert] - (*settings)[MergeTreeSetting::inactive_parts_to_delay_insert]; } - const UInt64 max_delay_milliseconds = (settings->max_delay_to_insert > 0 ? settings->max_delay_to_insert * 1000 : 1000); + const UInt64 max_delay_milliseconds = ((*settings)[MergeTreeSetting::max_delay_to_insert] > 0 ? (*settings)[MergeTreeSetting::max_delay_to_insert] * 1000 : 1000); if (allowed_parts_over_threshold == 0 || parts_over_threshold > allowed_parts_over_threshold) { delay_milliseconds = max_delay_milliseconds; @@ -4651,7 +4713,7 @@ void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until, const Contex else { double delay_factor = static_cast(parts_over_threshold) / allowed_parts_over_threshold; - const UInt64 min_delay_milliseconds = settings->min_delay_to_insert_ms; + const UInt64 min_delay_milliseconds = (*settings)[MergeTreeSetting::min_delay_to_insert_ms]; delay_milliseconds = std::max(min_delay_milliseconds, static_cast(max_delay_milliseconds * delay_factor)); } } @@ -4676,10 +4738,10 @@ void MergeTreeData::delayMutationOrThrowIfNeeded(Poco::Event * until, const Cont const auto & query_settings = query_context->getSettingsRef(); size_t num_mutations_to_delay = query_settings[Setting::number_of_mutations_to_delay] ? query_settings[Setting::number_of_mutations_to_delay] - : settings->number_of_mutations_to_delay; + : (*settings)[MergeTreeSetting::number_of_mutations_to_delay]; size_t num_mutations_to_throw = query_settings[Setting::number_of_mutations_to_throw] ? query_settings[Setting::number_of_mutations_to_throw] - : settings->number_of_mutations_to_throw; + : (*settings)[MergeTreeSetting::number_of_mutations_to_throw]; if (!num_mutations_to_delay && !num_mutations_to_throw) return; @@ -4702,7 +4764,7 @@ void MergeTreeData::delayMutationOrThrowIfNeeded(Poco::Event * until, const Cont size_t allowed_mutations_over_threshold = num_mutations_to_throw - num_mutations_to_delay; double delay_factor = std::min(static_cast(mutations_over_threshold) / allowed_mutations_over_threshold, 1.0); - size_t delay_milliseconds = static_cast(interpolateLinear(settings->min_delay_to_mutate_ms, settings->max_delay_to_mutate_ms, delay_factor)); + size_t delay_milliseconds = static_cast(interpolateLinear((*settings)[MergeTreeSetting::min_delay_to_mutate_ms], (*settings)[MergeTreeSetting::max_delay_to_mutate_ms], delay_factor)); ProfileEvents::increment(ProfileEvents::DelayedMutations); ProfileEvents::increment(ProfileEvents::DelayedMutationsMilliseconds, delay_milliseconds); @@ -4963,7 +5025,8 @@ void MergeTreeData::addPartContributionToColumnAndSecondaryIndexSizes(const Data total_column_size.add(part_column_size); } - auto indexes_descriptions = getInMemoryMetadataPtr()->secondary_indices; + const auto metadata_snapshot = getInMemoryMetadataPtr(); + auto indexes_descriptions = metadata_snapshot->secondary_indices; for (const auto & index : indexes_descriptions) { IndexSize & total_secondary_index_size = secondary_index_sizes[index.name]; @@ -5373,7 +5436,7 @@ Pipe MergeTreeData::alterPartition( case PartitionCommand::MoveDestinationType::SHARD: { - if (!getSettings()->part_moves_between_shards_enable) + if (!(*getSettings())[MergeTreeSetting::part_moves_between_shards_enable]) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Moving parts between shards is experimental and work in progress" ", see part_moves_between_shards_enable setting"); @@ -5476,7 +5539,7 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( make_temporary_hard_links = false; hold_storage_and_part_ptrs = true; } - else if (supportsReplication() && part->getDataPartStorage().supportZeroCopyReplication() && getSettings()->allow_remote_fs_zero_copy_replication) + else if (supportsReplication() && part->getDataPartStorage().supportZeroCopyReplication() && (*getSettings())[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) { /// Hard links don't work correctly with zero copy replication. make_temporary_hard_links = false; @@ -6251,7 +6314,7 @@ bool MergeTreeData::hasProjection() const bool MergeTreeData::areAsynchronousInsertsEnabled() const { - return getSettings()->async_insert; + return (*getSettings())[MergeTreeSetting::async_insert]; } MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector(MergeTreeData::DataPartStateVector * out_states) const @@ -6431,7 +6494,7 @@ MergeTreeData::MutableDataPartsVector MergeTreeData::tryLoadPartsToAttach(const .withPartFormatFromDisk() .build(); - loadPartAndFixMetadataImpl(part, local_context, getInMemoryMetadataPtr()->getMetadataVersion(), getSettings()->fsync_after_insert); + loadPartAndFixMetadataImpl(part, local_context, getInMemoryMetadataPtr()->getMetadataVersion(), (*getSettings())[MergeTreeSetting::fsync_after_insert]); loaded_parts.push_back(part); } @@ -6984,7 +7047,7 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( size_t align_of_state = func->alignOfData(); auto * place = arena.alignedAlloc(size_of_state, align_of_state); func->create(place); - if (const AggregateFunctionCount * agg_count = typeid_cast(func.get())) + if (const AggregateFunctionCount * /*agg_count*/ _ = typeid_cast(func.get())) AggregateFunctionCount::set(place, value.safeGet()); else { @@ -7294,7 +7357,7 @@ void MergeTreeData::checkColumnFilenamesForCollision(const ColumnsDescription & String stream_name; auto full_stream_name = ISerialization::getFileNameForStream(column, substream_path); - if (settings.replace_long_file_name_to_hash && full_stream_name.size() > settings.max_file_name_length) + if (settings[MergeTreeSetting::replace_long_file_name_to_hash] && full_stream_name.size() > settings[MergeTreeSetting::max_file_name_length]) stream_name = sipHash128String(full_stream_name); else stream_name = full_stream_name; @@ -7305,7 +7368,7 @@ void MergeTreeData::checkColumnFilenamesForCollision(const ColumnsDescription & auto serialization = column.type->getDefaultSerialization(); serialization->enumerateStreams(callback); - if (column.type->supportsSparseSerialization() && settings.ratio_of_defaults_for_sparse_serialization < 1.0) + if (column.type->supportsSparseSerialization() && settings[MergeTreeSetting::ratio_of_defaults_for_sparse_serialization] < 1.0) { auto sparse_serialization = column.type->getSparseSerialization(); sparse_serialization->enumerateStreams(callback); @@ -7323,7 +7386,7 @@ void MergeTreeData::checkColumnFilenamesForCollision(const ColumnsDescription & "Columns '{} {}' and '{} {}' have streams ({} and {}) with collision in file name {}", column.name, column.type->getName(), other_column_name, other_type->getName(), full_stream_name, other_full_name, stream_name); - if (settings.replace_long_file_name_to_hash) + if (settings[MergeTreeSetting::replace_long_file_name_to_hash]) message += ". It may be a collision between a filename for one column and a hash of filename for another column (see setting 'replace_long_file_name_to_hash')"; if (throw_on_error) @@ -7471,7 +7534,7 @@ std::pair MergeTreeData::cloneAn auto out_metadata = dst_part_storage->writeFile(IMergeTreeDataPart::METADATA_VERSION_FILE_NAME, 4096, getContext()->getWriteSettings()); writeText(metadata_snapshot->getMetadataVersion(), *out_metadata); out_metadata->finalize(); - if (getSettings()->fsync_after_insert) + if ((*getSettings())[MergeTreeSetting::fsync_after_insert]) out_metadata->sync(); } @@ -7532,8 +7595,8 @@ std::pair MergeTreeData::cloneAn bool MergeTreeData::canUseAdaptiveGranularity() const { const auto settings = getSettings(); - return settings->index_granularity_bytes != 0 - && (settings->enable_mixed_granularity_parts || !has_non_adaptive_index_granularity_parts); + return (*settings)[MergeTreeSetting::index_granularity_bytes] != 0 + && ((*settings)[MergeTreeSetting::enable_mixed_granularity_parts] || !has_non_adaptive_index_granularity_parts); } String MergeTreeData::getFullPathOnDisk(const DiskPtr & disk) const @@ -7691,8 +7754,8 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher( } } - if (supportsReplication() && settings->disable_freeze_partition_for_zero_copy_replication - && settings->allow_remote_fs_zero_copy_replication && has_zero_copy_part) + if (supportsReplication() && (*settings)[MergeTreeSetting::disable_freeze_partition_for_zero_copy_replication] + && (*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication] && has_zero_copy_part) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "FREEZE PARTITION queries are disabled."); String backup_name = (!with_name.empty() ? escapeForFileName(with_name) : toString(increment)); @@ -7796,7 +7859,7 @@ bool MergeTreeData::canReplacePartition(const DataPartPtr & src_part) const { const auto settings = getSettings(); - if (!settings->enable_mixed_granularity_parts || settings->index_granularity_bytes == 0) + if (!(*settings)[MergeTreeSetting::enable_mixed_granularity_parts] || (*settings)[MergeTreeSetting::index_granularity_bytes] == 0) { if (!canUseAdaptiveGranularity() && src_part->index_granularity_info.mark_type.adaptive) return false; @@ -8076,7 +8139,7 @@ MovePartsOutcome MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & /// common for ordinary merge tree. So it's a bad design and should /// be fixed. auto disk = moving_part.reserved_space->getDisk(); - if (supportsReplication() && disk->supportZeroCopyReplication() && settings->allow_remote_fs_zero_copy_replication) + if (supportsReplication() && disk->supportZeroCopyReplication() && (*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) { /// This loop is not endless, if shutdown called/connection failed/replica became readonly /// we will return true from waitZeroCopyLock and createZeroCopyLock will return nullopt. @@ -8257,15 +8320,15 @@ bool MergeTreeData::canUsePolymorphicParts(const MergeTreeSettings & settings, S { if (!canUseAdaptiveGranularity()) { - if ((settings.min_rows_for_wide_part != 0 || settings.min_bytes_for_wide_part != 0 - || settings.min_rows_for_compact_part != 0 || settings.min_bytes_for_compact_part != 0)) + if ((settings[MergeTreeSetting::min_rows_for_wide_part] != 0 || settings[MergeTreeSetting::min_bytes_for_wide_part] != 0 + || settings[MergeTreeSetting::min_rows_for_compact_part] != 0 || settings[MergeTreeSetting::min_bytes_for_compact_part] != 0)) { out_reason = fmt::format( "Table can't create parts with adaptive granularity, but settings" " min_rows_for_wide_part = {}" ", min_bytes_for_wide_part = {}" ". Parts with non-adaptive granularity can be stored only in Wide (default) format.", - settings.min_rows_for_wide_part, settings.min_bytes_for_wide_part); + settings[MergeTreeSetting::min_rows_for_wide_part], settings[MergeTreeSetting::min_bytes_for_wide_part]); } return false; @@ -8383,7 +8446,7 @@ ReservationPtr MergeTreeData::balancedReservation( bool is_insert) { ReservationPtr reserved_space; - auto min_bytes_to_rebalance_partition_over_jbod = getSettings()->min_bytes_to_rebalance_partition_over_jbod; + auto min_bytes_to_rebalance_partition_over_jbod = (*getSettings())[MergeTreeSetting::min_bytes_to_rebalance_partition_over_jbod]; if (tagger_ptr && min_bytes_to_rebalance_partition_over_jbod > 0 && part_size >= min_bytes_to_rebalance_partition_over_jbod) { try @@ -8601,11 +8664,13 @@ void MergeTreeData::resetSerializationHints(const DataPartsLock & /*lock*/) { SerializationInfo::Settings settings = { - .ratio_of_defaults_for_sparse = getSettings()->ratio_of_defaults_for_sparse_serialization, + .ratio_of_defaults_for_sparse = (*getSettings())[MergeTreeSetting::ratio_of_defaults_for_sparse_serialization], .choose_kind = true, }; - const auto & storage_columns = getInMemoryMetadataPtr()->getColumns(); + const auto metadata_snapshot = getInMemoryMetadataPtr(); + const auto & storage_columns = metadata_snapshot->getColumns(); + serialization_hints = SerializationInfoByName(storage_columns.getAllPhysical(), settings); auto range = getDataPartsStateRange(DataPartState::Active); @@ -8616,7 +8681,8 @@ void MergeTreeData::resetSerializationHints(const DataPartsLock & /*lock*/) template void MergeTreeData::updateSerializationHints(const AddedParts & added_parts, const RemovedParts & removed_parts, const DataPartsLock & /*lock*/) { - const auto & storage_columns = getInMemoryMetadataPtr()->getColumns(); + const auto metadata_snapshot = getInMemoryMetadataPtr(); + const auto & storage_columns = metadata_snapshot->getColumns(); for (const auto & part : added_parts) updateSerializationHintsForPart(part, storage_columns, serialization_hints, false); @@ -8741,7 +8807,7 @@ std::pair MergeTreeData::createE .withPartInfo(new_part_info) .build(); - if (settings->assign_part_uuids) + if ((*settings)[MergeTreeSetting::assign_part_uuids]) new_data_part->uuid = UUIDHelpers::generateV4(); new_data_part->setColumns(columns, {}, metadata_snapshot->getMetadataVersion()); @@ -8777,7 +8843,7 @@ std::pair MergeTreeData::createE new_data_part_storage->createDirectories(); - if (getSettings()->fsync_part_directory) + if ((*getSettings())[MergeTreeSetting::fsync_part_directory]) sync_guard = new_data_part_storage->getDirectorySyncGuard(); } @@ -8791,7 +8857,7 @@ std::pair MergeTreeData::createE ColumnsStatistics{}, compression_codec, txn ? txn->tid : Tx::PrehistoricTID); - bool sync_on_insert = settings->fsync_after_insert; + bool sync_on_insert = (*settings)[MergeTreeSetting::fsync_after_insert]; out.write(block); /// Here is no projections as no data inside diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index 2855ca01253..b05466ae9b9 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -51,6 +51,22 @@ namespace CurrentMetrics namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsUInt64 max_bytes_to_merge_at_max_space_in_pool; + extern const MergeTreeSettingsUInt64 max_bytes_to_merge_at_min_space_in_pool; + extern const MergeTreeSettingsUInt64 max_number_of_mutations_for_replica; + extern const MergeTreeSettingsUInt64 max_parts_to_merge_at_once; + extern const MergeTreeSettingsInt64 merge_with_recompression_ttl_timeout; + extern const MergeTreeSettingsInt64 merge_with_ttl_timeout; + extern const MergeTreeSettingsBool min_age_to_force_merge_on_partition_only; + extern const MergeTreeSettingsUInt64 min_age_to_force_merge_seconds; + extern const MergeTreeSettingsUInt64 number_of_free_entries_in_pool_to_execute_optimize_entire_partition; + extern const MergeTreeSettingsUInt64 number_of_free_entries_in_pool_to_execute_mutation; + extern const MergeTreeSettingsUInt64 number_of_free_entries_in_pool_to_lower_max_size_of_merge; + extern const MergeTreeSettingsBool ttl_only_drop_parts; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -97,13 +113,13 @@ UInt64 MergeTreeDataMergerMutator::getMaxSourcePartsSizeForMerge(size_t max_coun /// One entry is probably the entry where this function is executed. /// This will protect from bad settings. UInt64 max_size = 0; - if (scheduled_tasks_count <= 1 || free_entries >= data_settings->number_of_free_entries_in_pool_to_lower_max_size_of_merge) - max_size = data_settings->max_bytes_to_merge_at_max_space_in_pool; + if (scheduled_tasks_count <= 1 || free_entries >= (*data_settings)[MergeTreeSetting::number_of_free_entries_in_pool_to_lower_max_size_of_merge]) + max_size = (*data_settings)[MergeTreeSetting::max_bytes_to_merge_at_max_space_in_pool]; else max_size = static_cast(interpolateExponential( - data_settings->max_bytes_to_merge_at_min_space_in_pool, - data_settings->max_bytes_to_merge_at_max_space_in_pool, - static_cast(free_entries) / data_settings->number_of_free_entries_in_pool_to_lower_max_size_of_merge)); + (*data_settings)[MergeTreeSetting::max_bytes_to_merge_at_min_space_in_pool], + (*data_settings)[MergeTreeSetting::max_bytes_to_merge_at_max_space_in_pool], + static_cast(free_entries) / (*data_settings)[MergeTreeSetting::number_of_free_entries_in_pool_to_lower_max_size_of_merge])); return std::min(max_size, static_cast(data.getStoragePolicy()->getMaxUnreservedFreeSpace() / DISK_USAGE_COEFFICIENT_TO_SELECT)); } @@ -114,8 +130,8 @@ UInt64 MergeTreeDataMergerMutator::getMaxSourcePartSizeForMutation() const const auto data_settings = data.getSettings(); size_t occupied = CurrentMetrics::values[CurrentMetrics::BackgroundMergesAndMutationsPoolTask].load(std::memory_order_relaxed); - if (data_settings->max_number_of_mutations_for_replica > 0 && - occupied >= data_settings->max_number_of_mutations_for_replica) + if ((*data_settings)[MergeTreeSetting::max_number_of_mutations_for_replica] > 0 && + occupied >= (*data_settings)[MergeTreeSetting::max_number_of_mutations_for_replica]) return 0; /// A DataPart can be stored only at a single disk. Get the maximum reservable free space at all disks. @@ -124,7 +140,7 @@ UInt64 MergeTreeDataMergerMutator::getMaxSourcePartSizeForMutation() const /// Allow mutations only if there are enough threads, otherwise, leave free threads for merges. if (occupied <= 1 - || max_tasks_count - occupied >= data_settings->number_of_free_entries_in_pool_to_execute_mutation) + || max_tasks_count - occupied >= (*data_settings)[MergeTreeSetting::number_of_free_entries_in_pool_to_execute_mutation]) return static_cast(disk_space / DISK_USAGE_COEFFICIENT_TO_RESERVE); return 0; @@ -457,22 +473,22 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectPartsToMergeFromRanges( TTLDeleteMergeSelector drop_ttl_selector( next_delete_ttl_merge_times_by_partition, current_time, - data_settings->merge_with_ttl_timeout, + (*data_settings)[MergeTreeSetting::merge_with_ttl_timeout], /*only_drop_parts*/ true, dry_run); /// The size of the completely expired part of TTL drop is not affected by the merge pressure and the size of the storage space - parts_to_merge = drop_ttl_selector.select(parts_ranges, data_settings->max_bytes_to_merge_at_max_space_in_pool); + parts_to_merge = drop_ttl_selector.select(parts_ranges, (*data_settings)[MergeTreeSetting::max_bytes_to_merge_at_max_space_in_pool]); if (!parts_to_merge.empty()) { future_part->merge_type = MergeType::TTLDelete; } - else if (!data_settings->ttl_only_drop_parts) + else if (!(*data_settings)[MergeTreeSetting::ttl_only_drop_parts]) { TTLDeleteMergeSelector delete_ttl_selector( next_delete_ttl_merge_times_by_partition, current_time, - data_settings->merge_with_ttl_timeout, + (*data_settings)[MergeTreeSetting::merge_with_ttl_timeout], /*only_drop_parts*/ false, dry_run); @@ -486,7 +502,7 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectPartsToMergeFromRanges( TTLRecompressMergeSelector recompress_ttl_selector( next_recompress_ttl_merge_times_by_partition, current_time, - data_settings->merge_with_recompression_ttl_timeout, + (*data_settings)[MergeTreeSetting::merge_with_recompression_ttl_timeout], metadata_snapshot->getRecompressionTTLs(), dry_run); @@ -500,9 +516,9 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectPartsToMergeFromRanges( { SimpleMergeSelector::Settings merge_settings; /// Override value from table settings - merge_settings.max_parts_to_merge_at_once = data_settings->max_parts_to_merge_at_once; - if (!data_settings->min_age_to_force_merge_on_partition_only) - merge_settings.min_age_to_force_merge = data_settings->min_age_to_force_merge_seconds; + merge_settings.max_parts_to_merge_at_once = (*data_settings)[MergeTreeSetting::max_parts_to_merge_at_once]; + if (!(*data_settings)[MergeTreeSetting::min_age_to_force_merge_on_partition_only]) + merge_settings.min_age_to_force_merge = (*data_settings)[MergeTreeSetting::min_age_to_force_merge_seconds]; if (aggressive) merge_settings.base = 1; @@ -538,13 +554,13 @@ String MergeTreeDataMergerMutator::getBestPartitionToOptimizeEntire( const PartitionsInfo & partitions_info) const { const auto & data_settings = data.getSettings(); - if (!data_settings->min_age_to_force_merge_on_partition_only) + if (!(*data_settings)[MergeTreeSetting::min_age_to_force_merge_on_partition_only]) return {}; - if (!data_settings->min_age_to_force_merge_seconds) + if (!(*data_settings)[MergeTreeSetting::min_age_to_force_merge_seconds]) return {}; size_t occupied = CurrentMetrics::values[CurrentMetrics::BackgroundMergesAndMutationsPoolTask].load(std::memory_order_relaxed); size_t max_tasks_count = data.getContext()->getMergeMutateExecutor()->getMaxTasksCount(); - if (occupied > 1 && max_tasks_count - occupied < data_settings->number_of_free_entries_in_pool_to_execute_optimize_entire_partition) + if (occupied > 1 && max_tasks_count - occupied < (*data_settings)[MergeTreeSetting::number_of_free_entries_in_pool_to_execute_optimize_entire_partition]) { LOG_INFO( log, @@ -561,7 +577,7 @@ String MergeTreeDataMergerMutator::getBestPartitionToOptimizeEntire( assert(best_partition_it != partitions_info.end()); - if (static_cast(best_partition_it->second.min_age) < data_settings->min_age_to_force_merge_seconds) + if (static_cast(best_partition_it->second.min_age) < (*data_settings)[MergeTreeSetting::min_age_to_force_merge_seconds]) return {}; return best_partition_it->first; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp index 35914d8c50a..58a67fc4ba2 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp @@ -13,6 +13,13 @@ extern const Event MergeTreeDataWriterStatisticsCalculationMicroseconds; namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsUInt64 index_granularity; + extern const MergeTreeSettingsUInt64 index_granularity_bytes; + extern const MergeTreeSettingsUInt64 max_digestion_size_per_segment; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -235,8 +242,8 @@ size_t MergeTreeDataPartWriterOnDisk::computeIndexGranularity(const Block & bloc { return computeIndexGranularityImpl( block, - storage_settings->index_granularity_bytes, - storage_settings->index_granularity, + (*storage_settings)[MergeTreeSetting::index_granularity_bytes], + (*storage_settings)[MergeTreeSetting::index_granularity], settings.blocks_are_granules_size, settings.can_use_adaptive_granularity); } @@ -303,7 +310,7 @@ void MergeTreeDataPartWriterOnDisk::initSkipIndices() GinIndexStorePtr store = nullptr; if (typeid_cast(&*skip_index) != nullptr) { - store = std::make_shared(stream_name, data_part_storage, data_part_storage, storage_settings->max_digestion_size_per_segment); + store = std::make_shared(stream_name, data_part_storage, data_part_storage, (*storage_settings)[MergeTreeSetting::max_digestion_size_per_segment]); gin_index_stores[stream_name] = store; } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index 04e07a0588a..3e1643152b8 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -12,6 +12,12 @@ namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsUInt64 max_file_name_length; + extern const MergeTreeSettingsBool replace_long_file_name_to_hash; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -139,7 +145,7 @@ void MergeTreeDataPartWriterWide::addStreams( auto full_stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path); String stream_name; - if (storage_settings->replace_long_file_name_to_hash && full_stream_name.size() > storage_settings->max_file_name_length) + if ((*storage_settings)[MergeTreeSetting::replace_long_file_name_to_hash] && full_stream_name.size() > (*storage_settings)[MergeTreeSetting::max_file_name_length]) stream_name = sipHash128String(full_stream_name); else stream_name = full_stream_name; diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 99346af1ff1..5c9f872b134 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -85,6 +85,13 @@ namespace Setting extern const SettingsParallelReplicasMode parallel_replicas_mode; } +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsUInt64 max_concurrent_queries; + extern const MergeTreeSettingsInt64 max_partitions_to_read; + extern const MergeTreeSettingsUInt64 min_marks_to_honor_max_concurrent_queries; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -904,7 +911,7 @@ std::shared_ptr MergeTreeDataSelectExecutor::checkLimits( const auto & settings = context->getSettingsRef(); const auto data_settings = data.getSettings(); auto max_partitions_to_read - = settings[Setting::max_partitions_to_read].changed ? settings[Setting::max_partitions_to_read] : data_settings->max_partitions_to_read; + = settings[Setting::max_partitions_to_read].changed ? settings[Setting::max_partitions_to_read] : (*data_settings)[MergeTreeSetting::max_partitions_to_read]; if (max_partitions_to_read > 0) { std::set partitions; @@ -918,12 +925,12 @@ std::shared_ptr MergeTreeDataSelectExecutor::checkLimits( max_partitions_to_read); } - if (data_settings->max_concurrent_queries > 0 && data_settings->min_marks_to_honor_max_concurrent_queries > 0 - && result.selected_marks >= data_settings->min_marks_to_honor_max_concurrent_queries) + if ((*data_settings)[MergeTreeSetting::max_concurrent_queries] > 0 && (*data_settings)[MergeTreeSetting::min_marks_to_honor_max_concurrent_queries] > 0 + && result.selected_marks >= (*data_settings)[MergeTreeSetting::min_marks_to_honor_max_concurrent_queries]) { auto query_id = context->getCurrentQueryId(); if (!query_id.empty()) - return data.getQueryIdHolder(query_id, data_settings->max_concurrent_queries); + return data.getQueryIdHolder(query_id, (*data_settings)[MergeTreeSetting::max_concurrent_queries]); } return nullptr; } @@ -1440,42 +1447,48 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex( if (index_helper->isVectorSimilarityIndex()) { - /// An array of indices of useful ranges. - auto result = condition->getUsefulRanges(granule); + auto rows = condition->calculateApproximateNearestNeighbors(granule); - for (auto range : result) + for (auto row : rows) { - /// The range for the corresponding index. - MarkRange data_range( - std::max(ranges[i].begin, index_mark * index_granularity + range), - std::min(ranges[i].end, index_mark * index_granularity + range + 1)); + const MergeTreeIndexGranularity & merge_tree_index_granularity = part->index_granularity; + size_t num_marks = merge_tree_index_granularity.countMarksForRows(index_mark * index_granularity, row); - if (res.empty() || res.back().end - data_range.begin > min_marks_for_seek) + MarkRange data_range( + std::max(ranges[i].begin, (index_mark * index_granularity) + num_marks), + std::min(ranges[i].end, (index_mark * index_granularity) + num_marks + 1)); + + if (!res.empty() && data_range.end == res.back().end) + /// Vector search may return >1 hit within the same granule/mark. Don't add to the result twice. + continue; + + if (res.empty() || data_range.begin - res.back().end > min_marks_for_seek) res.push_back(data_range); else res.back().end = data_range.end; } - continue; } - - bool result = false; - const auto * gin_filter_condition = dynamic_cast(&*condition); - if (!gin_filter_condition) - result = condition->mayBeTrueOnGranule(granule); else - result = cache_in_store.store ? gin_filter_condition->mayBeTrueOnGranuleInPart(granule, cache_in_store) : true; + { + bool result = false; + const auto * gin_filter_condition = dynamic_cast(&*condition); + if (!gin_filter_condition) + result = condition->mayBeTrueOnGranule(granule); + else + result = cache_in_store.store ? gin_filter_condition->mayBeTrueOnGranuleInPart(granule, cache_in_store) : true; - if (!result) - continue; + if (!result) + continue; - MarkRange data_range( - std::max(ranges[i].begin, index_mark * index_granularity), - std::min(ranges[i].end, (index_mark + 1) * index_granularity)); + MarkRange data_range( + std::max(ranges[i].begin, index_mark * index_granularity), + std::min(ranges[i].end, (index_mark + 1) * index_granularity)); - if (res.empty() || data_range.begin - res.back().end > min_marks_for_seek) - res.push_back(data_range); - else - res.back().end = data_range.end; + if (res.empty() || data_range.begin - res.back().end > min_marks_for_seek) + res.push_back(data_range); + else + res.back().end = data_range.end; + } } last_index_mark = index_range.end - 1; diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index e3a8d211e9c..67fef759ed4 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -61,7 +61,18 @@ namespace Setting extern const SettingsBool optimize_on_insert; extern const SettingsBool throw_on_max_partitions_per_insert_block; extern const SettingsUInt64 min_free_disk_bytes_to_perform_insert; - extern const SettingsDouble min_free_disk_ratio_to_perform_insert; + extern const SettingsFloat min_free_disk_ratio_to_perform_insert; +} + +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool assign_part_uuids; + extern const MergeTreeSettingsBool fsync_after_insert; + extern const MergeTreeSettingsBool fsync_part_directory; + extern const MergeTreeSettingsUInt64 min_free_disk_bytes_to_perform_insert; + extern const MergeTreeSettingsFloat min_free_disk_ratio_to_perform_insert; + extern const MergeTreeSettingsBool optimize_row_order; + extern const MergeTreeSettingsFloat ratio_of_defaults_for_sparse_serialization; } namespace ErrorCodes @@ -531,7 +542,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( ProfileEvents::increment(ProfileEvents::MergeTreeDataWriterBlocksAlreadySorted); } - if (data.getSettings()->optimize_row_order + if ((*data.getSettings())[MergeTreeSetting::optimize_row_order] && data.merging_params.mode == MergeTreeData::MergingParams::Mode::Ordinary) /// Nobody knows if this optimization messes up specialized MergeTree engines. { RowOrderOptimizer::optimize(block, sort_description, perm); @@ -567,13 +578,13 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( const auto & data_settings = data.getSettings(); const UInt64 & min_bytes_to_perform_insert = - data_settings->min_free_disk_bytes_to_perform_insert.changed - ? data_settings->min_free_disk_bytes_to_perform_insert - : global_settings[Setting::min_free_disk_bytes_to_perform_insert]; + (*data_settings)[MergeTreeSetting::min_free_disk_bytes_to_perform_insert].changed + ? (*data_settings)[MergeTreeSetting::min_free_disk_bytes_to_perform_insert] + : global_settings[Setting::min_free_disk_bytes_to_perform_insert]; - const Float64 & min_ratio_to_perform_insert = - data_settings->min_free_disk_ratio_to_perform_insert.changed - ? data_settings->min_free_disk_ratio_to_perform_insert + const Float32 & min_ratio_to_perform_insert = + (*data_settings)[MergeTreeSetting::min_free_disk_ratio_to_perform_insert].changed + ? (*data_settings)[MergeTreeSetting::min_free_disk_ratio_to_perform_insert] : global_settings[Setting::min_free_disk_ratio_to_perform_insert]; if (min_bytes_to_perform_insert > 0 || min_ratio_to_perform_insert > 0.0) @@ -606,10 +617,10 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( auto data_part_storage = new_data_part->getDataPartStoragePtr(); data_part_storage->beginTransaction(); - if (data.storage_settings.get()->assign_part_uuids) + if ((*data.storage_settings.get())[MergeTreeSetting::assign_part_uuids]) new_data_part->uuid = UUIDHelpers::generateV4(); - SerializationInfo::Settings settings{data_settings->ratio_of_defaults_for_sparse_serialization, true}; + SerializationInfo::Settings settings{(*data_settings)[MergeTreeSetting::ratio_of_defaults_for_sparse_serialization], true}; SerializationInfoByName infos(columns, settings); infos.add(block); @@ -645,7 +656,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( data_part_storage->createDirectories(); - if (data.getSettings()->fsync_part_directory) + if ((*data_settings)[MergeTreeSetting::fsync_part_directory]) { const auto disk = data_part_volume->getDisk(); sync_guard = disk->getDirectorySyncGuard(full_path); @@ -709,7 +720,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( auto finalizer = out->finalizePartAsync( new_data_part, - data_settings->fsync_after_insert, + (*data_settings)[MergeTreeSetting::fsync_after_insert], nullptr, nullptr); temp_part.part = new_data_part; @@ -751,7 +762,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( new_data_part->is_temp = is_temp; NamesAndTypesList columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames()); - SerializationInfo::Settings settings{data.getSettings()->ratio_of_defaults_for_sparse_serialization, true}; + SerializationInfo::Settings settings{(*data.getSettings())[MergeTreeSetting::ratio_of_defaults_for_sparse_serialization], true}; SerializationInfoByName infos(columns, settings); infos.add(block); @@ -799,7 +810,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( ProfileEvents::increment(ProfileEvents::MergeTreeDataProjectionWriterBlocksAlreadySorted); } - if (data.getSettings()->optimize_row_order + if ((*data.getSettings())[MergeTreeSetting::optimize_row_order] && data.merging_params.mode == MergeTreeData::MergingParams::Mode::Ordinary) /// Nobody knows if this optimization messes up specialized MergeTree engines. { RowOrderOptimizer::optimize(block, sort_description, perm); diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.cpp b/src/Storages/MergeTree/MergeTreeIOSettings.cpp index e953dda4f21..8b87c35b4e6 100644 --- a/src/Storages/MergeTree/MergeTreeIOSettings.cpp +++ b/src/Storages/MergeTree/MergeTreeIOSettings.cpp @@ -14,6 +14,20 @@ namespace Setting extern const SettingsUInt64 max_compress_block_size; } +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsUInt64 adaptive_write_buffer_initial_size; + extern const MergeTreeSettingsBool compress_primary_key; + extern const MergeTreeSettingsUInt64 marks_compress_block_size; + extern const MergeTreeSettingsString marks_compression_codec; + extern const MergeTreeSettingsUInt64 max_compress_block_size; + extern const MergeTreeSettingsUInt64 min_compress_block_size; + extern const MergeTreeSettingsUInt64 primary_key_compress_block_size; + extern const MergeTreeSettingsString primary_key_compression_codec; + extern const MergeTreeSettingsBool use_adaptive_write_buffer_for_dynamic_subcolumns; + extern const MergeTreeSettingsBool use_compact_variant_discriminators_serialization; +} + MergeTreeWriterSettings::MergeTreeWriterSettings( const Settings & global_settings, const WriteSettings & query_write_settings_, @@ -22,23 +36,23 @@ MergeTreeWriterSettings::MergeTreeWriterSettings( bool rewrite_primary_key_, bool blocks_are_granules_size_) : min_compress_block_size( - storage_settings->min_compress_block_size ? storage_settings->min_compress_block_size : global_settings[Setting::min_compress_block_size]) + (*storage_settings)[MergeTreeSetting::min_compress_block_size] ? (*storage_settings)[MergeTreeSetting::min_compress_block_size] : global_settings[Setting::min_compress_block_size]) , max_compress_block_size( - storage_settings->max_compress_block_size ? storage_settings->max_compress_block_size : global_settings[Setting::max_compress_block_size]) - , marks_compression_codec(storage_settings->marks_compression_codec) - , marks_compress_block_size(storage_settings->marks_compress_block_size) - , compress_primary_key(storage_settings->compress_primary_key) - , primary_key_compression_codec(storage_settings->primary_key_compression_codec) - , primary_key_compress_block_size(storage_settings->primary_key_compress_block_size) + (*storage_settings)[MergeTreeSetting::max_compress_block_size] ? (*storage_settings)[MergeTreeSetting::max_compress_block_size] : global_settings[Setting::max_compress_block_size]) + , marks_compression_codec((*storage_settings)[MergeTreeSetting::marks_compression_codec]) + , marks_compress_block_size((*storage_settings)[MergeTreeSetting::marks_compress_block_size]) + , compress_primary_key((*storage_settings)[MergeTreeSetting::compress_primary_key]) + , primary_key_compression_codec((*storage_settings)[MergeTreeSetting::primary_key_compression_codec]) + , primary_key_compress_block_size((*storage_settings)[MergeTreeSetting::primary_key_compress_block_size]) , can_use_adaptive_granularity(can_use_adaptive_granularity_) , rewrite_primary_key(rewrite_primary_key_) , blocks_are_granules_size(blocks_are_granules_size_) , query_write_settings(query_write_settings_) , low_cardinality_max_dictionary_size(global_settings[Setting::low_cardinality_max_dictionary_size]) , low_cardinality_use_single_dictionary_for_part(global_settings[Setting::low_cardinality_use_single_dictionary_for_part] != 0) - , use_compact_variant_discriminators_serialization(storage_settings->use_compact_variant_discriminators_serialization) - , use_adaptive_write_buffer_for_dynamic_subcolumns(storage_settings->use_adaptive_write_buffer_for_dynamic_subcolumns) - , adaptive_write_buffer_initial_size(storage_settings->adaptive_write_buffer_initial_size) + , use_compact_variant_discriminators_serialization((*storage_settings)[MergeTreeSetting::use_compact_variant_discriminators_serialization]) + , use_adaptive_write_buffer_for_dynamic_subcolumns((*storage_settings)[MergeTreeSetting::use_adaptive_write_buffer_for_dynamic_subcolumns]) + , adaptive_write_buffer_initial_size((*storage_settings)[MergeTreeSetting::adaptive_write_buffer_initial_size]) { } diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp b/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp index 2b924284857..16902540738 100644 --- a/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp @@ -85,21 +85,24 @@ size_t MergeTreeIndexGranularity::getRowsCountInRanges(const MarkRanges & ranges size_t total = 0; for (const auto & range : ranges) total += getRowsCountInRange(range); - return total; } +size_t MergeTreeIndexGranularity::countMarksForRows(size_t from_mark, size_t number_of_rows) const +{ + size_t rows_before_mark = getMarkStartingRow(from_mark); + size_t last_row_pos = rows_before_mark + number_of_rows; + auto it = std::upper_bound(marks_rows_partial_sums.begin(), marks_rows_partial_sums.end(), last_row_pos); + size_t to_mark = it - marks_rows_partial_sums.begin(); + return to_mark - from_mark; +} -size_t MergeTreeIndexGranularity::countMarksForRows(size_t from_mark, size_t number_of_rows, size_t offset_in_rows, size_t min_marks_to_read) const +size_t MergeTreeIndexGranularity::countRowsForRows(size_t from_mark, size_t number_of_rows, size_t offset_in_rows, size_t min_marks_to_read) const { size_t rows_before_mark = getMarkStartingRow(from_mark); size_t last_row_pos = rows_before_mark + offset_in_rows + number_of_rows; - auto position = std::upper_bound(marks_rows_partial_sums.begin(), marks_rows_partial_sums.end(), last_row_pos); - size_t to_mark; - if (position == marks_rows_partial_sums.end()) - to_mark = marks_rows_partial_sums.size(); - else - to_mark = position - marks_rows_partial_sums.begin(); + auto it = std::upper_bound(marks_rows_partial_sums.begin(), marks_rows_partial_sums.end(), last_row_pos); + size_t to_mark = it - marks_rows_partial_sums.begin(); /// This is a heuristic to respect min_marks_to_read which is ignored by MergeTreeReadPool in case of remote disk. /// See comment in IMergeTreeSelectAlgorithm. diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularity.h b/src/Storages/MergeTree/MergeTreeIndexGranularity.h index d67762f7293..78a1423ad7e 100644 --- a/src/Storages/MergeTree/MergeTreeIndexGranularity.h +++ b/src/Storages/MergeTree/MergeTreeIndexGranularity.h @@ -9,7 +9,7 @@ namespace DB /// Inside it contains vector of partial sums of rows after mark: /// |-----|---|----|----| /// | 5 | 8 | 12 | 16 | -/// If user doesn't specify setting adaptive_index_granularity_bytes for MergeTree* table +/// If user doesn't specify setting index_granularity_bytes for MergeTree* table /// all values in inner vector would have constant stride (default 8192). class MergeTreeIndexGranularity { @@ -28,13 +28,16 @@ public: /// Return sum of rows between all ranges size_t getRowsCountInRanges(const MarkRanges & ranges) const; - /// Return amount of marks that contains amount of `number_of_rows` starting from - /// `from_mark` and possible some offset_in_rows from `from_mark` + /// Return number of marks, starting from `from_marks` that contain `number_of_rows` + size_t countMarksForRows(size_t from_mark, size_t number_of_rows) const; + + /// Return number of rows, starting from `from_mark`, that contains amount of `number_of_rows` + /// and possible some offset_in_rows from `from_mark` /// 1 2 <- answer /// |-----|---------------------------|----|----| /// ^------------------------^-----------^ //// from_mark offset_in_rows number_of_rows - size_t countMarksForRows(size_t from_mark, size_t number_of_rows, size_t offset_in_rows, size_t min_marks_to_read) const; + size_t countRowsForRows(size_t from_mark, size_t number_of_rows, size_t offset_in_rows, size_t min_marks_to_read) const; /// Total marks size_t getMarksCount() const; diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp index 067a692a3b5..f79aa0b146d 100644 --- a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp @@ -8,6 +8,12 @@ namespace fs = std::filesystem; namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool compress_marks; + extern const MergeTreeSettingsUInt64 index_granularity; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -103,14 +109,14 @@ std::optional MergeTreeIndexGranularityInfo::getMarksTypeFromFilesyste } MergeTreeIndexGranularityInfo::MergeTreeIndexGranularityInfo(const MergeTreeData & storage, MergeTreeDataPartType type_) - : MergeTreeIndexGranularityInfo(storage, {storage.canUseAdaptiveGranularity(), storage.getSettings()->compress_marks, type_.getValue()}) + : MergeTreeIndexGranularityInfo(storage, {storage.canUseAdaptiveGranularity(), (*storage.getSettings())[MergeTreeSetting::compress_marks], type_.getValue()}) { } MergeTreeIndexGranularityInfo::MergeTreeIndexGranularityInfo(const MergeTreeData & storage, MarkType mark_type_) : mark_type(mark_type_) { - fixed_index_granularity = storage.getSettings()->index_granularity; + fixed_index_granularity = (*storage.getSettings())[MergeTreeSetting::index_granularity]; } void MergeTreeIndexGranularityInfo::changeGranularityIfRequired(const IDataPartStorage & data_part_storage) diff --git a/src/Storages/MergeTree/MergeTreeIndexUtils.h b/src/Storages/MergeTree/MergeTreeIndexUtils.h index 6ba9725b564..a87365f050f 100644 --- a/src/Storages/MergeTree/MergeTreeIndexUtils.h +++ b/src/Storages/MergeTree/MergeTreeIndexUtils.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB { diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp index bc36343ac93..8d9eebd1d70 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp @@ -400,7 +400,7 @@ MergeTreeIndexConditionVectorSimilarity::MergeTreeIndexConditionVectorSimilarity bool MergeTreeIndexConditionVectorSimilarity::mayBeTrueOnGranule(MergeTreeIndexGranulePtr) const { - throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for ANN skip indexes"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for vector similarity indexes"); } bool MergeTreeIndexConditionVectorSimilarity::alwaysUnknownOrTrue() const @@ -415,10 +415,9 @@ bool MergeTreeIndexConditionVectorSimilarity::alwaysUnknownOrTrue() const return vector_similarity_condition.alwaysUnknownOrTrue(index_distance_function); } -std::vector MergeTreeIndexConditionVectorSimilarity::getUsefulRanges(MergeTreeIndexGranulePtr granule_) const +std::vector MergeTreeIndexConditionVectorSimilarity::calculateApproximateNearestNeighbors(MergeTreeIndexGranulePtr granule_) const { const UInt64 limit = vector_similarity_condition.getLimit(); - const UInt64 index_granularity = vector_similarity_condition.getIndexGranularity(); const auto granule = std::dynamic_pointer_cast(granule_); if (granule == nullptr) @@ -437,23 +436,25 @@ std::vector MergeTreeIndexConditionVectorSimilarity::getUsefulRanges(Mer if (!search_result) throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", String(search_result.error.release())); + std::vector neighbors(search_result.size()); /// indexes of vectors which were closest to the reference vector + search_result.dump_to(neighbors.data()); + + std::sort(neighbors.begin(), neighbors.end()); + + /// Duplicates should in theory not be possible but who knows ... + const bool has_duplicates = std::adjacent_find(neighbors.begin(), neighbors.end()) != neighbors.end(); + if (has_duplicates) +#ifndef NDEBUG + throw Exception(ErrorCodes::INCORRECT_DATA, "Usearch returned duplicate row numbers"); +#else + neighbors.erase(std::unique(neighbors.begin(), neighbors.end()), neighbors.end()); +#endif + ProfileEvents::increment(ProfileEvents::USearchSearchCount); ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, search_result.visited_members); ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, search_result.computed_distances); - std::vector neighbors(search_result.size()); /// indexes of vectors which were closest to the reference vector - search_result.dump_to(neighbors.data()); - - std::vector granules; - granules.reserve(neighbors.size()); - for (auto neighbor : neighbors) - granules.push_back(neighbor / index_granularity); - - /// make unique - std::sort(granules.begin(), granules.end()); - granules.erase(std::unique(granules.begin(), granules.end()), granules.end()); - - return granules; + return neighbors; } MergeTreeIndexVectorSimilarity::MergeTreeIndexVectorSimilarity( @@ -485,7 +486,7 @@ MergeTreeIndexConditionPtr MergeTreeIndexVectorSimilarity::createIndexCondition( MergeTreeIndexConditionPtr MergeTreeIndexVectorSimilarity::createIndexCondition(const ActionsDAG *, ContextPtr) const { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "MergeTreeIndexAnnoy cannot be created with ActionsDAG"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Vector similarity index cannot be created with ActionsDAG"); } MergeTreeIndexPtr vectorSimilarityIndexCreator(const IndexDescription & index) diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h index c4c03254d2d..b77473e7c2b 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h @@ -137,7 +137,7 @@ public: bool alwaysUnknownOrTrue() const override; bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr granule) const override; - std::vector getUsefulRanges(MergeTreeIndexGranulePtr granule) const override; + std::vector calculateApproximateNearestNeighbors(MergeTreeIndexGranulePtr granule) const override; private: const VectorSimilarityCondition vector_similarity_condition; diff --git a/src/Storages/MergeTree/MergeTreeIndices.h b/src/Storages/MergeTree/MergeTreeIndices.h index 55edc682a1b..b36425c0405 100644 --- a/src/Storages/MergeTree/MergeTreeIndices.h +++ b/src/Storages/MergeTree/MergeTreeIndices.h @@ -25,6 +25,7 @@ struct MergeTreeWriterSettings; namespace ErrorCodes { + extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; } @@ -95,11 +96,12 @@ public: virtual bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr granule) const = 0; - /// Special stuff for vector similarity indexes - /// - Returns vector of indexes of ranges in granule which are useful for query. - virtual std::vector getUsefulRanges(MergeTreeIndexGranulePtr) const + /// Special method for vector similarity indexes: + /// Returns the row positions of the N nearest neighbors in the index granule + /// The returned row numbers are guaranteed to be sorted and unique. + virtual std::vector calculateApproximateNearestNeighbors(MergeTreeIndexGranulePtr) const { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for non-vector-similarity indexes."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "calculateApproximateNearestNeighbors is not implemented for non-vector-similarity indexes"); } }; diff --git a/src/Storages/MergeTree/MergeTreePartsMover.cpp b/src/Storages/MergeTree/MergeTreePartsMover.cpp index d81300da738..221da9f97a0 100644 --- a/src/Storages/MergeTree/MergeTreePartsMover.cpp +++ b/src/Storages/MergeTree/MergeTreePartsMover.cpp @@ -10,6 +10,11 @@ namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool allow_remote_fs_zero_copy_replication; +} + namespace ErrorCodes { extern const int ABORTED; @@ -233,7 +238,7 @@ MergeTreePartsMover::TemporaryClonedPart MergeTreePartsMover::clonePart(const Me MutableDataPartStoragePtr cloned_part_storage; bool preserve_blobs = false; - if (disk->supportZeroCopyReplication() && settings->allow_remote_fs_zero_copy_replication) + if (disk->supportZeroCopyReplication() && (*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) { /// Try zero-copy replication and fallback to default copy if it's not possible moving_part.part->assertOnDisk(); diff --git a/src/Storages/MergeTree/MergeTreeReadTask.cpp b/src/Storages/MergeTree/MergeTreeReadTask.cpp index 177a325ea5a..dd057dc9984 100644 --- a/src/Storages/MergeTree/MergeTreeReadTask.cpp +++ b/src/Storages/MergeTree/MergeTreeReadTask.cpp @@ -148,7 +148,7 @@ UInt64 MergeTreeReadTask::estimateNumRows(const BlockSizeParams & params) const return rows_to_read; const auto & index_granularity = info->data_part->index_granularity; - return index_granularity.countMarksForRows(range_readers.main.currentMark(), rows_to_read, range_readers.main.numReadRowsInCurrentGranule(), params.min_marks_to_read); + return index_granularity.countRowsForRows(range_readers.main.currentMark(), rows_to_read, range_readers.main.numReadRowsInCurrentGranule(), params.min_marks_to_read); } MergeTreeReadTask::BlockAndProgress MergeTreeReadTask::read(const BlockSizeParams & params) diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp index 78ba02aa7ac..3a8d48262cc 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp @@ -20,6 +20,11 @@ namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool force_read_through_cache_for_merges; +} + /// Lightweight (in terms of logic) stream for reading single part from /// MergeTree, used for merges and mutations. /// @@ -135,7 +140,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( const auto & context = storage.getContext(); ReadSettings read_settings = context->getReadSettings(); - read_settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache = !storage.getSettings()->force_read_through_cache_for_merges; + read_settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache = !(*storage.getSettings())[MergeTreeSetting::force_read_through_cache_for_merges]; /// It does not make sense to use pthread_threadpool for background merges/mutations /// And also to preserve backward compatibility diff --git a/src/Storages/MergeTree/MergeTreeSettings.cpp b/src/Storages/MergeTree/MergeTreeSettings.cpp index 6beb0927cbf..77cff4ca527 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.cpp +++ b/src/Storages/MergeTree/MergeTreeSettings.cpp @@ -1,15 +1,22 @@ -#include -#include +#include +#include +#include #include #include -#include #include +#include #include #include -#include +#include +#include #include +#include #include -#include + + +#include +#include +#include namespace DB @@ -21,30 +28,265 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +/** These settings represent fine tunes for internal details of MergeTree storages + * and should not be changed by the user without a reason. + */ + +#define MERGE_TREE_SETTINGS(M, ALIAS) \ + M(UInt64, min_compress_block_size, 0, "When granule is written, compress the data in buffer if the size of pending uncompressed data is larger or equal than the specified threshold. If this setting is not set, the corresponding global setting is used.", 0) \ + M(UInt64, max_compress_block_size, 0, "Compress the pending uncompressed data in buffer if its size is larger or equal than the specified threshold. Block of data will be compressed even if the current granule is not finished. If this setting is not set, the corresponding global setting is used.", 0) \ + M(UInt64, index_granularity, 8192, "How many rows correspond to one primary key value.", 0) \ + M(UInt64, max_digestion_size_per_segment, 256_MiB, "Max number of bytes to digest per segment to build GIN index.", 0) \ + \ + /** Data storing format settings. */ \ + M(UInt64, min_bytes_for_wide_part, 10485760, "Minimal uncompressed size in bytes to create part in wide format instead of compact", 0) \ + M(UInt64, min_rows_for_wide_part, 0, "Minimal number of rows to create part in wide format instead of compact", 0) \ + M(Float, ratio_of_defaults_for_sparse_serialization, 0.9375f, "Minimal ratio of number of default values to number of all values in column to store it in sparse serializations. If >= 1, columns will be always written in full serialization.", 0) \ + M(Bool, replace_long_file_name_to_hash, true, "If the file name for column is too long (more than 'max_file_name_length' bytes) replace it to SipHash128", 0) \ + M(UInt64, max_file_name_length, 127, "The maximal length of the file name to keep it as is without hashing", 0) \ + M(UInt64, min_bytes_for_full_part_storage, 0, "Only available in ClickHouse Cloud", 0) \ + M(UInt64, min_rows_for_full_part_storage, 0, "Only available in ClickHouse Cloud", 0) \ + M(UInt64, compact_parts_max_bytes_to_buffer, 128 * 1024 * 1024, "Only available in ClickHouse Cloud", 0) \ + M(UInt64, compact_parts_max_granules_to_buffer, 128, "Only available in ClickHouse Cloud", 0) \ + M(UInt64, compact_parts_merge_max_bytes_to_prefetch_part, 16 * 1024 * 1024, "Only available in ClickHouse Cloud", 0) \ + M(Bool, load_existing_rows_count_for_old_parts, false, "Whether to load existing_rows_count for existing parts. If false, existing_rows_count will be equal to rows_count for existing parts.", 0) \ + M(Bool, use_compact_variant_discriminators_serialization, true, "Use compact version of Variant discriminators serialization.", 0) \ + \ + /** Merge settings. */ \ + M(UInt64, merge_max_block_size, 8192, "How many rows in blocks should be formed for merge operations. By default has the same value as `index_granularity`.", 0) \ + M(UInt64, merge_max_block_size_bytes, 10 * 1024 * 1024, "How many bytes in blocks should be formed for merge operations. By default has the same value as `index_granularity_bytes`.", 0) \ + M(UInt64, max_bytes_to_merge_at_max_space_in_pool, 150ULL * 1024 * 1024 * 1024, "Maximum in total size of parts to merge, when there are maximum free threads in background pool (or entries in replication queue).", 0) \ + M(UInt64, max_bytes_to_merge_at_min_space_in_pool, 1024 * 1024, "Maximum in total size of parts to merge, when there are minimum free threads in background pool (or entries in replication queue).", 0) \ + M(UInt64, max_replicated_merges_in_queue, 1000, "How many tasks of merging and mutating parts are allowed simultaneously in ReplicatedMergeTree queue.", 0) \ + M(UInt64, max_replicated_mutations_in_queue, 8, "How many tasks of mutating parts are allowed simultaneously in ReplicatedMergeTree queue.", 0) \ + M(UInt64, max_replicated_merges_with_ttl_in_queue, 1, "How many tasks of merging parts with TTL are allowed simultaneously in ReplicatedMergeTree queue.", 0) \ + M(UInt64, number_of_free_entries_in_pool_to_lower_max_size_of_merge, 8, "When there is less than specified number of free entries in pool (or replicated queue), start to lower maximum size of merge to process (or to put in queue). This is to allow small merges to process - not filling the pool with long running merges.", 0) \ + M(UInt64, number_of_free_entries_in_pool_to_execute_mutation, 20, "When there is less than specified number of free entries in pool, do not execute part mutations. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ + M(UInt64, max_number_of_mutations_for_replica, 0, "Limit the number of part mutations per replica to the specified amount. Zero means no limit on the number of mutations per replica (the execution can still be constrained by other settings).", 0) \ + M(UInt64, max_number_of_merges_with_ttl_in_pool, 2, "When there is more than specified number of merges with TTL entries in pool, do not assign new merge with TTL. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ + M(Seconds, old_parts_lifetime, 8 * 60, "How many seconds to keep obsolete parts.", 0) \ + M(Seconds, temporary_directories_lifetime, 86400, "How many seconds to keep tmp_-directories. You should not lower this value because merges and mutations may not be able to work with low value of this setting.", 0) \ + M(Seconds, lock_acquire_timeout_for_background_operations, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "For background operations like merges, mutations etc. How many seconds before failing to acquire table locks.", 0) \ + M(UInt64, min_rows_to_fsync_after_merge, 0, "Minimal number of rows to do fsync for part after merge (0 - disabled)", 0) \ + M(UInt64, min_compressed_bytes_to_fsync_after_merge, 0, "Minimal number of compressed bytes to do fsync for part after merge (0 - disabled)", 0) \ + M(UInt64, min_compressed_bytes_to_fsync_after_fetch, 0, "Minimal number of compressed bytes to do fsync for part after fetch (0 - disabled)", 0) \ + M(Bool, fsync_after_insert, false, "Do fsync for every inserted part. Significantly decreases performance of inserts, not recommended to use with wide parts.", 0) \ + M(Bool, fsync_part_directory, false, "Do fsync for part directory after all part operations (writes, renames, etc.).", 0) \ + M(UInt64, non_replicated_deduplication_window, 0, "How many last blocks of hashes should be kept on disk (0 - disabled).", 0) \ + M(UInt64, max_parts_to_merge_at_once, 100, "Max amount of parts which can be merged at once (0 - disabled). Doesn't affect OPTIMIZE FINAL query.", 0) \ + M(UInt64, merge_selecting_sleep_ms, 5000, "Minimum time to wait before trying to select parts to merge again after no parts were selected. A lower setting will trigger selecting tasks in background_schedule_pool frequently which result in large amount of requests to zookeeper in large-scale clusters", 0) \ + M(UInt64, max_merge_selecting_sleep_ms, 60000, "Maximum time to wait before trying to select parts to merge again after no parts were selected. A lower setting will trigger selecting tasks in background_schedule_pool frequently which result in large amount of requests to zookeeper in large-scale clusters", 0) \ + M(Float, merge_selecting_sleep_slowdown_factor, 1.2f, "The sleep time for merge selecting task is multiplied by this factor when there's nothing to merge and divided when a merge was assigned", 0) \ + M(UInt64, merge_tree_clear_old_temporary_directories_interval_seconds, 60, "The period of executing the clear old temporary directories operation in background.", 0) \ + M(UInt64, merge_tree_clear_old_parts_interval_seconds, 1, "The period of executing the clear old parts operation in background.", 0) \ + M(UInt64, min_age_to_force_merge_seconds, 0, "If all parts in a certain range are older than this value, range will be always eligible for merging. Set to 0 to disable.", 0) \ + M(Bool, min_age_to_force_merge_on_partition_only, false, "Whether min_age_to_force_merge_seconds should be applied only on the entire partition and not on subset.", false) \ + M(UInt64, number_of_free_entries_in_pool_to_execute_optimize_entire_partition, 25, "When there is less than specified number of free entries in pool, do not try to execute optimize entire partition with a merge (this merge is created when set min_age_to_force_merge_seconds > 0 and min_age_to_force_merge_on_partition_only = true). This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ + M(Bool, remove_rolled_back_parts_immediately, 1, "Setting for an incomplete experimental feature.", 0) \ + M(UInt64, replicated_max_mutations_in_one_entry, 10000, "Max number of mutation commands that can be merged together and executed in one MUTATE_PART entry (0 means unlimited)", 0) \ + M(UInt64, number_of_mutations_to_delay, 500, "If table has at least that many unfinished mutations, artificially slow down mutations of table. Disabled if set to 0", 0) \ + M(UInt64, number_of_mutations_to_throw, 1000, "If table has at least that many unfinished mutations, throw 'Too many mutations' exception. Disabled if set to 0", 0) \ + M(UInt64, min_delay_to_mutate_ms, 10, "Min delay of mutating MergeTree table in milliseconds, if there are a lot of unfinished mutations", 0) \ + M(UInt64, max_delay_to_mutate_ms, 1000, "Max delay of mutating MergeTree table in milliseconds, if there are a lot of unfinished mutations", 0) \ + M(Bool, exclude_deleted_rows_for_part_size_in_merge, false, "Use an estimated source part size (excluding lightweight deleted rows) when selecting parts to merge", 0) \ + M(String, merge_workload, "", "Name of workload to be used to access resources for merges", 0) \ + M(String, mutation_workload, "", "Name of workload to be used to access resources for mutations", 0) \ + M(Milliseconds, background_task_preferred_step_execution_time_ms, 50, "Target time to execution of one step of merge or mutation. Can be exceeded if one step takes longer time", 0) \ + \ + /** Inserts settings. */ \ + M(UInt64, parts_to_delay_insert, 1000, "If table contains at least that many active parts in single partition, artificially slow down insert into table. Disabled if set to 0", 0) \ + M(UInt64, inactive_parts_to_delay_insert, 0, "If table contains at least that many inactive parts in single partition, artificially slow down insert into table.", 0) \ + M(UInt64, parts_to_throw_insert, 3000, "If more than this number active parts in single partition, throw 'Too many parts ...' exception.", 0) \ + M(UInt64, inactive_parts_to_throw_insert, 0, "If more than this number inactive parts in single partition, throw 'Too many inactive parts ...' exception.", 0) \ + M(UInt64, max_avg_part_size_for_too_many_parts, 1ULL * 1024 * 1024 * 1024, "The 'too many parts' check according to 'parts_to_delay_insert' and 'parts_to_throw_insert' will be active only if the average part size (in the relevant partition) is not larger than the specified threshold. If it is larger than the specified threshold, the INSERTs will be neither delayed or rejected. This allows to have hundreds of terabytes in a single table on a single server if the parts are successfully merged to larger parts. This does not affect the thresholds on inactive parts or total parts.", 0) \ + M(UInt64, max_delay_to_insert, 1, "Max delay of inserting data into MergeTree table in seconds, if there are a lot of unmerged parts in single partition.", 0) \ + M(UInt64, min_delay_to_insert_ms, 10, "Min delay of inserting data into MergeTree table in milliseconds, if there are a lot of unmerged parts in single partition.", 0) \ + M(UInt64, max_parts_in_total, 100000, "If more than this number active parts in all partitions in total, throw 'Too many parts ...' exception.", 0) \ + M(Bool, async_insert, false, "If true, data from INSERT query is stored in queue and later flushed to table in background.", 0) \ + M(Bool, add_implicit_sign_column_constraint_for_collapsing_engine, false, "If true, add implicit constraint for sign column for CollapsingMergeTree engine.", 0) \ + M(Milliseconds, sleep_before_commit_local_part_in_replicated_table_ms, 0, "For testing. Do not change it.", 0) \ + M(Bool, optimize_row_order, false, "Allow reshuffling of rows during part inserts and merges to improve the compressibility of the new part", 0) \ + M(Bool, use_adaptive_write_buffer_for_dynamic_subcolumns, true, "Allow to use adaptive writer buffers during writing dynamic subcolumns to reduce memory usage", 0) \ + M(UInt64, adaptive_write_buffer_initial_size, 16 * 1024, "Initial size of an adaptive write buffer", 0) \ + M(UInt64, min_free_disk_bytes_to_perform_insert, 0, "Minimum free disk space bytes to perform an insert.", 0) \ + M(Float, min_free_disk_ratio_to_perform_insert, 0.0, "Minimum free disk space ratio to perform an insert.", 0) \ + \ + /* Part removal settings. */ \ + M(UInt64, simultaneous_parts_removal_limit, 0, "Maximum number of parts to remove during one CleanupThread iteration (0 means unlimited).", 0) \ + \ + /** Replication settings. */ \ + M(UInt64, replicated_deduplication_window, 1000, "How many last blocks of hashes should be kept in ZooKeeper (old blocks will be deleted).", 0) \ + M(UInt64, replicated_deduplication_window_seconds, 7 * 24 * 60 * 60 /* one week */, "Similar to \"replicated_deduplication_window\", but determines old blocks by their lifetime. Hash of an inserted block will be deleted (and the block will not be deduplicated after) if it outside of one \"window\". You can set very big replicated_deduplication_window to avoid duplicating INSERTs during that period of time.", 0) \ + M(UInt64, replicated_deduplication_window_for_async_inserts, 10000, "How many last hash values of async_insert blocks should be kept in ZooKeeper (old blocks will be deleted).", 0) \ + M(UInt64, replicated_deduplication_window_seconds_for_async_inserts, 7 * 24 * 60 * 60 /* one week */, "Similar to \"replicated_deduplication_window_for_async_inserts\", but determines old blocks by their lifetime. Hash of an inserted block will be deleted (and the block will not be deduplicated after) if it outside of one \"window\". You can set very big replicated_deduplication_window to avoid duplicating INSERTs during that period of time.", 0) \ + M(Milliseconds, async_block_ids_cache_update_wait_ms, 100, "How long each insert iteration will wait for async_block_ids_cache update", 0) \ + M(Bool, use_async_block_ids_cache, true, "Use in-memory cache to filter duplicated async inserts based on block ids", 0) \ + M(UInt64, max_replicated_logs_to_keep, 1000, "How many records may be in log, if there is inactive replica. Inactive replica becomes lost when when this number exceed.", 0) \ + M(UInt64, min_replicated_logs_to_keep, 10, "Keep about this number of last records in ZooKeeper log, even if they are obsolete. It doesn't affect work of tables: used only to diagnose ZooKeeper log before cleaning.", 0) \ + M(Seconds, prefer_fetch_merged_part_time_threshold, 3600, "If time passed after replication log entry creation exceeds this threshold and sum size of parts is greater than \"prefer_fetch_merged_part_size_threshold\", prefer fetching merged part from replica instead of doing merge locally. To speed up very long merges.", 0) \ + M(UInt64, prefer_fetch_merged_part_size_threshold, 10ULL * 1024 * 1024 * 1024, "If sum size of parts exceeds this threshold and time passed after replication log entry creation is greater than \"prefer_fetch_merged_part_time_threshold\", prefer fetching merged part from replica instead of doing merge locally. To speed up very long merges.", 0) \ + M(Seconds, execute_merges_on_single_replica_time_threshold, 0, "When greater than zero only a single replica starts the merge immediately, others wait up to that amount of time to download the result instead of doing merges locally. If the chosen replica doesn't finish the merge during that amount of time, fallback to standard behavior happens.", 0) \ + M(Seconds, remote_fs_execute_merges_on_single_replica_time_threshold, 3 * 60 * 60, "When greater than zero only a single replica starts the merge immediately if merged part on shared storage and 'allow_remote_fs_zero_copy_replication' is enabled.", 0) \ + M(Seconds, try_fetch_recompressed_part_timeout, 7200, "Recompression works slow in most cases, so we don't start merge with recompression until this timeout and trying to fetch recompressed part from replica which assigned this merge with recompression.", 0) \ + M(Bool, always_fetch_merged_part, false, "If true, replica never merge parts and always download merged parts from other replicas.", 0) \ + M(UInt64, max_suspicious_broken_parts, 100, "Max broken parts, if more - deny automatic deletion.", 0) \ + M(UInt64, max_suspicious_broken_parts_bytes, 1ULL * 1024 * 1024 * 1024, "Max size of all broken parts, if more - deny automatic deletion.", 0) \ + M(UInt64, max_files_to_modify_in_alter_columns, 75, "Not apply ALTER if number of files for modification(deletion, addition) more than this.", 0) \ + M(UInt64, max_files_to_remove_in_alter_columns, 50, "Not apply ALTER, if number of files for deletion more than this.", 0) \ + M(Float, replicated_max_ratio_of_wrong_parts, 0.5, "If ratio of wrong parts to total number of parts is less than this - allow to start.", 0) \ + M(Bool, replicated_can_become_leader, true, "If true, Replicated tables replicas on this node will try to acquire leadership.", 0) \ + M(Seconds, zookeeper_session_expiration_check_period, 60, "ZooKeeper session expiration check period, in seconds.", 0) \ + M(Seconds, initialization_retry_period, 60, "Retry period for table initialization, in seconds.", 0) \ + M(Bool, detach_old_local_parts_when_cloning_replica, true, "Do not remove old local parts when repairing lost replica.", 0) \ + M(Bool, detach_not_byte_identical_parts, false, "Do not remove non byte-idential parts for ReplicatedMergeTree, instead detach them (maybe useful for further analysis).", 0) \ + M(UInt64, max_replicated_fetches_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for replicated fetches. Zero means unlimited.", 0) \ + M(UInt64, max_replicated_sends_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for replicated sends. Zero means unlimited.", 0) \ + M(Milliseconds, wait_for_unique_parts_send_before_shutdown_ms, 0, "Before shutdown table will wait for required amount time for unique parts (exist only on current replica) to be fetched by other replicas (0 means disabled).", 0) \ + M(Float, fault_probability_before_part_commit, 0, "For testing. Do not change it.", 0) \ + M(Float, fault_probability_after_part_commit, 0, "For testing. Do not change it.", 0) \ + M(Bool, shared_merge_tree_disable_merges_and_mutations_assignment, false, "Only available in ClickHouse Cloud", 0) \ + M(Float, shared_merge_tree_partitions_hint_ratio_to_reload_merge_pred_for_mutations, 0.5, "Only available in ClickHouse Cloud", 0) \ + M(UInt64, shared_merge_tree_parts_load_batch_size, 32, "Only available in ClickHouse Cloud", 0) \ + \ + /** Check delay of replicas settings. */ \ + M(UInt64, min_relative_delay_to_measure, 120, "Calculate relative replica delay only if absolute delay is not less that this value.", 0) \ + M(UInt64, cleanup_delay_period, 30, "Minimum period to clean old queue logs, blocks hashes and parts.", 0) \ + M(UInt64, max_cleanup_delay_period, 300, "Maximum period to clean old queue logs, blocks hashes and parts.", 0) \ + M(UInt64, cleanup_delay_period_random_add, 10, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \ + M(UInt64, cleanup_thread_preferred_points_per_iteration, 150, "Preferred batch size for background cleanup (points are abstract but 1 point is approximately equivalent to 1 inserted block).", 0) \ + M(UInt64, cleanup_threads, 128, "Only available in ClickHouse Cloud", 0) \ + M(UInt64, kill_delay_period, 30, "Only available in ClickHouse Cloud", 0) \ + M(UInt64, kill_delay_period_random_add, 10, "Only available in ClickHouse Cloud", 0) \ + M(UInt64, kill_threads, 128, "Only available in ClickHouse Cloud", 0) \ + M(UInt64, min_relative_delay_to_close, 300, "Minimal delay from other replicas to close, stop serving requests and not return Ok during status check.", 0) \ + M(UInt64, min_absolute_delay_to_close, 0, "Minimal absolute delay to close, stop serving requests and not return Ok during status check.", 0) \ + M(UInt64, enable_vertical_merge_algorithm, 1, "Enable usage of Vertical merge algorithm.", 0) \ + M(UInt64, vertical_merge_algorithm_min_rows_to_activate, 16 * 8192, "Minimal (approximate) sum of rows in merging parts to activate Vertical merge algorithm.", 0) \ + M(UInt64, vertical_merge_algorithm_min_bytes_to_activate, 0, "Minimal (approximate) uncompressed size in bytes in merging parts to activate Vertical merge algorithm.", 0) \ + M(UInt64, vertical_merge_algorithm_min_columns_to_activate, 11, "Minimal amount of non-PK columns to activate Vertical merge algorithm.", 0) \ + M(Bool, vertical_merge_remote_filesystem_prefetch, true, "If true prefetching of data from remote filesystem is used for the next column during merge", 0) \ + M(UInt64, max_postpone_time_for_failed_mutations_ms, 5ULL * 60 * 1000, "The maximum postpone time for failed mutations.", 0) \ + \ + /** Compatibility settings */ \ + M(Bool, allow_suspicious_indices, false, "Reject primary/secondary indexes and sorting keys with identical expressions", 0) \ + M(Bool, compatibility_allow_sampling_expression_not_in_primary_key, false, "Allow to create a table with sampling expression not in primary key. This is needed only to temporarily allow to run the server with wrong tables for backward compatibility.", 0) \ + M(Bool, use_minimalistic_checksums_in_zookeeper, true, "Use small format (dozens bytes) for part checksums in ZooKeeper instead of ordinary ones (dozens KB). Before enabling check that all replicas support new format.", 0) \ + M(Bool, use_minimalistic_part_header_in_zookeeper, true, "Store part header (checksums and columns) in a compact format and a single part znode instead of separate znodes (/columns and /checksums). This can dramatically reduce snapshot size in ZooKeeper. Before enabling check that all replicas support new format.", 0) \ + M(UInt64, finished_mutations_to_keep, 100, "How many records about mutations that are done to keep. If zero, then keep all of them.", 0) \ + M(UInt64, min_merge_bytes_to_use_direct_io, 10ULL * 1024 * 1024 * 1024, "Minimal amount of bytes to enable O_DIRECT in merge (0 - disabled).", 0) \ + M(UInt64, index_granularity_bytes, 10 * 1024 * 1024, "Approximate amount of bytes in single granule (0 - disabled).", 0) \ + M(UInt64, min_index_granularity_bytes, 1024, "Minimum amount of bytes in single granule.", 1024) \ + M(Int64, merge_with_ttl_timeout, 3600 * 4, "Minimal time in seconds, when merge with delete TTL can be repeated.", 0) \ + M(Int64, merge_with_recompression_ttl_timeout, 3600 * 4, "Minimal time in seconds, when merge with recompression TTL can be repeated.", 0) \ + M(Bool, ttl_only_drop_parts, false, "Only drop altogether the expired parts and not partially prune them.", 0) \ + M(Bool, materialize_ttl_recalculate_only, false, "Only recalculate ttl info when MATERIALIZE TTL", 0) \ + M(Bool, enable_mixed_granularity_parts, true, "Enable parts with adaptive and non adaptive granularity", 0) \ + M(UInt64, concurrent_part_removal_threshold, 100, "Activate concurrent part removal (see 'max_part_removal_threads') only if the number of inactive data parts is at least this.", 0) \ + M(UInt64, zero_copy_concurrent_part_removal_max_split_times, 5, "Max recursion depth for splitting independent Outdated parts ranges into smaller subranges (highly not recommended to change)", 0) \ + M(Float, zero_copy_concurrent_part_removal_max_postpone_ratio, static_cast(0.05), "Max percentage of top level parts to postpone removal in order to get smaller independent ranges (highly not recommended to change)", 0) \ + M(String, storage_policy, "default", "Name of storage disk policy", 0) \ + M(String, disk, "", "Name of storage disk. Can be specified instead of storage policy.", 0) \ + M(Bool, allow_nullable_key, false, "Allow Nullable types as primary keys.", 0) \ + M(Bool, remove_empty_parts, true, "Remove empty parts after they were pruned by TTL, mutation, or collapsing merge algorithm.", 0) \ + M(Bool, assign_part_uuids, false, "Generate UUIDs for parts. Before enabling check that all replicas support new format.", 0) \ + M(Int64, max_partitions_to_read, -1, "Limit the max number of partitions that can be accessed in one query. <= 0 means unlimited. This setting is the default that can be overridden by the query-level setting with the same name.", 0) \ + M(UInt64, max_concurrent_queries, 0, "Max number of concurrently executed queries related to the MergeTree table (0 - disabled). Queries will still be limited by other max_concurrent_queries settings.", 0) \ + M(UInt64, min_marks_to_honor_max_concurrent_queries, 0, "Minimal number of marks to honor the MergeTree-level's max_concurrent_queries (0 - disabled). Queries will still be limited by other max_concurrent_queries settings.", 0) \ + M(UInt64, min_bytes_to_rebalance_partition_over_jbod, 0, "Minimal amount of bytes to enable part rebalance over JBOD array (0 - disabled).", 0) \ + M(Bool, check_sample_column_is_correct, true, "Check columns or columns by hash for sampling are unsigned integer.", 0) \ + M(Bool, allow_vertical_merges_from_compact_to_wide_parts, true, "Allows vertical merges from compact to wide parts. This settings must have the same value on all replicas", 0) \ + M(Bool, enable_the_endpoint_id_with_zookeeper_name_prefix, false, "Enable the endpoint id with zookeeper name prefix for the replicated merge tree table", 0) \ + M(UInt64, zero_copy_merge_mutation_min_parts_size_sleep_before_lock, 1ULL * 1024 * 1024 * 1024, "If zero copy replication is enabled sleep random amount of time before trying to lock depending on parts size for merge or mutation", 0) \ + M(Bool, allow_floating_point_partition_key, false, "Allow floating point as partition key", 0) \ + M(UInt64, sleep_before_loading_outdated_parts_ms, 0, "For testing. Do not change it.", 0) \ + M(Bool, always_use_copy_instead_of_hardlinks, false, "Always copy data instead of hardlinking during mutations/replaces/detaches and so on.", 0) \ + M(Bool, disable_freeze_partition_for_zero_copy_replication, true, "Disable FREEZE PARTITION query for zero copy replication.", 0) \ + M(Bool, disable_detach_partition_for_zero_copy_replication, true, "Disable DETACH PARTITION query for zero copy replication.", 0) \ + M(Bool, disable_fetch_partition_for_zero_copy_replication, true, "Disable FETCH PARTITION query for zero copy replication.", 0) \ + M(Bool, enable_block_number_column, false, "Enable persisting column _block_number for each row.", 0) ALIAS(allow_experimental_block_number_column) \ + M(Bool, enable_block_offset_column, false, "Enable persisting column _block_offset for each row.", 0) \ + \ + /** Experimental/work in progress feature. Unsafe for production. */ \ + M(UInt64, part_moves_between_shards_enable, 0, "Experimental/Incomplete feature to move parts between shards. Does not take into account sharding expressions.", 0) \ + M(UInt64, part_moves_between_shards_delay_seconds, 30, "Time to wait before/after moving parts between shards.", 0) \ + M(Bool, allow_remote_fs_zero_copy_replication, false, "Don't use this setting in production, because it is not ready.", 0) \ + M(String, remote_fs_zero_copy_zookeeper_path, "/clickhouse/zero_copy", "ZooKeeper path for zero-copy table-independent info.", 0) \ + M(Bool, remote_fs_zero_copy_path_compatible_mode, false, "Run zero-copy in compatible mode during conversion process.", 0) \ + M(Bool, cache_populated_by_fetch, false, "Only available in ClickHouse Cloud", 0) \ + M(Bool, force_read_through_cache_for_merges, false, "Force read-through filesystem cache for merges", 0) \ + M(Bool, allow_experimental_replacing_merge_with_cleanup, false, "Allow experimental CLEANUP merges for ReplacingMergeTree with is_deleted column.", 0) \ + \ + /** Compress marks and primary key. */ \ + M(Bool, compress_marks, true, "Marks support compression, reduce mark file size and speed up network transmission.", 0) \ + M(Bool, compress_primary_key, true, "Primary key support compression, reduce primary key file size and speed up network transmission.", 0) \ + M(String, marks_compression_codec, "ZSTD(3)", "Compression encoding used by marks, marks are small enough and cached, so the default compression is ZSTD(3).", 0) \ + M(String, primary_key_compression_codec, "ZSTD(3)", "Compression encoding used by primary, primary key is small enough and cached, so the default compression is ZSTD(3).", 0) \ + M(UInt64, marks_compress_block_size, 65536, "Mark compress block size, the actual size of the block to compress.", 0) \ + M(UInt64, primary_key_compress_block_size, 65536, "Primary compress block size, the actual size of the block to compress.", 0) \ + M(Bool, primary_key_lazy_load, true, "Load primary key in memory on first use instead of on table initialization. This can save memory in the presence of a large number of tables.", 0) \ + M(Float, primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns, 0.9f, "If the value of a column of the primary key in data part changes at least in this ratio of times, skip loading next columns in memory. This allows to save memory usage by not loading useless columns of the primary key.", 0) \ + /** Projection settings. */ \ + M(UInt64, max_projections, 25, "The maximum number of merge tree projections.", 0) \ + M(LightweightMutationProjectionMode, lightweight_mutation_projection_mode, LightweightMutationProjectionMode::THROW, "When lightweight delete happens on a table with projection(s), the possible operations include throw the exception as projection exists, or drop projections of this table's relevant parts, or rebuild the projections.", 0) \ + M(DeduplicateMergeProjectionMode, deduplicate_merge_projection_mode, DeduplicateMergeProjectionMode::THROW, "Whether to allow create projection for the table with non-classic MergeTree. Ignore option is purely for compatibility which might result in incorrect answer. Otherwise, if allowed, what is the action when merge, drop or rebuild.", 0) \ + +#define MAKE_OBSOLETE_MERGE_TREE_SETTING(M, TYPE, NAME, DEFAULT) \ + M(TYPE, NAME, DEFAULT, "Obsolete setting, does nothing.", BaseSettingsHelpers::Flags::OBSOLETE) + +#define OBSOLETE_MERGE_TREE_SETTINGS(M, ALIAS) \ + /** Obsolete settings that do nothing but left for compatibility reasons. */ \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, min_relative_delay_to_yield_leadership, 120) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, check_delay_period, 60) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, replicated_max_parallel_sends, 0) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, replicated_max_parallel_sends_for_table, 0) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, replicated_max_parallel_fetches, 0) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, replicated_max_parallel_fetches_for_table, 0) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, Bool, write_final_mark, true) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, min_bytes_for_compact_part, 0) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, min_rows_for_compact_part, 0) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, Bool, in_memory_parts_enable_wal, true) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, write_ahead_log_max_bytes, 1024 * 1024 * 1024) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, write_ahead_log_bytes_to_fsync, 100ULL * 1024 * 1024) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, write_ahead_log_interval_ms_to_fsync, 100) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, Bool, in_memory_parts_insert_sync, false) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, MaxThreads, max_part_loading_threads, 0) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, MaxThreads, max_part_removal_threads, 0) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, Bool, use_metadata_cache, false) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, merge_tree_enable_clear_old_broken_detached, 0) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, merge_tree_clear_old_broken_detached_parts_ttl_timeout_seconds, 1ULL * 3600 * 24 * 30) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, Seconds, replicated_fetches_http_connection_timeout, 0) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, Seconds, replicated_fetches_http_send_timeout, 0) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, Seconds, replicated_fetches_http_receive_timeout, 0) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, replicated_max_parallel_fetches_for_host, DEFAULT_COUNT_OF_HTTP_CONNECTIONS_PER_ENDPOINT) \ + MAKE_OBSOLETE_MERGE_TREE_SETTING(M, CleanDeletedRows, clean_deleted_rows, CleanDeletedRows::Never) \ + + /// Settings that should not change after the creation of a table. + /// NOLINTNEXTLINE +#define APPLY_FOR_IMMUTABLE_MERGE_TREE_SETTINGS(MACRO) \ + MACRO(index_granularity) + +#define LIST_OF_MERGE_TREE_SETTINGS(M, ALIAS) \ + MERGE_TREE_SETTINGS(M, ALIAS) \ + OBSOLETE_MERGE_TREE_SETTINGS(M, ALIAS) + +DECLARE_SETTINGS_TRAITS(MergeTreeSettingsTraits, LIST_OF_MERGE_TREE_SETTINGS) + + +/** Settings for the MergeTree family of engines. + * Could be loaded from config or from a CREATE TABLE query (SETTINGS clause). + */ +struct MergeTreeSettingsImpl : public BaseSettings +{ + /// NOTE: will rewrite the AST to add immutable settings. + void loadFromQuery(ASTStorage & storage_def, ContextPtr context, bool is_attach); + + /// Check that the values are sane taking also query-level settings into account. + void sanityCheck(size_t background_pool_tasks) const; +}; + IMPLEMENT_SETTINGS_TRAITS(MergeTreeSettingsTraits, LIST_OF_MERGE_TREE_SETTINGS) -void MergeTreeSettings::loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config) -{ - if (!config.has(config_elem)) - return; - - Poco::Util::AbstractConfiguration::Keys config_keys; - config.keys(config_elem, config_keys); - - try - { - for (const String & key : config_keys) - set(key, config.getString(config_elem + "." + key)); - } - catch (Exception & e) - { - if (e.code() == ErrorCodes::UNKNOWN_SETTING) - e.addMessage("in MergeTree config"); - throw; - } -} - -void MergeTreeSettings::loadFromQuery(ASTStorage & storage_def, ContextPtr context, bool is_attach) +void MergeTreeSettingsImpl::loadFromQuery(ASTStorage & storage_def, ContextPtr context, bool is_attach) { if (storage_def.settings) { @@ -119,17 +361,7 @@ void MergeTreeSettings::loadFromQuery(ASTStorage & storage_def, ContextPtr conte #undef ADD_IF_ABSENT } -bool MergeTreeSettings::isReadonlySetting(const String & name) -{ - return name == "index_granularity" || name == "index_granularity_bytes" || name == "enable_mixed_granularity_parts"; -} - -bool MergeTreeSettings::isPartFormatSetting(const String & name) -{ - return name == "min_bytes_for_wide_part" || name == "min_rows_for_wide_part"; -} - -void MergeTreeSettings::sanityCheck(size_t background_pool_tasks) const +void MergeTreeSettingsImpl::sanityCheck(size_t background_pool_tasks) const { if (number_of_free_entries_in_pool_to_execute_mutation > background_pool_tasks) { @@ -244,21 +476,274 @@ void MergeTreeColumnSettings::validate(const SettingsChanges & changes) "Setting {} is unknown or not supported at column level, supported settings: {}", change.name, fmt::join(allowed_column_level_settings, ", ")); - MergeTreeSettings::checkCanSet(change.name, change.value); + MergeTreeSettingsImpl::checkCanSet(change.name, change.value); + } +} + +#define INITIALIZE_SETTING_EXTERN(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) \ + MergeTreeSettings ## TYPE NAME = & MergeTreeSettings ## Impl :: NAME; + +namespace MergeTreeSetting +{ + LIST_OF_MERGE_TREE_SETTINGS(INITIALIZE_SETTING_EXTERN, SKIP_ALIAS) +} + +#undef INITIALIZE_SETTING_EXTERN + +MergeTreeSettings::MergeTreeSettings() : impl(std::make_unique()) +{ +} + +MergeTreeSettings::MergeTreeSettings(const MergeTreeSettings & settings) : impl(std::make_unique(*settings.impl)) +{ +} + +MergeTreeSettings::MergeTreeSettings(MergeTreeSettings && settings) noexcept + : impl(std::make_unique(std::move(*settings.impl))) +{ +} + +MergeTreeSettings::~MergeTreeSettings() = default; + +#define IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR(CLASS_NAME, TYPE) \ + const SettingField##TYPE & MergeTreeSettings::operator[](CLASS_NAME##TYPE t) const \ + { \ + return impl.get()->*t; \ + } \ +SettingField##TYPE & MergeTreeSettings::operator[](CLASS_NAME##TYPE t) \ + { \ + return impl.get()->*t; \ + } + +MERGETREE_SETTINGS_SUPPORTED_TYPES(MergeTreeSettings, IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR) +#undef IMPLEMENT_SETTING_SUBSCRIPT_OPERATOR + +bool MergeTreeSettings::has(std::string_view name) const +{ + return impl->has(name); +} + +bool MergeTreeSettings::tryGet(std::string_view name, Field & value) const +{ + return impl->tryGet(name, value); +} + +Field MergeTreeSettings::get(std::string_view name) const +{ + return impl->get(name); +} + +void MergeTreeSettings::set(std::string_view name, const Field & value) +{ + impl->set(name, value); +} + +SettingsChanges MergeTreeSettings::changes() const +{ + return impl->changes(); +} + +void MergeTreeSettings::applyChanges(const SettingsChanges & changes) +{ + impl->applyChanges(changes); +} + +void MergeTreeSettings::applyChange(const SettingChange & change) +{ + impl->applyChange(change); +} + +void MergeTreeSettings::applyCompatibilitySetting(const String & compatibility_value) +{ + /// If setting value is empty, we don't need to change settings + if (compatibility_value.empty()) + return; + + ClickHouseVersion version(compatibility_value); + const auto & settings_changes_history = getMergeTreeSettingsChangesHistory(); + /// Iterate through ClickHouse version in descending order and apply reversed + /// changes for each version that is higher that version from compatibility setting + for (auto it = settings_changes_history.rbegin(); it != settings_changes_history.rend(); ++it) + { + if (version >= it->first) + break; + + /// Apply reversed changes from this version. + for (const auto & change : it->second) + { + /// In case the alias is being used (e.g. use enable_analyzer) we must change the original setting + auto final_name = MergeTreeSettingsTraits::resolveName(change.name); + set(final_name, change.previous_value); + } + } +} + +std::vector MergeTreeSettings::getAllRegisteredNames() const +{ + std::vector setting_names; + for (const auto & setting : impl->all()) + { + setting_names.emplace_back(setting.getName()); + } + return setting_names; +} + +void MergeTreeSettings::loadFromQuery(ASTStorage & storage_def, ContextPtr context, bool is_attach) +{ + impl->loadFromQuery(storage_def, context, is_attach); +} + +void MergeTreeSettings::loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config) +{ + if (!config.has(config_elem)) + return; + + Poco::Util::AbstractConfiguration::Keys config_keys; + config.keys(config_elem, config_keys); + + try + { + for (const String & key : config_keys) + impl->set(key, config.getString(config_elem + "." + key)); + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::UNKNOWN_SETTING) + e.addMessage("in MergeTree config"); + throw; + } +} + +bool MergeTreeSettings::needSyncPart(size_t input_rows, size_t input_bytes) const +{ + return ( + (impl->min_rows_to_fsync_after_merge && input_rows >= impl->min_rows_to_fsync_after_merge) + || (impl->min_compressed_bytes_to_fsync_after_merge && input_bytes >= impl->min_compressed_bytes_to_fsync_after_merge)); +} + +void MergeTreeSettings::sanityCheck(size_t background_pool_tasks) const +{ + impl->sanityCheck(background_pool_tasks); +} + +void MergeTreeSettings::dumpToSystemMergeTreeSettingsColumns(MutableColumnsAndConstraints & params) const +{ + const auto & constraints = params.constraints; + MutableColumns & res_columns = params.res_columns; + + for (const auto & setting : impl->all()) + { + const auto & setting_name = setting.getName(); + res_columns[0]->insert(setting_name); + res_columns[1]->insert(setting.getValueString()); + res_columns[2]->insert(setting.isValueChanged()); + res_columns[3]->insert(setting.getDescription()); + + Field min, max; + SettingConstraintWritability writability = SettingConstraintWritability::WRITABLE; + constraints.get(*this, setting_name, min, max, writability); + + /// These two columns can accept strings only. + if (!min.isNull()) + min = MergeTreeSettings::valueToStringUtil(setting_name, min); + if (!max.isNull()) + max = MergeTreeSettings::valueToStringUtil(setting_name, max); + + res_columns[4]->insert(min); + res_columns[5]->insert(max); + res_columns[6]->insert(writability == SettingConstraintWritability::CONST); + res_columns[7]->insert(setting.getTypeName()); + res_columns[8]->insert(setting.isObsolete()); } } -std::vector MergeTreeSettings::getAllRegisteredNames() const +namespace { - std::vector all_settings; - for (const auto & setting_field : all()) - all_settings.push_back(setting_field.getName()); - return all_settings; +/// Define transparent hash to we can use +/// std::string_view with the containers +struct TransparentStringHash +{ + using is_transparent = void; + size_t operator()(std::string_view txt) const { return std::hash{}(txt); } +}; +} + +void MergeTreeSettings::addToProgramOptionsIfNotPresent( + boost::program_options::options_description & main_options, bool allow_repeated_settings) +{ + /// Add merge tree settings manually, because names of some settings + /// may clash. Query settings have higher priority and we just + /// skip ambiguous merge tree settings. + + std::unordered_set> main_option_names; + for (const auto & option : main_options.options()) + main_option_names.insert(option->long_name()); + + const auto & settings_to_aliases = MergeTreeSettingsImpl::Traits::settingsToAliases(); + for (const auto & setting : impl->all()) + { + const auto add_setting = [&](const std::string_view name) + { + if (auto it = main_option_names.find(name); it != main_option_names.end()) + return; + + if (allow_repeated_settings) + addProgramOptionAsMultitoken(*impl, main_options, name, setting); + else + addProgramOption(*impl, main_options, name, setting); + }; + + const auto & setting_name = setting.getName(); + add_setting(setting_name); + + if (auto it = settings_to_aliases.find(setting_name); it != settings_to_aliases.end()) + { + for (const auto alias : it->second) + { + add_setting(alias); + } + } + } +} + +Field MergeTreeSettings::castValueUtil(std::string_view name, const Field & value) +{ + return MergeTreeSettingsImpl::castValueUtil(name, value); +} + +String MergeTreeSettings::valueToStringUtil(std::string_view name, const Field & value) +{ + return MergeTreeSettingsImpl::valueToStringUtil(name, value); +} + +Field MergeTreeSettings::stringToValueUtil(std::string_view name, const String & str) +{ + return MergeTreeSettingsImpl::stringToValueUtil(name, str); +} + +bool MergeTreeSettings::hasBuiltin(std::string_view name) +{ + return MergeTreeSettingsImpl::hasBuiltin(name); } std::string_view MergeTreeSettings::resolveName(std::string_view name) { - return MergeTreeSettings::Traits::resolveName(name); + return MergeTreeSettingsImpl::Traits::resolveName(name); +} + +bool MergeTreeSettings::isReadonlySetting(const String & name) +{ + return name == "index_granularity" || name == "index_granularity_bytes" || name == "enable_mixed_granularity_parts"; +} + +void MergeTreeSettings::checkCanSet(std::string_view name, const Field & value) +{ + MergeTreeSettingsImpl::checkCanSet(name, value); +} + +bool MergeTreeSettings::isPartFormatSetting(const String & name) +{ + return name == "min_bytes_for_wide_part" || name == "min_rows_for_wide_part"; } } diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index add20b7cf75..794a79f89cf 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -1,304 +1,103 @@ #pragma once -#include -#include -#include +#include +#include #include -#include -#include -#include +#include +#include +#include - -namespace Poco::Util +namespace boost { - class AbstractConfiguration; +namespace program_options +{ +class options_description; +} } +namespace Poco +{ +namespace Util +{ +class AbstractConfiguration; +} +} namespace DB { class ASTStorage; -struct Settings; +class Context; +using ContextPtr = std::shared_ptr; +struct MergeTreeSettingsImpl; +struct MergeTreeSettings; +using MergeTreeSettingsPtr = std::shared_ptr; +struct MutableColumnsAndConstraints; +/// List of available types supported in MergeTreeSettings object +#define MERGETREE_SETTINGS_SUPPORTED_TYPES(CLASS_NAME, M) \ + M(CLASS_NAME, Bool) \ + M(CLASS_NAME, CleanDeletedRows) \ + M(CLASS_NAME, DeduplicateMergeProjectionMode) \ + M(CLASS_NAME, Float) \ + M(CLASS_NAME, Int64) \ + M(CLASS_NAME, LightweightMutationProjectionMode) \ + M(CLASS_NAME, MaxThreads) \ + M(CLASS_NAME, Milliseconds) \ + M(CLASS_NAME, Seconds) \ + M(CLASS_NAME, String) \ + M(CLASS_NAME, UInt64) -/** These settings represent fine tunes for internal details of MergeTree storages - * and should not be changed by the user without a reason. - */ +MERGETREE_SETTINGS_SUPPORTED_TYPES(MergeTreeSettings, DECLARE_SETTING_TRAIT) -#define MERGE_TREE_SETTINGS(M, ALIAS) \ - M(UInt64, min_compress_block_size, 0, "When granule is written, compress the data in buffer if the size of pending uncompressed data is larger or equal than the specified threshold. If this setting is not set, the corresponding global setting is used.", 0) \ - M(UInt64, max_compress_block_size, 0, "Compress the pending uncompressed data in buffer if its size is larger or equal than the specified threshold. Block of data will be compressed even if the current granule is not finished. If this setting is not set, the corresponding global setting is used.", 0) \ - M(UInt64, index_granularity, 8192, "How many rows correspond to one primary key value.", 0) \ - M(UInt64, max_digestion_size_per_segment, 256_MiB, "Max number of bytes to digest per segment to build GIN index.", 0) \ - \ - /** Data storing format settings. */ \ - M(UInt64, min_bytes_for_wide_part, 10485760, "Minimal uncompressed size in bytes to create part in wide format instead of compact", 0) \ - M(UInt64, min_rows_for_wide_part, 0, "Minimal number of rows to create part in wide format instead of compact", 0) \ - M(Float, ratio_of_defaults_for_sparse_serialization, 0.9375f, "Minimal ratio of number of default values to number of all values in column to store it in sparse serializations. If >= 1, columns will be always written in full serialization.", 0) \ - M(Bool, replace_long_file_name_to_hash, true, "If the file name for column is too long (more than 'max_file_name_length' bytes) replace it to SipHash128", 0) \ - M(UInt64, max_file_name_length, 127, "The maximal length of the file name to keep it as is without hashing", 0) \ - M(UInt64, min_bytes_for_full_part_storage, 0, "Only available in ClickHouse Cloud", 0) \ - M(UInt64, min_rows_for_full_part_storage, 0, "Only available in ClickHouse Cloud", 0) \ - M(UInt64, compact_parts_max_bytes_to_buffer, 128 * 1024 * 1024, "Only available in ClickHouse Cloud", 0) \ - M(UInt64, compact_parts_max_granules_to_buffer, 128, "Only available in ClickHouse Cloud", 0) \ - M(UInt64, compact_parts_merge_max_bytes_to_prefetch_part, 16 * 1024 * 1024, "Only available in ClickHouse Cloud", 0) \ - M(Bool, load_existing_rows_count_for_old_parts, false, "Whether to load existing_rows_count for existing parts. If false, existing_rows_count will be equal to rows_count for existing parts.", 0) \ - M(Bool, use_compact_variant_discriminators_serialization, true, "Use compact version of Variant discriminators serialization.", 0) \ - \ - /** Merge settings. */ \ - M(UInt64, merge_max_block_size, 8192, "How many rows in blocks should be formed for merge operations. By default has the same value as `index_granularity`.", 0) \ - M(UInt64, merge_max_block_size_bytes, 10 * 1024 * 1024, "How many bytes in blocks should be formed for merge operations. By default has the same value as `index_granularity_bytes`.", 0) \ - M(UInt64, max_bytes_to_merge_at_max_space_in_pool, 150ULL * 1024 * 1024 * 1024, "Maximum in total size of parts to merge, when there are maximum free threads in background pool (or entries in replication queue).", 0) \ - M(UInt64, max_bytes_to_merge_at_min_space_in_pool, 1024 * 1024, "Maximum in total size of parts to merge, when there are minimum free threads in background pool (or entries in replication queue).", 0) \ - M(UInt64, max_replicated_merges_in_queue, 1000, "How many tasks of merging and mutating parts are allowed simultaneously in ReplicatedMergeTree queue.", 0) \ - M(UInt64, max_replicated_mutations_in_queue, 8, "How many tasks of mutating parts are allowed simultaneously in ReplicatedMergeTree queue.", 0) \ - M(UInt64, max_replicated_merges_with_ttl_in_queue, 1, "How many tasks of merging parts with TTL are allowed simultaneously in ReplicatedMergeTree queue.", 0) \ - M(UInt64, number_of_free_entries_in_pool_to_lower_max_size_of_merge, 8, "When there is less than specified number of free entries in pool (or replicated queue), start to lower maximum size of merge to process (or to put in queue). This is to allow small merges to process - not filling the pool with long running merges.", 0) \ - M(UInt64, number_of_free_entries_in_pool_to_execute_mutation, 20, "When there is less than specified number of free entries in pool, do not execute part mutations. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ - M(UInt64, max_number_of_mutations_for_replica, 0, "Limit the number of part mutations per replica to the specified amount. Zero means no limit on the number of mutations per replica (the execution can still be constrained by other settings).", 0) \ - M(UInt64, max_number_of_merges_with_ttl_in_pool, 2, "When there is more than specified number of merges with TTL entries in pool, do not assign new merge with TTL. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ - M(Seconds, old_parts_lifetime, 8 * 60, "How many seconds to keep obsolete parts.", 0) \ - M(Seconds, temporary_directories_lifetime, 86400, "How many seconds to keep tmp_-directories. You should not lower this value because merges and mutations may not be able to work with low value of this setting.", 0) \ - M(Seconds, lock_acquire_timeout_for_background_operations, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "For background operations like merges, mutations etc. How many seconds before failing to acquire table locks.", 0) \ - M(UInt64, min_rows_to_fsync_after_merge, 0, "Minimal number of rows to do fsync for part after merge (0 - disabled)", 0) \ - M(UInt64, min_compressed_bytes_to_fsync_after_merge, 0, "Minimal number of compressed bytes to do fsync for part after merge (0 - disabled)", 0) \ - M(UInt64, min_compressed_bytes_to_fsync_after_fetch, 0, "Minimal number of compressed bytes to do fsync for part after fetch (0 - disabled)", 0) \ - M(Bool, fsync_after_insert, false, "Do fsync for every inserted part. Significantly decreases performance of inserts, not recommended to use with wide parts.", 0) \ - M(Bool, fsync_part_directory, false, "Do fsync for part directory after all part operations (writes, renames, etc.).", 0) \ - M(UInt64, non_replicated_deduplication_window, 0, "How many last blocks of hashes should be kept on disk (0 - disabled).", 0) \ - M(UInt64, max_parts_to_merge_at_once, 100, "Max amount of parts which can be merged at once (0 - disabled). Doesn't affect OPTIMIZE FINAL query.", 0) \ - M(UInt64, merge_selecting_sleep_ms, 5000, "Minimum time to wait before trying to select parts to merge again after no parts were selected. A lower setting will trigger selecting tasks in background_schedule_pool frequently which result in large amount of requests to zookeeper in large-scale clusters", 0) \ - M(UInt64, max_merge_selecting_sleep_ms, 60000, "Maximum time to wait before trying to select parts to merge again after no parts were selected. A lower setting will trigger selecting tasks in background_schedule_pool frequently which result in large amount of requests to zookeeper in large-scale clusters", 0) \ - M(Float, merge_selecting_sleep_slowdown_factor, 1.2f, "The sleep time for merge selecting task is multiplied by this factor when there's nothing to merge and divided when a merge was assigned", 0) \ - M(UInt64, merge_tree_clear_old_temporary_directories_interval_seconds, 60, "The period of executing the clear old temporary directories operation in background.", 0) \ - M(UInt64, merge_tree_clear_old_parts_interval_seconds, 1, "The period of executing the clear old parts operation in background.", 0) \ - M(UInt64, min_age_to_force_merge_seconds, 0, "If all parts in a certain range are older than this value, range will be always eligible for merging. Set to 0 to disable.", 0) \ - M(Bool, min_age_to_force_merge_on_partition_only, false, "Whether min_age_to_force_merge_seconds should be applied only on the entire partition and not on subset.", false) \ - M(UInt64, number_of_free_entries_in_pool_to_execute_optimize_entire_partition, 25, "When there is less than specified number of free entries in pool, do not try to execute optimize entire partition with a merge (this merge is created when set min_age_to_force_merge_seconds > 0 and min_age_to_force_merge_on_partition_only = true). This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ - M(Bool, remove_rolled_back_parts_immediately, 1, "Setting for an incomplete experimental feature.", 0) \ - M(UInt64, replicated_max_mutations_in_one_entry, 10000, "Max number of mutation commands that can be merged together and executed in one MUTATE_PART entry (0 means unlimited)", 0) \ - M(UInt64, number_of_mutations_to_delay, 500, "If table has at least that many unfinished mutations, artificially slow down mutations of table. Disabled if set to 0", 0) \ - M(UInt64, number_of_mutations_to_throw, 1000, "If table has at least that many unfinished mutations, throw 'Too many mutations' exception. Disabled if set to 0", 0) \ - M(UInt64, min_delay_to_mutate_ms, 10, "Min delay of mutating MergeTree table in milliseconds, if there are a lot of unfinished mutations", 0) \ - M(UInt64, max_delay_to_mutate_ms, 1000, "Max delay of mutating MergeTree table in milliseconds, if there are a lot of unfinished mutations", 0) \ - M(Bool, exclude_deleted_rows_for_part_size_in_merge, false, "Use an estimated source part size (excluding lightweight deleted rows) when selecting parts to merge", 0) \ - M(String, merge_workload, "", "Name of workload to be used to access resources for merges", 0) \ - M(String, mutation_workload, "", "Name of workload to be used to access resources for mutations", 0) \ - M(Milliseconds, background_task_preferred_step_execution_time_ms, 50, "Target time to execution of one step of merge or mutation. Can be exceeded if one step takes longer time", 0) \ - \ - /** Inserts settings. */ \ - M(UInt64, parts_to_delay_insert, 1000, "If table contains at least that many active parts in single partition, artificially slow down insert into table. Disabled if set to 0", 0) \ - M(UInt64, inactive_parts_to_delay_insert, 0, "If table contains at least that many inactive parts in single partition, artificially slow down insert into table.", 0) \ - M(UInt64, parts_to_throw_insert, 3000, "If more than this number active parts in single partition, throw 'Too many parts ...' exception.", 0) \ - M(UInt64, inactive_parts_to_throw_insert, 0, "If more than this number inactive parts in single partition, throw 'Too many inactive parts ...' exception.", 0) \ - M(UInt64, max_avg_part_size_for_too_many_parts, 1ULL * 1024 * 1024 * 1024, "The 'too many parts' check according to 'parts_to_delay_insert' and 'parts_to_throw_insert' will be active only if the average part size (in the relevant partition) is not larger than the specified threshold. If it is larger than the specified threshold, the INSERTs will be neither delayed or rejected. This allows to have hundreds of terabytes in a single table on a single server if the parts are successfully merged to larger parts. This does not affect the thresholds on inactive parts or total parts.", 0) \ - M(UInt64, max_delay_to_insert, 1, "Max delay of inserting data into MergeTree table in seconds, if there are a lot of unmerged parts in single partition.", 0) \ - M(UInt64, min_delay_to_insert_ms, 10, "Min delay of inserting data into MergeTree table in milliseconds, if there are a lot of unmerged parts in single partition.", 0) \ - M(UInt64, max_parts_in_total, 100000, "If more than this number active parts in all partitions in total, throw 'Too many parts ...' exception.", 0) \ - M(Bool, async_insert, false, "If true, data from INSERT query is stored in queue and later flushed to table in background.", 0) \ - M(Bool, add_implicit_sign_column_constraint_for_collapsing_engine, false, "If true, add implicit constraint for sign column for CollapsingMergeTree engine.", 0) \ - M(Milliseconds, sleep_before_commit_local_part_in_replicated_table_ms, 0, "For testing. Do not change it.", 0) \ - M(Bool, optimize_row_order, false, "Allow reshuffling of rows during part inserts and merges to improve the compressibility of the new part", 0) \ - M(Bool, use_adaptive_write_buffer_for_dynamic_subcolumns, true, "Allow to use adaptive writer buffers during writing dynamic subcolumns to reduce memory usage", 0) \ - M(UInt64, adaptive_write_buffer_initial_size, 16 * 1024, "Initial size of an adaptive write buffer", 0) \ - M(UInt64, min_free_disk_bytes_to_perform_insert, 0, "Minimum free disk space bytes to perform an insert.", 0) \ - M(Double, min_free_disk_ratio_to_perform_insert, 0.0, "Minimum free disk space ratio to perform an insert.", 0) \ - \ - /* Part removal settings. */ \ - M(UInt64, simultaneous_parts_removal_limit, 0, "Maximum number of parts to remove during one CleanupThread iteration (0 means unlimited).", 0) \ - \ - /** Replication settings. */ \ - M(UInt64, replicated_deduplication_window, 1000, "How many last blocks of hashes should be kept in ZooKeeper (old blocks will be deleted).", 0) \ - M(UInt64, replicated_deduplication_window_seconds, 7 * 24 * 60 * 60 /* one week */, "Similar to \"replicated_deduplication_window\", but determines old blocks by their lifetime. Hash of an inserted block will be deleted (and the block will not be deduplicated after) if it outside of one \"window\". You can set very big replicated_deduplication_window to avoid duplicating INSERTs during that period of time.", 0) \ - M(UInt64, replicated_deduplication_window_for_async_inserts, 10000, "How many last hash values of async_insert blocks should be kept in ZooKeeper (old blocks will be deleted).", 0) \ - M(UInt64, replicated_deduplication_window_seconds_for_async_inserts, 7 * 24 * 60 * 60 /* one week */, "Similar to \"replicated_deduplication_window_for_async_inserts\", but determines old blocks by their lifetime. Hash of an inserted block will be deleted (and the block will not be deduplicated after) if it outside of one \"window\". You can set very big replicated_deduplication_window to avoid duplicating INSERTs during that period of time.", 0) \ - M(Milliseconds, async_block_ids_cache_update_wait_ms, 100, "How long each insert iteration will wait for async_block_ids_cache update", 0) \ - M(Bool, use_async_block_ids_cache, true, "Use in-memory cache to filter duplicated async inserts based on block ids", 0) \ - M(UInt64, max_replicated_logs_to_keep, 1000, "How many records may be in log, if there is inactive replica. Inactive replica becomes lost when when this number exceed.", 0) \ - M(UInt64, min_replicated_logs_to_keep, 10, "Keep about this number of last records in ZooKeeper log, even if they are obsolete. It doesn't affect work of tables: used only to diagnose ZooKeeper log before cleaning.", 0) \ - M(Seconds, prefer_fetch_merged_part_time_threshold, 3600, "If time passed after replication log entry creation exceeds this threshold and sum size of parts is greater than \"prefer_fetch_merged_part_size_threshold\", prefer fetching merged part from replica instead of doing merge locally. To speed up very long merges.", 0) \ - M(UInt64, prefer_fetch_merged_part_size_threshold, 10ULL * 1024 * 1024 * 1024, "If sum size of parts exceeds this threshold and time passed after replication log entry creation is greater than \"prefer_fetch_merged_part_time_threshold\", prefer fetching merged part from replica instead of doing merge locally. To speed up very long merges.", 0) \ - M(Seconds, execute_merges_on_single_replica_time_threshold, 0, "When greater than zero only a single replica starts the merge immediately, others wait up to that amount of time to download the result instead of doing merges locally. If the chosen replica doesn't finish the merge during that amount of time, fallback to standard behavior happens.", 0) \ - M(Seconds, remote_fs_execute_merges_on_single_replica_time_threshold, 3 * 60 * 60, "When greater than zero only a single replica starts the merge immediately if merged part on shared storage and 'allow_remote_fs_zero_copy_replication' is enabled.", 0) \ - M(Seconds, try_fetch_recompressed_part_timeout, 7200, "Recompression works slow in most cases, so we don't start merge with recompression until this timeout and trying to fetch recompressed part from replica which assigned this merge with recompression.", 0) \ - M(Bool, always_fetch_merged_part, false, "If true, replica never merge parts and always download merged parts from other replicas.", 0) \ - M(UInt64, max_suspicious_broken_parts, 100, "Max broken parts, if more - deny automatic deletion.", 0) \ - M(UInt64, max_suspicious_broken_parts_bytes, 1ULL * 1024 * 1024 * 1024, "Max size of all broken parts, if more - deny automatic deletion.", 0) \ - M(UInt64, max_files_to_modify_in_alter_columns, 75, "Not apply ALTER if number of files for modification(deletion, addition) more than this.", 0) \ - M(UInt64, max_files_to_remove_in_alter_columns, 50, "Not apply ALTER, if number of files for deletion more than this.", 0) \ - M(Float, replicated_max_ratio_of_wrong_parts, 0.5, "If ratio of wrong parts to total number of parts is less than this - allow to start.", 0) \ - M(Bool, replicated_can_become_leader, true, "If true, Replicated tables replicas on this node will try to acquire leadership.", 0) \ - M(Seconds, zookeeper_session_expiration_check_period, 60, "ZooKeeper session expiration check period, in seconds.", 0) \ - M(Seconds, initialization_retry_period, 60, "Retry period for table initialization, in seconds.", 0) \ - M(Bool, detach_old_local_parts_when_cloning_replica, true, "Do not remove old local parts when repairing lost replica.", 0) \ - M(Bool, detach_not_byte_identical_parts, false, "Do not remove non byte-idential parts for ReplicatedMergeTree, instead detach them (maybe useful for further analysis).", 0) \ - M(UInt64, max_replicated_fetches_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for replicated fetches. Zero means unlimited.", 0) \ - M(UInt64, max_replicated_sends_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for replicated sends. Zero means unlimited.", 0) \ - M(Milliseconds, wait_for_unique_parts_send_before_shutdown_ms, 0, "Before shutdown table will wait for required amount time for unique parts (exist only on current replica) to be fetched by other replicas (0 means disabled).", 0) \ - M(Float, fault_probability_before_part_commit, 0, "For testing. Do not change it.", 0) \ - M(Float, fault_probability_after_part_commit, 0, "For testing. Do not change it.", 0) \ - M(Bool, shared_merge_tree_disable_merges_and_mutations_assignment, false, "Only available in ClickHouse Cloud", 0) \ - M(Float, shared_merge_tree_partitions_hint_ratio_to_reload_merge_pred_for_mutations, 0.5, "Only available in ClickHouse Cloud", 0) \ - M(UInt64, shared_merge_tree_parts_load_batch_size, 32, "Only available in ClickHouse Cloud", 0) \ - \ - /** Check delay of replicas settings. */ \ - M(UInt64, min_relative_delay_to_measure, 120, "Calculate relative replica delay only if absolute delay is not less that this value.", 0) \ - M(UInt64, cleanup_delay_period, 30, "Minimum period to clean old queue logs, blocks hashes and parts.", 0) \ - M(UInt64, max_cleanup_delay_period, 300, "Maximum period to clean old queue logs, blocks hashes and parts.", 0) \ - M(UInt64, cleanup_delay_period_random_add, 10, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \ - M(UInt64, cleanup_thread_preferred_points_per_iteration, 150, "Preferred batch size for background cleanup (points are abstract but 1 point is approximately equivalent to 1 inserted block).", 0) \ - M(UInt64, cleanup_threads, 128, "Only available in ClickHouse Cloud", 0) \ - M(UInt64, kill_delay_period, 30, "Only available in ClickHouse Cloud", 0) \ - M(UInt64, kill_delay_period_random_add, 10, "Only available in ClickHouse Cloud", 0) \ - M(UInt64, kill_threads, 128, "Only available in ClickHouse Cloud", 0) \ - M(UInt64, min_relative_delay_to_close, 300, "Minimal delay from other replicas to close, stop serving requests and not return Ok during status check.", 0) \ - M(UInt64, min_absolute_delay_to_close, 0, "Minimal absolute delay to close, stop serving requests and not return Ok during status check.", 0) \ - M(UInt64, enable_vertical_merge_algorithm, 1, "Enable usage of Vertical merge algorithm.", 0) \ - M(UInt64, vertical_merge_algorithm_min_rows_to_activate, 16 * 8192, "Minimal (approximate) sum of rows in merging parts to activate Vertical merge algorithm.", 0) \ - M(UInt64, vertical_merge_algorithm_min_bytes_to_activate, 0, "Minimal (approximate) uncompressed size in bytes in merging parts to activate Vertical merge algorithm.", 0) \ - M(UInt64, vertical_merge_algorithm_min_columns_to_activate, 11, "Minimal amount of non-PK columns to activate Vertical merge algorithm.", 0) \ - M(Bool, vertical_merge_remote_filesystem_prefetch, true, "If true prefetching of data from remote filesystem is used for the next column during merge", 0) \ - M(UInt64, max_postpone_time_for_failed_mutations_ms, 5ULL * 60 * 1000, "The maximum postpone time for failed mutations.", 0) \ - \ - /** Compatibility settings */ \ - M(Bool, allow_suspicious_indices, false, "Reject primary/secondary indexes and sorting keys with identical expressions", 0) \ - M(Bool, compatibility_allow_sampling_expression_not_in_primary_key, false, "Allow to create a table with sampling expression not in primary key. This is needed only to temporarily allow to run the server with wrong tables for backward compatibility.", 0) \ - M(Bool, use_minimalistic_checksums_in_zookeeper, true, "Use small format (dozens bytes) for part checksums in ZooKeeper instead of ordinary ones (dozens KB). Before enabling check that all replicas support new format.", 0) \ - M(Bool, use_minimalistic_part_header_in_zookeeper, true, "Store part header (checksums and columns) in a compact format and a single part znode instead of separate znodes (/columns and /checksums). This can dramatically reduce snapshot size in ZooKeeper. Before enabling check that all replicas support new format.", 0) \ - M(UInt64, finished_mutations_to_keep, 100, "How many records about mutations that are done to keep. If zero, then keep all of them.", 0) \ - M(UInt64, min_merge_bytes_to_use_direct_io, 10ULL * 1024 * 1024 * 1024, "Minimal amount of bytes to enable O_DIRECT in merge (0 - disabled).", 0) \ - M(UInt64, index_granularity_bytes, 10 * 1024 * 1024, "Approximate amount of bytes in single granule (0 - disabled).", 0) \ - M(UInt64, min_index_granularity_bytes, 1024, "Minimum amount of bytes in single granule.", 1024) \ - M(Int64, merge_with_ttl_timeout, 3600 * 4, "Minimal time in seconds, when merge with delete TTL can be repeated.", 0) \ - M(Int64, merge_with_recompression_ttl_timeout, 3600 * 4, "Minimal time in seconds, when merge with recompression TTL can be repeated.", 0) \ - M(Bool, ttl_only_drop_parts, false, "Only drop altogether the expired parts and not partially prune them.", 0) \ - M(Bool, materialize_ttl_recalculate_only, false, "Only recalculate ttl info when MATERIALIZE TTL", 0) \ - M(Bool, enable_mixed_granularity_parts, true, "Enable parts with adaptive and non adaptive granularity", 0) \ - M(UInt64, concurrent_part_removal_threshold, 100, "Activate concurrent part removal (see 'max_part_removal_threads') only if the number of inactive data parts is at least this.", 0) \ - M(UInt64, zero_copy_concurrent_part_removal_max_split_times, 5, "Max recursion depth for splitting independent Outdated parts ranges into smaller subranges (highly not recommended to change)", 0) \ - M(Float, zero_copy_concurrent_part_removal_max_postpone_ratio, static_cast(0.05), "Max percentage of top level parts to postpone removal in order to get smaller independent ranges (highly not recommended to change)", 0) \ - M(String, storage_policy, "default", "Name of storage disk policy", 0) \ - M(String, disk, "", "Name of storage disk. Can be specified instead of storage policy.", 0) \ - M(Bool, allow_nullable_key, false, "Allow Nullable types as primary keys.", 0) \ - M(Bool, remove_empty_parts, true, "Remove empty parts after they were pruned by TTL, mutation, or collapsing merge algorithm.", 0) \ - M(Bool, assign_part_uuids, false, "Generate UUIDs for parts. Before enabling check that all replicas support new format.", 0) \ - M(Int64, max_partitions_to_read, -1, "Limit the max number of partitions that can be accessed in one query. <= 0 means unlimited. This setting is the default that can be overridden by the query-level setting with the same name.", 0) \ - M(UInt64, max_concurrent_queries, 0, "Max number of concurrently executed queries related to the MergeTree table (0 - disabled). Queries will still be limited by other max_concurrent_queries settings.", 0) \ - M(UInt64, min_marks_to_honor_max_concurrent_queries, 0, "Minimal number of marks to honor the MergeTree-level's max_concurrent_queries (0 - disabled). Queries will still be limited by other max_concurrent_queries settings.", 0) \ - M(UInt64, min_bytes_to_rebalance_partition_over_jbod, 0, "Minimal amount of bytes to enable part rebalance over JBOD array (0 - disabled).", 0) \ - M(Bool, check_sample_column_is_correct, true, "Check columns or columns by hash for sampling are unsigned integer.", 0) \ - M(Bool, allow_vertical_merges_from_compact_to_wide_parts, true, "Allows vertical merges from compact to wide parts. This settings must have the same value on all replicas", 0) \ - M(Bool, enable_the_endpoint_id_with_zookeeper_name_prefix, false, "Enable the endpoint id with zookeeper name prefix for the replicated merge tree table", 0) \ - M(UInt64, zero_copy_merge_mutation_min_parts_size_sleep_before_lock, 1ULL * 1024 * 1024 * 1024, "If zero copy replication is enabled sleep random amount of time before trying to lock depending on parts size for merge or mutation", 0) \ - M(Bool, allow_floating_point_partition_key, false, "Allow floating point as partition key", 0) \ - M(UInt64, sleep_before_loading_outdated_parts_ms, 0, "For testing. Do not change it.", 0) \ - M(Bool, always_use_copy_instead_of_hardlinks, false, "Always copy data instead of hardlinking during mutations/replaces/detaches and so on.", 0) \ - M(Bool, disable_freeze_partition_for_zero_copy_replication, true, "Disable FREEZE PARTITION query for zero copy replication.", 0) \ - M(Bool, disable_detach_partition_for_zero_copy_replication, true, "Disable DETACH PARTITION query for zero copy replication.", 0) \ - M(Bool, disable_fetch_partition_for_zero_copy_replication, true, "Disable FETCH PARTITION query for zero copy replication.", 0) \ - M(Bool, enable_block_number_column, false, "Enable persisting column _block_number for each row.", 0) ALIAS(allow_experimental_block_number_column) \ - M(Bool, enable_block_offset_column, false, "Enable persisting column _block_offset for each row.", 0) \ - \ - /** Experimental/work in progress feature. Unsafe for production. */ \ - M(UInt64, part_moves_between_shards_enable, 0, "Experimental/Incomplete feature to move parts between shards. Does not take into account sharding expressions.", 0) \ - M(UInt64, part_moves_between_shards_delay_seconds, 30, "Time to wait before/after moving parts between shards.", 0) \ - M(Bool, allow_remote_fs_zero_copy_replication, false, "Don't use this setting in production, because it is not ready.", 0) \ - M(String, remote_fs_zero_copy_zookeeper_path, "/clickhouse/zero_copy", "ZooKeeper path for zero-copy table-independent info.", 0) \ - M(Bool, remote_fs_zero_copy_path_compatible_mode, false, "Run zero-copy in compatible mode during conversion process.", 0) \ - M(Bool, cache_populated_by_fetch, false, "Only available in ClickHouse Cloud", 0) \ - M(Bool, force_read_through_cache_for_merges, false, "Force read-through filesystem cache for merges", 0) \ - M(Bool, allow_experimental_replacing_merge_with_cleanup, false, "Allow experimental CLEANUP merges for ReplacingMergeTree with is_deleted column.", 0) \ - \ - /** Compress marks and primary key. */ \ - M(Bool, compress_marks, true, "Marks support compression, reduce mark file size and speed up network transmission.", 0) \ - M(Bool, compress_primary_key, true, "Primary key support compression, reduce primary key file size and speed up network transmission.", 0) \ - M(String, marks_compression_codec, "ZSTD(3)", "Compression encoding used by marks, marks are small enough and cached, so the default compression is ZSTD(3).", 0) \ - M(String, primary_key_compression_codec, "ZSTD(3)", "Compression encoding used by primary, primary key is small enough and cached, so the default compression is ZSTD(3).", 0) \ - M(UInt64, marks_compress_block_size, 65536, "Mark compress block size, the actual size of the block to compress.", 0) \ - M(UInt64, primary_key_compress_block_size, 65536, "Primary compress block size, the actual size of the block to compress.", 0) \ - M(Bool, primary_key_lazy_load, true, "Load primary key in memory on first use instead of on table initialization. This can save memory in the presence of a large number of tables.", 0) \ - M(Float, primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns, 0.9f, "If the value of a column of the primary key in data part changes at least in this ratio of times, skip loading next columns in memory. This allows to save memory usage by not loading useless columns of the primary key.", 0) \ - /** Projection settings. */ \ - M(UInt64, max_projections, 25, "The maximum number of merge tree projections.", 0) \ - M(LightweightMutationProjectionMode, lightweight_mutation_projection_mode, LightweightMutationProjectionMode::THROW, "When lightweight delete happens on a table with projection(s), the possible operations include throw the exception as projection exists, or drop projections of this table's relevant parts, or rebuild the projections.", 0) \ - M(DeduplicateMergeProjectionMode, deduplicate_merge_projection_mode, DeduplicateMergeProjectionMode::THROW, "Whether to allow create projection for the table with non-classic MergeTree, if allowed, what is the action when merge, drop or rebuild.", 0) \ - -#define MAKE_OBSOLETE_MERGE_TREE_SETTING(M, TYPE, NAME, DEFAULT) \ - M(TYPE, NAME, DEFAULT, "Obsolete setting, does nothing.", BaseSettingsHelpers::Flags::OBSOLETE) - -#define OBSOLETE_MERGE_TREE_SETTINGS(M, ALIAS) \ - /** Obsolete settings that do nothing but left for compatibility reasons. */ \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, min_relative_delay_to_yield_leadership, 120) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, check_delay_period, 60) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, replicated_max_parallel_sends, 0) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, replicated_max_parallel_sends_for_table, 0) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, replicated_max_parallel_fetches, 0) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, replicated_max_parallel_fetches_for_table, 0) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, Bool, write_final_mark, true) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, min_bytes_for_compact_part, 0) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, min_rows_for_compact_part, 0) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, Bool, in_memory_parts_enable_wal, true) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, write_ahead_log_max_bytes, 1024 * 1024 * 1024) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, write_ahead_log_bytes_to_fsync, 100ULL * 1024 * 1024) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, write_ahead_log_interval_ms_to_fsync, 100) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, Bool, in_memory_parts_insert_sync, false) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, MaxThreads, max_part_loading_threads, 0) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, MaxThreads, max_part_removal_threads, 0) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, Bool, use_metadata_cache, false) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, merge_tree_enable_clear_old_broken_detached, 0) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, merge_tree_clear_old_broken_detached_parts_ttl_timeout_seconds, 1ULL * 3600 * 24 * 30) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, Seconds, replicated_fetches_http_connection_timeout, 0) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, Seconds, replicated_fetches_http_send_timeout, 0) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, Seconds, replicated_fetches_http_receive_timeout, 0) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, UInt64, replicated_max_parallel_fetches_for_host, DEFAULT_COUNT_OF_HTTP_CONNECTIONS_PER_ENDPOINT) \ - MAKE_OBSOLETE_MERGE_TREE_SETTING(M, CleanDeletedRows, clean_deleted_rows, CleanDeletedRows::Never) \ - - /// Settings that should not change after the creation of a table. - /// NOLINTNEXTLINE -#define APPLY_FOR_IMMUTABLE_MERGE_TREE_SETTINGS(M) \ - M(index_granularity) - -#define LIST_OF_MERGE_TREE_SETTINGS(M, ALIAS) \ - MERGE_TREE_SETTINGS(M, ALIAS) \ - OBSOLETE_MERGE_TREE_SETTINGS(M, ALIAS) - -DECLARE_SETTINGS_TRAITS(MergeTreeSettingsTraits, LIST_OF_MERGE_TREE_SETTINGS) - - -/** Settings for the MergeTree family of engines. - * Could be loaded from config or from a CREATE TABLE query (SETTINGS clause). - */ -struct MergeTreeSettings : public BaseSettings, public IHints<2> +struct MergeTreeSettings { - void loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config); + MergeTreeSettings(); + MergeTreeSettings(const MergeTreeSettings & settings); + MergeTreeSettings(MergeTreeSettings && settings) noexcept; + ~MergeTreeSettings(); + + MERGETREE_SETTINGS_SUPPORTED_TYPES(MergeTreeSettings, DECLARE_SETTING_SUBSCRIPT_OPERATOR) + + bool has(std::string_view name) const; + + bool tryGet(std::string_view name, Field & value) const; + Field get(std::string_view name) const; + + void set(std::string_view name, const Field & value); + + SettingsChanges changes() const; + void applyChanges(const SettingsChanges & changes); + void applyChange(const SettingChange & change); + std::vector getAllRegisteredNames() const; + void applyCompatibilitySetting(const String & compatibility_value); /// NOTE: will rewrite the AST to add immutable settings. void loadFromQuery(ASTStorage & storage_def, ContextPtr context, bool is_attach); + void loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config); - static bool isReadonlySetting(const String & name); - static bool isPartFormatSetting(const String & name); - - /// Check that the values are sane taking also query-level settings into account. + bool needSyncPart(size_t input_rows, size_t input_bytes) const; void sanityCheck(size_t background_pool_tasks) const; - std::vector getAllRegisteredNames() const override; + void dumpToSystemMergeTreeSettingsColumns(MutableColumnsAndConstraints & params) const; + void addToProgramOptionsIfNotPresent(boost::program_options::options_description & main_options, bool allow_repeated_settings); + + static Field castValueUtil(std::string_view name, const Field & value); + static String valueToStringUtil(std::string_view name, const Field & value); + static Field stringToValueUtil(std::string_view name, const String & str); + static bool hasBuiltin(std::string_view name); static std::string_view resolveName(std::string_view name); + static bool isReadonlySetting(const String & name); + static void checkCanSet(std::string_view name, const Field & value); + static bool isPartFormatSetting(const String & name); + +private: + std::unique_ptr impl; }; -using MergeTreeSettingsPtr = std::shared_ptr; - - /// Column-level Merge-Tree settings which overwrite MergeTree settings namespace MergeTreeColumnSettings { void validate(const SettingsChanges & changes); } - -[[maybe_unused]] static bool needSyncPart(size_t input_rows, size_t input_bytes, const MergeTreeSettings & settings) -{ - return ( - (settings.min_rows_to_fsync_after_merge && input_rows >= settings.min_rows_to_fsync_after_merge) - || (settings.min_compressed_bytes_to_fsync_after_merge && input_bytes >= settings.min_compressed_bytes_to_fsync_after_merge)); -} } diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp index 9942e890b56..c78e37cbe74 100644 --- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp @@ -15,6 +15,16 @@ namespace ProfileEvents namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool allow_remote_fs_zero_copy_replication; + extern const MergeTreeSettingsBool detach_not_byte_identical_parts; + extern const MergeTreeSettingsSeconds lock_acquire_timeout_for_background_operations; + extern const MergeTreeSettingsUInt64 prefer_fetch_merged_part_size_threshold; + extern const MergeTreeSettingsSeconds prefer_fetch_merged_part_time_threshold; + extern const MergeTreeSettingsUInt64 zero_copy_merge_mutation_min_parts_size_sleep_before_lock; +} + ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare() { const String & source_part_name = entry.source_parts.at(0); @@ -71,8 +81,8 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare() /// TODO - some better heuristic? size_t estimated_space_for_result = MergeTreeDataMergerMutator::estimateNeededDiskSpace({source_part}, false); - if (entry.create_time + storage_settings_ptr->prefer_fetch_merged_part_time_threshold.totalSeconds() <= time(nullptr) - && estimated_space_for_result >= storage_settings_ptr->prefer_fetch_merged_part_size_threshold) + if (entry.create_time + (*storage_settings_ptr)[MergeTreeSetting::prefer_fetch_merged_part_time_threshold].totalSeconds() <= time(nullptr) + && estimated_space_for_result >= (*storage_settings_ptr)[MergeTreeSetting::prefer_fetch_merged_part_size_threshold]) { /// If entry is old enough, and have enough size, and some replica has the desired part, /// then prefer fetching from replica. @@ -121,12 +131,12 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare() future_mutated_part->updatePath(storage, reserved_space.get()); table_lock_holder = storage.lockForShare( - RWLockImpl::NO_QUERY, storage_settings_ptr->lock_acquire_timeout_for_background_operations); + RWLockImpl::NO_QUERY, (*storage_settings_ptr)[MergeTreeSetting::lock_acquire_timeout_for_background_operations]); StorageMetadataPtr metadata_snapshot = storage.getInMemoryMetadataPtr(); transaction_ptr = std::make_unique(storage, NO_TRANSACTION_RAW); - if (storage_settings_ptr->allow_remote_fs_zero_copy_replication) + if ((*storage_settings_ptr)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) { if (auto disk = reserved_space->getDisk(); disk->supportZeroCopyReplication()) { @@ -140,8 +150,8 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare() }; } - if (storage_settings_ptr->zero_copy_merge_mutation_min_parts_size_sleep_before_lock != 0 && - estimated_space_for_result >= storage_settings_ptr->zero_copy_merge_mutation_min_parts_size_sleep_before_lock) + if ((*storage_settings_ptr)[MergeTreeSetting::zero_copy_merge_mutation_min_parts_size_sleep_before_lock] != 0 && + estimated_space_for_result >= (*storage_settings_ptr)[MergeTreeSetting::zero_copy_merge_mutation_min_parts_size_sleep_before_lock]) { /// In zero copy replication only one replica execute merge/mutation, others just download merged parts metadata. /// Here we are trying to metigate the skew of merges execution because of faster/slower replicas. @@ -151,12 +161,12 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare() /// /// So here we trying to solve it with the simplest solution -- sleep random time up to 500ms for 1GB part and up to 7 seconds for 300GB part. /// It can sound too much, but we are trying to acquire these locks in background tasks which can be scheduled each 5 seconds or so. - double start_to_sleep_seconds = std::logf(storage_settings_ptr->zero_copy_merge_mutation_min_parts_size_sleep_before_lock.value); + double start_to_sleep_seconds = std::logf((*storage_settings_ptr)[MergeTreeSetting::zero_copy_merge_mutation_min_parts_size_sleep_before_lock].value); uint64_t right_border_to_sleep_ms = static_cast((std::log(estimated_space_for_result) - start_to_sleep_seconds + 0.5) * 1000); uint64_t time_to_sleep_milliseconds = std::min(10000UL, std::uniform_int_distribution(1, 1 + right_border_to_sleep_ms)(rng)); LOG_INFO(log, "Mutation size is {} bytes (it's more than sleep threshold {}) so will intentionally sleep for {} ms to allow other replicas to took this big mutation", - estimated_space_for_result, storage_settings_ptr->zero_copy_merge_mutation_min_parts_size_sleep_before_lock, time_to_sleep_milliseconds); + estimated_space_for_result, (*storage_settings_ptr)[MergeTreeSetting::zero_copy_merge_mutation_min_parts_size_sleep_before_lock], time_to_sleep_milliseconds); std::this_thread::sleep_for(std::chrono::milliseconds(time_to_sleep_milliseconds)); } @@ -258,7 +268,7 @@ bool MutateFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrit mutate_task->updateProfileEvents(); write_part_log(ExecutionStatus::fromCurrentException("", true)); - if (storage.getSettings()->detach_not_byte_identical_parts) + if ((*storage.getSettings())[MergeTreeSetting::detach_not_byte_identical_parts]) storage.forcefullyMovePartToDetachedAndRemoveFromMemory(std::move(new_part), "mutate-not-byte-identical"); else storage.tryRemovePartImmediately(std::move(new_part)); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 9b5d3176323..c16c445f6ae 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -62,6 +62,20 @@ namespace Setting extern const SettingsUInt64 min_insert_block_size_rows; } +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool allow_remote_fs_zero_copy_replication; + extern const MergeTreeSettingsBool always_use_copy_instead_of_hardlinks; + extern const MergeTreeSettingsMilliseconds background_task_preferred_step_execution_time_ms; + extern const MergeTreeSettingsBool exclude_deleted_rows_for_part_size_in_merge; + extern const MergeTreeSettingsLightweightMutationProjectionMode lightweight_mutation_projection_mode; + extern const MergeTreeSettingsBool materialize_ttl_recalculate_only; + extern const MergeTreeSettingsUInt64 max_file_name_length; + extern const MergeTreeSettingsFloat ratio_of_defaults_for_sparse_serialization; + extern const MergeTreeSettingsBool replace_long_file_name_to_hash; + extern const MergeTreeSettingsBool ttl_only_drop_parts; +} + namespace ErrorCodes { extern const int ABORTED; @@ -421,7 +435,7 @@ getColumnsForNewDataPart( SerializationInfo::Settings settings { - .ratio_of_defaults_for_sparse = source_part->storage.getSettings()->ratio_of_defaults_for_sparse_serialization, + .ratio_of_defaults_for_sparse = (*source_part->storage.getSettings())[MergeTreeSetting::ratio_of_defaults_for_sparse_serialization], .choose_kind = false }; @@ -837,7 +851,7 @@ static NameToNameVector collectFilesForRenames( String stream_to; auto storage_settings = source_part->storage.getSettings(); - if (storage_settings->replace_long_file_name_to_hash && full_stream_to.size() > storage_settings->max_file_name_length) + if ((*storage_settings)[MergeTreeSetting::replace_long_file_name_to_hash] && full_stream_to.size() > (*storage_settings)[MergeTreeSetting::max_file_name_length]) stream_to = sipHash128String(full_stream_to); else stream_to = full_stream_to; @@ -1198,7 +1212,7 @@ void PartMergerWriter::prepare() bool PartMergerWriter::mutateOriginalPartAndPrepareProjections() { Stopwatch watch(CLOCK_MONOTONIC_COARSE); - UInt64 step_time_ms = ctx->data->getSettings()->background_task_preferred_step_execution_time_ms.totalMilliseconds(); + UInt64 step_time_ms = (*ctx->data->getSettings())[MergeTreeSetting::background_task_preferred_step_execution_time_ms].totalMilliseconds(); do { @@ -1475,7 +1489,7 @@ private: bool lightweight_delete_mode = ctx->updated_header.has(RowExistsColumn::name); bool lightweight_delete_drop = lightweight_delete_mode - && ctx->data->getSettings()->lightweight_mutation_projection_mode == LightweightMutationProjectionMode::DROP; + && (*ctx->data->getSettings())[MergeTreeSetting::lightweight_mutation_projection_mode] == LightweightMutationProjectionMode::DROP; const auto & projections = ctx->metadata_snapshot->getProjections(); for (const auto & projection : projections) @@ -1753,7 +1767,7 @@ private: if (it->isFile()) { - if (settings->always_use_copy_instead_of_hardlinks) + if ((*settings)[MergeTreeSetting::always_use_copy_instead_of_hardlinks]) { ctx->new_data_part->getDataPartStorage().copyFileFrom( ctx->source_part->getDataPartStorage(), it->name(), destination); @@ -1776,7 +1790,7 @@ private: for (auto p_it = projection_data_part_storage_src->iterate(); p_it->isValid(); p_it->next()) { - if (settings->always_use_copy_instead_of_hardlinks) + if ((*settings)[MergeTreeSetting::always_use_copy_instead_of_hardlinks]) { projection_data_part_storage_dst->copyFileFrom( *projection_data_part_storage_src, p_it->name(), p_it->name()); @@ -2170,7 +2184,7 @@ bool MutateTask::prepare() /// part: all_0_0_0_1/checksums.txt -> /s3/blobs/shjfgsaasdasdasdasdasdas /// locks path in zk: /zero_copy/tbl_id/s3_blobs_shjfgsaasdasdasdasdasdas/replica_name /// So we need to copy to have a new name - bool copy_checksumns = ctx->data->supportsReplication() && settings_ptr->allow_remote_fs_zero_copy_replication && ctx->source_part->isStoredOnRemoteDiskWithZeroCopySupport(); + bool copy_checksumns = ctx->data->supportsReplication() && (*settings_ptr)[MergeTreeSetting::allow_remote_fs_zero_copy_replication] && ctx->source_part->isStoredOnRemoteDiskWithZeroCopySupport(); if (copy_checksumns) files_to_copy_instead_of_hardlinks.insert(IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK); @@ -2182,7 +2196,7 @@ bool MutateTask::prepare() IDataPartStorage::ClonePartParams clone_params { .txn = ctx->txn, .hardlinked_files = &ctx->hardlinked_files, - .copy_instead_of_hardlink = settings_ptr->always_use_copy_instead_of_hardlinks, + .copy_instead_of_hardlink = (*settings_ptr)[MergeTreeSetting::always_use_copy_instead_of_hardlinks], .files_to_copy_instead_of_hardlinks = std::move(files_to_copy_instead_of_hardlinks), .keep_metadata_version = true, }; @@ -2215,7 +2229,7 @@ bool MutateTask::prepare() context_for_reading->setSetting("max_streams_for_merge_tree_reading", Field(0)); context_for_reading->setSetting("read_from_filesystem_cache_if_exists_otherwise_bypass_cache", 1); - bool suitable_for_ttl_optimization = ctx->metadata_snapshot->hasOnlyRowsTTL() && ctx->data->getSettings()->ttl_only_drop_parts; + bool suitable_for_ttl_optimization = ctx->metadata_snapshot->hasOnlyRowsTTL() && (*ctx->data->getSettings())[MergeTreeSetting::ttl_only_drop_parts]; MutationHelpers::splitAndModifyMutationCommands( ctx->source_part, ctx->metadata_snapshot, @@ -2253,7 +2267,7 @@ bool MutateTask::prepare() /// If under the condition of lightweight delete mode with rebuild option, add projections again here as we can only know /// the condition as early as from here. if (lightweight_delete_mode - && ctx->data->getSettings()->lightweight_mutation_projection_mode == LightweightMutationProjectionMode::REBUILD) + && (*ctx->data->getSettings())[MergeTreeSetting::lightweight_mutation_projection_mode] == LightweightMutationProjectionMode::REBUILD) { for (const auto & projection : ctx->metadata_snapshot->getProjections()) { @@ -2298,13 +2312,13 @@ bool MutateTask::prepare() ctx->mrk_extension = ctx->source_part->index_granularity_info.mark_type.getFileExtension(); const auto data_settings = ctx->data->getSettings(); - ctx->need_sync = needSyncPart(ctx->source_part->rows_count, ctx->source_part->getBytesOnDisk(), *data_settings); + ctx->need_sync = data_settings->needSyncPart(ctx->source_part->rows_count, ctx->source_part->getBytesOnDisk()); ctx->execute_ttl_type = ExecuteTTLType::NONE; if (ctx->mutating_pipeline_builder.initialized()) ctx->execute_ttl_type = MutationHelpers::shouldExecuteTTL(ctx->metadata_snapshot, ctx->interpreter->getColumnDependencies()); - if (ctx->data->getSettings()->exclude_deleted_rows_for_part_size_in_merge && lightweight_delete_mode) + if ((*ctx->data->getSettings())[MergeTreeSetting::exclude_deleted_rows_for_part_size_in_merge] && lightweight_delete_mode) { /// This mutation contains lightweight delete and we need to count the deleted rows, /// Reset existing_rows_count of new data part to 0 and it will be updated while writing _row_exists column @@ -2330,7 +2344,7 @@ bool MutateTask::prepare() /// The blobs have to be removed along with the part, this temporary part owns them and does not share them yet. ctx->new_data_part->remove_tmp_policy = IMergeTreeDataPart::BlobsRemovalPolicyForTemporaryParts::REMOVE_BLOBS; - bool drop_expired_parts = suitable_for_ttl_optimization && !ctx->data->getSettings()->materialize_ttl_recalculate_only; + bool drop_expired_parts = suitable_for_ttl_optimization && !(*ctx->data->getSettings())[MergeTreeSetting::materialize_ttl_recalculate_only]; if (drop_expired_parts) task = std::make_unique(std::make_unique(ctx), ctx); else @@ -2347,7 +2361,7 @@ bool MutateTask::prepare() ctx->context, ctx->materialized_indices); - auto lightweight_mutation_projection_mode = ctx->data->getSettings()->lightweight_mutation_projection_mode; + auto lightweight_mutation_projection_mode = (*ctx->data->getSettings())[MergeTreeSetting::lightweight_mutation_projection_mode]; bool lightweight_delete_drops_projections = lightweight_mutation_projection_mode == LightweightMutationProjectionMode::DROP || lightweight_mutation_projection_mode == LightweightMutationProjectionMode::THROW; @@ -2395,7 +2409,7 @@ bool MutateTask::prepare() /// Keeper has to be asked with unlock request to release the references to the blobs ctx->new_data_part->remove_tmp_policy = IMergeTreeDataPart::BlobsRemovalPolicyForTemporaryParts::ASK_KEEPER; - bool drop_expired_parts = suitable_for_ttl_optimization && !ctx->data->getSettings()->materialize_ttl_recalculate_only; + bool drop_expired_parts = suitable_for_ttl_optimization && !(*ctx->data->getSettings())[MergeTreeSetting::materialize_ttl_recalculate_only]; if (drop_expired_parts) task = std::make_unique(std::make_unique(ctx), ctx); else diff --git a/src/Storages/MergeTree/PartMetadataManagerOrdinary.cpp b/src/Storages/MergeTree/PartMetadataManagerOrdinary.cpp index 149ad6b4a10..6eac71eeaf7 100644 --- a/src/Storages/MergeTree/PartMetadataManagerOrdinary.cpp +++ b/src/Storages/MergeTree/PartMetadataManagerOrdinary.cpp @@ -11,7 +11,11 @@ namespace DB std::unique_ptr PartMetadataManagerOrdinary::read(const String & file_name) const { size_t file_size = part->getDataPartStorage().getFileSize(file_name); - auto res = part->getDataPartStorage().readFile(file_name, getReadSettings().adjustBufferSize(file_size), file_size, std::nullopt); + auto read_settings = getReadSettings().adjustBufferSize(file_size); + /// Default read method is pread_threadpool, but there is not much point in it here. + read_settings.local_fs_method = LocalFSReadMethod::pread; + + auto res = part->getDataPartStorage().readFile(file_name, read_settings, file_size, std::nullopt); if (isCompressedFromFileName(file_name)) return std::make_unique(std::move(res)); diff --git a/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.cpp b/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.cpp index 10c7bef72fc..7a46132ef1a 100644 --- a/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.cpp +++ b/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.cpp @@ -10,6 +10,12 @@ namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsUInt64 part_moves_between_shards_delay_seconds; + extern const MergeTreeSettingsUInt64 part_moves_between_shards_enable; +} + namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -30,7 +36,7 @@ PartMovesBetweenShardsOrchestrator::PartMovesBetweenShardsOrchestrator(StorageRe void PartMovesBetweenShardsOrchestrator::run() { - if (!storage.getSettings()->part_moves_between_shards_enable) + if (!(*storage.getSettings())[MergeTreeSetting::part_moves_between_shards_enable]) return; if (need_stop) @@ -526,7 +532,7 @@ PartMovesBetweenShardsOrchestrator::Entry PartMovesBetweenShardsOrchestrator::st } else { - std::this_thread::sleep_for(std::chrono::seconds(storage.getSettings()->part_moves_between_shards_delay_seconds)); + std::this_thread::sleep_for(std::chrono::seconds((*storage.getSettings())[MergeTreeSetting::part_moves_between_shards_delay_seconds])); entry.state = EntryState::SOURCE_DROP; return entry; } @@ -598,7 +604,7 @@ PartMovesBetweenShardsOrchestrator::Entry PartMovesBetweenShardsOrchestrator::st throw Exception(ErrorCodes::LOGICAL_ERROR, "It is not possible to rollback from this state. This is a bug."); else { - std::this_thread::sleep_for(std::chrono::seconds(storage.getSettings()->part_moves_between_shards_delay_seconds)); + std::this_thread::sleep_for(std::chrono::seconds((*storage.getSettings())[MergeTreeSetting::part_moves_between_shards_delay_seconds])); entry.state = EntryState::REMOVE_UUID_PIN; return entry; } diff --git a/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.cpp b/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.cpp index 24929365b72..bd7221c29c8 100644 --- a/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.cpp @@ -9,6 +9,10 @@ namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsUInt64 max_postpone_time_for_failed_mutations_ms; +} namespace ErrorCodes { @@ -119,7 +123,7 @@ bool ReplicatedMergeMutateTaskBase::executeStep() status.latest_fail_time = time(nullptr); status.latest_fail_reason = getExceptionMessage(saved_exception, false); if (result_data_version == it->first) - storage.mutation_backoff_policy.addPartMutationFailure(src_part, storage.getSettings()->max_postpone_time_for_failed_mutations_ms); + storage.mutation_backoff_policy.addPartMutationFailure(src_part, (*storage.getSettings())[MergeTreeSetting::max_postpone_time_for_failed_mutations_ms]); } } } diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp index 67570d78366..22b8ccca151 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp @@ -11,6 +11,11 @@ namespace CurrentMetrics namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsSeconds initialization_retry_period; +} + namespace ErrorCodes { extern const int SUPPORT_IS_DISABLED; @@ -25,7 +30,7 @@ ReplicatedMergeTreeAttachThread::ReplicatedMergeTreeAttachThread(StorageReplicat { task = storage.getContext()->getSchedulePool().createTask(log_name, [this] { run(); }); const auto storage_settings = storage.getSettings(); - retry_period = storage_settings->initialization_retry_period.totalSeconds(); + retry_period = (*storage_settings)[MergeTreeSetting::initialization_retry_period].totalSeconds(); } ReplicatedMergeTreeAttachThread::~ReplicatedMergeTreeAttachThread() diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp index 7aef249c366..bb7d683023b 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp @@ -14,6 +14,23 @@ namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsUInt64 cleanup_delay_period; + extern const MergeTreeSettingsUInt64 cleanup_delay_period_random_add; + extern const MergeTreeSettingsUInt64 cleanup_thread_preferred_points_per_iteration; + extern const MergeTreeSettingsUInt64 finished_mutations_to_keep; + extern const MergeTreeSettingsSeconds lock_acquire_timeout_for_background_operations; + extern const MergeTreeSettingsUInt64 max_cleanup_delay_period; + extern const MergeTreeSettingsUInt64 max_replicated_logs_to_keep; + extern const MergeTreeSettingsUInt64 min_replicated_logs_to_keep; + extern const MergeTreeSettingsUInt64 replicated_deduplication_window; + extern const MergeTreeSettingsUInt64 replicated_deduplication_window_for_async_inserts; + extern const MergeTreeSettingsUInt64 replicated_deduplication_window_seconds; + extern const MergeTreeSettingsUInt64 replicated_deduplication_window_seconds_for_async_inserts; + extern const MergeTreeSettingsSeconds temporary_directories_lifetime; +} + namespace ErrorCodes { extern const int NOT_FOUND_NODE; @@ -26,7 +43,7 @@ ReplicatedMergeTreeCleanupThread::ReplicatedMergeTreeCleanupThread(StorageReplic : storage(storage_) , log_name(storage.getStorageID().getFullTableName() + " (ReplicatedMergeTreeCleanupThread)") , log(getLogger(log_name)) - , sleep_ms(storage.getSettings()->cleanup_delay_period * 1000) + , sleep_ms((*storage.getSettings())[MergeTreeSetting::cleanup_delay_period] * 1000) { task = storage.getContext()->getSchedulePool().createTask(log_name, [this]{ run(); }); } @@ -65,22 +82,22 @@ void ReplicatedMergeTreeCleanupThread::run() UInt64 now_ms = clock_gettime_ns_adjusted(prev_timestamp * 1'000'000) / 1'000'000; /// Do not adjust sleep_ms on the first run after starting the server - if (prev_timestamp && storage_settings->cleanup_thread_preferred_points_per_iteration) + if (prev_timestamp && (*storage_settings)[MergeTreeSetting::cleanup_thread_preferred_points_per_iteration]) { /// We don't want to run the task too often when the table was barely changed and there's almost nothing to cleanup. /// But we cannot simply sleep max_cleanup_delay_period (300s) when nothing was cleaned up and cleanup_delay_period (30s) /// when we removed something, because inserting one part per 30s will lead to running cleanup each 30s just to remove one part. /// So we need some interpolation based on preferred batch size. - auto expected_cleanup_points = storage_settings->cleanup_thread_preferred_points_per_iteration; + auto expected_cleanup_points = (*storage_settings)[MergeTreeSetting::cleanup_thread_preferred_points_per_iteration]; /// How long should we sleep to remove cleanup_thread_preferred_points_per_iteration on the next iteration? Float32 ratio = cleanup_points / expected_cleanup_points; if (ratio == 0) - sleep_ms = storage_settings->max_cleanup_delay_period * 1000; + sleep_ms = (*storage_settings)[MergeTreeSetting::max_cleanup_delay_period] * 1000; else sleep_ms = static_cast(sleep_ms / ratio); - sleep_ms = std::clamp(sleep_ms, storage_settings->cleanup_delay_period * 1000, storage_settings->max_cleanup_delay_period * 1000); + sleep_ms = std::clamp(sleep_ms, (*storage_settings)[MergeTreeSetting::cleanup_delay_period] * 1000, (*storage_settings)[MergeTreeSetting::max_cleanup_delay_period] * 1000); UInt64 interval_ms = now_ms - prev_timestamp; LOG_TRACE(log, "Scheduling next cleanup after {}ms (points: {}, interval: {}ms, ratio: {}, points per minute: {})", @@ -88,7 +105,7 @@ void ReplicatedMergeTreeCleanupThread::run() } prev_cleanup_timestamp_ms.store(now_ms, std::memory_order_relaxed); - sleep_ms += std::uniform_int_distribution(0, storage_settings->cleanup_delay_period_random_add * 1000)(rng); + sleep_ms += std::uniform_int_distribution(0, (*storage_settings)[MergeTreeSetting::cleanup_delay_period_random_add] * 1000)(rng); task->scheduleAfter(sleep_ms); } @@ -98,13 +115,13 @@ void ReplicatedMergeTreeCleanupThread::wakeupEarlierIfNeeded() /// In this case, sleep_ms was set to the highest possible value, the task is not going to wake up soon, /// but the number of objects to clean up is growing. We need to wakeup the task earlier. auto storage_settings = storage.getSettings(); - if (!storage_settings->cleanup_thread_preferred_points_per_iteration) + if (!(*storage_settings)[MergeTreeSetting::cleanup_thread_preferred_points_per_iteration]) return; /// The number of other objects (logs, blocks, etc) is usually correlated with the number of Outdated parts. /// Do not wake up unless we have too many. size_t number_of_outdated_objects = storage.getOutdatedPartsCount(); - if (number_of_outdated_objects < storage_settings->cleanup_thread_preferred_points_per_iteration * 2) + if (number_of_outdated_objects < (*storage_settings)[MergeTreeSetting::cleanup_thread_preferred_points_per_iteration] * 2) return; /// A race condition is possible here, but it's okay @@ -112,7 +129,7 @@ void ReplicatedMergeTreeCleanupThread::wakeupEarlierIfNeeded() return; /// Do not re-check all parts too often (avoid constantly calling getNumberOfOutdatedPartsWithExpiredRemovalTime()) - if (!wakeup_check_timer.compareAndRestart(storage_settings->cleanup_delay_period / 4.0)) + if (!wakeup_check_timer.compareAndRestart((*storage_settings)[MergeTreeSetting::cleanup_delay_period] / 4.0)) return; UInt64 prev_run_timestamp_ms = prev_cleanup_timestamp_ms.load(std::memory_order_relaxed); @@ -122,12 +139,12 @@ void ReplicatedMergeTreeCleanupThread::wakeupEarlierIfNeeded() /// Don't run it more often than cleanup_delay_period UInt64 seconds_passed = (now_ms - prev_run_timestamp_ms) / 1000; - if (seconds_passed < storage_settings->cleanup_delay_period) + if (seconds_passed < (*storage_settings)[MergeTreeSetting::cleanup_delay_period]) return; /// Do not count parts that cannot be removed anyway. Do not wake up unless we have too many. number_of_outdated_objects = storage.getNumberOfOutdatedPartsWithExpiredRemovalTime(); - if (number_of_outdated_objects < storage_settings->cleanup_thread_preferred_points_per_iteration * 2) + if (number_of_outdated_objects < (*storage_settings)[MergeTreeSetting::cleanup_thread_preferred_points_per_iteration] * 2) return; LOG_TRACE(log, "Waking up cleanup thread because there are {} outdated objects and previous cleanup finished {}s ago", @@ -148,10 +165,10 @@ Float32 ReplicatedMergeTreeCleanupThread::iterate() auto storage_settings = storage.getSettings(); { - auto lock = storage.lockForShare(RWLockImpl::NO_QUERY, storage.getSettings()->lock_acquire_timeout_for_background_operations); + auto lock = storage.lockForShare(RWLockImpl::NO_QUERY, (*storage.getSettings())[MergeTreeSetting::lock_acquire_timeout_for_background_operations]); /// Both use relative_data_path which changes during rename, so we /// do it under share lock - cleaned_part_like += storage.clearOldTemporaryDirectories(storage.getSettings()->temporary_directories_lifetime.totalSeconds()); + cleaned_part_like += storage.clearOldTemporaryDirectories((*storage.getSettings())[MergeTreeSetting::temporary_directories_lifetime].totalSeconds()); } /// This is loose condition: no problem if we actually had lost leadership at this moment @@ -159,17 +176,17 @@ Float32 ReplicatedMergeTreeCleanupThread::iterate() if (storage.is_leader) { cleaned_logs = clearOldLogs(); - size_t normal_blocks = clearOldBlocks("blocks", storage_settings->replicated_deduplication_window_seconds, - storage_settings->replicated_deduplication_window, cached_block_stats_for_sync_inserts); + size_t normal_blocks = clearOldBlocks("blocks", (*storage_settings)[MergeTreeSetting::replicated_deduplication_window_seconds], + (*storage_settings)[MergeTreeSetting::replicated_deduplication_window], cached_block_stats_for_sync_inserts); size_t async_blocks = clearOldBlocks("async_blocks", - storage_settings->replicated_deduplication_window_seconds_for_async_inserts, - storage_settings->replicated_deduplication_window_for_async_inserts, + (*storage_settings)[MergeTreeSetting::replicated_deduplication_window_seconds_for_async_inserts], + (*storage_settings)[MergeTreeSetting::replicated_deduplication_window_for_async_inserts], cached_block_stats_for_async_inserts); /// Many async blocks are transformed into one ordinary block - Float32 async_blocks_per_block = static_cast(storage_settings->replicated_deduplication_window) / - (storage_settings->replicated_deduplication_window_for_async_inserts + 1); + Float32 async_blocks_per_block = static_cast((*storage_settings)[MergeTreeSetting::replicated_deduplication_window]) / + ((*storage_settings)[MergeTreeSetting::replicated_deduplication_window_for_async_inserts] + 1); cleaned_blocks = (normal_blocks + async_blocks * async_blocks_per_block) / 2; cleaned_other += clearOldMutations(); @@ -210,7 +227,7 @@ size_t ReplicatedMergeTreeCleanupThread::clearOldLogs() /// Numbers are arbitrary. std::uniform_real_distribution distr(1.05, 1.15); double ratio = distr(rng); - size_t min_replicated_logs_to_keep = static_cast(storage_settings->min_replicated_logs_to_keep * ratio); + size_t min_replicated_logs_to_keep = static_cast((*storage_settings)[MergeTreeSetting::min_replicated_logs_to_keep] * ratio); if (static_cast(children_count) < min_replicated_logs_to_keep) return 0; @@ -230,8 +247,8 @@ size_t ReplicatedMergeTreeCleanupThread::clearOldLogs() ::sort(entries.begin(), entries.end()); String min_saved_record_log_str = entries[ - entries.size() > storage_settings->max_replicated_logs_to_keep - ? entries.size() - storage_settings->max_replicated_logs_to_keep + entries.size() > (*storage_settings)[MergeTreeSetting::max_replicated_logs_to_keep] + ? entries.size() - (*storage_settings)[MergeTreeSetting::max_replicated_logs_to_keep] : 0]; /// Replicas that were marked is_lost but are active. @@ -333,7 +350,7 @@ size_t ReplicatedMergeTreeCleanupThread::clearOldLogs() min_saved_log_pointer = std::min(min_saved_log_pointer, min_log_pointer_lost_candidate); /// We will not touch the last `min_replicated_logs_to_keep` records. - entries.erase(entries.end() - std::min(entries.size(), storage_settings->min_replicated_logs_to_keep), entries.end()); + entries.erase(entries.end() - std::min(entries.size(), (*storage_settings)[MergeTreeSetting::min_replicated_logs_to_keep]), entries.end()); /// We will not touch records that are no less than `min_saved_log_pointer`. entries.erase(std::lower_bound(entries.begin(), entries.end(), "log-" + padIndex(min_saved_log_pointer)), entries.end()); @@ -573,10 +590,10 @@ void ReplicatedMergeTreeCleanupThread::getBlocksSortedByTime(const String & bloc size_t ReplicatedMergeTreeCleanupThread::clearOldMutations() { auto storage_settings = storage.getSettings(); - if (!storage_settings->finished_mutations_to_keep) + if (!(*storage_settings)[MergeTreeSetting::finished_mutations_to_keep]) return 0; - if (storage.queue.countFinishedMutations() <= storage_settings->finished_mutations_to_keep) + if (storage.queue.countFinishedMutations() <= (*storage_settings)[MergeTreeSetting::finished_mutations_to_keep]) { /// Not strictly necessary, but helps to avoid unnecessary ZooKeeper requests. /// If even this replica hasn't finished enough mutations yet, then we don't need to clean anything. @@ -604,10 +621,10 @@ size_t ReplicatedMergeTreeCleanupThread::clearOldMutations() /// Do not remove entries that are greater than `min_pointer` (they are not done yet). entries.erase(std::upper_bound(entries.begin(), entries.end(), padIndex(min_pointer)), entries.end()); - /// Do not remove last `storage_settings->finished_mutations_to_keep` entries. - if (entries.size() <= storage_settings->finished_mutations_to_keep) + /// Do not remove last `(*storage_settings)[MergeTreeSetting::finished_mutations_to_keep`] entries. + if (entries.size() <= (*storage_settings)[MergeTreeSetting::finished_mutations_to_keep]) return 0; - entries.erase(entries.end() - storage_settings->finished_mutations_to_keep, entries.end()); + entries.erase(entries.end() - (*storage_settings)[MergeTreeSetting::finished_mutations_to_keep], entries.end()); if (entries.empty()) return 0; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h index fc83b374a77..508cb96797e 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h @@ -40,7 +40,7 @@ struct ReplicatedMergeTreeLogEntryData DROP_RANGE, /// Delete the parts in the specified partition in the specified number range. CLEAR_COLUMN, /// NOTE: Deprecated. Drop specific column from specified partition. CLEAR_INDEX, /// NOTE: Deprecated. Drop specific index from specified partition. - REPLACE_RANGE, /// Drop certain range of partitions and replace them by new ones + REPLACE_RANGE, /// Drop certain range of parts and replace them by new ones MUTATE_PART, /// Apply one or several mutations to the part. ALTER_METADATA, /// Apply alter modification according to global /metadata and /columns paths SYNC_PINNED_PART_UUIDS, /// Synchronization point for ensuring that all replicas have up to date in-memory state. diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.cpp index 3e64a4c7c52..1b7fc2f21fc 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.cpp @@ -15,6 +15,13 @@ namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool allow_remote_fs_zero_copy_replication; + extern const MergeTreeSettingsSeconds execute_merges_on_single_replica_time_threshold; + extern const MergeTreeSettingsSeconds remote_fs_execute_merges_on_single_replica_time_threshold; +} + /// minimum interval (seconds) between checks if chosen replica finished the merge. static const auto RECHECK_MERGE_READYNESS_INTERVAL_SECONDS = 1; @@ -92,10 +99,10 @@ std::optional ReplicatedMergeTreeMergeStrategyPicker::pickReplicaToExecu void ReplicatedMergeTreeMergeStrategyPicker::refreshState() { const auto settings = storage.getSettings(); - time_t threshold = settings->execute_merges_on_single_replica_time_threshold.totalSeconds(); + time_t threshold = (*settings)[MergeTreeSetting::execute_merges_on_single_replica_time_threshold].totalSeconds(); time_t threshold_init = 0; - if (settings->allow_remote_fs_zero_copy_replication) - threshold_init = settings->remote_fs_execute_merges_on_single_replica_time_threshold.totalSeconds(); + if ((*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) + threshold_init = (*settings)[MergeTreeSetting::remote_fs_execute_merges_on_single_replica_time_threshold].totalSeconds(); if (threshold == 0) /// we can reset the settings without lock (it's atomic) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp index 8877ebff6a1..bb85e224c1f 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp @@ -17,6 +17,12 @@ namespace ProfileEvents namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsSeconds lock_acquire_timeout_for_background_operations; + extern const MergeTreeSettingsSeconds old_parts_lifetime; +} + namespace ErrorCodes { extern const int TABLE_DIFFERS_TOO_MUCH; @@ -292,7 +298,7 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St /// We cannot rely on exists_in_zookeeper, because the cleanup thread is probably going to remove it from ZooKeeper /// Also, it will avoid "Cannot commit empty part: Part ... (state Outdated) already exists, but it will be deleted soon" time_t lifetime = time(nullptr) - outdated->remove_time; - time_t max_lifetime = storage.getSettings()->old_parts_lifetime.totalSeconds(); + time_t max_lifetime = (*storage.getSettings())[MergeTreeSetting::old_parts_lifetime].totalSeconds(); time_t delay = lifetime >= max_lifetime ? 0 : max_lifetime - lifetime; result.recheck_after_seconds = delay + 30; @@ -327,7 +333,7 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St time_t current_time = time(nullptr); auto zookeeper = storage.getZooKeeper(); - auto table_lock = storage.lockForShare(RWLockImpl::NO_QUERY, storage.getSettings()->lock_acquire_timeout_for_background_operations); + auto table_lock = storage.lockForShare(RWLockImpl::NO_QUERY, (*storage.getSettings())[MergeTreeSetting::lock_acquire_timeout_for_background_operations]); auto local_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksums( part->getColumns(), part->checksums); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 0fa2be6a389..b1e7020f117 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -20,6 +20,14 @@ namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool allow_remote_fs_zero_copy_replication; + extern const MergeTreeSettingsUInt64 max_bytes_to_merge_at_max_space_in_pool; + extern const MergeTreeSettingsUInt64 max_number_of_merges_with_ttl_in_pool; + extern const MergeTreeSettingsUInt64 replicated_max_mutations_in_one_entry; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -1460,7 +1468,7 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry( } const auto data_settings = data.getSettings(); - if (data_settings->allow_remote_fs_zero_copy_replication) + if ((*data_settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) { auto disks = storage.getDisks(); DiskPtr disk_with_zero_copy = nullptr; @@ -1515,7 +1523,7 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry( bool ignore_max_size = false; if (entry.type == LogEntry::MERGE_PARTS) { - ignore_max_size = max_source_parts_size == data_settings->max_bytes_to_merge_at_max_space_in_pool; + ignore_max_size = max_source_parts_size == (*data_settings)[MergeTreeSetting::max_bytes_to_merge_at_max_space_in_pool]; if (isTTLMergeType(entry.merge_type)) { @@ -1526,11 +1534,11 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry( return false; } size_t total_merges_with_ttl = data.getTotalMergesWithTTLInMergeList(); - if (total_merges_with_ttl >= data_settings->max_number_of_merges_with_ttl_in_pool) + if (total_merges_with_ttl >= (*data_settings)[MergeTreeSetting::max_number_of_merges_with_ttl_in_pool]) { constexpr auto fmt_string = "Not executing log entry {} for part {} because {} merges with TTL already executing, maximum {}."; LOG_DEBUG(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.new_part_name, total_merges_with_ttl, - data_settings->max_number_of_merges_with_ttl_in_pool); + (*data_settings)[MergeTreeSetting::max_number_of_merges_with_ttl_in_pool]); return false; } } @@ -2725,7 +2733,7 @@ std::optional> ReplicatedMergeTreeMergePredicate::getDesir if (in_partition == queue.mutations_by_partition.end()) return {}; - UInt64 mutations_limit = queue.storage.getSettings()->replicated_max_mutations_in_one_entry; + UInt64 mutations_limit = (*queue.storage.getSettings())[MergeTreeSetting::replicated_max_mutations_in_one_entry]; UInt64 mutations_count = 0; Int64 current_version = queue.getCurrentMutationVersion(part->info.partition_id, part->info.getDataVersion()); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp index dec1dd0bee9..9d3e26cdc8d 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp @@ -21,6 +21,11 @@ namespace CurrentMetrics namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsSeconds zookeeper_session_expiration_check_period; +} + namespace ErrorCodes { extern const int REPLICA_IS_ALREADY_ACTIVE; @@ -44,7 +49,7 @@ ReplicatedMergeTreeRestartingThread::ReplicatedMergeTreeRestartingThread(Storage , active_node_identifier(generateActiveNodeIdentifier()) { const auto storage_settings = storage.getSettings(); - check_period_ms = storage_settings->zookeeper_session_expiration_check_period.totalSeconds() * 1000; + check_period_ms = (*storage_settings)[MergeTreeSetting::zookeeper_session_expiration_check_period].totalSeconds() * 1000; task = storage.getContext()->getSchedulePool().createTask(log_name, [this]{ run(); }); } diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 69c8ea01e85..1321af1e804 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -39,6 +39,11 @@ namespace Setting extern const SettingsBool optimize_on_insert; } +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsMilliseconds sleep_before_commit_local_part_in_replicated_table_ms; +} + namespace FailPoints { extern const char replicated_merge_tree_commit_zk_fail_after_op[]; @@ -811,7 +816,7 @@ std::pair, bool> ReplicatedMergeTreeSinkImpl:: auto sleep_before_commit_for_tests = [&] () { - auto sleep_before_commit_local_part_in_replicated_table_ms = storage.getSettings()->sleep_before_commit_local_part_in_replicated_table_ms; + auto sleep_before_commit_local_part_in_replicated_table_ms = (*storage.getSettings())[MergeTreeSetting::sleep_before_commit_local_part_in_replicated_table_ms]; if (sleep_before_commit_local_part_in_replicated_table_ms.totalMilliseconds()) { LOG_INFO(log, "committing part {}, triggered sleep_before_commit_local_part_in_replicated_table_ms {}", diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.cpp index c4bae5352cb..10185115da4 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.cpp @@ -15,6 +15,12 @@ namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsUInt64 index_granularity; + extern const MergeTreeSettingsUInt64 index_granularity_bytes; +} + namespace ErrorCodes { extern const int METADATA_MISMATCH; @@ -50,7 +56,7 @@ ReplicatedMergeTreeTableMetadata::ReplicatedMergeTreeTableMetadata(const MergeTr const auto data_settings = data.getSettings(); sampling_expression = formattedASTNormalized(metadata_snapshot->getSamplingKeyAST()); - index_granularity = data_settings->index_granularity; + index_granularity = (*data_settings)[MergeTreeSetting::index_granularity]; merging_params_mode = static_cast(data.merging_params.mode); sign_column = data.merging_params.sign_column; is_deleted_column = data.merging_params.is_deleted_column; @@ -96,7 +102,7 @@ ReplicatedMergeTreeTableMetadata::ReplicatedMergeTreeTableMetadata(const MergeTr projections = metadata_snapshot->getProjections().toString(); if (data.canUseAdaptiveGranularity()) - index_granularity_bytes = data_settings->index_granularity_bytes; + index_granularity_bytes = (*data_settings)[MergeTreeSetting::index_granularity_bytes]; else index_granularity_bytes = 0; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h b/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h index 15ed8671f9b..f09196a6788 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include #include diff --git a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.cpp b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.cpp index 280783245a2..3df03b55cd4 100644 --- a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.cpp @@ -6,6 +6,11 @@ namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool materialize_ttl_recalculate_only; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -15,7 +20,7 @@ bool StorageFromMergeTreeDataPart::materializeTTLRecalculateOnly() const { if (parts.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "parts must not be empty for materializeTTLRecalculateOnly"); - return parts.front()->storage.getSettings()->materialize_ttl_recalculate_only; + return (*parts.front()->storage.getSettings())[MergeTreeSetting::materialize_ttl_recalculate_only]; } void StorageFromMergeTreeDataPart::read( diff --git a/src/Storages/MergeTree/VectorSimilarityCondition.cpp b/src/Storages/MergeTree/VectorSimilarityCondition.cpp index a8c61ae4894..13dc6abfbf6 100644 --- a/src/Storages/MergeTree/VectorSimilarityCondition.cpp +++ b/src/Storages/MergeTree/VectorSimilarityCondition.cpp @@ -58,7 +58,6 @@ VectorSimilarityCondition::Info::DistanceFunction stringToDistanceFunction(const VectorSimilarityCondition::VectorSimilarityCondition(const SelectQueryInfo & query_info, ContextPtr context) : block_with_constants(KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)) - , index_granularity(context->getMergeTreeSettings().index_granularity) , max_limit_for_ann_queries(context->getSettingsRef()[Setting::max_limit_for_ann_queries]) , index_is_useful(checkQueryStructure(query_info)) {} diff --git a/src/Storages/MergeTree/VectorSimilarityCondition.h b/src/Storages/MergeTree/VectorSimilarityCondition.h index 2e9e06a31d0..254c91fbc48 100644 --- a/src/Storages/MergeTree/VectorSimilarityCondition.h +++ b/src/Storages/MergeTree/VectorSimilarityCondition.h @@ -75,7 +75,6 @@ public: size_t getDimensions() const; String getColumnName() const; Info::DistanceFunction getDistanceFunction() const; - UInt64 getIndexGranularity() const { return index_granularity; } UInt64 getLimit() const; private: @@ -156,9 +155,6 @@ private: /// true if we have one of two supported query types std::optional query_information; - // Get from settings ANNIndex parameters - const UInt64 index_granularity; - /// only queries with a lower limit can be considered to avoid memory overflow const UInt64 max_limit_for_ann_queries; diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index 975097b5fda..4b9ff276bfd 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -28,6 +28,11 @@ namespace CurrentMetrics namespace DB { +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsFloat ratio_of_defaults_for_sparse_serialization; +} + namespace ErrorCodes { extern const int CORRUPTED_DATA; @@ -77,7 +82,14 @@ bool isRetryableException(std::exception_ptr exception_ptr) #endif catch (const ErrnoException & e) { - return e.getErrno() == EMFILE; + return e.getErrno() == EMFILE + || e.getErrno() == ENOMEM + || isNotEnoughMemoryErrorCode(e.code()) + || e.code() == ErrorCodes::NETWORK_ERROR + || e.code() == ErrorCodes::SOCKET_TIMEOUT + || e.code() == ErrorCodes::CANNOT_SCHEDULE_TASK + || e.code() == ErrorCodes::ABORTED; + } catch (const Coordination::Exception & e) { @@ -91,6 +103,22 @@ bool isRetryableException(std::exception_ptr exception_ptr) || e.code() == ErrorCodes::CANNOT_SCHEDULE_TASK || e.code() == ErrorCodes::ABORTED; } + catch (const std::filesystem::filesystem_error & e) + { + return e.code() == std::errc::no_space_on_device || + e.code() == std::errc::read_only_file_system || + e.code() == std::errc::too_many_files_open_in_system || + e.code() == std::errc::operation_not_permitted || + e.code() == std::errc::device_or_resource_busy || + e.code() == std::errc::permission_denied || + e.code() == std::errc::too_many_files_open || + e.code() == std::errc::text_file_busy || + e.code() == std::errc::timed_out || + e.code() == std::errc::not_enough_memory || + e.code() == std::errc::not_supported || + e.code() == std::errc::too_many_links || + e.code() == std::errc::too_many_symbolic_link_levels; + } catch (const Poco::Net::NetException &) { return true; @@ -160,7 +188,7 @@ static IMergeTreeDataPart::Checksums checkDataPart( }; }; - auto ratio_of_defaults = data_part->storage.getSettings()->ratio_of_defaults_for_sparse_serialization; + auto ratio_of_defaults = (*data_part->storage.getSettings())[MergeTreeSetting::ratio_of_defaults_for_sparse_serialization]; SerializationInfoByName serialization_infos; if (data_part_storage.exists(IMergeTreeDataPart::SERIALIZATION_FILE_NAME)) @@ -171,13 +199,9 @@ static IMergeTreeDataPart::Checksums checkDataPart( SerializationInfo::Settings settings{ratio_of_defaults, false}; serialization_infos = SerializationInfoByName::readJSON(columns_txt, settings, *serialization_file); } - catch (const Poco::Exception & ex) - { - throw Exception(ErrorCodes::CORRUPTED_DATA, "Failed to load {}, with error {}", IMergeTreeDataPart::SERIALIZATION_FILE_NAME, ex.message()); - } catch (...) { - throw; + throw Exception(ErrorCodes::CORRUPTED_DATA, "Failed to load file {} of data part {}, with error {}", IMergeTreeDataPart::SERIALIZATION_FILE_NAME, data_part->name, getCurrentExceptionMessage(true)); } } @@ -399,18 +423,45 @@ IMergeTreeDataPart::Checksums checkDataPart( ReadSettings read_settings; read_settings.enable_filesystem_cache = false; + read_settings.enable_filesystem_cache_log = false; + read_settings.enable_filesystem_read_prefetches_log = false; + read_settings.page_cache = nullptr; + read_settings.load_marks_asynchronously = false; + read_settings.remote_fs_prefetch = false; + read_settings.page_cache_inject_eviction = false; + read_settings.use_page_cache_for_disks_without_file_cache = false; + read_settings.local_fs_method = LocalFSReadMethod::pread; + + try + { + return checkDataPart( + data_part, + data_part_storage, + data_part->getColumns(), + data_part->getType(), + data_part->getFileNamesWithoutChecksums(), + read_settings, + require_checksums, + is_cancelled, + is_broken_projection, + throw_on_broken_projection); + } + catch (...) + { + if (isRetryableException(std::current_exception())) + { + LOG_DEBUG( + getLogger("checkDataPart"), + "Got retriable error {} checking data part {}, will return empty", data_part->name, getCurrentExceptionMessage(false)); + + /// We were unable to check data part because of some temporary exception + /// like Memory limit exceeded. If part is actually broken we will retry check + /// with the next read attempt of this data part. + return IMergeTreeDataPart::Checksums{}; + } + throw; + } - return checkDataPart( - data_part, - data_part_storage, - data_part->getColumns(), - data_part->getType(), - data_part->getFileNamesWithoutChecksums(), - read_settings, - require_checksums, - is_cancelled, - is_broken_projection, - throw_on_broken_projection); }; try @@ -431,7 +482,16 @@ IMergeTreeDataPart::Checksums checkDataPart( catch (...) { if (isRetryableException(std::current_exception())) - throw; + { + LOG_DEBUG( + getLogger("checkDataPart"), + "Got retriable error {} checking data part {}, will return empty", data_part->name, getCurrentExceptionMessage(false)); + + /// We were unable to check data part because of some temporary exception + /// like Memory limit exceeded. If part is actually broken we will retry check + /// with the next read attempt of this data part. + return {}; + } return drop_cache_and_check(); } } diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index e4918645f09..14cd7e897f2 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -38,6 +38,14 @@ namespace Setting extern const SettingsUInt64 database_replicated_allow_replicated_engine_arguments; } +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool add_implicit_sign_column_constraint_for_collapsing_engine; + extern const MergeTreeSettingsBool allow_floating_point_partition_key; + extern const MergeTreeSettingsDeduplicateMergeProjectionMode deduplicate_merge_projection_mode; + extern const MergeTreeSettingsUInt64 index_granularity; +} + namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -701,7 +709,7 @@ static StoragePtr create(const StorageFactory::Arguments & args) constraints.push_back(constraint); if ((merging_params.mode == MergeTreeData::MergingParams::Collapsing || merging_params.mode == MergeTreeData::MergingParams::VersionedCollapsing) && - storage_settings->add_implicit_sign_column_constraint_for_collapsing_engine) + (*storage_settings)[MergeTreeSetting::add_implicit_sign_column_constraint_for_collapsing_engine]) { auto sign_column_check_constraint = std::make_unique(); sign_column_check_constraint->name = "check_sign_column"; @@ -767,11 +775,11 @@ static StoragePtr create(const StorageFactory::Arguments & args) const auto * ast = engine_args[arg_num]->as(); if (ast && ast->value.getType() == Field::Types::UInt64) { - storage_settings->index_granularity = ast->value.safeGet(); + (*storage_settings)[MergeTreeSetting::index_granularity] = ast->value.safeGet(); if (args.mode <= LoadingStrictnessLevel::CREATE) { SettingsChanges changes; - changes.emplace_back("index_granularity", Field(storage_settings->index_granularity)); + changes.emplace_back("index_granularity", Field((*storage_settings)[MergeTreeSetting::index_granularity])); args.getLocalContext()->checkMergeTreeSettingsConstraints(initial_storage_settings, changes); } } @@ -784,7 +792,7 @@ static StoragePtr create(const StorageFactory::Arguments & args) } DataTypes data_types = metadata.partition_key.data_types; - if (args.mode <= LoadingStrictnessLevel::CREATE && !storage_settings->allow_floating_point_partition_key) + if (args.mode <= LoadingStrictnessLevel::CREATE && !(*storage_settings)[MergeTreeSetting::allow_floating_point_partition_key]) { for (size_t i = 0; i < data_types.size(); ++i) if (isFloat(data_types[i])) @@ -797,7 +805,7 @@ static StoragePtr create(const StorageFactory::Arguments & args) /// Now let's handle the merge tree family. Note we only handle in the mode of CREATE due to backward compatibility. /// Otherwise, it would fail to start in the case of existing projections with special mergetree. if (merging_params.mode != MergeTreeData::MergingParams::Mode::Ordinary - && storage_settings->deduplicate_merge_projection_mode == DeduplicateMergeProjectionMode::THROW) + && (*storage_settings)[MergeTreeSetting::deduplicate_merge_projection_mode] == DeduplicateMergeProjectionMode::THROW) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Projection is fully supported in {}MergeTree with deduplicate_merge_projection_mode = throw. " "Use 'drop' or 'rebuild' option of deduplicate_merge_projection_mode.", diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 0b7106de949..440a4af4b64 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -426,37 +427,39 @@ std::unique_ptr StorageObjectStorageSource::createReadBuffer( const auto & object_size = object_info.metadata->size_bytes; auto read_settings = context_->getReadSettings().adjustBufferSize(object_size); - read_settings.enable_filesystem_cache = false; /// FIXME: Changing this setting to default value breaks something around parquet reading read_settings.remote_read_min_bytes_for_seek = read_settings.remote_fs_buffer_size; + /// User's object may change, don't cache it. + read_settings.enable_filesystem_cache = false; + read_settings.use_page_cache_for_disks_without_file_cache = false; const bool object_too_small = object_size <= 2 * context_->getSettingsRef()[Setting::max_download_buffer_size]; - const bool use_prefetch = object_too_small && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool; - read_settings.remote_fs_method = use_prefetch ? RemoteFSReadMethod::threadpool : RemoteFSReadMethod::read; - /// User's object may change, don't cache it. - read_settings.use_page_cache_for_disks_without_file_cache = false; + const bool use_prefetch = object_too_small + && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool + && read_settings.remote_fs_prefetch; + + if (use_prefetch) + read_settings.remote_read_buffer_use_external_buffer = true; + + auto impl = object_storage->readObject(StoredObject(object_info.getPath(), "", object_size), read_settings); // Create a read buffer that will prefetch the first ~1 MB of the file. // When reading lots of tiny files, this prefetching almost doubles the throughput. // For bigger files, parallel reading is more useful. - if (use_prefetch) - { - LOG_TRACE(log, "Downloading object of size {} with initial prefetch", object_size); + if (!use_prefetch) + return impl; - auto async_reader = object_storage->readObjects( - StoredObjects{StoredObject{object_info.getPath(), /* local_path */ "", object_size}}, read_settings); + LOG_TRACE(log, "Downloading object of size {} with initial prefetch", object_size); - async_reader->setReadUntilEnd(); - if (read_settings.remote_fs_prefetch) - async_reader->prefetch(DEFAULT_PREFETCH_PRIORITY); + auto & reader = context_->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); + impl = std::make_unique( + std::move(impl), reader, read_settings, + context_->getAsyncReadCounters(), + context_->getFilesystemReadPrefetchesLog()); - return async_reader; - } - else - { - /// FIXME: this is inconsistent that readObject always reads synchronously ignoring read_method setting. - return object_storage->readObject(StoredObject(object_info.getPath(), "", object_size), read_settings); - } + impl->setReadUntilEnd(); + impl->prefetch(DEFAULT_PREFETCH_PRIORITY); + return impl; } StorageObjectStorageSource::IIterator::IIterator(const std::string & logger_name_) diff --git a/src/Storages/PartitionCommands.h b/src/Storages/PartitionCommands.h index f0ecf91f567..917e510f24b 100644 --- a/src/Storages/PartitionCommands.h +++ b/src/Storages/PartitionCommands.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h index af2f13bb880..8774a8963af 100644 --- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h +++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h @@ -6,7 +6,7 @@ #include "PostgreSQLReplicationHandler.h" #include "MaterializedPostgreSQLSettings.h" -#include +#include #include #include #include diff --git a/src/Storages/ProjectionsDescription.h b/src/Storages/ProjectionsDescription.h index 5f091b4421b..445a4828e31 100644 --- a/src/Storages/ProjectionsDescription.h +++ b/src/Storages/ProjectionsDescription.h @@ -9,11 +9,6 @@ #include #include -#include -#include -#include -#include - namespace DB { struct StorageInMemoryMetadata; diff --git a/src/Storages/ReplaceAliasByExpressionVisitor.h b/src/Storages/ReplaceAliasByExpressionVisitor.h index 4acc1fd4be7..e8d1fe77c91 100644 --- a/src/Storages/ReplaceAliasByExpressionVisitor.h +++ b/src/Storages/ReplaceAliasByExpressionVisitor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include namespace DB diff --git a/src/Storages/SelectQueryDescription.cpp b/src/Storages/SelectQueryDescription.cpp index 0c06c523515..7129c8c66f0 100644 --- a/src/Storages/SelectQueryDescription.cpp +++ b/src/Storages/SelectQueryDescription.cpp @@ -100,17 +100,20 @@ void checkAllowedQueries(const ASTSelectQuery & query) /// check if only one single select query in SelectWithUnionQuery static bool isSingleSelect(const ASTPtr & select, ASTPtr & res) { - auto new_select = select->as(); - if (new_select.list_of_selects->children.size() != 1) + auto * new_select = select->as(); + if (new_select == nullptr) return false; - auto & new_inner_query = new_select.list_of_selects->children.at(0); + + if (new_select->list_of_selects->children.size() != 1) + return false; + auto & new_inner_query = new_select->list_of_selects->children.at(0); if (new_inner_query->as()) { res = new_inner_query; return true; } - else - return isSingleSelect(new_inner_query, res); + + return isSingleSelect(new_inner_query, res); } SelectQueryDescription SelectQueryDescription::getSelectQueryFromASTForMatView(const ASTPtr & select, bool refreshable, ContextPtr context) diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index 435c8db377f..c2c66fabb57 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -17,6 +18,7 @@ #include #include #include +#include #include #include @@ -156,16 +158,77 @@ StorageMaterializedView::StorageMaterializedView( if (query.refresh_strategy) { - fixed_uuid = false; - refresher = RefreshTask::create(this, getContext(), *query.refresh_strategy); - refresh_on_start = mode < LoadingStrictnessLevel::ATTACH && !query.is_create_empty; + fixed_uuid = query.refresh_strategy->append; + + auto db = DatabaseCatalog::instance().getDatabase(getStorageID().database_name); + bool is_replicated_db = db->getEngineName() == "Replicated"; + + /// Decide whether to enable coordination. + if (is_replicated_db) + { + if (fixed_uuid) + { + /// In APPEND mode, both coordinated and uncoordinated mode make sense, so allow choosing it with a setting. + RefreshSettings s; + if (query.refresh_strategy->settings) + s.applyChanges(query.refresh_strategy->settings->changes); + refresh_coordinated = !s.all_replicas; + } + else + { + /// In non-APPEND mode, uncoordinated refresh would just break. Require coordination. + refresh_coordinated = true; + } + } + + /// Sanity-check the table engine. + if (mode < LoadingStrictnessLevel::ATTACH && !fixed_uuid) + { + String inner_engine; + if (has_inner_table) + { + auto storage = query.getTargetInnerEngine(ViewTarget::To); + if (storage && storage->engine) + inner_engine = storage->engine->name; + } + else + { + if (auto task = getContext()->getRefreshSet().tryGetTaskForInnerTable(to_table_id)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table {} is already a target of another refreshable materialized view: {}", to_table_id.getFullTableName(), task->getInfo().view_id.getFullTableName()); + + StoragePtr inner_table = DatabaseCatalog::instance().tryGetTable(to_table_id, getContext()); + if (inner_table) + inner_engine = inner_table->getName(); + } + if (!inner_engine.empty()) + { + bool is_replicated_table = inner_engine.starts_with("Replicated") || inner_engine.starts_with("Shared"); + if (is_replicated_table && !is_replicated_db) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "This combination doesn't work: refreshable materialized view, no APPEND, non-replicated database, replicated table. Each refresh would replace the replicated table locally, but other replicas wouldn't see it. Refusing to create"); + if (!is_replicated_table && refresh_coordinated) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "This combination doesn't work: refreshable materialized view, no APPEND, replicated database, non-replicated table. The refresh would be done on one replica, but the table would be replaced on other replicas too (with empty tables). Refusing to create"); + /// Combination (!is_replicated_table && refresh_coordinated && fixed_uuid) is also questionable: + /// each refresh would append to a table on one arbitrarily chosen replica. But in principle it can be useful, + /// e.g. if SELECTs are done using clusterAllReplicas(). (For the two disallowed cases above, clusterAllReplicas() wouldn't work reliably.) + } + } + + refresher = RefreshTask::create(this, getContext(), *query.refresh_strategy, mode >= LoadingStrictnessLevel::ATTACH, refresh_coordinated, query.is_create_empty); + } + + if (!fixed_uuid) + { + if (to_inner_uuid != UUIDHelpers::Nil) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "TO INNER UUID is not allowed for materialized views with REFRESH without APPEND"); + if (to_table_id.hasUUID()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "explicit UUID is not allowed for target table of materialized view with REFRESH without APPEND"); } if (!has_inner_table) { target_table_id = to_table_id; } - else if (LoadingStrictnessLevel::ATTACH <= mode) + else if (mode >= LoadingStrictnessLevel::ATTACH) { /// If there is an ATTACH request, then the internal table must already be created. target_table_id = StorageID(getStorageID().database_name, generateInnerTableName(getStorageID()), to_inner_uuid); @@ -178,8 +241,10 @@ StorageMaterializedView::StorageMaterializedView( /// We will create a query to create an internal table. auto create_context = Context::createCopy(local_context); auto manual_create_query = std::make_shared(); - manual_create_query->setDatabase(getStorageID().database_name); - manual_create_query->setTable(generateInnerTableName(getStorageID())); + String db_name = getStorageID().database_name; + String inner_name = generateInnerTableName(getStorageID()); + manual_create_query->setDatabase(db_name); + manual_create_query->setTable(inner_name); manual_create_query->uuid = to_inner_uuid; manual_create_query->has_uuid = to_inner_uuid != UUIDHelpers::Nil; @@ -211,7 +276,10 @@ StorageMaterializedView::StorageMaterializedView( create_interpreter.setInternal(true); create_interpreter.execute(); - target_table_id = DatabaseCatalog::instance().getTable({manual_create_query->getDatabase(), manual_create_query->getTable()}, getContext())->getStorageID(); + if (fixed_uuid) + target_table_id = DatabaseCatalog::instance().getTable({db_name, inner_name}, getContext())->getStorageID(); + else + target_table_id = StorageID(db_name, inner_name); } } @@ -242,8 +310,19 @@ void StorageMaterializedView::read( const size_t num_streams) { auto context = getInMemoryMetadataPtr()->getSQLSecurityOverriddenContext(local_context); - auto storage = getTargetTable(); - auto lock = storage->lockForShare(context->getCurrentQueryId(), context->getSettingsRef()[Setting::lock_acquire_timeout]); + StoragePtr storage; + TableLockHolder lock; + + if (fixed_uuid) + { + storage = getTargetTable(); + lock = storage->lockForShare(context->getCurrentQueryId(), context->getSettingsRef()[Setting::lock_acquire_timeout]); + } + else + { + std::tie(storage, lock) = refresher->getAndLockTargetTable(getTargetTableId(), context); + } + auto target_metadata_snapshot = storage->getInMemoryMetadataPtr(); auto target_storage_snapshot = storage->getStorageSnapshot(target_metadata_snapshot, context); @@ -338,20 +417,39 @@ void StorageMaterializedView::drop() /// but DROP acquires DDLGuard for the name of MV. And we cannot acquire second DDLGuard for the inner name in DROP, /// because it may lead to lock-order-inversion (DDLGuards must be acquired in lexicographical order). dropInnerTableIfAny(/* sync */ false, getContext()); + + if (refresher) + refresher->drop(getContext()); } void StorageMaterializedView::dropInnerTableIfAny(bool sync, ContextPtr local_context) { - /// We will use `sync` argument wneh this function is called from a DROP query - /// and will ignore database_atomic_wait_for_drop_and_detach_synchronously when it's called from drop task. - /// See the comment in StorageMaterializedView::drop. - /// DDL queries with StorageMaterializedView are fundamentally broken. - /// Best-effort to make them work: the inner table name is almost always less than the MV name (so it's safe to lock DDLGuard) - auto inner_table_id = getTargetTableId(); - bool may_lock_ddl_guard = getStorageID().getQualifiedName() < inner_table_id.getQualifiedName(); - if (has_inner_table && tryGetTargetTable()) - InterpreterDropQuery::executeDropQuery(ASTDropQuery::Kind::Drop, getContext(), local_context, inner_table_id, - sync, /* ignore_sync_setting */ true, may_lock_ddl_guard); + if (!has_inner_table) + return; + + std::vector to_drop = {getTargetTableId()}; + if (!fixed_uuid) + to_drop.push_back(StorageID(to_drop[0].getDatabaseName(), ".tmp" + to_drop[0].getTableName())); + + for (const StorageID & inner_table_id : to_drop) + { + /// We will use `sync` argument when this function is called from a DROP query + /// and will ignore database_atomic_wait_for_drop_and_detach_synchronously when it's called from drop task. + /// See the comment in StorageMaterializedView::drop. + /// + /// DDL queries with StorageMaterializedView are fundamentally broken: we can't lock DDLGuard + /// here because DDLGuards must be locked in order of increasing table name (to avoid deadlocks), + /// while the inner table name is almost always less than the MV name. + /// So we just don't lock it. It's mostly ok because DatabaseReplicatedDDLWorker doesn't + /// execute queries concurrently, but presumably there are other race conditions + /// (I'm not the author of this code and don't know for sure, just commenting). + /// (Why not reverse DDLGuard locking order everywhere? Because in another place we lock + /// DDLGuard for table "" while holding DDLGuard for table "".) + auto table_exists = DatabaseCatalog::instance().tryGetTable(inner_table_id, getContext()) != nullptr; + if (table_exists) + InterpreterDropQuery::executeDropQuery(ASTDropQuery::Kind::Drop, getContext(), local_context, inner_table_id, + sync, /* ignore_sync_setting */ true, /*need_ddl_guard*/ false); + } } void StorageMaterializedView::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr local_context, TableExclusiveLockHolder &) @@ -386,19 +484,22 @@ bool StorageMaterializedView::optimize( ContextMutablePtr StorageMaterializedView::createRefreshContext() const { auto refresh_context = getInMemoryMetadataPtr()->getSQLSecurityOverriddenContext(getContext()); + refresh_context->setSetting("database_replicated_allow_replicated_engine_arguments", 3); refresh_context->setQueryKind(ClientInfo::QueryKind::INITIAL_QUERY); /// Generate a random query id. refresh_context->setCurrentQueryId(""); - /// TODO: Set view's definer as the current user in refresh_context, so that the correct user's - /// quotas and permissions apply for this query. return refresh_context; } -std::shared_ptr StorageMaterializedView::prepareRefresh(bool append, ContextMutablePtr refresh_context, std::optional & out_temp_table_id) const +std::tuple, std::unique_ptr> +StorageMaterializedView::prepareRefresh(bool append, ContextMutablePtr refresh_context, std::optional & out_temp_table_id) const { auto inner_table_id = getTargetTableId(); StorageID target_table = inner_table_id; + auto select_query = getInMemoryMetadataPtr()->getSelectQuery().select_query; + InterpreterSetQuery::applySettingsFromQuery(select_query, refresh_context); + if (!append) { CurrentThread::QueryScope query_scope(refresh_context); @@ -407,24 +508,30 @@ std::shared_ptr StorageMaterializedView::prepareRefresh(bool app String db_name = db->getDatabaseName(); auto new_table_name = ".tmp" + generateInnerTableName(getStorageID()); - auto create_table_query = db->getCreateTableQuery(inner_table_id.table_name, getContext()); - auto & create_query = create_table_query->as(); - create_query.setTable(new_table_name); - create_query.setDatabase(db->getDatabaseName()); - create_query.create_or_replace = true; - create_query.replace_table = true; - create_query.uuid = UUIDHelpers::Nil; + auto create_query = std::dynamic_pointer_cast(db->getCreateTableQuery(inner_table_id.table_name, getContext())); + create_query->setTable(new_table_name); + create_query->setDatabase(db_name); + create_query->create_or_replace = true; + create_query->replace_table = true; + /// Use UUID to ensure that the INSERT below inserts into the exact table we created, even if another replica replaced it. + create_query->uuid = UUIDHelpers::generateV4(); + create_query->has_uuid = true; - InterpreterCreateQuery create_interpreter(create_table_query, refresh_context); + InterpreterCreateQuery create_interpreter(create_query, refresh_context); create_interpreter.setInternal(true); + /// Notice that we discard the BlockIO that execute() returns. This means that in case of DatabaseReplicated we don't wait + /// for other replicas to execute the query, only the current replica. Same in exchangeTargetTable() and dropTempTable(). create_interpreter.execute(); - target_table = DatabaseCatalog::instance().getTable({db_name, new_table_name}, getContext())->getStorageID(); + target_table = StorageID(db_name, new_table_name, create_query->uuid); out_temp_table_id = target_table; } + // Create a thread group for the query. + auto query_scope = std::make_unique(refresh_context); + auto insert_query = std::make_shared(); - insert_query->select = getInMemoryMetadataPtr()->getSelectQuery().select_query; + insert_query->select = select_query; insert_query->setTable(target_table.table_name); insert_query->setDatabase(target_table.database_name); insert_query->table_id = target_table; @@ -440,32 +547,30 @@ std::shared_ptr StorageMaterializedView::prepareRefresh(bool app columns->children.push_back(std::make_shared(name)); insert_query->columns = std::move(columns); - return insert_query; + return {std::move(insert_query), std::move(query_scope)}; } -StorageID StorageMaterializedView::exchangeTargetTable(StorageID fresh_table, ContextPtr refresh_context) +std::optional StorageMaterializedView::exchangeTargetTable(StorageID fresh_table, ContextPtr refresh_context) const { /// Known problem: if the target table was ALTERed during refresh, this will effectively revert /// the ALTER. auto stale_table_id = getTargetTableId(); + fresh_table.uuid = UUIDHelpers::Nil; auto db = DatabaseCatalog::instance().getDatabase(stale_table_id.database_name); auto target_db = DatabaseCatalog::instance().getDatabase(fresh_table.database_name); + bool exchange = DatabaseCatalog::instance().isTableExist(stale_table_id, refresh_context); CurrentThread::QueryScope query_scope(refresh_context); auto rename_query = std::make_shared(); - rename_query->exchange = true; + rename_query->exchange = exchange; rename_query->addElement(fresh_table.database_name, fresh_table.table_name, stale_table_id.database_name, stale_table_id.table_name); InterpreterRenameQuery(rename_query, refresh_context).execute(); - std::swap(stale_table_id.database_name, fresh_table.database_name); - std::swap(stale_table_id.table_name, fresh_table.table_name); - - setTargetTableId(std::move(fresh_table)); - return stale_table_id; + return exchange ? std::make_optional(fresh_table) : std::nullopt; } void StorageMaterializedView::dropTempTable(StorageID table_id, ContextMutablePtr refresh_context) @@ -546,7 +651,10 @@ void StorageMaterializedView::checkAlterIsPossible(const AlterCommands & command else if (command.type == AlterCommand::MODIFY_QUERY) continue; else if (command.type == AlterCommand::MODIFY_REFRESH && refresher) + { + refresher->checkAlterIsPossible(*command.refresh->as()); continue; + } throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Alter of type '{}' is not supported by storage {}", command.type, getName()); @@ -612,7 +720,7 @@ void StorageMaterializedView::renameInMemory(const StorageID & new_table_id) DatabaseCatalog::instance().updateViewDependency(select_query.select_table_id, old_table_id, select_query.select_table_id, getStorageID()); if (refresher) - refresher->rename(new_table_id); + refresher->rename(new_table_id, getTargetTableId()); } void StorageMaterializedView::startup() @@ -623,12 +731,7 @@ void StorageMaterializedView::startup() DatabaseCatalog::instance().addViewDependency(select_query.select_table_id, getStorageID()); if (refresher) - { - refresher->initializeAndStart(); - - if (refresh_on_start) - refresher->run(); - } + refresher->startup(); } void StorageMaterializedView::shutdown(bool) @@ -665,7 +768,7 @@ Strings StorageMaterializedView::getDataPaths() const void StorageMaterializedView::backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) { /// We backup the target table's data only if it's inner. - if (hasInnerTable()) + if (hasInnerTable() && fixed_uuid) { if (auto table = tryGetTargetTable()) table->backupData(backup_entries_collector, data_path_in_backup, partitions); @@ -677,13 +780,13 @@ void StorageMaterializedView::backupData(BackupEntriesCollector & backup_entries void StorageMaterializedView::restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) { - if (hasInnerTable()) + if (hasInnerTable() && fixed_uuid) getTargetTable()->restoreDataFromBackup(restorer, data_path_in_backup, partitions); } bool StorageMaterializedView::supportsBackupPartition() const { - if (hasInnerTable()) + if (hasInnerTable() && fixed_uuid) return getTargetTable()->supportsBackupPartition(); return false; } @@ -753,12 +856,6 @@ StorageID StorageMaterializedView::getTargetTableId() const return id; } -void StorageMaterializedView::setTargetTableId(DB::StorageID id) -{ - std::lock_guard guard(target_table_id_mutex); - target_table_id = std::move(id); -} - void StorageMaterializedView::updateTargetTableId(std::optional database_name, std::optional table_name) { std::lock_guard guard(target_table_id_mutex); diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index a09ee07b3f6..e39642066b4 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -2,9 +2,10 @@ #include +#include + #include #include - #include namespace DB @@ -107,7 +108,7 @@ private: StorageID target_table_id = StorageID::createEmpty(); OwnedRefreshTask refresher; - bool refresh_on_start = false; + bool refresh_coordinated = false; bool has_inner_table = false; @@ -120,15 +121,15 @@ private: void checkStatementCanBeForwarded() const; ContextMutablePtr createRefreshContext() const; - /// Prepare to refresh a refreshable materialized view: create temporary table and form the - /// insert-select query. + /// Prepare to refresh a refreshable materialized view: create temporary table (if needed) and + /// form the insert-select query. /// out_temp_table_id may be assigned before throwing an exception, in which case the caller /// must drop the temp table before rethrowing. - std::shared_ptr prepareRefresh(bool append, ContextMutablePtr refresh_context, std::optional & out_temp_table_id) const; - StorageID exchangeTargetTable(StorageID fresh_table, ContextPtr refresh_context); + std::tuple, std::unique_ptr> + prepareRefresh(bool append, ContextMutablePtr refresh_context, std::optional & out_temp_table_id) const; + std::optional exchangeTargetTable(StorageID fresh_table, ContextPtr refresh_context) const; void dropTempTable(StorageID table, ContextMutablePtr refresh_context); - void setTargetTableId(StorageID id); void updateTargetTableId(std::optional database_name, std::optional table_name); }; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index f4d2ee67bb6..e8bdb61d4b6 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -59,6 +59,22 @@ namespace Setting extern const SettingsBool throw_on_unsupported_query_inside_transaction; } +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool allow_experimental_replacing_merge_with_cleanup; + extern const MergeTreeSettingsBool always_use_copy_instead_of_hardlinks; + extern const MergeTreeSettingsBool assign_part_uuids; + extern const MergeTreeSettingsDeduplicateMergeProjectionMode deduplicate_merge_projection_mode; + extern const MergeTreeSettingsUInt64 finished_mutations_to_keep; + extern const MergeTreeSettingsSeconds lock_acquire_timeout_for_background_operations; + extern const MergeTreeSettingsUInt64 max_number_of_merges_with_ttl_in_pool; + extern const MergeTreeSettingsUInt64 max_postpone_time_for_failed_mutations_ms; + extern const MergeTreeSettingsUInt64 merge_tree_clear_old_parts_interval_seconds; + extern const MergeTreeSettingsUInt64 merge_tree_clear_old_temporary_directories_interval_seconds; + extern const MergeTreeSettingsUInt64 non_replicated_deduplication_window; + extern const MergeTreeSettingsSeconds temporary_directories_lifetime; +} + namespace ErrorCodes { extern const int NOT_IMPLEMENTED; @@ -420,14 +436,14 @@ void StorageMergeTree::alter( /// Some additional changes in settings auto new_storage_settings = getSettings(); - if (old_storage_settings->non_replicated_deduplication_window != new_storage_settings->non_replicated_deduplication_window) + if ((*old_storage_settings)[MergeTreeSetting::non_replicated_deduplication_window] != (*new_storage_settings)[MergeTreeSetting::non_replicated_deduplication_window]) { /// We cannot place this check into settings sanityCheck because it depends on format_version. /// sanityCheck must work event without storage. - if (new_storage_settings->non_replicated_deduplication_window != 0 && format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING) + if ((*new_storage_settings)[MergeTreeSetting::non_replicated_deduplication_window] != 0 && format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Deduplication for non-replicated MergeTree in old syntax is not supported"); - deduplication_log->setDeduplicationWindowSize(new_storage_settings->non_replicated_deduplication_window); + deduplication_log->setDeduplicationWindowSize((*new_storage_settings)[MergeTreeSetting::non_replicated_deduplication_window]); } } } @@ -585,7 +601,7 @@ void StorageMergeTree::updateMutationEntriesErrors(FutureMergedMutatedPartPtr re if (static_cast(result_part->part_info.mutation) == it->first) { - mutation_backoff_policy.addPartMutationFailure(failed_part->name, getSettings()->max_postpone_time_for_failed_mutations_ms); + mutation_backoff_policy.addPartMutationFailure(failed_part->name, (*getSettings())[MergeTreeSetting::max_postpone_time_for_failed_mutations_ms]); } } } @@ -891,7 +907,7 @@ CancellationCode StorageMergeTree::killMutation(const String & mutation_id) void StorageMergeTree::loadDeduplicationLog() { auto settings = getSettings(); - if (settings->non_replicated_deduplication_window != 0 && format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING) + if ((*settings)[MergeTreeSetting::non_replicated_deduplication_window] != 0 && format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Deduplication for non-replicated MergeTree in old syntax is not supported"); auto disk = getDisks()[0]; @@ -900,7 +916,7 @@ void StorageMergeTree::loadDeduplicationLog() /// If either there is already a deduplication log, or we will be able to use it. if (!disk->isReadOnly() || disk->exists(path)) { - deduplication_log = std::make_unique(path, settings->non_replicated_deduplication_window, format_version, disk); + deduplication_log = std::make_unique(path, (*settings)[MergeTreeSetting::non_replicated_deduplication_window], format_version, disk); deduplication_log->load(); } } @@ -969,7 +985,7 @@ MergeMutateSelectedEntryPtr StorageMergeTree::selectPartsToMerge( auto future_part = std::make_shared(); - if (storage_settings.get()->assign_part_uuids) + if ((*storage_settings.get())[MergeTreeSetting::assign_part_uuids]) future_part->uuid = UUIDHelpers::generateV4(); /// You must call destructor with unlocked `currently_processing_in_background_mutex`. @@ -1053,7 +1069,7 @@ MergeMutateSelectedEntryPtr StorageMergeTree::selectPartsToMerge( if (is_background_memory_usage_ok(out_disable_reason)) { UInt64 max_source_parts_size = merger_mutator.getMaxSourcePartsSizeForMerge(); - bool merge_with_ttl_allowed = getTotalMergesWithTTLInMergeList() < data_settings->max_number_of_merges_with_ttl_in_pool; + bool merge_with_ttl_allowed = getTotalMergesWithTTLInMergeList() < (*data_settings)[MergeTreeSetting::max_number_of_merges_with_ttl_in_pool]; /// TTL requirements is much more strict than for regular merge, so /// if regular not possible, than merge with ttl is not also not @@ -1077,7 +1093,7 @@ MergeMutateSelectedEntryPtr StorageMergeTree::selectPartsToMerge( { while (true) { - auto timeout_ms = getSettings()->lock_acquire_timeout_for_background_operations.totalMilliseconds(); + auto timeout_ms = (*getSettings())[MergeTreeSetting::lock_acquire_timeout_for_background_operations].totalMilliseconds(); auto timeout = std::chrono::milliseconds(timeout_ms); if (!is_background_memory_usage_ok(out_disable_reason)) @@ -1153,7 +1169,7 @@ bool StorageMergeTree::merge( PreformattedMessage & out_disable_reason, bool optimize_skip_merged_partitions) { - auto table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); + auto table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, (*getSettings())[MergeTreeSetting::lock_acquire_timeout_for_background_operations]); auto metadata_snapshot = getInMemoryMetadataPtr(); SelectPartsDecision select_decision; @@ -1223,7 +1239,7 @@ MergeMutateSelectedEntryPtr StorageMergeTree::selectPartsToMutate( size_t max_ast_elements = getContext()->getSettingsRef()[Setting::max_expanded_ast_elements]; auto future_part = std::make_shared(); - if (storage_settings.get()->assign_part_uuids) + if ((*storage_settings.get())[MergeTreeSetting::assign_part_uuids]) future_part->uuid = UUIDHelpers::generateV4(); CurrentlyMergingPartsTaggerPtr tagger; @@ -1404,7 +1420,7 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assign auto metadata_snapshot = getInMemoryMetadataPtr(); MergeMutateSelectedEntryPtr merge_entry, mutate_entry; - auto shared_lock = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); + auto shared_lock = lockForShare(RWLockImpl::NO_QUERY, (*getSettings())[MergeTreeSetting::lock_acquire_timeout_for_background_operations]); MergeTreeTransactionHolder transaction_for_merge; MergeTreeTransactionPtr txn; @@ -1473,18 +1489,18 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assign bool scheduled = false; if (auto lock = time_after_previous_cleanup_temporary_directories.compareAndRestartDeferred( - getSettings()->merge_tree_clear_old_temporary_directories_interval_seconds)) + (*getSettings())[MergeTreeSetting::merge_tree_clear_old_temporary_directories_interval_seconds])) { assignee.scheduleCommonTask(std::make_shared( [this, shared_lock] () { - return clearOldTemporaryDirectories(getSettings()->temporary_directories_lifetime.totalSeconds()); + return clearOldTemporaryDirectories((*getSettings())[MergeTreeSetting::temporary_directories_lifetime].totalSeconds()); }, common_assignee_trigger, getStorageID()), /* need_trigger */ false); scheduled = true; } if (auto lock = time_after_previous_cleanup_parts.compareAndRestartDeferred( - getSettings()->merge_tree_clear_old_parts_interval_seconds)) + (*getSettings())[MergeTreeSetting::merge_tree_clear_old_parts_interval_seconds])) { assignee.scheduleCommonTask(std::make_shared( [this, shared_lock] () @@ -1518,7 +1534,7 @@ UInt64 StorageMergeTree::getCurrentMutationVersion( size_t StorageMergeTree::clearOldMutations(bool truncate) { - size_t finished_mutations_to_keep = getSettings()->finished_mutations_to_keep; + size_t finished_mutations_to_keep = (*getSettings())[MergeTreeSetting::finished_mutations_to_keep]; if (!truncate && !finished_mutations_to_keep) return 0; @@ -1588,8 +1604,9 @@ bool StorageMergeTree::optimize( { assertNotReadonly(); + const auto mode = (*getSettings())[MergeTreeSetting::deduplicate_merge_projection_mode]; if (deduplicate && getInMemoryMetadataPtr()->hasProjections() - && getSettings()->deduplicate_merge_projection_mode == DeduplicateMergeProjectionMode::THROW) + && (mode == DeduplicateMergeProjectionMode::THROW || mode == DeduplicateMergeProjectionMode::IGNORE)) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "OPTIMIZE DEDUPLICATE query is not supported for table {} as it has projections. " "User should drop all the projections manually before running the query, " @@ -1612,7 +1629,7 @@ bool StorageMergeTree::optimize( if (cleanup && this->merging_params.mode != MergingParams::Mode::Replacing) throw Exception(ErrorCodes::CANNOT_ASSIGN_OPTIMIZE, "Cannot OPTIMIZE with CLEANUP table: only ReplacingMergeTree can be CLEANUP"); - if (cleanup && !getSettings()->allow_experimental_replacing_merge_with_cleanup) + if (cleanup && !(*getSettings())[MergeTreeSetting::allow_experimental_replacing_merge_with_cleanup]) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Experimental merges with CLEANUP are not allowed"); DataPartsVector data_parts = getVisibleDataPartsVector(local_context); @@ -2356,7 +2373,7 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const IDataPartStorage::ClonePartParams clone_params { .txn = local_context->getCurrentTransaction(), - .copy_instead_of_hardlink = getSettings()->always_use_copy_instead_of_hardlinks, + .copy_instead_of_hardlink = (*getSettings())[MergeTreeSetting::always_use_copy_instead_of_hardlinks], }; auto [dst_part, part_lock] = dest_table_storage->cloneAndLoadDataPart( diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index a3d529c5fbb..fa084059611 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -170,6 +170,45 @@ namespace Setting extern const SettingsUInt64 select_sequential_consistency; } +namespace MergeTreeSetting +{ + extern const MergeTreeSettingsBool allow_experimental_replacing_merge_with_cleanup; + extern const MergeTreeSettingsBool allow_remote_fs_zero_copy_replication; + extern const MergeTreeSettingsBool always_use_copy_instead_of_hardlinks; + extern const MergeTreeSettingsBool assign_part_uuids; + extern const MergeTreeSettingsDeduplicateMergeProjectionMode deduplicate_merge_projection_mode; + extern const MergeTreeSettingsBool detach_old_local_parts_when_cloning_replica; + extern const MergeTreeSettingsBool disable_detach_partition_for_zero_copy_replication; + extern const MergeTreeSettingsBool disable_fetch_partition_for_zero_copy_replication; + extern const MergeTreeSettingsBool enable_mixed_granularity_parts; + extern const MergeTreeSettingsBool enable_the_endpoint_id_with_zookeeper_name_prefix; + extern const MergeTreeSettingsFloat fault_probability_after_part_commit; + extern const MergeTreeSettingsFloat fault_probability_before_part_commit; + extern const MergeTreeSettingsUInt64 index_granularity_bytes; + extern const MergeTreeSettingsSeconds lock_acquire_timeout_for_background_operations; + extern const MergeTreeSettingsUInt64 max_bytes_to_merge_at_max_space_in_pool; + extern const MergeTreeSettingsUInt64 max_merge_selecting_sleep_ms; + extern const MergeTreeSettingsUInt64 max_number_of_merges_with_ttl_in_pool; + extern const MergeTreeSettingsUInt64 max_replicated_fetches_network_bandwidth; + extern const MergeTreeSettingsUInt64 max_replicated_merges_in_queue; + extern const MergeTreeSettingsUInt64 max_replicated_merges_with_ttl_in_queue; + extern const MergeTreeSettingsUInt64 max_replicated_mutations_in_queue; + extern const MergeTreeSettingsUInt64 max_replicated_sends_network_bandwidth; + extern const MergeTreeSettingsUInt64 merge_selecting_sleep_ms; + extern const MergeTreeSettingsFloat merge_selecting_sleep_slowdown_factor; + extern const MergeTreeSettingsUInt64 min_relative_delay_to_measure; + extern const MergeTreeSettingsUInt64 parts_to_delay_insert; + extern const MergeTreeSettingsBool remote_fs_zero_copy_path_compatible_mode; + extern const MergeTreeSettingsString remote_fs_zero_copy_zookeeper_path; + extern const MergeTreeSettingsBool replicated_can_become_leader; + extern const MergeTreeSettingsUInt64 replicated_deduplication_window; + extern const MergeTreeSettingsUInt64 replicated_deduplication_window_for_async_inserts; + extern const MergeTreeSettingsFloat replicated_max_ratio_of_wrong_parts; + extern const MergeTreeSettingsBool use_minimalistic_checksums_in_zookeeper; + extern const MergeTreeSettingsBool use_minimalistic_part_header_in_zookeeper; + extern const MergeTreeSettingsMilliseconds wait_for_unique_parts_send_before_shutdown_ms; +} + namespace FailPoints { extern const char replicated_queue_fail_next_entry[]; @@ -281,7 +320,7 @@ zkutil::ZooKeeperPtr StorageReplicatedMergeTree::getZooKeeperAndAssertNotReadonl String StorageReplicatedMergeTree::getEndpointName() const { const MergeTreeSettings & settings = getContext()->getReplicatedMergeTreeSettings(); - if (settings.enable_the_endpoint_id_with_zookeeper_name_prefix) + if (settings[MergeTreeSetting::enable_the_endpoint_id_with_zookeeper_name_prefix]) return zookeeper_info.zookeeper_name + ":" + replica_path; return replica_path; @@ -347,8 +386,8 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( , part_check_thread(*this) , restarting_thread(*this) , part_moves_between_shards_orchestrator(*this) - , replicated_fetches_throttler(std::make_shared(getSettings()->max_replicated_fetches_network_bandwidth, getContext()->getReplicatedFetchesThrottler())) - , replicated_sends_throttler(std::make_shared(getSettings()->max_replicated_sends_network_bandwidth, getContext()->getReplicatedSendsThrottler())) + , replicated_fetches_throttler(std::make_shared((*getSettings())[MergeTreeSetting::max_replicated_fetches_network_bandwidth], getContext()->getReplicatedFetchesThrottler())) + , replicated_sends_throttler(std::make_shared((*getSettings())[MergeTreeSetting::max_replicated_sends_network_bandwidth], getContext()->getReplicatedSendsThrottler())) { initializeDirectoriesAndFormatVersion(relative_data_path_, LoadingStrictnessLevel::ATTACH <= mode, date_column_name); /// We create and deactivate all tasks for consistency. @@ -370,7 +409,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( /// Will be activated if we will achieve leader state. merge_selecting_task->deactivate(); - merge_selecting_sleep_ms = getSettings()->merge_selecting_sleep_ms; + merge_selecting_sleep_ms = (*getSettings())[MergeTreeSetting::merge_selecting_sleep_ms]; mutations_finalizing_task = getContext()->getSchedulePool().createTask( getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::mutationsFinalizingTask)", [this] { mutationsFinalizingTask(); }); @@ -476,11 +515,11 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( /// Provide better initial value of merge_selecting_sleep_ms on server startup auto settings = getSettings(); size_t max_parts_in_partition = getMaxPartsCountAndSizeForPartition().first; - if (settings->parts_to_delay_insert && max_parts_in_partition < settings->parts_to_delay_insert) + if ((*settings)[MergeTreeSetting::parts_to_delay_insert] && max_parts_in_partition < (*settings)[MergeTreeSetting::parts_to_delay_insert]) { - Float64 ratio = 1.0 - static_cast(max_parts_in_partition) / settings->parts_to_delay_insert; - merge_selecting_sleep_ms = static_cast(interpolateLinear(settings->merge_selecting_sleep_ms, - settings->max_merge_selecting_sleep_ms, ratio)); + Float64 ratio = 1.0 - static_cast(max_parts_in_partition) / (*settings)[MergeTreeSetting::parts_to_delay_insert]; + merge_selecting_sleep_ms = static_cast(interpolateLinear((*settings)[MergeTreeSetting::merge_selecting_sleep_ms], + (*settings)[MergeTreeSetting::max_merge_selecting_sleep_ms], ratio)); } } @@ -780,7 +819,7 @@ void StorageReplicatedMergeTree::createNewZooKeeperNodes() futures.push_back(zookeeper->asyncTryCreateNoThrow(zookeeper_path + "/quorum/parallel", String(), zkutil::CreateMode::Persistent)); /// Nodes for remote fs zero-copy replication const auto settings = getSettings(); - if (settings->allow_remote_fs_zero_copy_replication) + if ((*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) { for (const auto & zero_copy_locks_root : getZookeeperZeroCopyLockPaths()) { @@ -1132,7 +1171,7 @@ zkutil::ZooKeeperPtr StorageReplicatedMergeTree::getZooKeeperIfTableShutDown() c std::vector StorageReplicatedMergeTree::getZookeeperZeroCopyLockPaths() const { const auto settings = getSettings(); - if (!settings->allow_remote_fs_zero_copy_replication) + if (!(*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) { return {}; } @@ -1155,7 +1194,7 @@ std::vector StorageReplicatedMergeTree::getZookeeperZeroCopyLockPaths() for (const auto & disk_type: disk_types_with_zero_copy) { auto zero_copy = fmt::format("zero_copy_{}", disk_type); - auto zero_copy_path = fs::path(settings->remote_fs_zero_copy_zookeeper_path.toString()) / zero_copy; + auto zero_copy_path = fs::path((*settings)[MergeTreeSetting::remote_fs_zero_copy_zookeeper_path].toString()) / zero_copy; result.push_back(zero_copy_path / actual_table_shared_id); } @@ -1279,7 +1318,7 @@ bool StorageReplicatedMergeTree::dropReplica( /// Then try to remove paths that are known to be flat (all children are leafs) Strings flat_nodes = {"flags", "queue"}; - if (table_settings && table_settings->use_minimalistic_part_header_in_zookeeper) + if (table_settings && (*table_settings)[MergeTreeSetting::use_minimalistic_part_header_in_zookeeper]) flat_nodes.emplace_back("parts"); for (const auto & node : flat_nodes) { @@ -1750,7 +1789,7 @@ bool StorageReplicatedMergeTree::checkPartsImpl(bool skip_sanity_checks) total_rows_on_filesystem += part.part->rows_count; const auto storage_settings_ptr = getSettings(); - bool insane = uncovered_unexpected_parts_rows > total_rows_on_filesystem * storage_settings_ptr->replicated_max_ratio_of_wrong_parts; + bool insane = uncovered_unexpected_parts_rows > total_rows_on_filesystem * (*storage_settings_ptr)[MergeTreeSetting::replicated_max_ratio_of_wrong_parts]; constexpr auto sanity_report_fmt = "The local set of parts of table {} doesn't look like the set of parts in ZooKeeper: " "{} rows of {} total rows in filesystem are suspicious. " @@ -1900,7 +1939,7 @@ bool StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps( const auto storage_settings_ptr = getSettings(); String part_path = fs::path(replica_path) / "parts" / part_name; - if (storage_settings_ptr->use_minimalistic_part_header_in_zookeeper) + if ((*storage_settings_ptr)[MergeTreeSetting::use_minimalistic_part_header_in_zookeeper]) { ops.emplace_back(zkutil::makeCreateRequest( part_path, local_part_header.toString(), zkutil::CreateMode::Persistent)); @@ -1993,8 +2032,8 @@ MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAnd Coordination::Error e; { - Coordination::SimpleFaultInjection fault(getSettings()->fault_probability_before_part_commit, - getSettings()->fault_probability_after_part_commit, "part commit"); + Coordination::SimpleFaultInjection fault((*getSettings())[MergeTreeSetting::fault_probability_before_part_commit], + (*getSettings())[MergeTreeSetting::fault_probability_after_part_commit], "part commit"); ThreadFuzzer::maybeInjectSleep(); e = zookeeper->tryMulti(ops, responses, /* check_session_valid */ true); } @@ -2021,7 +2060,7 @@ MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAnd String StorageReplicatedMergeTree::getChecksumsForZooKeeper(const MergeTreeDataPartChecksums & checksums) const { return MinimalisticDataPartChecksums::getSerializedString(checksums, - getSettings()->use_minimalistic_checksums_in_zookeeper); + (*getSettings())[MergeTreeSetting::use_minimalistic_checksums_in_zookeeper]); } MergeTreeData::MutableDataPartPtr StorageReplicatedMergeTree::attachPartHelperFoundValidPart(const LogEntry & entry) const @@ -2597,7 +2636,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(LogEntry & entry) PartsToRemoveFromZooKeeper parts_to_remove; auto table_lock_holder_dst_table = lockForShare( - RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); + RWLockImpl::NO_QUERY, (*getSettings())[MergeTreeSetting::lock_acquire_timeout_for_background_operations]); auto dst_metadata_snapshot = getInMemoryMetadataPtr(); for (size_t i = 0; i < entry_replace.new_part_names.size(); ++i) @@ -2673,7 +2712,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(LogEntry & entry) } table_lock_holder_src_table = source_table->lockForShare( - RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); + RWLockImpl::NO_QUERY, (*getSettings())[MergeTreeSetting::lock_acquire_timeout_for_background_operations]); DataPartStates valid_states{ MergeTreeDataPartState::PreActive, MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}; @@ -2688,7 +2727,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(LogEntry & entry) continue; } - bool avoid_copy_local_part = storage_settings_ptr->allow_remote_fs_zero_copy_replication && src_part->isStoredOnRemoteDiskWithZeroCopySupport(); + bool avoid_copy_local_part = (*storage_settings_ptr)[MergeTreeSetting::allow_remote_fs_zero_copy_replication] && src_part->isStoredOnRemoteDiskWithZeroCopySupport(); if (avoid_copy_local_part) { @@ -2841,7 +2880,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(LogEntry & entry) { /// Fetches with zero-copy-replication are cheap, but cloneAndLoadDataPart(must_on_same_disk=true) will do full copy. /// It's okay to check the setting for current table and disk for the source table, because src and dst part are on the same disk. - bool prefer_fetch_from_other_replica = !part_desc->replica.empty() && storage_settings_ptr->allow_remote_fs_zero_copy_replication + bool prefer_fetch_from_other_replica = !part_desc->replica.empty() && (*storage_settings_ptr)[MergeTreeSetting::allow_remote_fs_zero_copy_replication] && part_desc->src_table_part && part_desc->src_table_part->isStoredOnRemoteDiskWithZeroCopySupport(); if (part_desc->src_table_part && !prefer_fetch_from_other_replica) @@ -2850,12 +2889,12 @@ bool StorageReplicatedMergeTree::executeReplaceRange(LogEntry & entry) throw Exception(ErrorCodes::UNFINISHED, "Checksums of {} is suddenly changed", part_desc->src_table_part->name); /// Don't do hardlinks in case of zero-copy at any side (defensive programming) - bool source_zero_copy_enabled = dynamic_cast(source_table.get())->getSettings()->allow_remote_fs_zero_copy_replication; - bool our_zero_copy_enabled = storage_settings_ptr->allow_remote_fs_zero_copy_replication; + bool source_zero_copy_enabled = (*dynamic_cast(source_table.get())->getSettings())[MergeTreeSetting::allow_remote_fs_zero_copy_replication]; + bool our_zero_copy_enabled = (*storage_settings_ptr)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]; IDataPartStorage::ClonePartParams clone_params { - .copy_instead_of_hardlink = storage_settings_ptr->always_use_copy_instead_of_hardlinks || ((our_zero_copy_enabled || source_zero_copy_enabled) && part_desc->src_table_part->isStoredOnRemoteDiskWithZeroCopySupport()), + .copy_instead_of_hardlink = (*storage_settings_ptr)[MergeTreeSetting::always_use_copy_instead_of_hardlinks] || ((our_zero_copy_enabled || source_zero_copy_enabled) && part_desc->src_table_part->isStoredOnRemoteDiskWithZeroCopySupport()), .metadata_version_to_write = metadata_snapshot->getMetadataVersion() }; auto [res_part, temporary_part_lock] = cloneAndLoadDataPart( @@ -3233,7 +3272,7 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo LOG_WARNING(log, "Source replica does not have part {}. Removing it from working set.", part->name); } - if (getSettings()->detach_old_local_parts_when_cloning_replica) + if ((*getSettings())[MergeTreeSetting::detach_old_local_parts_when_cloning_replica]) { auto metadata_snapshot = getInMemoryMetadataPtr(); @@ -3849,26 +3888,26 @@ void StorageReplicatedMergeTree::mergeSelectingTask() return AttemptStatus::Limited; } - if (merges_and_mutations_sum >= storage_settings_ptr->max_replicated_merges_in_queue) + if (merges_and_mutations_sum >= (*storage_settings_ptr)[MergeTreeSetting::max_replicated_merges_in_queue]) { LOG_TRACE(log, "Number of queued merges ({}) and part mutations ({})" " is greater than max_replicated_merges_in_queue ({}), so won't select new parts to merge or mutate.", merges_and_mutations_queued.merges, merges_and_mutations_queued.mutations, - storage_settings_ptr->max_replicated_merges_in_queue); + (*storage_settings_ptr)[MergeTreeSetting::max_replicated_merges_in_queue]); return AttemptStatus::Limited; } UInt64 max_source_parts_size_for_merge = merger_mutator.getMaxSourcePartsSizeForMerge( - storage_settings_ptr->max_replicated_merges_in_queue, merges_and_mutations_sum); + (*storage_settings_ptr)[MergeTreeSetting::max_replicated_merges_in_queue], merges_and_mutations_sum); UInt64 max_source_part_size_for_mutation = merger_mutator.getMaxSourcePartSizeForMutation(); - bool merge_with_ttl_allowed = merges_and_mutations_queued.merges_with_ttl < storage_settings_ptr->max_replicated_merges_with_ttl_in_queue && - getTotalMergesWithTTLInMergeList() < storage_settings_ptr->max_number_of_merges_with_ttl_in_pool; + bool merge_with_ttl_allowed = merges_and_mutations_queued.merges_with_ttl < (*storage_settings_ptr)[MergeTreeSetting::max_replicated_merges_with_ttl_in_queue] && + getTotalMergesWithTTLInMergeList() < (*storage_settings_ptr)[MergeTreeSetting::max_number_of_merges_with_ttl_in_pool]; auto future_merged_part = std::make_shared(); - if (storage_settings.get()->assign_part_uuids) + if ((*storage_settings.get())[MergeTreeSetting::assign_part_uuids]) future_merged_part->uuid = UUIDHelpers::generateV4(); bool can_assign_merge = max_source_parts_size_for_merge > 0; @@ -3910,7 +3949,7 @@ void StorageReplicatedMergeTree::mergeSelectingTask() } /// If there are many mutations in queue, it may happen, that we cannot enqueue enough merges to merge all new parts - if (max_source_part_size_for_mutation == 0 || merges_and_mutations_queued.mutations >= storage_settings_ptr->max_replicated_mutations_in_queue) + if (max_source_part_size_for_mutation == 0 || merges_and_mutations_queued.mutations >= (*storage_settings_ptr)[MergeTreeSetting::max_replicated_mutations_in_queue]) return AttemptStatus::Limited; if (queue.countMutations() > 0) @@ -3960,13 +3999,13 @@ void StorageReplicatedMergeTree::mergeSelectingTask() Float32 new_sleep_ms = merge_selecting_sleep_ms; if (result == AttemptStatus::EntryCreated || result == AttemptStatus::NeedRetry) - new_sleep_ms /= storage_settings_ptr->merge_selecting_sleep_slowdown_factor; + new_sleep_ms /= (*storage_settings_ptr)[MergeTreeSetting::merge_selecting_sleep_slowdown_factor]; else if (result == AttemptStatus::CannotSelect) - new_sleep_ms *= storage_settings_ptr->merge_selecting_sleep_slowdown_factor; + new_sleep_ms *= (*storage_settings_ptr)[MergeTreeSetting::merge_selecting_sleep_slowdown_factor]; new_sleep_ms *= std::uniform_real_distribution(1.f, 1.1f)(thread_local_rng); merge_selecting_sleep_ms = std::clamp(static_cast(new_sleep_ms), - storage_settings_ptr->merge_selecting_sleep_ms, - storage_settings_ptr->max_merge_selecting_sleep_ms); + (*storage_settings_ptr)[MergeTreeSetting::merge_selecting_sleep_ms], + (*storage_settings_ptr)[MergeTreeSetting::max_merge_selecting_sleep_ms]); if (result == AttemptStatus::EntryCreated) merge_selecting_task->schedule(); @@ -4322,7 +4361,7 @@ void StorageReplicatedMergeTree::startBeingLeader() { auto zookeeper = getZooKeeper(); - if (!getSettings()->replicated_can_become_leader) + if (!(*getSettings())[MergeTreeSetting::replicated_can_become_leader]) { LOG_INFO(log, "Will not enter leader election because replicated_can_become_leader=0"); return; @@ -4407,7 +4446,7 @@ void StorageReplicatedMergeTree::waitForUniquePartsToBeFetchedByOtherReplicas(St auto settings_ptr = getSettings(); - auto wait_ms = settings_ptr->wait_for_unique_parts_send_before_shutdown_ms.totalMilliseconds(); + auto wait_ms = (*settings_ptr)[MergeTreeSetting::wait_for_unique_parts_send_before_shutdown_ms].totalMilliseconds(); if (wait_ms == 0) { LOG_INFO(log, "Will not wait for unique parts to be fetched by other replicas because wait time is zero"); @@ -4843,7 +4882,7 @@ bool StorageReplicatedMergeTree::fetchPart( auto settings_ptr = getSettings(); TableLockHolder table_lock_holder; if (!to_detached) - table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, settings_ptr->lock_acquire_timeout_for_background_operations); + table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, (*settings_ptr)[MergeTreeSetting::lock_acquire_timeout_for_background_operations]); /// Logging Stopwatch stopwatch; @@ -4861,7 +4900,7 @@ bool StorageReplicatedMergeTree::fetchPart( auto is_zero_copy_part = [&settings_ptr](const auto & data_part) { - return settings_ptr->allow_remote_fs_zero_copy_replication && data_part->isStoredOnRemoteDiskWithZeroCopySupport(); + return (*settings_ptr)[MergeTreeSetting::allow_remote_fs_zero_copy_replication] && data_part->isStoredOnRemoteDiskWithZeroCopySupport(); }; DataPartPtr part_to_clone; @@ -4933,7 +4972,7 @@ bool StorageReplicatedMergeTree::fetchPart( chassert(!is_zero_copy_part(part_to_clone)); IDataPartStorage::ClonePartParams clone_params { - .copy_instead_of_hardlink = getSettings()->always_use_copy_instead_of_hardlinks, + .copy_instead_of_hardlink = (*getSettings())[MergeTreeSetting::always_use_copy_instead_of_hardlinks], .keep_metadata_version = true, }; @@ -5107,7 +5146,7 @@ MergeTreeData::MutableDataPartPtr StorageReplicatedMergeTree::fetchExistsPart( LOG_DEBUG(log, "Fetching already known part {} from {}:{}", part_name, zookeeper_info.zookeeper_name, source_replica_path); - TableLockHolder table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); + TableLockHolder table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, (*getSettings())[MergeTreeSetting::lock_acquire_timeout_for_background_operations]); /// Logging Stopwatch stopwatch; @@ -5328,7 +5367,7 @@ void StorageReplicatedMergeTree::flushAndPrepareForShutdown() restarting_thread.shutdown(/* part_of_full_shutdown */true); /// Explicitly set the event, because the restarting thread will not set it again startup_event.set(); - shutdown_deadline.emplace(std::chrono::system_clock::now() + std::chrono::milliseconds(settings_ptr->wait_for_unique_parts_send_before_shutdown_ms.totalMilliseconds())); + shutdown_deadline.emplace(std::chrono::system_clock::now() + std::chrono::milliseconds((*settings_ptr)[MergeTreeSetting::wait_for_unique_parts_send_before_shutdown_ms].totalMilliseconds())); } catch (...) { @@ -5683,9 +5722,9 @@ SinkToStoragePtr StorageReplicatedMergeTree::write(const ASTPtr & /*query*/, con const auto storage_settings_ptr = getSettings(); const Settings & query_settings = local_context->getSettingsRef(); - bool deduplicate = storage_settings_ptr->replicated_deduplication_window != 0 && query_settings[Setting::insert_deduplicate]; + bool deduplicate = (*storage_settings_ptr)[MergeTreeSetting::replicated_deduplication_window] != 0 && query_settings[Setting::insert_deduplicate]; bool async_deduplicate = async_insert && query_settings[Setting::async_insert_deduplicate] - && storage_settings_ptr->replicated_deduplication_window_for_async_inserts != 0 && query_settings[Setting::insert_deduplicate]; + && (*storage_settings_ptr)[MergeTreeSetting::replicated_deduplication_window_for_async_inserts] != 0 && query_settings[Setting::insert_deduplicate]; if (async_deduplicate) return std::make_shared( *this, @@ -5833,8 +5872,9 @@ bool StorageReplicatedMergeTree::optimize( if (!is_leader) throw Exception(ErrorCodes::NOT_A_LEADER, "OPTIMIZE cannot be done on this replica because it is not a leader"); + const auto mode = (*getSettings())[MergeTreeSetting::deduplicate_merge_projection_mode]; if (deduplicate && getInMemoryMetadataPtr()->hasProjections() - && getSettings()->deduplicate_merge_projection_mode == DeduplicateMergeProjectionMode::THROW) + && (mode == DeduplicateMergeProjectionMode::THROW || mode == DeduplicateMergeProjectionMode::IGNORE)) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "OPTIMIZE DEDUPLICATE query is not supported for table {} as it has projections. " "User should drop all the projections manually before running the query, " @@ -5843,7 +5883,7 @@ bool StorageReplicatedMergeTree::optimize( if (cleanup) { - if (!getSettings()->allow_experimental_replacing_merge_with_cleanup) + if (!(*getSettings())[MergeTreeSetting::allow_experimental_replacing_merge_with_cleanup]) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Experimental merges with CLEANUP are not allowed"); LOG_DEBUG(log, "Cleanup the ReplicatedMergeTree."); } @@ -5886,7 +5926,7 @@ bool StorageReplicatedMergeTree::optimize( ReplicatedMergeTreeMergePredicate can_merge = queue.getMergePredicate(zookeeper, std::move(partition_ids_hint)); auto future_merged_part = std::make_shared(); - if (storage_settings.get()->assign_part_uuids) + if ((*storage_settings.get())[MergeTreeSetting::assign_part_uuids]) future_merged_part->uuid = UUIDHelpers::generateV4(); constexpr const char * unknown_disable_reason = "unknown reason"; @@ -5896,7 +5936,7 @@ bool StorageReplicatedMergeTree::optimize( if (partition_id.empty()) { select_decision = merger_mutator.selectPartsToMerge( - future_merged_part, /* aggressive */ true, storage_settings_ptr->max_bytes_to_merge_at_max_space_in_pool, + future_merged_part, /* aggressive */ true, (*storage_settings_ptr)[MergeTreeSetting::max_bytes_to_merge_at_max_space_in_pool], can_merge, /* merge_with_ttl_allowed */ false, NO_TRANSACTION_PTR, disable_reason); } else @@ -6060,8 +6100,8 @@ bool StorageReplicatedMergeTree::executeMetadataAlter(const StorageReplicatedMer } { - auto table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); - auto alter_lock_holder = lockForAlter(getSettings()->lock_acquire_timeout_for_background_operations); + auto table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, (*getSettings())[MergeTreeSetting::lock_acquire_timeout_for_background_operations]); + auto alter_lock_holder = lockForAlter((*getSettings())[MergeTreeSetting::lock_acquire_timeout_for_background_operations]); LOG_INFO(log, "Metadata changed in ZooKeeper. Applying changes locally."); auto metadata_diff = ReplicatedMergeTreeTableMetadata(*this, getInMemoryMetadataPtr()).checkAndFindDiff(metadata_from_entry, getInMemoryMetadataPtr()->getColumns(), getContext()); @@ -6593,8 +6633,8 @@ void StorageReplicatedMergeTree::dropPartition(const ASTPtr & partition, bool de auto settings = getSettings(); - if (detach && settings->disable_detach_partition_for_zero_copy_replication - && settings->allow_remote_fs_zero_copy_replication) + if (detach && (*settings)[MergeTreeSetting::disable_detach_partition_for_zero_copy_replication] + && (*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) { for (const auto & disk : getDisks()) { @@ -7080,7 +7120,7 @@ void StorageReplicatedMergeTree::getStatus(ReplicatedTableStatus & res, bool wit const auto storage_settings_ptr = getSettings(); res.is_leader = is_leader; - res.can_become_leader = storage_settings_ptr->replicated_can_become_leader; + res.can_become_leader = (*storage_settings_ptr)[MergeTreeSetting::replicated_can_become_leader]; res.is_readonly = is_readonly; res.is_session_expired = !zookeeper || zookeeper->expired(); @@ -7218,7 +7258,7 @@ void StorageReplicatedMergeTree::getReplicaDelays(time_t & out_absolute_delay, t * Calculated only if the absolute delay is large enough. */ - if (out_absolute_delay < static_cast(storage_settings_ptr->min_relative_delay_to_measure)) + if (out_absolute_delay < static_cast((*storage_settings_ptr)[MergeTreeSetting::min_relative_delay_to_measure])) return; auto zookeeper = getZooKeeper(); @@ -7290,8 +7330,8 @@ void StorageReplicatedMergeTree::fetchPartition( if (from.empty()) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "ZooKeeper path should not be empty"); - if (settings->disable_fetch_partition_for_zero_copy_replication - && settings->allow_remote_fs_zero_copy_replication) + if ((*settings)[MergeTreeSetting::disable_fetch_partition_for_zero_copy_replication] + && (*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) { for (const auto & disk : getDisks()) { @@ -7689,7 +7729,7 @@ bool StorageReplicatedMergeTree::hasLightweightDeletedMask() const size_t StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() { - auto table_lock = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); + auto table_lock = lockForShare(RWLockImpl::NO_QUERY, (*getSettings())[MergeTreeSetting::lock_acquire_timeout_for_background_operations]); auto zookeeper = getZooKeeper(); /// Now these parts are in Deleting state. If we fail to remove some of them we must roll them back to Outdated state. @@ -8105,8 +8145,8 @@ void StorageReplicatedMergeTree::replacePartitionFrom( ProfileEventsScope profile_events_scope; const auto zookeeper = getZooKeeper(); - const bool zero_copy_enabled = storage_settings_ptr->allow_remote_fs_zero_copy_replication - || dynamic_cast(source_table.get())->getSettings()->allow_remote_fs_zero_copy_replication; + const bool zero_copy_enabled = (*storage_settings_ptr)[MergeTreeSetting::allow_remote_fs_zero_copy_replication] + || (*dynamic_cast(source_table.get())->getSettings())[MergeTreeSetting::allow_remote_fs_zero_copy_replication]; std::unique_ptr entries[partitions.size()]; size_t idx = 0; @@ -8120,7 +8160,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom( zookeeper, replace, zero_copy_enabled, - storage_settings_ptr->always_use_copy_instead_of_hardlinks, + (*storage_settings_ptr)[MergeTreeSetting::always_use_copy_instead_of_hardlinks], query_context); ++idx; } @@ -8514,12 +8554,12 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); /// Don't do hardlinks in case of zero-copy at any side (defensive programming) - bool zero_copy_enabled = storage_settings_ptr->allow_remote_fs_zero_copy_replication - || dynamic_cast(dest_table.get())->getSettings()->allow_remote_fs_zero_copy_replication; + bool zero_copy_enabled = (*storage_settings_ptr)[MergeTreeSetting::allow_remote_fs_zero_copy_replication] + || (*dynamic_cast(dest_table.get())->getSettings())[MergeTreeSetting::allow_remote_fs_zero_copy_replication]; IDataPartStorage::ClonePartParams clone_params { - .copy_instead_of_hardlink = storage_settings_ptr->always_use_copy_instead_of_hardlinks || (zero_copy_enabled && src_part->isStoredOnRemoteDiskWithZeroCopySupport()), + .copy_instead_of_hardlink = (*storage_settings_ptr)[MergeTreeSetting::always_use_copy_instead_of_hardlinks] || (zero_copy_enabled && src_part->isStoredOnRemoteDiskWithZeroCopySupport()), .metadata_version_to_write = dest_metadata_snapshot->getMetadataVersion() }; auto [dst_part, dst_part_lock] = dest_table_storage->cloneAndLoadDataPart( @@ -8807,7 +8847,7 @@ void StorageReplicatedMergeTree::getCommitPartOps( } /// Information about the part, in the replica - if (storage_settings_ptr->use_minimalistic_part_header_in_zookeeper) + if ((*storage_settings_ptr)[MergeTreeSetting::use_minimalistic_part_header_in_zookeeper]) { ops.emplace_back(zkutil::makeCreateRequest( fs::path(replica_path) / "parts" / part->name, @@ -9237,7 +9277,7 @@ std::optional StorageReplicatedMergeTree::checkDataNext(DataValidat bool StorageReplicatedMergeTree::canUseZeroCopyReplication() const { auto settings_ptr = getSettings(); - if (!settings_ptr->allow_remote_fs_zero_copy_replication) + if (!(*settings_ptr)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) return false; auto disks = getStoragePolicy()->getDisks(); @@ -9296,8 +9336,8 @@ void StorageReplicatedMergeTree::checkBrokenDisks() bool StorageReplicatedMergeTree::canUseAdaptiveGranularity() const { const auto storage_settings_ptr = getSettings(); - return storage_settings_ptr->index_granularity_bytes != 0 && - (storage_settings_ptr->enable_mixed_granularity_parts || + return (*storage_settings_ptr)[MergeTreeSetting::index_granularity_bytes] != 0 && + ((*storage_settings_ptr)[MergeTreeSetting::enable_mixed_granularity_parts] || (!has_non_adaptive_index_granularity_parts && !other_replicas_fixed_granularity)); } @@ -9403,7 +9443,7 @@ zkutil::EphemeralNodeHolderPtr StorageReplicatedMergeTree::lockSharedDataTempora { auto settings = getSettings(); - if (!disk || !disk->supportZeroCopyReplication() || !settings->allow_remote_fs_zero_copy_replication) + if (!disk || !disk->supportZeroCopyReplication() || !(*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) return {}; zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper(); @@ -9446,7 +9486,7 @@ void StorageReplicatedMergeTree::getLockSharedDataOps( { auto settings = getSettings(); - if (!part.isStoredOnDisk() || !settings->allow_remote_fs_zero_copy_replication) + if (!part.isStoredOnDisk() || !(*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) return; if (!part.getDataPartStorage().supportZeroCopyReplication()) @@ -9501,7 +9541,7 @@ void StorageReplicatedMergeTree::lockSharedData( { auto settings = getSettings(); - if (!part.isStoredOnDisk() || !settings->allow_remote_fs_zero_copy_replication) + if (!part.isStoredOnDisk() || !(*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) return; if (!part.getDataPartStorage().supportZeroCopyReplication()) @@ -9553,7 +9593,7 @@ std::pair StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part, const ZooKeeperWithFaultInjectionPtr & zookeeper) const { auto settings = getSettings(); - if (!settings->allow_remote_fs_zero_copy_replication) + if (!(*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) return std::make_pair(true, NameSet{}); if (!part.isStoredOnDisk()) @@ -9922,7 +9962,7 @@ MergeTreeData::MutableDataPartPtr StorageReplicatedMergeTree::tryToFetchIfShared { const auto settings = getSettings(); auto data_source_description = disk->getDataSourceDescription(); - if (!(disk->supportZeroCopyReplication() && settings->allow_remote_fs_zero_copy_replication)) + if (!(disk->supportZeroCopyReplication() && (*settings)[MergeTreeSetting::allow_remote_fs_zero_copy_replication])) return nullptr; String replica = getSharedDataReplica(part, data_source_description); @@ -10021,9 +10061,9 @@ Strings StorageReplicatedMergeTree::getZeroCopyPartPath( String zero_copy = fmt::format("zero_copy_{}", disk_type); - String new_path = fs::path(settings.remote_fs_zero_copy_zookeeper_path.toString()) / zero_copy / table_uuid / part_name; + String new_path = fs::path(settings[MergeTreeSetting::remote_fs_zero_copy_zookeeper_path].toString()) / zero_copy / table_uuid / part_name; res.push_back(std::move(new_path)); - if (settings.remote_fs_zero_copy_path_compatible_mode && !zookeeper_path_old.empty()) + if (settings[MergeTreeSetting::remote_fs_zero_copy_path_compatible_mode] && !zookeeper_path_old.empty()) { /// Compatibility mode for cluster with old and new versions String old_path = fs::path(zookeeper_path_old) / zero_copy / "shared" / part_name; res.push_back(std::move(old_path)); @@ -10483,7 +10523,7 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode( bool StorageReplicatedMergeTree::removeDetachedPart(DiskPtr disk, const String & path, const String & part_name) { auto settings_ptr = getSettings(); - if (disk->supportZeroCopyReplication() && settings_ptr->allow_remote_fs_zero_copy_replication) + if (disk->supportZeroCopyReplication() && (*settings_ptr)[MergeTreeSetting::allow_remote_fs_zero_copy_replication]) { String table_id = getTableSharedID(); return removeSharedDetachedPart(disk, path, part_name, table_id, replica_name, zookeeper_path, getContext(), current_zookeeper); diff --git a/src/Storages/System/CMakeLists.txt b/src/Storages/System/CMakeLists.txt index 899c3d5cf40..c7583713d2d 100644 --- a/src/Storages/System/CMakeLists.txt +++ b/src/Storages/System/CMakeLists.txt @@ -51,5 +51,3 @@ target_link_libraries(clickhouse_storages_system PRIVATE clickhouse_parsers Poco::JSON ) - -target_include_directories(clickhouse_storages_system PRIVATE InformationSchema) diff --git a/src/Storages/System/StorageSystemMergeTreeSettings.cpp b/src/Storages/System/StorageSystemMergeTreeSettings.cpp index 18730010e53..35d975216f6 100644 --- a/src/Storages/System/StorageSystemMergeTreeSettings.cpp +++ b/src/Storages/System/StorageSystemMergeTreeSettings.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -38,30 +39,9 @@ void SystemMergeTreeSettings::fillData(MutableColumns & res_columns, const auto & settings = replicated ? context->getReplicatedMergeTreeSettings() : context->getMergeTreeSettings(); auto constraints_and_current_profiles = context->getSettingsConstraintsAndCurrentProfiles(); const auto & constraints = constraints_and_current_profiles->constraints; - for (const auto & setting : settings.all()) - { - const auto & setting_name = setting.getName(); - res_columns[0]->insert(setting_name); - res_columns[1]->insert(setting.getValueString()); - res_columns[2]->insert(setting.isValueChanged()); - res_columns[3]->insert(setting.getDescription()); - Field min, max; - SettingConstraintWritability writability = SettingConstraintWritability::WRITABLE; - constraints.get(settings, setting_name, min, max, writability); - - /// These two columns can accept strings only. - if (!min.isNull()) - min = Settings::valueToStringUtil(setting_name, min); - if (!max.isNull()) - max = Settings::valueToStringUtil(setting_name, max); - - res_columns[4]->insert(min); - res_columns[5]->insert(max); - res_columns[6]->insert(writability == SettingConstraintWritability::CONST); - res_columns[7]->insert(setting.getTypeName()); - res_columns[8]->insert(setting.isObsolete()); - } + MutableColumnsAndConstraints params(res_columns, constraints); + settings.dumpToSystemMergeTreeSettingsColumns(params); } template class SystemMergeTreeSettings; diff --git a/src/Storages/System/StorageSystemViewRefreshes.cpp b/src/Storages/System/StorageSystemViewRefreshes.cpp index 6e0dab1468d..d70671dc39b 100644 --- a/src/Storages/System/StorageSystemViewRefreshes.cpp +++ b/src/Storages/System/StorageSystemViewRefreshes.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include namespace DB @@ -22,32 +22,21 @@ ColumnsDescription StorageSystemViewRefreshes::getColumnsDescription() {"view", std::make_shared(), "Table name."}, {"uuid", std::make_shared(), "Table uuid (Atomic database)."}, {"status", std::make_shared(), "Current state of the refresh."}, - {"last_refresh_result", std::make_shared(), "Outcome of the latest refresh attempt."}, - {"last_refresh_time", std::make_shared(std::make_shared()), - "Time of the last refresh attempt. NULL if no refresh attempts happened since server startup or table creation."}, {"last_success_time", std::make_shared(std::make_shared()), - "Time of the last successful refresh. NULL if no successful refreshes happened since server startup or table creation."}, - {"duration_ms", std::make_shared(), "How long the last refresh attempt took."}, - {"next_refresh_time", std::make_shared(), "Time at which the next refresh is scheduled to start."}, - {"remaining_dependencies", std::make_shared(std::make_shared()), - "If the view has refresh dependencies, this array contains the subset of those dependencies that are not satisfied for the current refresh yet. " - "If status = 'WaitingForDependencies', a refresh is ready to start as soon as these dependencies are fulfilled." - }, - {"exception", std::make_shared(), - "if last_refresh_result = 'Error', i.e. the last refresh attempt failed, this column contains the corresponding error message and stack trace." - }, - {"retry", std::make_shared(), "How many failed attempts there were so far, for the current refresh."}, - {"refresh_count", std::make_shared(), "Number of successful refreshes since last server restart or table creation."}, - {"progress", std::make_shared(), "Progress of the current refresh, between 0 and 1."}, - {"elapsed", std::make_shared(), "The amount of nanoseconds the current refresh took."}, - {"read_rows", std::make_shared(), "Number of rows read during the current refresh."}, - {"read_bytes", std::make_shared(), "Number of bytes read during the current refresh."}, - {"total_rows", std::make_shared(), "Estimated total number of rows that need to be read by the current refresh."}, - {"total_bytes", std::make_shared(), "Estimated total number of bytes that need to be read by the current refresh."}, - {"written_rows", std::make_shared(), "Number of rows written during the current refresh."}, - {"written_bytes", std::make_shared(), "Number rof bytes written during the current refresh."}, - {"result_rows", std::make_shared(), "Estimated total number of rows in the result set of the SELECT query."}, - {"result_bytes", std::make_shared(), "Estimated total number of bytes in the result set of the SELECT query."}, + "Time when the latest successful refresh started. NULL if no successful refreshes happened since server startup or table creation."}, + {"last_success_duration_ms", std::make_shared(std::make_shared()), "How long the latest refresh took."}, + {"last_refresh_time", std::make_shared(std::make_shared()), + "Time when the latest refresh attempt finished (if known) or started (if unknown or still running). NULL if no refresh attempts happened since server startup or table creation."}, + {"last_refresh_replica", std::make_shared(), "If coordination is enabled, name of the replica that made the current (if running) or previous (if not running) refresh attempt."}, + {"next_refresh_time", std::make_shared(std::make_shared()), "Time at which the next refresh is scheduled to start, if status = Scheduled."}, + {"exception", std::make_shared(), "Error message from previous attempt if it failed."}, + {"retry", std::make_shared(), "How many failed attempts there were so far, for the current refresh. Not available if status is `RunningOnAnotherReplica`."}, + {"progress", std::make_shared(), "Progress of the current refresh, between 0 and 1. Not available if status is RunningOnAnotherReplica."}, + {"read_rows", std::make_shared(), "Number of rows read by the current refresh so far. Not available if status is RunningOnAnotherReplica."}, + {"read_bytes", std::make_shared(), "Number of bytes read during the current refresh. Not available if status is `RunningOnAnotherReplica`."}, + {"total_rows", std::make_shared(), "Estimated total number of rows that need to be read by the current refresh. Not available if status is RunningOnAnotherReplica."}, + {"written_rows", std::make_shared(), "Number of rows written during the current refresh. Not available if status is `RunningOnAnotherReplica`."}, + {"written_bytes", std::make_shared(), "Number rof bytes written during the current refresh. Not available if status is `RunningOnAnotherReplica`."}, }; } @@ -58,8 +47,10 @@ void StorageSystemViewRefreshes::fillData( auto valid_access = AccessType::SHOW_TABLES; bool check_access_for_tables = !access->isGranted(valid_access); - for (const auto & refresh : context->getRefreshSet().getInfo()) + for (const RefreshTaskPtr & task : context->getRefreshSet().getTasks()) { + RefreshTask::Info refresh = task->getInfo(); + if (check_access_for_tables && !access->isGranted(valid_access, refresh.view_id.getDatabaseName(), refresh.view_id.getTableName())) continue; @@ -68,39 +59,46 @@ void StorageSystemViewRefreshes::fillData( res_columns[i++]->insert(refresh.view_id.getTableName()); res_columns[i++]->insert(refresh.view_id.uuid); res_columns[i++]->insert(toString(refresh.state)); - res_columns[i++]->insert(toString(refresh.last_refresh_result)); - if (refresh.last_attempt_time.has_value()) - res_columns[i++]->insert(refresh.last_attempt_time.value()); - else + if (refresh.znode.last_success_time.time_since_epoch().count() == 0) + { res_columns[i++]->insertDefault(); // NULL - - if (refresh.last_success_time.has_value()) - res_columns[i++]->insert(refresh.last_success_time.value()); + res_columns[i++]->insertDefault(); + } else - res_columns[i++]->insertDefault(); // NULL + { + res_columns[i++]->insert(refresh.znode.last_success_time.time_since_epoch().count()); + res_columns[i++]->insert(refresh.znode.last_success_duration.count()); + } - res_columns[i++]->insert(refresh.last_attempt_duration_ms); - res_columns[i++]->insert(refresh.next_refresh_time); + if (refresh.znode.last_attempt_time.time_since_epoch().count() == 0) + res_columns[i++]->insertDefault(); + else + res_columns[i++]->insert(refresh.znode.last_attempt_time.time_since_epoch().count()); + res_columns[i++]->insert(refresh.znode.last_attempt_replica); - Array deps; - for (const StorageID & id : refresh.remaining_dependencies) - deps.push_back(id.getFullTableName()); - res_columns[i++]->insert(Array(deps)); + res_columns[i++]->insert(std::chrono::duration_cast(refresh.next_refresh_time.time_since_epoch()).count()); - res_columns[i++]->insert(refresh.exception_message); - res_columns[i++]->insert(refresh.retry); - res_columns[i++]->insert(refresh.refresh_count); - res_columns[i++]->insert(Float64(refresh.progress.read_rows) / refresh.progress.total_rows_to_read); - res_columns[i++]->insert(refresh.progress.elapsed_ns / 1e9); + if (refresh.znode.last_attempt_succeeded || refresh.znode.last_attempt_time.time_since_epoch().count() == 0) + res_columns[i++]->insertDefault(); + else if (refresh.refresh_running) + res_columns[i++]->insert(refresh.znode.previous_attempt_error); + else if (refresh.znode.last_attempt_error.empty()) + res_columns[i++]->insert("Replica went away"); + else + res_columns[i++]->insert(refresh.znode.last_attempt_error); + + Int64 retries = refresh.znode.attempt_number; + if (refresh.refresh_running && retries) + retries -= 1; + res_columns[i++]->insert(retries); + + res_columns[i++]->insert(Float64(refresh.progress.read_rows) / std::max(refresh.progress.total_rows_to_read, UInt64(1))); res_columns[i++]->insert(refresh.progress.read_rows); res_columns[i++]->insert(refresh.progress.read_bytes); res_columns[i++]->insert(refresh.progress.total_rows_to_read); - res_columns[i++]->insert(refresh.progress.total_bytes_to_read); res_columns[i++]->insert(refresh.progress.written_rows); res_columns[i++]->insert(refresh.progress.written_bytes); - res_columns[i++]->insert(refresh.progress.result_rows); - res_columns[i++]->insert(refresh.progress.result_bytes); } } diff --git a/src/Storages/extractTableFunctionArgumentsFromSelectQuery.h b/src/Storages/extractTableFunctionArgumentsFromSelectQuery.h index 8bf5d95a42c..af19ef656cc 100644 --- a/src/Storages/extractTableFunctionArgumentsFromSelectQuery.h +++ b/src/Storages/extractTableFunctionArgumentsFromSelectQuery.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include namespace DB diff --git a/src/Storages/getStructureOfRemoteTable.h b/src/Storages/getStructureOfRemoteTable.h index 62f93dccf1a..fc62f0aea79 100644 --- a/src/Storages/getStructureOfRemoteTable.h +++ b/src/Storages/getStructureOfRemoteTable.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include diff --git a/src/TableFunctions/TableFunctionLoop.cpp b/src/TableFunctions/TableFunctionLoop.cpp index 43f122f6cb3..c9436cf02cf 100644 --- a/src/TableFunctions/TableFunctionLoop.cpp +++ b/src/TableFunctions/TableFunctionLoop.cpp @@ -71,7 +71,7 @@ namespace DB loop_table_name = id_name; } } - else if (const auto * func = args[0]->as()) + else if (const auto * /*func*/ _ = args[0]->as()) { inner_table_function_ast = args[0]; } diff --git a/tests/ci/ci.py b/tests/ci/ci.py index 04f5a1625d1..10431ce038f 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -1133,12 +1133,15 @@ def main() -> int: if IS_CI and not pr_info.is_merge_queue: - if pr_info.is_release and pr_info.is_push_event: + if pr_info.is_master and pr_info.is_push_event: print("Release/master: CI Cache add pending records for all todo jobs") ci_cache.push_pending_all(pr_info.is_release) - # wait for pending jobs to be finished, await_jobs is a long blocking call - ci_cache.await_pending_jobs(pr_info.is_release) + if pr_info.is_master or pr_info.is_pr: + # - wait for pending jobs to be finished, await_jobs is a long blocking call + # - don't wait for release CI because some jobs may not be present there + # and we may wait until timeout in vain + ci_cache.await_pending_jobs(pr_info.is_release) # conclude results result["git_ref"] = git_ref diff --git a/tests/ci/stress.py b/tests/ci/stress.py index 4e86f551b30..3b3a6bcadb5 100755 --- a/tests/ci/stress.py +++ b/tests/ci/stress.py @@ -141,7 +141,8 @@ def call_with_retry(query: str, timeout: int = 30, retry_count: int = 5) -> None def make_query_command(query: str) -> str: return ( f'clickhouse client -q "{query}" --max_untracked_memory=1Gi ' - "--memory_profiler_step=1Gi --max_memory_usage_for_user=0 --max_memory_usage_in_client=1000000000" + "--memory_profiler_step=1Gi --max_memory_usage_for_user=0 --max_memory_usage_in_client=1000000000 " + "--enable-progress-table-toggle=0" ) diff --git a/tests/docker_scripts/stateless_runner.sh b/tests/docker_scripts/stateless_runner.sh index 307b41cf4f1..ba2dee87f6f 100755 --- a/tests/docker_scripts/stateless_runner.sh +++ b/tests/docker_scripts/stateless_runner.sh @@ -378,9 +378,9 @@ done # collect minio audit and server logs # wait for minio to flush its batch if it has any sleep 1 -clickhouse-client -q "SYSTEM FLUSH ASYNC INSERT QUEUE" -clickhouse-client ${logs_saver_client_options} -q "SELECT log FROM minio_audit_logs ORDER BY event_time INTO OUTFILE '/test_output/minio_audit_logs.jsonl.zst' FORMAT JSONEachRow" -clickhouse-client ${logs_saver_client_options} -q "SELECT log FROM minio_server_logs ORDER BY event_time INTO OUTFILE '/test_output/minio_server_logs.jsonl.zst' FORMAT JSONEachRow" +clickhouse-client -q "SYSTEM FLUSH ASYNC INSERT QUEUE" ||: +clickhouse-client ${logs_saver_client_options} -q "SELECT log FROM minio_audit_logs ORDER BY event_time INTO OUTFILE '/test_output/minio_audit_logs.jsonl.zst' FORMAT JSONEachRow" ||: +clickhouse-client ${logs_saver_client_options} -q "SELECT log FROM minio_server_logs ORDER BY event_time INTO OUTFILE '/test_output/minio_server_logs.jsonl.zst' FORMAT JSONEachRow" ||: # Stop server so we can safely read data with clickhouse-local. # Why do we read data with clickhouse-local? diff --git a/tests/integration/compose/docker_compose_rabbitmq.yml b/tests/integration/compose/docker_compose_rabbitmq.yml index 4aae2427596..0e5203b925f 100644 --- a/tests/integration/compose/docker_compose_rabbitmq.yml +++ b/tests/integration/compose/docker_compose_rabbitmq.yml @@ -1,7 +1,9 @@ services: rabbitmq1: - image: rabbitmq:3.12.6-alpine + image: rabbitmq:4.0.2-alpine hostname: rabbitmq1 + environment: + RABBITMQ_FEATURE_FLAGS: feature_flags_v2,message_containers expose: - ${RABBITMQ_PORT:-5672} - ${RABBITMQ_SECURE_PORT:-5671} @@ -14,3 +16,4 @@ services: - /misc/rabbitmq/ca-cert.pem:/etc/rabbitmq/ca-cert.pem - /misc/rabbitmq/server-cert.pem:/etc/rabbitmq/server-cert.pem - /misc/rabbitmq/server-key.pem:/etc/rabbitmq/server-key.pem + - /misc/rabbitmq/enabled_plugins:/etc/rabbitmq/enabled_plugins \ No newline at end of file diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 9cbc53f4e1b..d26487e9aa4 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -298,19 +298,32 @@ def check_postgresql_java_client_is_available(postgresql_java_client_id): return p.returncode == 0 -def check_rabbitmq_is_available(rabbitmq_id, cookie): - p = subprocess.Popen( - docker_exec( - "-e", - f"RABBITMQ_ERLANG_COOKIE={cookie}", - rabbitmq_id, - "rabbitmqctl", - "await_startup", - ), - stdout=subprocess.PIPE, - ) - p.wait(timeout=60) - return p.returncode == 0 +def check_rabbitmq_is_available(rabbitmq_id, cookie, timeout=90): + try: + subprocess.check_output( + docker_exec( + "-e", + f"RABBITMQ_ERLANG_COOKIE={cookie}", + rabbitmq_id, + "rabbitmqctl", + "await_startup", + ), + stderr=subprocess.STDOUT, + timeout=timeout, + ) + return True + except subprocess.CalledProcessError as e: + # Raised if the command returns a non-zero exit code + error_message = ( + f"RabbitMQ startup failed with return code {e.returncode}. " + f"Output: {e.output.decode(errors='replace')}" + ) + raise RuntimeError(error_message) + except subprocess.TimeoutExpired as e: + # Raised if the command times out + raise RuntimeError( + f"RabbitMQ startup timed out. Output: {e.output.decode(errors='replace')}" + ) def rabbitmq_debuginfo(rabbitmq_id, cookie): @@ -374,22 +387,6 @@ async def nats_connect_ssl(nats_port, user, password, ssl_ctx=None): return nc -def enable_consistent_hash_plugin(rabbitmq_id, cookie): - p = subprocess.Popen( - docker_exec( - "-e", - f"RABBITMQ_ERLANG_COOKIE={cookie}", - rabbitmq_id, - "rabbitmq-plugins", - "enable", - "rabbitmq_consistent_hash_exchange", - ), - stdout=subprocess.PIPE, - ) - p.communicate() - return p.returncode == 0 - - def get_instances_dir(name): instances_dir_name = "_instances" @@ -2355,22 +2352,14 @@ class ClickHouseCluster: self.print_all_docker_pieces() self.rabbitmq_ip = self.get_instance_ip(self.rabbitmq_host) - start = time.time() - while time.time() - start < timeout: - try: - if check_rabbitmq_is_available( - self.rabbitmq_docker_id, self.rabbitmq_cookie - ): - logging.debug("RabbitMQ is available") - if enable_consistent_hash_plugin( - self.rabbitmq_docker_id, self.rabbitmq_cookie - ): - logging.debug("RabbitMQ consistent hash plugin is available") - return True - time.sleep(0.5) - except Exception as ex: - logging.debug("Can't connect to RabbitMQ " + str(ex)) - time.sleep(0.5) + try: + if check_rabbitmq_is_available( + self.rabbitmq_docker_id, self.rabbitmq_cookie, timeout + ): + logging.debug("RabbitMQ is available") + return True + except Exception as ex: + logging.debug("RabbitMQ await_startup failed", exc_info=True) try: with open(os.path.join(self.rabbitmq_dir, "docker.log"), "w+") as f: diff --git a/tests/integration/test_compatibility_merge_tree_settings/__init__.py b/tests/integration/test_compatibility_merge_tree_settings/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_compatibility_merge_tree_settings/configs/compatibility.xml b/tests/integration/test_compatibility_merge_tree_settings/configs/compatibility.xml new file mode 100644 index 00000000000..6bca18c1c81 --- /dev/null +++ b/tests/integration/test_compatibility_merge_tree_settings/configs/compatibility.xml @@ -0,0 +1,7 @@ + + + + 24.7 + + + diff --git a/tests/integration/test_compatibility_merge_tree_settings/configs/mt_settings.xml b/tests/integration/test_compatibility_merge_tree_settings/configs/mt_settings.xml new file mode 100644 index 00000000000..e4a79f7ee90 --- /dev/null +++ b/tests/integration/test_compatibility_merge_tree_settings/configs/mt_settings.xml @@ -0,0 +1,8 @@ + + + throw + + + throw + + diff --git a/tests/integration/test_compatibility_merge_tree_settings/test.py b/tests/integration/test_compatibility_merge_tree_settings/test.py new file mode 100644 index 00000000000..96a9f7a2124 --- /dev/null +++ b/tests/integration/test_compatibility_merge_tree_settings/test.py @@ -0,0 +1,117 @@ +import pytest + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + +node = cluster.add_instance("node1", with_zookeeper=True) +node_with_compatibility = cluster.add_instance( + "node2", with_zookeeper=True, user_configs=["configs/compatibility.xml"] +) +node_with_compatibility_and_mt_setings = cluster.add_instance( + "node3", + with_zookeeper=True, + main_configs=["configs/mt_settings.xml"], + user_configs=["configs/compatibility.xml"], +) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + + finally: + cluster.shutdown() + + +def test_check_projections_compatibility(started_cluster): + create_with_invalid_projection = """ + CREATE TABLE tp (type Int32, eventcnt UInt64, PROJECTION p (select sum(eventcnt), type group by type)) + engine = {} order by type; + """ + + create_no_projection = """ + CREATE TABLE tp (type Int32, eventcnt UInt64) + engine = {} order by type; + """ + + alter_add_projection = """ + ALTER TABLE tp ADD PROJECTION p (select sum(eventcnt), type group by type); + """ + + # Create with invalid projection is not supported by default + + assert "Projection is fully supported" in node.query_and_get_error( + create_with_invalid_projection.format("ReplacingMergeTree") + ) + assert "Projection is fully supported" in node.query_and_get_error( + create_with_invalid_projection.format( + "ReplicatedReplacingMergeTree('/tables/tp', '0')" + ) + ) + + # Adding invalid projection is not supported by default + + node.query(create_no_projection.format("ReplacingMergeTree")) + assert "Projection is fully supported" in node.query_and_get_error( + alter_add_projection + ) + node.query("drop table tp;") + + node.query( + create_no_projection.format("ReplicatedReplacingMergeTree('/tables/tp', '0')") + ) + assert "Projection is fully supported" in node.query_and_get_error( + alter_add_projection + ) + node.query("drop table tp;") + + # Create with invalid projection is supported with compatibility + + node_with_compatibility.query( + create_with_invalid_projection.format("ReplacingMergeTree") + ) + node_with_compatibility.query("drop table tp;") + node_with_compatibility.query( + create_with_invalid_projection.format( + "ReplicatedReplacingMergeTree('/tables/tp2', '0')" + ) + ) + node_with_compatibility.query("drop table tp;") + + # Adding invalid projection is supported with compatibility + + node_with_compatibility.query(create_no_projection.format("ReplacingMergeTree")) + node_with_compatibility.query(alter_add_projection) + node_with_compatibility.query("drop table tp;") + + node_with_compatibility.query( + create_no_projection.format("ReplicatedReplacingMergeTree('/tables/tp3', '0')") + ) + node_with_compatibility.query(alter_add_projection) + node_with_compatibility.query("drop table tp;") + + +def test_config_overrides_compatibility(started_cluster): + create_with_invalid_projection = """ + CREATE TABLE tp (type Int32, eventcnt UInt64, PROJECTION p (select sum(eventcnt), type group by type)) + engine = {} order by type; + """ + + assert ( + "Projection is fully supported" + in node_with_compatibility_and_mt_setings.query_and_get_error( + create_with_invalid_projection.format("ReplacingMergeTree") + ) + ) + assert ( + "Projection is fully supported" + in node_with_compatibility_and_mt_setings.query_and_get_error( + create_with_invalid_projection.format( + "ReplicatedReplacingMergeTree('/tables/tp', '0')" + ) + ) + ) diff --git a/tests/integration/test_keeper_four_word_command/test.py b/tests/integration/test_keeper_four_word_command/test.py index 6d6f3c9a8ad..c163ad434ae 100644 --- a/tests/integration/test_keeper_four_word_command/test.py +++ b/tests/integration/test_keeper_four_word_command/test.py @@ -344,7 +344,7 @@ def test_cmd_srvr(started_cluster): assert result["Received"] == "10" assert result["Sent"] == "10" assert int(result["Connections"]) == 1 - assert int(result["Zxid"], 16) > 10 + assert int(result["Zxid"], 16) >= 10 assert result["Mode"] == "leader" assert result["Node count"] == "14" diff --git a/tests/integration/test_refreshable_mv/__init__.py b/tests/integration/test_refreshable_mv/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_refreshable_mv/configs/config.xml b/tests/integration/test_refreshable_mv/configs/config.xml new file mode 100644 index 00000000000..b163c6f54a1 --- /dev/null +++ b/tests/integration/test_refreshable_mv/configs/config.xml @@ -0,0 +1 @@ + diff --git a/tests/integration/test_refreshable_mv/configs/users.xml b/tests/integration/test_refreshable_mv/configs/users.xml new file mode 100644 index 00000000000..d77a8c4ae22 --- /dev/null +++ b/tests/integration/test_refreshable_mv/configs/users.xml @@ -0,0 +1,14 @@ + + + + 1 + 1 + 1 + + + + + default + + + diff --git a/tests/integration/test_refreshable_mv/test.py b/tests/integration/test_refreshable_mv/test.py new file mode 100644 index 00000000000..5e764e381f3 --- /dev/null +++ b/tests/integration/test_refreshable_mv/test.py @@ -0,0 +1,201 @@ +import os +import re +import shutil +import threading +import time +from random import randint + +import pytest + +from helpers.cluster import ClickHouseCluster +from helpers.network import PartitionManager +from helpers.test_tools import assert_eq_with_retry, assert_logs_contain + +test_recover_staled_replica_run = 1 + +cluster = ClickHouseCluster(__file__) + +node1 = cluster.add_instance( + "node1", + main_configs=["configs/config.xml"], + user_configs=["configs/users.xml"], + with_zookeeper=True, + macros={"shard": "shard1", "replica": "1"}, +) +node2 = cluster.add_instance( + "node2", + main_configs=["configs/config.xml"], + user_configs=["configs/users.xml"], + with_zookeeper=True, + macros={"shard": "shard1", "replica": "2"}, +) +nodes = [node1, node2] + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + + finally: + cluster.shutdown() + + +def test_refreshable_mv_in_replicated_db(started_cluster): + for node in nodes: + node.query( + "create database re engine = Replicated('/test/re', 'shard1', '{replica}');" + ) + + # Table engine check. + assert "BAD_ARGUMENTS" in node1.query_and_get_error( + "create materialized view re.a refresh every 1 second (x Int64) engine Memory as select 1 as x" + ) + + # Basic refreshing. + node1.query( + "create materialized view re.a refresh every 1 second (x Int64) engine ReplicatedMergeTree order by x as select number*10 as x from numbers(2)" + ) + node1.query("system sync database replica re") + for node in nodes: + node.query("system wait view re.a") + assert node.query("select * from re.a order by all") == "0\n10\n" + assert ( + node.query( + "select database, view, last_success_time != 0, last_refresh_time != 0, last_refresh_replica in ('1','2'), exception from system.view_refreshes" + ) + == "re\ta\t1\t1\t1\t\n" + ) + + # Append mode, with and without coordination. + for coordinated in [True, False]: + name = "append" if coordinated else "append_uncoordinated" + refresh_settings = "" if coordinated else " settings all_replicas = 1" + node2.query( + f"create materialized view re.{name} refresh every 1 year{refresh_settings} append (x Int64) engine ReplicatedMergeTree order by x as select rand() as x" + ) + # Stop the clocks. + for node in nodes: + node.query( + f"system test view re.{name} set fake time '2040-01-01 00:00:01'" + ) + # Wait for quiescence. + for node in nodes: + node.query(f"system wait view re.{name}") + rows_before = int(nodes[randint(0, 1)].query(f"select count() from re.{name}")) + # Advance the clocks. + for node in nodes: + node.query( + f"system test view re.{name} set fake time '2041-01-01 00:00:01'" + ) + # Wait for refresh. + for node in nodes: + assert_eq_with_retry( + node, + f"select status, last_success_time from system.view_refreshes where view = '{name}'", + "Scheduled\t2041-01-01 00:00:01", + ) + node.query(f"system wait view re.{name}") + # Check results. + rows_after = int(nodes[randint(0, 1)].query(f"select count() from re.{name}")) + expected = 1 if coordinated else 2 + assert rows_after - rows_before == expected + + # Uncoordinated append to unreplicated table. + node1.query( + "create materialized view re.unreplicated_uncoordinated refresh every 1 second settings all_replicas = 1 append (x String) engine Memory as select 1 as x" + ) + node2.query("system sync database replica re") + for node in nodes: + node.query("system wait view re.unreplicated_uncoordinated") + assert ( + node.query("select distinct x from re.unreplicated_uncoordinated") == "1\n" + ) + + # Rename. + node2.query( + "create materialized view re.c refresh every 1 year (x Int64) engine ReplicatedMergeTree order by x empty as select rand() as x" + ) + node1.query("system sync database replica re") + node1.query("rename table re.c to re.d") + node1.query( + "alter table re.d modify query select number + sleepEachRow(1) as x from numbers(5) settings max_block_size = 1" + ) + # Rename while refreshing. + node1.query("system refresh view re.d") + assert_eq_with_retry( + node2, + "select status from system.view_refreshes where view = 'd'", + "RunningOnAnotherReplica", + ) + node2.query("rename table re.d to re.e") + node1.query("system wait view re.e") + assert node1.query("select * from re.e order by x") == "0\n1\n2\n3\n4\n" + + # A view that will be stuck refreshing until dropped. + node1.query( + "create materialized view re.f refresh every 1 second (x Int64) engine ReplicatedMergeTree order by x as select sleepEachRow(1) as x from numbers(1000000) settings max_block_size = 1" + ) + assert_eq_with_retry( + node2, + "select status in ('Running', 'RunningOnAnotherReplica') from system.view_refreshes where view = 'f'", + "1", + ) + + # Locate coordination znodes. + znode_exists = ( + lambda uuid: nodes[randint(0, 1)].query( + f"select count() from system.zookeeper where path = '/clickhouse/tables/{uuid}' and name = 'shard1'" + ) + == "1\n" + ) + tables = [] + for row in node1.query( + "select table, uuid from system.tables where database = 're'" + ).split("\n")[:-1]: + name, uuid = row.split("\t") + print(f"found table {name} {uuid}") + if name.startswith(".") or name.startswith("_tmp_replace_"): + continue + coordinated = not name.endswith("uncoordinated") + tables.append((name, uuid, coordinated)) + assert coordinated == znode_exists(uuid) + assert sorted([name for (name, _, _) in tables]) == [ + "a", + "append", + "append_uncoordinated", + "e", + "f", + "unreplicated_uncoordinated", + ] + + # Drop all tables and check that coordination znodes were deleted. + for name, uuid, coordinated in tables: + maybe_sync = " sync" if randint(0, 1) == 0 else "" + nodes[randint(0, 1)].query(f"drop table re.{name}{maybe_sync}") + # TODO: After https://github.com/ClickHouse/ClickHouse/issues/61065 is done (for MVs, not ReplicatedMergeTree), check the parent znode instead. + assert not znode_exists(uuid) + + # A little stress test dropping MV while it's refreshing, hoping to hit various cases where the + # drop happens while creating/exchanging/dropping the inner table. + for i in range(20): + maybe_empty = " empty" if randint(0, 2) == 0 else "" + nodes[randint(0, 1)].query( + f"create materialized view re.g refresh every 1 second (x Int64) engine ReplicatedMergeTree order by x{maybe_empty} as select 1 as x" + ) + r = randint(0, 5) + if r == 0: + pass + elif r == 1: + time.sleep(randint(0, 100) / 1000) + else: + time.sleep(randint(900, 1100) / 1000) + nodes[randint(0, 1)].query("drop table re.g") + + # Check that inner and temp tables were dropped. + for node in nodes: + assert node.query("show tables from re") == "" + + node1.query("drop database re sync") + node2.query("drop database re sync") diff --git a/tests/integration/test_startup_scripts/configs/config.d/startup_scripts.xml b/tests/integration/test_startup_scripts/configs/config.d/startup_scripts.xml index e8a711a926a..21d2865efe8 100644 --- a/tests/integration/test_startup_scripts/configs/config.d/startup_scripts.xml +++ b/tests/integration/test_startup_scripts/configs/config.d/startup_scripts.xml @@ -13,5 +13,13 @@ SELECT * FROM system.query_log LIMIT 1 + + SELECT 1 SETTINGS skip_unavailable_shards = 1 + SELECT 1; + + + SELECT 1 SETTINGS skip_unavailable_shards = 1 + SELECT 1; + diff --git a/tests/integration/test_startup_scripts/test.py b/tests/integration/test_startup_scripts/test.py index 43a871a6fc5..3146db12082 100644 --- a/tests/integration/test_startup_scripts/test.py +++ b/tests/integration/test_startup_scripts/test.py @@ -16,6 +16,12 @@ def test_startup_scripts(): try: cluster.start() assert node.query("SHOW TABLES") == "TestTable\n" + assert ( + node.query( + "SELECT value, changed FROM system.settings WHERE name = 'skip_unavailable_shards'" + ) + == "0\t0\n" + ) finally: cluster.shutdown() diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py index 814b17e1e73..0bade55415f 100644 --- a/tests/integration/test_storage_kafka/test.py +++ b/tests/integration/test_storage_kafka/test.py @@ -2761,7 +2761,7 @@ def test_kafka_produce_key_timestamp(kafka_cluster, create_query_generator, log_ ) ) - # instance.wait_for_log_line(log_line) + instance.wait_for_log_line(log_line) expected = """\ 1 1 k1 1577836801 k1 insert3 0 0 1577836801 diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py index c235e5dad89..62019120140 100644 --- a/tests/integration/test_storage_s3_queue/test.py +++ b/tests/integration/test_storage_s3_queue/test.py @@ -1087,7 +1087,7 @@ def test_drop_table(started_cluster): started_cluster, files_path, files_to_generate, start_ind=0, row_num=100000 ) create_mv(node, table_name, dst_table_name) - node.wait_for_log_line(f"Reading from file: test_drop_data") + node.wait_for_log_line(f"rows from file: test_drop_data") node.query(f"DROP TABLE {table_name} SYNC") assert node.contains_in_log( f"StorageS3Queue (default.{table_name}): Table is being dropped" diff --git a/tests/output.txt b/tests/output.txt deleted file mode 100644 index 14cf08aac3b..00000000000 --- a/tests/output.txt +++ /dev/null @@ -1,1218 +0,0 @@ -Using queries from 'queries' directory -Connecting to ClickHouse server... OK - -Running 1 stateless tests (MainProcess). - -02240_protobuflist_format_persons: [ FAIL ] - return code: 1, result: - -a7522158-3d41-4b77-ad69-6c598ee55c49 Ivan Petrov male 1980-12-29 png +74951234567\0 1 2019-01-05 18:45:00 38 capricorn ['Yesterday','Flowers'] [255,0,0] Moscow [55.753215,37.622504] 3.14 214.1 0.1 5.8 17060000000 ['meter','centimeter','kilometer'] [1,0.01,1000] 500 [501,502] -c694ad8a-f714-4ea3-907d-fd54fb25d9b5 Natalia Sokolova female 1992-03-08 jpg \N 0 \N 26 pisces [] [100,200,50] Plymouth [50.403724,-4.142123] 3.14159 \N 0.007 5.4 -20000000000000 [] [] \N [] -a7da1aa6-f425-4789-8947-b034786ed374 Vasily Sidorov male 1995-07-28 bmp +442012345678 1 2018-12-30 00:00:00 23 leo ['Sunny'] [250,244,10] Murmansk [68.970682,33.074981] 3.14159265358979 100000000000 800 -3.2 154400000 ['pound'] [16] 503 [] - -Schema 02240_protobuflist1_format_persons:Person - -Binary representation: -00000000 ba 04 0a f4 01 0a 24 61 37 35 32 32 31 35 38 2d |......$a7522158-| -00000010 33 64 34 31 2d 34 62 37 37 2d 61 64 36 39 2d 36 |3d41-4b77-ad69-6| -00000020 63 35 39 38 65 65 35 35 63 34 39 12 04 49 76 61 |c598ee55c49..Iva| -00000030 6e 1a 06 50 65 74 72 6f 76 20 01 28 af 1f 32 03 |n..Petrov .(..2.| -00000040 70 6e 67 3a 0d 2b 37 34 39 35 31 32 33 34 35 36 |png:.+7495123456| -00000050 37 00 40 01 4d fc d0 30 5c 50 26 58 09 62 09 59 |7.@.M..0\P&X.b.Y| -00000060 65 73 74 65 72 64 61 79 62 07 46 6c 6f 77 65 72 |esterdayb.Flower| -00000070 73 6a 04 ff 01 00 00 72 06 4d 6f 73 63 6f 77 7a |sj.....r.Moscowz| -00000080 08 4b 03 5f 42 72 7d 16 42 81 01 1f 85 eb 51 b8 |.K._Br}.B.....Q.| -00000090 1e 09 40 89 01 33 33 33 33 33 c3 6a 40 95 01 cd |..@..33333.j@...| -000000a0 cc cc 3d 9d 01 9a 99 b9 40 a0 01 80 c4 d7 8d 7f |..=.....@.......| -000000b0 aa 01 0c 0a 05 6d 65 74 65 72 15 00 00 80 3f aa |.....meter....?.| -000000c0 01 11 0a 0a 63 65 6e 74 69 6d 65 74 65 72 15 0a |....centimeter..| -000000d0 d7 23 3c aa 01 10 0a 09 6b 69 6c 6f 6d 65 74 65 |.#<.....kilomete| -000000e0 72 15 00 00 7a 44 b2 01 10 0a 0e a2 06 0b 0a 09 |r...zD..........| -000000f0 08 f4 03 12 04 f5 03 f6 03 0a 7e 0a 24 63 36 39 |..........~.$c69| -00000100 34 61 64 38 61 2d 66 37 31 34 2d 34 65 61 33 2d |4ad8a-f714-4ea3-| -00000110 39 30 37 64 2d 66 64 35 34 66 62 32 35 64 39 62 |907d-fd54fb25d9b| -00000120 35 12 07 4e 61 74 61 6c 69 61 1a 08 53 6f 6b 6f |5..Natalia..Soko| -00000130 6c 6f 76 61 28 a6 3f 32 03 6a 70 67 50 1a 58 0b |lova(.?2.jpgP.X.| -00000140 6a 04 64 c8 01 32 72 08 50 6c 79 6d 6f 75 74 68 |j.d..2r.Plymouth| -00000150 7a 08 6a 9d 49 42 46 8c 84 c0 81 01 6e 86 1b f0 |z.j.IBF.....n...| -00000160 f9 21 09 40 95 01 42 60 e5 3b 9d 01 cd cc ac 40 |.!.@..B`.;.....@| -00000170 a0 01 ff ff a9 ce 93 8c 09 0a c0 01 0a 24 61 37 |.............$a7| -00000180 64 61 31 61 61 36 2d 66 34 32 35 2d 34 37 38 39 |da1aa6-f425-4789| -00000190 2d 38 39 34 37 2d 62 30 33 34 37 38 36 65 64 33 |-8947-b034786ed3| -000001a0 37 34 12 06 56 61 73 69 6c 79 1a 07 53 69 64 6f |74..Vasily..Sido| -000001b0 72 6f 76 20 01 28 fb 48 32 03 62 6d 70 3a 0d 2b |rov .(.H2.bmp:.+| -000001c0 34 34 32 30 31 32 33 34 35 36 37 38 40 01 4d 50 |442012345678@.MP| -000001d0 e0 27 5c 50 17 58 04 62 05 53 75 6e 6e 79 6a 05 |.'\P.X.b.Sunnyj.| -000001e0 fa 01 f4 01 0a 72 08 4d 75 72 6d 61 6e 73 6b 7a |.....r.Murmanskz| -000001f0 08 fd f0 89 42 c8 4c 04 42 81 01 11 2d 44 54 fb |....B.L.B...-DT.| -00000200 21 09 40 89 01 00 00 00 e8 76 48 37 42 95 01 00 |!.@......vH7B...| -00000210 00 48 44 9d 01 cd cc 4c c0 a0 01 80 d4 9f 93 01 |.HD....L........| -00000220 aa 01 0c 0a 05 70 6f 75 6e 64 15 00 00 80 41 b2 |.....pound....A.| -00000230 01 0a 0a 08 a2 06 05 0a 03 08 f7 03 |............| -0000023c - -MESSAGE #1 AT 0x00000005 -uuid: "a7522158-3d41-4b77-ad69-6c598ee55c49" -name: "Ivan" -surname: "Petrov" -gender: male -birthDate: 4015 -photo: "png" -phoneNumber: "+74951234567\000" -isOnline: true -visitTime: 1546703100 -age: 38 -zodiacSign: capricorn -songs: "Yesterday" -songs: "Flowers" -color: 255 -color: 0 -color: 0 -hometown: "Moscow" -location: 55.7532158 -location: 37.6225052 -pi: 3.14 -lotteryWin: 214.1 -someRatio: 0.1 -temperature: 5.8 -randomBigNumber: 17060000000 -measureUnits { - unit: "meter" - coef: 1 -} -measureUnits { - unit: "centimeter" - coef: 0.01 -} -measureUnits { - unit: "kilometer" - coef: 1000 -} -nestiness { - a { - b { - c { - d: 500 - e: 501 - e: 502 - } - } - } -} -MESSAGE #2 AT 0x000000FB -uuid: "c694ad8a-f714-4ea3-907d-fd54fb25d9b5" -name: "Natalia" -surname: "Sokolova" -birthDate: 8102 -photo: "jpg" -age: 26 -zodiacSign: pisces -color: 100 -color: 200 -color: 50 -hometown: "Plymouth" -location: 50.4037247 -location: -4.14212322 -pi: 3.14159 -someRatio: 0.007 -temperature: 5.4 -randomBigNumber: -20000000000000 -MESSAGE #3 AT 0x0000017C -uuid: "a7da1aa6-f425-4789-8947-b034786ed374" -name: "Vasily" -surname: "Sidorov" -gender: male -birthDate: 9339 -photo: "bmp" -phoneNumber: "+442012345678" -isOnline: true -visitTime: 1546117200 -age: 23 -zodiacSign: leo -songs: "Sunny" -color: 250 -color: 244 -color: 10 -hometown: "Murmansk" -location: 68.9706802 -location: 33.0749817 -pi: 3.14159265358979 -lotteryWin: 100000000000 -someRatio: 800 -temperature: -3.2 -randomBigNumber: 154400000 -measureUnits { - unit: "pound" - coef: 16 -} -nestiness { - a { - b { - c { - d: 503 - } - } - } -} - -Binary representation is as expected - -Roundtrip: -a7522158-3d41-4b77-ad69-6c598ee55c49 Ivan Petrov male 1980-12-29 png +74951234567\0 1 2019-01-05 18:45:00 38 capricorn ['Yesterday','Flowers'] [255,0,0] Moscow [55.753216,37.622504] 3.14 214.1 0.1 5.8 17060000000 ['meter','centimeter','kilometer'] [1,0.01,1000] 500 [501,502] -c694ad8a-f714-4ea3-907d-fd54fb25d9b5 Natalia Sokolova female 1992-03-08 jpg \N 0 \N 26 pisces [] [100,200,50] Plymouth [50.403724,-4.142123] 3.14159 \N 0.007 5.4 -20000000000000 [] [] \N [] -a7da1aa6-f425-4789-8947-b034786ed374 Vasily Sidorov male 1995-07-28 bmp +442012345678 1 2018-12-30 00:00:00 23 leo ['Sunny'] [250,244,10] Murmansk [68.97068,33.074982] 3.14159265358979 100000000000 800 -3.2 154400000 ['pound'] [16] 503 [] - -Schema 02240_protobuflist2_format_persons:AltPerson - -Binary representation: -00000000 f4 03 0a c4 01 08 01 12 04 49 76 61 6e 28 87 a8 |.........Ivan(..| -00000010 c4 9b 97 02 52 06 50 65 74 72 6f 76 72 0c 00 00 |....R.Petrovr...| -00000020 7f 43 00 00 00 00 00 00 00 00 79 fc d0 30 5c 00 |.C........y..0\.| -00000030 00 00 00 c8 02 0a c2 05 0c 00 00 80 3f 0a d7 23 |............?..#| -00000040 3c 00 00 7a 44 9a 06 05 6d 65 74 65 72 9a 06 0a |<..zD...meter...| -00000050 63 65 6e 74 69 6d 65 74 65 72 9a 06 09 6b 69 6c |centimeter...kil| -00000060 6f 6d 65 74 65 72 a1 06 00 00 00 a0 99 99 b9 3f |ometer.........?| -00000070 a8 06 37 a8 06 25 bd 06 c3 f5 48 40 fa 06 02 33 |..7..%....H@...3| -00000080 38 90 08 c6 09 e1 08 00 f1 da f8 03 00 00 00 b0 |8...............| -00000090 09 af 1f d0 0c d6 01 e2 12 24 61 37 35 32 32 31 |.........$a75221| -000000a0 35 38 2d 33 64 34 31 2d 34 62 37 37 2d 61 64 36 |58-3d41-4b77-ad6| -000000b0 39 2d 36 63 35 39 38 65 65 35 35 63 34 39 a0 38 |9-6c598ee55c49.8| -000000c0 f4 03 aa 38 04 f5 03 f6 03 0a 84 01 12 07 4e 61 |...8..........Na| -000000d0 74 61 6c 69 61 52 08 53 6f 6b 6f 6c 6f 76 61 72 |taliaR.Sokolovar| -000000e0 0c 00 00 c8 42 00 00 48 43 00 00 48 42 c8 02 0a |....B..HC..HB...| -000000f0 a1 06 00 00 00 40 08 ac 7c 3f a8 06 32 a8 06 fc |.....@..|?..2...| -00000100 ff ff ff ff ff ff ff ff 01 b0 06 01 bd 06 d0 0f |................| -00000110 49 40 fa 06 02 32 36 90 08 db 01 e1 08 00 c0 1a |I@...26.........| -00000120 63 cf ed ff ff b0 09 a6 3f e2 12 24 63 36 39 34 |c.......?..$c694| -00000130 61 64 38 61 2d 66 37 31 34 2d 34 65 61 33 2d 39 |ad8a-f714-4ea3-9| -00000140 30 37 64 2d 66 64 35 34 66 62 32 35 64 39 62 35 |07d-fd54fb25d9b5| -00000150 0a a3 01 08 01 12 06 56 61 73 69 6c 79 28 ce ca |.......Vasily(..| -00000160 f4 cf ee 0c 52 07 53 69 64 6f 72 6f 76 72 0c 00 |....R.Sidorovr..| -00000170 00 7a 43 00 00 74 43 00 00 20 41 79 50 e0 27 5c |.zC..tC.. AyP.'\| -00000180 00 00 00 00 c8 02 05 c2 05 04 00 00 80 41 9a 06 |.............A..| -00000190 05 70 6f 75 6e 64 a1 06 00 00 00 00 00 00 89 40 |.pound.........@| -000001a0 a8 06 44 a8 06 21 bd 06 db 0f 49 40 fa 06 02 32 |..D..!....I@...2| -000001b0 33 90 08 d3 05 e1 08 00 f5 33 09 00 00 00 00 b0 |3........3......| -000001c0 09 fb 48 d0 0c 80 d0 db c3 f4 02 e2 12 24 61 37 |..H..........$a7| -000001d0 64 61 31 61 61 36 2d 66 34 32 35 2d 34 37 38 39 |da1aa6-f425-4789| -000001e0 2d 38 39 34 37 2d 62 30 33 34 37 38 36 65 64 33 |-8947-b034786ed3| -000001f0 37 34 a0 38 f7 03 |74.8..| -000001f6 - -MESSAGE #1 AT 0x00000005 -isOnline: online -name: "Ivan" -phoneNumber: 74951234567 -surname: "Petrov" -color: 255 -color: 0 -color: 0 -visitTime: 1546703100 -temperature: 5 -measureUnits_coef: 1 -measureUnits_coef: 0.01 -measureUnits_coef: 1000 -measureUnits_unit: "meter" -measureUnits_unit: "centimeter" -measureUnits_unit: "kilometer" -someRatio: 0.10000000149011612 -location: 55 -location: 37 -pi: 3.14 -age: "38" -zodiacSign: 1222 -randomBigNumber: 17060000000 -birthDate: 4015 -lotteryWin: 214 -uuid: "a7522158-3d41-4b77-ad69-6c598ee55c49" -nestiness_a_b_c_d: 500 -nestiness_a_b_c_e: 501 -nestiness_a_b_c_e: 502 -MESSAGE #2 AT 0x000000CC -name: "Natalia" -surname: "Sokolova" -color: 100 -color: 200 -color: 50 -temperature: 5 -someRatio: 0.0070000002160668373 -location: 50 -location: -4 -gender: female -pi: 3.14159 -age: "26" -zodiacSign: 219 -randomBigNumber: -20000000000000 -birthDate: 8102 -uuid: "c694ad8a-f714-4ea3-907d-fd54fb25d9b5" -MESSAGE #3 AT 0x00000153 -isOnline: online -name: "Vasily" -phoneNumber: 442012345678 -surname: "Sidorov" -color: 250 -color: 244 -color: 10 -visitTime: 1546117200 -temperature: -3 -measureUnits_coef: 16 -measureUnits_unit: "pound" -someRatio: 800 -location: 68 -location: 33 -pi: 3.14159274 -age: "23" -zodiacSign: 723 -randomBigNumber: 154400000 -birthDate: 9339 -lotteryWin: 100000000000 -uuid: "a7da1aa6-f425-4789-8947-b034786ed374" -nestiness_a_b_c_d: 503 - -Binary representation is as expected - -Roundtrip: -a7522158-3d41-4b77-ad69-6c598ee55c49 Ivan Petrov male 1980-12-29 \N 74951234567\0\0 1 2019-01-05 18:45:00 38 capricorn [] [255,0,0] [55,37] 3.140000104904175 214 0.1 5 17060000000 ['meter','centimeter','kilometer'] [1,0.01,1000] 500 [501,502] -c694ad8a-f714-4ea3-907d-fd54fb25d9b5 Natalia Sokolova female 1992-03-08 \N \N 0 \N 26 pisces [] [100,200,50] [50,-4] 3.141590118408203 \N 0.007 5 -20000000000000 [] [] \N [] -a7da1aa6-f425-4789-8947-b034786ed374 Vasily Sidorov male 1995-07-28 \N 442012345678\0 1 2018-12-30 00:00:00 23 leo [] [250,244,10] [68,33] 3.1415927410125732 100000000000 800 -3 154400000 ['pound'] [16] 503 [] - -Schema 02240_protobuflist3_format_persons:StrPerson as ProtobufList - -Binary representation: -00000000 e4 05 0a a6 02 0a 24 61 37 35 32 32 31 35 38 2d |......$a7522158-| -00000010 33 64 34 31 2d 34 62 37 37 2d 61 64 36 39 2d 36 |3d41-4b77-ad69-6| -00000020 63 35 39 38 65 65 35 35 63 34 39 12 04 49 76 61 |c598ee55c49..Iva| -00000030 6e 1a 06 50 65 74 72 6f 76 22 04 6d 61 6c 65 2a |n..Petrov".male*| -00000040 0a 31 39 38 30 2d 31 32 2d 32 39 3a 0d 2b 37 34 |.1980-12-29:.+74| -00000050 39 35 31 32 33 34 35 36 37 00 42 01 31 4a 13 32 |951234567.B.1J.2| -00000060 30 31 39 2d 30 31 2d 30 35 20 31 38 3a 34 35 3a |019-01-05 18:45:| -00000070 30 30 52 02 33 38 5a 09 63 61 70 72 69 63 6f 72 |00R.38Z.capricor| -00000080 6e 62 09 59 65 73 74 65 72 64 61 79 62 07 46 6c |nb.Yesterdayb.Fl| -00000090 6f 77 65 72 73 6a 03 32 35 35 6a 01 30 6a 01 30 |owersj.255j.0j.0| -000000a0 72 06 4d 6f 73 63 6f 77 7a 09 35 35 2e 37 35 33 |r.Moscowz.55.753| -000000b0 32 31 35 7a 09 33 37 2e 36 32 32 35 30 34 82 01 |215z.37.622504..| -000000c0 04 33 2e 31 34 8a 01 05 32 31 34 2e 31 92 01 03 |.3.14...214.1...| -000000d0 30 2e 31 9a 01 03 35 2e 38 a2 01 0b 31 37 30 36 |0.1...5.8...1706| -000000e0 30 30 30 30 30 30 30 aa 01 2d 0a 05 6d 65 74 65 |0000000..-..mete| -000000f0 72 0a 0a 63 65 6e 74 69 6d 65 74 65 72 0a 09 6b |r..centimeter..k| -00000100 69 6c 6f 6d 65 74 65 72 12 01 31 12 04 30 2e 30 |ilometer..1..0.0| -00000110 31 12 04 31 30 30 30 b2 01 11 0a 0f 0a 03 35 30 |1..1000.......50| -00000120 30 12 03 35 30 31 12 03 35 30 32 0a b4 01 0a 24 |0..501..502....$| -00000130 63 36 39 34 61 64 38 61 2d 66 37 31 34 2d 34 65 |c694ad8a-f714-4e| -00000140 61 33 2d 39 30 37 64 2d 66 64 35 34 66 62 32 35 |a3-907d-fd54fb25| -00000150 64 39 62 35 12 07 4e 61 74 61 6c 69 61 1a 08 53 |d9b5..Natalia..S| -00000160 6f 6b 6f 6c 6f 76 61 22 06 66 65 6d 61 6c 65 2a |okolova".female*| -00000170 0a 31 39 39 32 2d 30 33 2d 30 38 42 01 30 52 02 |.1992-03-08B.0R.| -00000180 32 36 5a 06 70 69 73 63 65 73 6a 03 31 30 30 6a |26Z.piscesj.100j| -00000190 03 32 30 30 6a 02 35 30 72 08 50 6c 79 6d 6f 75 |.200j.50r.Plymou| -000001a0 74 68 7a 09 35 30 2e 34 30 33 37 32 34 7a 09 2d |thz.50.403724z.-| -000001b0 34 2e 31 34 32 31 32 33 82 01 07 33 2e 31 34 31 |4.142123...3.141| -000001c0 35 39 92 01 05 30 2e 30 30 37 9a 01 03 35 2e 34 |59...0.007...5.4| -000001d0 a2 01 0f 2d 32 30 30 30 30 30 30 30 30 30 30 30 |...-200000000000| -000001e0 30 30 0a 81 02 0a 24 61 37 64 61 31 61 61 36 2d |00....$a7da1aa6-| -000001f0 66 34 32 35 2d 34 37 38 39 2d 38 39 34 37 2d 62 |f425-4789-8947-b| -00000200 30 33 34 37 38 36 65 64 33 37 34 12 06 56 61 73 |034786ed374..Vas| -00000210 69 6c 79 1a 07 53 69 64 6f 72 6f 76 22 04 6d 61 |ily..Sidorov".ma| -00000220 6c 65 2a 0a 31 39 39 35 2d 30 37 2d 32 38 3a 0d |le*.1995-07-28:.| -00000230 2b 34 34 32 30 31 32 33 34 35 36 37 38 42 01 31 |+442012345678B.1| -00000240 4a 13 32 30 31 38 2d 31 32 2d 33 30 20 30 30 3a |J.2018-12-30 00:| -00000250 30 30 3a 30 30 52 02 32 33 5a 03 6c 65 6f 62 05 |00:00R.23Z.leob.| -00000260 53 75 6e 6e 79 6a 03 32 35 30 6a 03 32 34 34 6a |Sunnyj.250j.244j| -00000270 02 31 30 72 08 4d 75 72 6d 61 6e 73 6b 7a 09 36 |.10r.Murmanskz.6| -00000280 38 2e 39 37 30 36 38 32 7a 09 33 33 2e 30 37 34 |8.970682z.33.074| -00000290 39 38 31 82 01 10 33 2e 31 34 31 35 39 32 36 35 |981...3.14159265| -000002a0 33 35 38 39 37 39 8a 01 0c 31 30 30 30 30 30 30 |358979...1000000| -000002b0 30 30 30 30 30 92 01 03 38 30 30 9a 01 04 2d 33 |00000...800...-3| -000002c0 2e 32 a2 01 09 31 35 34 34 30 30 30 30 30 aa 01 |.2...154400000..| -000002d0 0b 0a 05 70 6f 75 6e 64 12 02 31 36 b2 01 07 0a |...pound..16....| -000002e0 05 0a 03 35 30 33 |...503| -000002e6 - -MESSAGE #1 AT 0x00000005 -uuid: "a7522158-3d41-4b77-ad69-6c598ee55c49" -name: "Ivan" -surname: "Petrov" -gender: "male" -birthDate: "1980-12-29" -phoneNumber: "+74951234567\000" -isOnline: "1" -visitTime: "2019-01-05 18:45:00" -age: "38" -zodiacSign: "capricorn" -songs: "Yesterday" -songs: "Flowers" -color: "255" -color: "0" -color: "0" -hometown: "Moscow" -location: "55.753215" -location: "37.622504" -pi: "3.14" -lotteryWin: "214.1" -someRatio: "0.1" -temperature: "5.8" -randomBigNumber: "17060000000" -measureUnits { - unit: "meter" - unit: "centimeter" - unit: "kilometer" - coef: "1" - coef: "0.01" - coef: "1000" -} -nestiness_a { - b_c { - d: "500" - e: "501" - e: "502" - } -} -MESSAGE #2 AT 0x0000012E -uuid: "c694ad8a-f714-4ea3-907d-fd54fb25d9b5" -name: "Natalia" -surname: "Sokolova" -gender: "female" -birthDate: "1992-03-08" -isOnline: "0" -age: "26" -zodiacSign: "pisces" -color: "100" -color: "200" -color: "50" -hometown: "Plymouth" -location: "50.403724" -location: "-4.142123" -pi: "3.14159" -someRatio: "0.007" -temperature: "5.4" -randomBigNumber: "-20000000000000" -MESSAGE #3 AT 0x000001E5 -uuid: "a7da1aa6-f425-4789-8947-b034786ed374" -name: "Vasily" -surname: "Sidorov" -gender: "male" -birthDate: "1995-07-28" -phoneNumber: "+442012345678" -isOnline: "1" -visitTime: "2018-12-30 00:00:00" -age: "23" -zodiacSign: "leo" -songs: "Sunny" -color: "250" -color: "244" -color: "10" -hometown: "Murmansk" -location: "68.970682" -location: "33.074981" -pi: "3.14159265358979" -lotteryWin: "100000000000" -someRatio: "800" -temperature: "-3.2" -randomBigNumber: "154400000" -measureUnits { - unit: "pound" - coef: "16" -} -nestiness_a { - b_c { - d: "503" - } -} - -Binary representation is as expected -Roundtrip: -a7522158-3d41-4b77-ad69-6c598ee55c49 Ivan Petrov male 1980-12-29 \N +74951234567\0 1 2019-01-05 18:45:00 38 capricorn ['Yesterday','Flowers'] [255,0,0] Moscow [55.753215,37.622504] 3.14 214.1 0.1 5.8 17060000000 ['meter','centimeter','kilometer'] [1,0.01,1000] 500 [501,502] -c694ad8a-f714-4ea3-907d-fd54fb25d9b5 Natalia Sokolova female 1992-03-08 \N \N 0 \N 26 pisces [] [100,200,50] Plymouth [50.403724,-4.142123] 3.14159 \N 0.007 5.4 -20000000000000 [] [] \N [] -a7da1aa6-f425-4789-8947-b034786ed374 Vasily Sidorov male 1995-07-28 \N +442012345678 1 2018-12-30 00:00:00 23 leo ['Sunny'] [250,244,10] Murmansk [68.970682,33.074981] 3.14159265358979 100000000000 800 -3.2 154400000 ['pound'] [16] 503 [] - -Schema 02240_protobuf_format_syntax2:Syntax2Person - -Binary representation: -00000000 bb 04 0a ef 01 0a 24 61 37 35 32 32 31 35 38 2d |......$a7522158-| -00000010 33 64 34 31 2d 34 62 37 37 2d 61 64 36 39 2d 36 |3d41-4b77-ad69-6| -00000020 63 35 39 38 65 65 35 35 63 34 39 12 04 49 76 61 |c598ee55c49..Iva| -00000030 6e 1a 06 50 65 74 72 6f 76 20 01 28 af 1f 32 03 |n..Petrov .(..2.| -00000040 70 6e 67 3a 0d 2b 37 34 39 35 31 32 33 34 35 36 |png:.+7495123456| -00000050 37 00 40 01 4d fc d0 30 5c 50 26 58 09 62 09 59 |7.@.M..0\P&X.b.Y| -00000060 65 73 74 65 72 64 61 79 62 07 46 6c 6f 77 65 72 |esterdayb.Flower| -00000070 73 6a 04 ff 01 00 00 72 06 4d 6f 73 63 6f 77 7a |sj.....r.Moscowz| -00000080 08 4b 03 5f 42 72 7d 16 42 81 01 1f 85 eb 51 b8 |.K._Br}.B.....Q.| -00000090 1e 09 40 89 01 33 33 33 33 33 c3 6a 40 95 01 cd |..@..33333.j@...| -000000a0 cc cc 3d 9d 01 9a 99 b9 40 a0 01 80 c4 d7 8d 7f |..=.....@.......| -000000b0 ab 01 0a 0c 00 00 80 3f 0a d7 23 3c 00 00 7a 44 |.......?..#<..zD| -000000c0 12 05 6d 65 74 65 72 12 0a 63 65 6e 74 69 6d 65 |..meter..centime| -000000d0 74 65 72 12 09 6b 69 6c 6f 6d 65 74 65 72 ac 01 |ter..kilometer..| -000000e0 b3 01 0b a2 06 0b 0b 08 f4 03 12 04 f5 03 f6 03 |................| -000000f0 0c 0c b4 01 0a 80 01 0a 24 63 36 39 34 61 64 38 |........$c694ad8| -00000100 61 2d 66 37 31 34 2d 34 65 61 33 2d 39 30 37 64 |a-f714-4ea3-907d| -00000110 2d 66 64 35 34 66 62 32 35 64 39 62 35 12 07 4e |-fd54fb25d9b5..N| -00000120 61 74 61 6c 69 61 1a 08 53 6f 6b 6f 6c 6f 76 61 |atalia..Sokolova| -00000130 20 00 28 a6 3f 32 03 6a 70 67 50 1a 58 0b 6a 04 | .(.?2.jpgP.X.j.| -00000140 64 c8 01 32 72 08 50 6c 79 6d 6f 75 74 68 7a 08 |d..2r.Plymouthz.| -00000150 6a 9d 49 42 46 8c 84 c0 81 01 6e 86 1b f0 f9 21 |j.IBF.....n....!| -00000160 09 40 95 01 42 60 e5 3b 9d 01 cd cc ac 40 a0 01 |.@..B`.;.....@..| -00000170 ff ff a9 ce 93 8c 09 0a c3 01 0a 24 61 37 64 61 |...........$a7da| -00000180 31 61 61 36 2d 66 34 32 35 2d 34 37 38 39 2d 38 |1aa6-f425-4789-8| -00000190 39 34 37 2d 62 30 33 34 37 38 36 65 64 33 37 34 |947-b034786ed374| -000001a0 12 06 56 61 73 69 6c 79 1a 07 53 69 64 6f 72 6f |..Vasily..Sidoro| -000001b0 76 20 01 28 fb 48 32 03 62 6d 70 3a 0d 2b 34 34 |v .(.H2.bmp:.+44| -000001c0 32 30 31 32 33 34 35 36 37 38 40 01 4d 50 e0 27 |2012345678@.MP.'| -000001d0 5c 50 17 58 04 62 05 53 75 6e 6e 79 6a 05 fa 01 |\P.X.b.Sunnyj...| -000001e0 f4 01 0a 72 08 4d 75 72 6d 61 6e 73 6b 7a 08 fd |...r.Murmanskz..| -000001f0 f0 89 42 c8 4c 04 42 81 01 11 2d 44 54 fb 21 09 |..B.L.B...-DT.!.| -00000200 40 89 01 00 00 00 e8 76 48 37 42 95 01 00 00 48 |@......vH7B....H| -00000210 44 9d 01 cd cc 4c c0 a0 01 80 d4 9f 93 01 ab 01 |D....L..........| -00000220 0a 04 00 00 80 41 12 05 70 6f 75 6e 64 ac 01 b3 |.....A..pound...| -00000230 01 0b a2 06 05 0b 08 f7 03 0c 0c b4 01 |.............| -0000023d - -MESSAGE #1 AT 0x00000005 -uuid: "a7522158-3d41-4b77-ad69-6c598ee55c49" -name: "Ivan" -surname: "Petrov" -gender: male -birthDate: 4015 -photo: "png" -phoneNumber: "+74951234567\000" -isOnline: true -visitTime: 1546703100 -age: 38 -zodiacSign: capricorn -songs: "Yesterday" -songs: "Flowers" -color: 255 -color: 0 -color: 0 -hometown: "Moscow" -location: 55.7532158 -location: 37.6225052 -pi: 3.14 -lotteryWin: 214.1 -someRatio: 0.1 -temperature: 5.8 -randomBigNumber: 17060000000 -MeasureUnits { - coef: 1 - coef: 0.01 - coef: 1000 - unit: "meter" - unit: "centimeter" - unit: "kilometer" -} -Nestiness { - A { - b { - C { - d: 500 - e: 501 - e: 502 - } - } - } -} -MESSAGE #2 AT 0x000000F7 -uuid: "c694ad8a-f714-4ea3-907d-fd54fb25d9b5" -name: "Natalia" -surname: "Sokolova" -gender: female -birthDate: 8102 -photo: "jpg" -age: 26 -zodiacSign: pisces -color: 100 -color: 200 -color: 50 -hometown: "Plymouth" -location: 50.4037247 -location: -4.14212322 -pi: 3.14159 -someRatio: 0.007 -temperature: 5.4 -randomBigNumber: -20000000000000 -MESSAGE #3 AT 0x0000017A -uuid: "a7da1aa6-f425-4789-8947-b034786ed374" -name: "Vasily" -surname: "Sidorov" -gender: male -birthDate: 9339 -photo: "bmp" -phoneNumber: "+442012345678" -isOnline: true -visitTime: 1546117200 -age: 23 -zodiacSign: leo -songs: "Sunny" -color: 250 -color: 244 -color: 10 -hometown: "Murmansk" -location: 68.9706802 -location: 33.0749817 -pi: 3.14159265358979 -lotteryWin: 100000000000 -someRatio: 800 -temperature: -3.2 -randomBigNumber: 154400000 -MeasureUnits { - coef: 16 - unit: "pound" -} -Nestiness { - A { - b { - C { - d: 503 - } - } - } -} - -Binary representation differs from the expected one (listed below): -00000000 be 04 0a f1 01 0a 24 61 37 35 32 32 31 35 38 2d |......$a7522158-| -00000010 33 64 34 31 2d 34 62 37 37 2d 61 64 36 39 2d 36 |3d41-4b77-ad69-6| -00000020 63 35 39 38 65 65 35 35 63 34 39 12 04 49 76 61 |c598ee55c49..Iva| -00000030 6e 1a 06 50 65 74 72 6f 76 20 01 28 af 1f 32 03 |n..Petrov .(..2.| -00000040 70 6e 67 3a 0d 2b 37 34 39 35 31 32 33 34 35 36 |png:.+7495123456| -00000050 37 00 40 01 4d fc d0 30 5c 50 26 58 09 62 09 59 |7.@.M..0\P&X.b.Y| -00000060 65 73 74 65 72 64 61 79 62 07 46 6c 6f 77 65 72 |esterdayb.Flower| -00000070 73 68 ff 01 68 00 68 00 72 06 4d 6f 73 63 6f 77 |sh..h.h.r.Moscow| -00000080 7a 08 4b 03 5f 42 72 7d 16 42 81 01 1f 85 eb 51 |z.K._Br}.B.....Q| -00000090 b8 1e 09 40 89 01 33 33 33 33 33 c3 6a 40 95 01 |...@..33333.j@..| -000000a0 cd cc cc 3d 9d 01 9a 99 b9 40 a0 01 80 c4 d7 8d |...=.....@......| -000000b0 7f ab 01 0d 00 00 80 3f 0d 0a d7 23 3c 0d 00 00 |.......?...#<...| -000000c0 7a 44 12 05 6d 65 74 65 72 12 0a 63 65 6e 74 69 |zD..meter..centi| -000000d0 6d 65 74 65 72 12 09 6b 69 6c 6f 6d 65 74 65 72 |meter..kilometer| -000000e0 ac 01 b3 01 0b a2 06 0b 0b 08 f4 03 10 f5 03 10 |................| -000000f0 f6 03 0c 0c b4 01 0a 81 01 0a 24 63 36 39 34 61 |..........$c694a| -00000100 64 38 61 2d 66 37 31 34 2d 34 65 61 33 2d 39 30 |d8a-f714-4ea3-90| -00000110 37 64 2d 66 64 35 34 66 62 32 35 64 39 62 35 12 |7d-fd54fb25d9b5.| -00000120 07 4e 61 74 61 6c 69 61 1a 08 53 6f 6b 6f 6c 6f |.Natalia..Sokolo| -00000130 76 61 20 00 28 a6 3f 32 03 6a 70 67 50 1a 58 0b |va .(.?2.jpgP.X.| -00000140 68 64 68 c8 01 68 32 72 08 50 6c 79 6d 6f 75 74 |hdh..h2r.Plymout| -00000150 68 7a 08 6a 9d 49 42 46 8c 84 c0 81 01 6e 86 1b |hz.j.IBF.....n..| -00000160 f0 f9 21 09 40 95 01 42 60 e5 3b 9d 01 cd cc ac |..!.@..B`.;.....| -00000170 40 a0 01 ff ff a9 ce 93 8c 09 0a c3 01 0a 24 61 |@.............$a| -00000180 37 64 61 31 61 61 36 2d 66 34 32 35 2d 34 37 38 |7da1aa6-f425-478| -00000190 39 2d 38 39 34 37 2d 62 30 33 34 37 38 36 65 64 |9-8947-b034786ed| -000001a0 33 37 34 12 06 56 61 73 69 6c 79 1a 07 53 69 64 |374..Vasily..Sid| -000001b0 6f 72 6f 76 20 01 28 fb 48 32 03 62 6d 70 3a 0d |orov .(.H2.bmp:.| -000001c0 2b 34 34 32 30 31 32 33 34 35 36 37 38 40 01 4d |+442012345678@.M| -000001d0 50 e0 27 5c 50 17 58 04 62 05 53 75 6e 6e 79 68 |P.'\P.X.b.Sunnyh| -000001e0 fa 01 68 f4 01 68 0a 72 08 4d 75 72 6d 61 6e 73 |..h..h.r.Murmans| -000001f0 6b 7a 08 fd f0 89 42 c8 4c 04 42 81 01 11 2d 44 |kz....B.L.B...-D| -00000200 54 fb 21 09 40 89 01 00 00 00 e8 76 48 37 42 95 |T.!.@......vH7B.| -00000210 01 00 00 48 44 9d 01 cd cc 4c c0 a0 01 80 d4 9f |...HD....L......| -00000220 93 01 ab 01 0d 00 00 80 41 12 05 70 6f 75 6e 64 |........A..pound| -00000230 ac 01 b3 01 0b a2 06 05 0b 08 f7 03 0c 0c b4 01 |................| -00000240 - -stdout: -a7522158-3d41-4b77-ad69-6c598ee55c49 Ivan Petrov male 1980-12-29 png +74951234567\0 1 2019-01-05 18:45:00 38 capricorn ['Yesterday','Flowers'] [255,0,0] Moscow [55.753215,37.622504] 3.14 214.1 0.1 5.8 17060000000 ['meter','centimeter','kilometer'] [1,0.01,1000] 500 [501,502] -c694ad8a-f714-4ea3-907d-fd54fb25d9b5 Natalia Sokolova female 1992-03-08 jpg \N 0 \N 26 pisces [] [100,200,50] Plymouth [50.403724,-4.142123] 3.14159 \N 0.007 5.4 -20000000000000 [] [] \N [] -a7da1aa6-f425-4789-8947-b034786ed374 Vasily Sidorov male 1995-07-28 bmp +442012345678 1 2018-12-30 00:00:00 23 leo ['Sunny'] [250,244,10] Murmansk [68.970682,33.074981] 3.14159265358979 100000000000 800 -3.2 154400000 ['pound'] [16] 503 [] - -Schema 02240_protobuflist1_format_persons:Person - -Binary representation: -00000000 ba 04 0a f4 01 0a 24 61 37 35 32 32 31 35 38 2d |......$a7522158-| -00000010 33 64 34 31 2d 34 62 37 37 2d 61 64 36 39 2d 36 |3d41-4b77-ad69-6| -00000020 63 35 39 38 65 65 35 35 63 34 39 12 04 49 76 61 |c598ee55c49..Iva| -00000030 6e 1a 06 50 65 74 72 6f 76 20 01 28 af 1f 32 03 |n..Petrov .(..2.| -00000040 70 6e 67 3a 0d 2b 37 34 39 35 31 32 33 34 35 36 |png:.+7495123456| -00000050 37 00 40 01 4d fc d0 30 5c 50 26 58 09 62 09 59 |7.@.M..0\P&X.b.Y| -00000060 65 73 74 65 72 64 61 79 62 07 46 6c 6f 77 65 72 |esterdayb.Flower| -00000070 73 6a 04 ff 01 00 00 72 06 4d 6f 73 63 6f 77 7a |sj.....r.Moscowz| -00000080 08 4b 03 5f 42 72 7d 16 42 81 01 1f 85 eb 51 b8 |.K._Br}.B.....Q.| -00000090 1e 09 40 89 01 33 33 33 33 33 c3 6a 40 95 01 cd |..@..33333.j@...| -000000a0 cc cc 3d 9d 01 9a 99 b9 40 a0 01 80 c4 d7 8d 7f |..=.....@.......| -000000b0 aa 01 0c 0a 05 6d 65 74 65 72 15 00 00 80 3f aa |.....meter....?.| -000000c0 01 11 0a 0a 63 65 6e 74 69 6d 65 74 65 72 15 0a |....centimeter..| -000000d0 d7 23 3c aa 01 10 0a 09 6b 69 6c 6f 6d 65 74 65 |.#<.....kilomete| -000000e0 72 15 00 00 7a 44 b2 01 10 0a 0e a2 06 0b 0a 09 |r...zD..........| -000000f0 08 f4 03 12 04 f5 03 f6 03 0a 7e 0a 24 63 36 39 |..........~.$c69| -00000100 34 61 64 38 61 2d 66 37 31 34 2d 34 65 61 33 2d |4ad8a-f714-4ea3-| -00000110 39 30 37 64 2d 66 64 35 34 66 62 32 35 64 39 62 |907d-fd54fb25d9b| -00000120 35 12 07 4e 61 74 61 6c 69 61 1a 08 53 6f 6b 6f |5..Natalia..Soko| -00000130 6c 6f 76 61 28 a6 3f 32 03 6a 70 67 50 1a 58 0b |lova(.?2.jpgP.X.| -00000140 6a 04 64 c8 01 32 72 08 50 6c 79 6d 6f 75 74 68 |j.d..2r.Plymouth| -00000150 7a 08 6a 9d 49 42 46 8c 84 c0 81 01 6e 86 1b f0 |z.j.IBF.....n...| -00000160 f9 21 09 40 95 01 42 60 e5 3b 9d 01 cd cc ac 40 |.!.@..B`.;.....@| -00000170 a0 01 ff ff a9 ce 93 8c 09 0a c0 01 0a 24 61 37 |.............$a7| -00000180 64 61 31 61 61 36 2d 66 34 32 35 2d 34 37 38 39 |da1aa6-f425-4789| -00000190 2d 38 39 34 37 2d 62 30 33 34 37 38 36 65 64 33 |-8947-b034786ed3| -000001a0 37 34 12 06 56 61 73 69 6c 79 1a 07 53 69 64 6f |74..Vasily..Sido| -000001b0 72 6f 76 20 01 28 fb 48 32 03 62 6d 70 3a 0d 2b |rov .(.H2.bmp:.+| -000001c0 34 34 32 30 31 32 33 34 35 36 37 38 40 01 4d 50 |442012345678@.MP| -000001d0 e0 27 5c 50 17 58 04 62 05 53 75 6e 6e 79 6a 05 |.'\P.X.b.Sunnyj.| -000001e0 fa 01 f4 01 0a 72 08 4d 75 72 6d 61 6e 73 6b 7a |.....r.Murmanskz| -000001f0 08 fd f0 89 42 c8 4c 04 42 81 01 11 2d 44 54 fb |....B.L.B...-DT.| -00000200 21 09 40 89 01 00 00 00 e8 76 48 37 42 95 01 00 |!.@......vH7B...| -00000210 00 48 44 9d 01 cd cc 4c c0 a0 01 80 d4 9f 93 01 |.HD....L........| -00000220 aa 01 0c 0a 05 70 6f 75 6e 64 15 00 00 80 41 b2 |.....pound....A.| -00000230 01 0a 0a 08 a2 06 05 0a 03 08 f7 03 |............| -0000023c - -MESSAGE #1 AT 0x00000005 -uuid: "a7522158-3d41-4b77-ad69-6c598ee55c49" -name: "Ivan" -surname: "Petrov" -gender: male -birthDate: 4015 -photo: "png" -phoneNumber: "+74951234567\000" -isOnline: true -visitTime: 1546703100 -age: 38 -zodiacSign: capricorn -songs: "Yesterday" -songs: "Flowers" -color: 255 -color: 0 -color: 0 -hometown: "Moscow" -location: 55.7532158 -location: 37.6225052 -pi: 3.14 -lotteryWin: 214.1 -someRatio: 0.1 -temperature: 5.8 -randomBigNumber: 17060000000 -measureUnits { - unit: "meter" - coef: 1 -} -measureUnits { - unit: "centimeter" - coef: 0.01 -} -measureUnits { - unit: "kilometer" - coef: 1000 -} -nestiness { - a { - b { - c { - d: 500 - e: 501 - e: 502 - } - } - } -} -MESSAGE #2 AT 0x000000FB -uuid: "c694ad8a-f714-4ea3-907d-fd54fb25d9b5" -name: "Natalia" -surname: "Sokolova" -birthDate: 8102 -photo: "jpg" -age: 26 -zodiacSign: pisces -color: 100 -color: 200 -color: 50 -hometown: "Plymouth" -location: 50.4037247 -location: -4.14212322 -pi: 3.14159 -someRatio: 0.007 -temperature: 5.4 -randomBigNumber: -20000000000000 -MESSAGE #3 AT 0x0000017C -uuid: "a7da1aa6-f425-4789-8947-b034786ed374" -name: "Vasily" -surname: "Sidorov" -gender: male -birthDate: 9339 -photo: "bmp" -phoneNumber: "+442012345678" -isOnline: true -visitTime: 1546117200 -age: 23 -zodiacSign: leo -songs: "Sunny" -color: 250 -color: 244 -color: 10 -hometown: "Murmansk" -location: 68.9706802 -location: 33.0749817 -pi: 3.14159265358979 -lotteryWin: 100000000000 -someRatio: 800 -temperature: -3.2 -randomBigNumber: 154400000 -measureUnits { - unit: "pound" - coef: 16 -} -nestiness { - a { - b { - c { - d: 503 - } - } - } -} - -Binary representation is as expected - -Roundtrip: -a7522158-3d41-4b77-ad69-6c598ee55c49 Ivan Petrov male 1980-12-29 png +74951234567\0 1 2019-01-05 18:45:00 38 capricorn ['Yesterday','Flowers'] [255,0,0] Moscow [55.753216,37.622504] 3.14 214.1 0.1 5.8 17060000000 ['meter','centimeter','kilometer'] [1,0.01,1000] 500 [501,502] -c694ad8a-f714-4ea3-907d-fd54fb25d9b5 Natalia Sokolova female 1992-03-08 jpg \N 0 \N 26 pisces [] [100,200,50] Plymouth [50.403724,-4.142123] 3.14159 \N 0.007 5.4 -20000000000000 [] [] \N [] -a7da1aa6-f425-4789-8947-b034786ed374 Vasily Sidorov male 1995-07-28 bmp +442012345678 1 2018-12-30 00:00:00 23 leo ['Sunny'] [250,244,10] Murmansk [68.97068,33.074982] 3.14159265358979 100000000000 800 -3.2 154400000 ['pound'] [16] 503 [] - -Schema 02240_protobuflist2_format_persons:AltPerson - -Binary representation: -00000000 f4 03 0a c4 01 08 01 12 04 49 76 61 6e 28 87 a8 |.........Ivan(..| -00000010 c4 9b 97 02 52 06 50 65 74 72 6f 76 72 0c 00 00 |....R.Petrovr...| -00000020 7f 43 00 00 00 00 00 00 00 00 79 fc d0 30 5c 00 |.C........y..0\.| -00000030 00 00 00 c8 02 0a c2 05 0c 00 00 80 3f 0a d7 23 |............?..#| -00000040 3c 00 00 7a 44 9a 06 05 6d 65 74 65 72 9a 06 0a |<..zD...meter...| -00000050 63 65 6e 74 69 6d 65 74 65 72 9a 06 09 6b 69 6c |centimeter...kil| -00000060 6f 6d 65 74 65 72 a1 06 00 00 00 a0 99 99 b9 3f |ometer.........?| -00000070 a8 06 37 a8 06 25 bd 06 c3 f5 48 40 fa 06 02 33 |..7..%....H@...3| -00000080 38 90 08 c6 09 e1 08 00 f1 da f8 03 00 00 00 b0 |8...............| -00000090 09 af 1f d0 0c d6 01 e2 12 24 61 37 35 32 32 31 |.........$a75221| -000000a0 35 38 2d 33 64 34 31 2d 34 62 37 37 2d 61 64 36 |58-3d41-4b77-ad6| -000000b0 39 2d 36 63 35 39 38 65 65 35 35 63 34 39 a0 38 |9-6c598ee55c49.8| -000000c0 f4 03 aa 38 04 f5 03 f6 03 0a 84 01 12 07 4e 61 |...8..........Na| -000000d0 74 61 6c 69 61 52 08 53 6f 6b 6f 6c 6f 76 61 72 |taliaR.Sokolovar| -000000e0 0c 00 00 c8 42 00 00 48 43 00 00 48 42 c8 02 0a |....B..HC..HB...| -000000f0 a1 06 00 00 00 40 08 ac 7c 3f a8 06 32 a8 06 fc |.....@..|?..2...| -00000100 ff ff ff ff ff ff ff ff 01 b0 06 01 bd 06 d0 0f |................| -00000110 49 40 fa 06 02 32 36 90 08 db 01 e1 08 00 c0 1a |I@...26.........| -00000120 63 cf ed ff ff b0 09 a6 3f e2 12 24 63 36 39 34 |c.......?..$c694| -00000130 61 64 38 61 2d 66 37 31 34 2d 34 65 61 33 2d 39 |ad8a-f714-4ea3-9| -00000140 30 37 64 2d 66 64 35 34 66 62 32 35 64 39 62 35 |07d-fd54fb25d9b5| -00000150 0a a3 01 08 01 12 06 56 61 73 69 6c 79 28 ce ca |.......Vasily(..| -00000160 f4 cf ee 0c 52 07 53 69 64 6f 72 6f 76 72 0c 00 |....R.Sidorovr..| -00000170 00 7a 43 00 00 74 43 00 00 20 41 79 50 e0 27 5c |.zC..tC.. AyP.'\| -00000180 00 00 00 00 c8 02 05 c2 05 04 00 00 80 41 9a 06 |.............A..| -00000190 05 70 6f 75 6e 64 a1 06 00 00 00 00 00 00 89 40 |.pound.........@| -000001a0 a8 06 44 a8 06 21 bd 06 db 0f 49 40 fa 06 02 32 |..D..!....I@...2| -000001b0 33 90 08 d3 05 e1 08 00 f5 33 09 00 00 00 00 b0 |3........3......| -000001c0 09 fb 48 d0 0c 80 d0 db c3 f4 02 e2 12 24 61 37 |..H..........$a7| -000001d0 64 61 31 61 61 36 2d 66 34 32 35 2d 34 37 38 39 |da1aa6-f425-4789| -000001e0 2d 38 39 34 37 2d 62 30 33 34 37 38 36 65 64 33 |-8947-b034786ed3| -000001f0 37 34 a0 38 f7 03 |74.8..| -000001f6 - -MESSAGE #1 AT 0x00000005 -isOnline: online -name: "Ivan" -phoneNumber: 74951234567 -surname: "Petrov" -color: 255 -color: 0 -color: 0 -visitTime: 1546703100 -temperature: 5 -measureUnits_coef: 1 -measureUnits_coef: 0.01 -measureUnits_coef: 1000 -measureUnits_unit: "meter" -measureUnits_unit: "centimeter" -measureUnits_unit: "kilometer" -someRatio: 0.10000000149011612 -location: 55 -location: 37 -pi: 3.14 -age: "38" -zodiacSign: 1222 -randomBigNumber: 17060000000 -birthDate: 4015 -lotteryWin: 214 -uuid: "a7522158-3d41-4b77-ad69-6c598ee55c49" -nestiness_a_b_c_d: 500 -nestiness_a_b_c_e: 501 -nestiness_a_b_c_e: 502 -MESSAGE #2 AT 0x000000CC -name: "Natalia" -surname: "Sokolova" -color: 100 -color: 200 -color: 50 -temperature: 5 -someRatio: 0.0070000002160668373 -location: 50 -location: -4 -gender: female -pi: 3.14159 -age: "26" -zodiacSign: 219 -randomBigNumber: -20000000000000 -birthDate: 8102 -uuid: "c694ad8a-f714-4ea3-907d-fd54fb25d9b5" -MESSAGE #3 AT 0x00000153 -isOnline: online -name: "Vasily" -phoneNumber: 442012345678 -surname: "Sidorov" -color: 250 -color: 244 -color: 10 -visitTime: 1546117200 -temperature: -3 -measureUnits_coef: 16 -measureUnits_unit: "pound" -someRatio: 800 -location: 68 -location: 33 -pi: 3.14159274 -age: "23" -zodiacSign: 723 -randomBigNumber: 154400000 -birthDate: 9339 -lotteryWin: 100000000000 -uuid: "a7da1aa6-f425-4789-8947-b034786ed374" -nestiness_a_b_c_d: 503 - -Binary representation is as expected - -Roundtrip: -a7522158-3d41-4b77-ad69-6c598ee55c49 Ivan Petrov male 1980-12-29 \N 74951234567\0\0 1 2019-01-05 18:45:00 38 capricorn [] [255,0,0] [55,37] 3.140000104904175 214 0.1 5 17060000000 ['meter','centimeter','kilometer'] [1,0.01,1000] 500 [501,502] -c694ad8a-f714-4ea3-907d-fd54fb25d9b5 Natalia Sokolova female 1992-03-08 \N \N 0 \N 26 pisces [] [100,200,50] [50,-4] 3.141590118408203 \N 0.007 5 -20000000000000 [] [] \N [] -a7da1aa6-f425-4789-8947-b034786ed374 Vasily Sidorov male 1995-07-28 \N 442012345678\0 1 2018-12-30 00:00:00 23 leo [] [250,244,10] [68,33] 3.1415927410125732 100000000000 800 -3 154400000 ['pound'] [16] 503 [] - -Schema 02240_protobuflist3_format_persons:StrPerson as ProtobufList - -Binary representation: -00000000 e4 05 0a a6 02 0a 24 61 37 35 32 32 31 35 38 2d |......$a7522158-| -00000010 33 64 34 31 2d 34 62 37 37 2d 61 64 36 39 2d 36 |3d41-4b77-ad69-6| -00000020 63 35 39 38 65 65 35 35 63 34 39 12 04 49 76 61 |c598ee55c49..Iva| -00000030 6e 1a 06 50 65 74 72 6f 76 22 04 6d 61 6c 65 2a |n..Petrov".male*| -00000040 0a 31 39 38 30 2d 31 32 2d 32 39 3a 0d 2b 37 34 |.1980-12-29:.+74| -00000050 39 35 31 32 33 34 35 36 37 00 42 01 31 4a 13 32 |951234567.B.1J.2| -00000060 30 31 39 2d 30 31 2d 30 35 20 31 38 3a 34 35 3a |019-01-05 18:45:| -00000070 30 30 52 02 33 38 5a 09 63 61 70 72 69 63 6f 72 |00R.38Z.capricor| -00000080 6e 62 09 59 65 73 74 65 72 64 61 79 62 07 46 6c |nb.Yesterdayb.Fl| -00000090 6f 77 65 72 73 6a 03 32 35 35 6a 01 30 6a 01 30 |owersj.255j.0j.0| -000000a0 72 06 4d 6f 73 63 6f 77 7a 09 35 35 2e 37 35 33 |r.Moscowz.55.753| -000000b0 32 31 35 7a 09 33 37 2e 36 32 32 35 30 34 82 01 |215z.37.622504..| -000000c0 04 33 2e 31 34 8a 01 05 32 31 34 2e 31 92 01 03 |.3.14...214.1...| -000000d0 30 2e 31 9a 01 03 35 2e 38 a2 01 0b 31 37 30 36 |0.1...5.8...1706| -000000e0 30 30 30 30 30 30 30 aa 01 2d 0a 05 6d 65 74 65 |0000000..-..mete| -000000f0 72 0a 0a 63 65 6e 74 69 6d 65 74 65 72 0a 09 6b |r..centimeter..k| -00000100 69 6c 6f 6d 65 74 65 72 12 01 31 12 04 30 2e 30 |ilometer..1..0.0| -00000110 31 12 04 31 30 30 30 b2 01 11 0a 0f 0a 03 35 30 |1..1000.......50| -00000120 30 12 03 35 30 31 12 03 35 30 32 0a b4 01 0a 24 |0..501..502....$| -00000130 63 36 39 34 61 64 38 61 2d 66 37 31 34 2d 34 65 |c694ad8a-f714-4e| -00000140 61 33 2d 39 30 37 64 2d 66 64 35 34 66 62 32 35 |a3-907d-fd54fb25| -00000150 64 39 62 35 12 07 4e 61 74 61 6c 69 61 1a 08 53 |d9b5..Natalia..S| -00000160 6f 6b 6f 6c 6f 76 61 22 06 66 65 6d 61 6c 65 2a |okolova".female*| -00000170 0a 31 39 39 32 2d 30 33 2d 30 38 42 01 30 52 02 |.1992-03-08B.0R.| -00000180 32 36 5a 06 70 69 73 63 65 73 6a 03 31 30 30 6a |26Z.piscesj.100j| -00000190 03 32 30 30 6a 02 35 30 72 08 50 6c 79 6d 6f 75 |.200j.50r.Plymou| -000001a0 74 68 7a 09 35 30 2e 34 30 33 37 32 34 7a 09 2d |thz.50.403724z.-| -000001b0 34 2e 31 34 32 31 32 33 82 01 07 33 2e 31 34 31 |4.142123...3.141| -000001c0 35 39 92 01 05 30 2e 30 30 37 9a 01 03 35 2e 34 |59...0.007...5.4| -000001d0 a2 01 0f 2d 32 30 30 30 30 30 30 30 30 30 30 30 |...-200000000000| -000001e0 30 30 0a 81 02 0a 24 61 37 64 61 31 61 61 36 2d |00....$a7da1aa6-| -000001f0 66 34 32 35 2d 34 37 38 39 2d 38 39 34 37 2d 62 |f425-4789-8947-b| -00000200 30 33 34 37 38 36 65 64 33 37 34 12 06 56 61 73 |034786ed374..Vas| -00000210 69 6c 79 1a 07 53 69 64 6f 72 6f 76 22 04 6d 61 |ily..Sidorov".ma| -00000220 6c 65 2a 0a 31 39 39 35 2d 30 37 2d 32 38 3a 0d |le*.1995-07-28:.| -00000230 2b 34 34 32 30 31 32 33 34 35 36 37 38 42 01 31 |+442012345678B.1| -00000240 4a 13 32 30 31 38 2d 31 32 2d 33 30 20 30 30 3a |J.2018-12-30 00:| -00000250 30 30 3a 30 30 52 02 32 33 5a 03 6c 65 6f 62 05 |00:00R.23Z.leob.| -00000260 53 75 6e 6e 79 6a 03 32 35 30 6a 03 32 34 34 6a |Sunnyj.250j.244j| -00000270 02 31 30 72 08 4d 75 72 6d 61 6e 73 6b 7a 09 36 |.10r.Murmanskz.6| -00000280 38 2e 39 37 30 36 38 32 7a 09 33 33 2e 30 37 34 |8.970682z.33.074| -00000290 39 38 31 82 01 10 33 2e 31 34 31 35 39 32 36 35 |981...3.14159265| -000002a0 33 35 38 39 37 39 8a 01 0c 31 30 30 30 30 30 30 |358979...1000000| -000002b0 30 30 30 30 30 92 01 03 38 30 30 9a 01 04 2d 33 |00000...800...-3| -000002c0 2e 32 a2 01 09 31 35 34 34 30 30 30 30 30 aa 01 |.2...154400000..| -000002d0 0b 0a 05 70 6f 75 6e 64 12 02 31 36 b2 01 07 0a |...pound..16....| -000002e0 05 0a 03 35 30 33 |...503| -000002e6 - -MESSAGE #1 AT 0x00000005 -uuid: "a7522158-3d41-4b77-ad69-6c598ee55c49" -name: "Ivan" -surname: "Petrov" -gender: "male" -birthDate: "1980-12-29" -phoneNumber: "+74951234567\000" -isOnline: "1" -visitTime: "2019-01-05 18:45:00" -age: "38" -zodiacSign: "capricorn" -songs: "Yesterday" -songs: "Flowers" -color: "255" -color: "0" -color: "0" -hometown: "Moscow" -location: "55.753215" -location: "37.622504" -pi: "3.14" -lotteryWin: "214.1" -someRatio: "0.1" -temperature: "5.8" -randomBigNumber: "17060000000" -measureUnits { - unit: "meter" - unit: "centimeter" - unit: "kilometer" - coef: "1" - coef: "0.01" - coef: "1000" -} -nestiness_a { - b_c { - d: "500" - e: "501" - e: "502" - } -} -MESSAGE #2 AT 0x0000012E -uuid: "c694ad8a-f714-4ea3-907d-fd54fb25d9b5" -name: "Natalia" -surname: "Sokolova" -gender: "female" -birthDate: "1992-03-08" -isOnline: "0" -age: "26" -zodiacSign: "pisces" -color: "100" -color: "200" -color: "50" -hometown: "Plymouth" -location: "50.403724" -location: "-4.142123" -pi: "3.14159" -someRatio: "0.007" -temperature: "5.4" -randomBigNumber: "-20000000000000" -MESSAGE #3 AT 0x000001E5 -uuid: "a7da1aa6-f425-4789-8947-b034786ed374" -name: "Vasily" -surname: "Sidorov" -gender: "male" -birthDate: "1995-07-28" -phoneNumber: "+442012345678" -isOnline: "1" -visitTime: "2018-12-30 00:00:00" -age: "23" -zodiacSign: "leo" -songs: "Sunny" -color: "250" -color: "244" -color: "10" -hometown: "Murmansk" -location: "68.970682" -location: "33.074981" -pi: "3.14159265358979" -lotteryWin: "100000000000" -someRatio: "800" -temperature: "-3.2" -randomBigNumber: "154400000" -measureUnits { - unit: "pound" - coef: "16" -} -nestiness_a { - b_c { - d: "503" - } -} - -Binary representation is as expected -Roundtrip: -a7522158-3d41-4b77-ad69-6c598ee55c49 Ivan Petrov male 1980-12-29 \N +74951234567\0 1 2019-01-05 18:45:00 38 capricorn ['Yesterday','Flowers'] [255,0,0] Moscow [55.753215,37.622504] 3.14 214.1 0.1 5.8 17060000000 ['meter','centimeter','kilometer'] [1,0.01,1000] 500 [501,502] -c694ad8a-f714-4ea3-907d-fd54fb25d9b5 Natalia Sokolova female 1992-03-08 \N \N 0 \N 26 pisces [] [100,200,50] Plymouth [50.403724,-4.142123] 3.14159 \N 0.007 5.4 -20000000000000 [] [] \N [] -a7da1aa6-f425-4789-8947-b034786ed374 Vasily Sidorov male 1995-07-28 \N +442012345678 1 2018-12-30 00:00:00 23 leo ['Sunny'] [250,244,10] Murmansk [68.970682,33.074981] 3.14159265358979 100000000000 800 -3.2 154400000 ['pound'] [16] 503 [] - -Schema 02240_protobuf_format_syntax2:Syntax2Person - -Binary representation: -00000000 bb 04 0a ef 01 0a 24 61 37 35 32 32 31 35 38 2d |......$a7522158-| -00000010 33 64 34 31 2d 34 62 37 37 2d 61 64 36 39 2d 36 |3d41-4b77-ad69-6| -00000020 63 35 39 38 65 65 35 35 63 34 39 12 04 49 76 61 |c598ee55c49..Iva| -00000030 6e 1a 06 50 65 74 72 6f 76 20 01 28 af 1f 32 03 |n..Petrov .(..2.| -00000040 70 6e 67 3a 0d 2b 37 34 39 35 31 32 33 34 35 36 |png:.+7495123456| -00000050 37 00 40 01 4d fc d0 30 5c 50 26 58 09 62 09 59 |7.@.M..0\P&X.b.Y| -00000060 65 73 74 65 72 64 61 79 62 07 46 6c 6f 77 65 72 |esterdayb.Flower| -00000070 73 6a 04 ff 01 00 00 72 06 4d 6f 73 63 6f 77 7a |sj.....r.Moscowz| -00000080 08 4b 03 5f 42 72 7d 16 42 81 01 1f 85 eb 51 b8 |.K._Br}.B.....Q.| -00000090 1e 09 40 89 01 33 33 33 33 33 c3 6a 40 95 01 cd |..@..33333.j@...| -000000a0 cc cc 3d 9d 01 9a 99 b9 40 a0 01 80 c4 d7 8d 7f |..=.....@.......| -000000b0 ab 01 0a 0c 00 00 80 3f 0a d7 23 3c 00 00 7a 44 |.......?..#<..zD| -000000c0 12 05 6d 65 74 65 72 12 0a 63 65 6e 74 69 6d 65 |..meter..centime| -000000d0 74 65 72 12 09 6b 69 6c 6f 6d 65 74 65 72 ac 01 |ter..kilometer..| -000000e0 b3 01 0b a2 06 0b 0b 08 f4 03 12 04 f5 03 f6 03 |................| -000000f0 0c 0c b4 01 0a 80 01 0a 24 63 36 39 34 61 64 38 |........$c694ad8| -00000100 61 2d 66 37 31 34 2d 34 65 61 33 2d 39 30 37 64 |a-f714-4ea3-907d| -00000110 2d 66 64 35 34 66 62 32 35 64 39 62 35 12 07 4e |-fd54fb25d9b5..N| -00000120 61 74 61 6c 69 61 1a 08 53 6f 6b 6f 6c 6f 76 61 |atalia..Sokolova| -00000130 20 00 28 a6 3f 32 03 6a 70 67 50 1a 58 0b 6a 04 | .(.?2.jpgP.X.j.| -00000140 64 c8 01 32 72 08 50 6c 79 6d 6f 75 74 68 7a 08 |d..2r.Plymouthz.| -00000150 6a 9d 49 42 46 8c 84 c0 81 01 6e 86 1b f0 f9 21 |j.IBF.....n....!| -00000160 09 40 95 01 42 60 e5 3b 9d 01 cd cc ac 40 a0 01 |.@..B`.;.....@..| -00000170 ff ff a9 ce 93 8c 09 0a c3 01 0a 24 61 37 64 61 |...........$a7da| -00000180 31 61 61 36 2d 66 34 32 35 2d 34 37 38 39 2d 38 |1aa6-f425-4789-8| -00000190 39 34 37 2d 62 30 33 34 37 38 36 65 64 33 37 34 |947-b034786ed374| -000001a0 12 06 56 61 73 69 6c 79 1a 07 53 69 64 6f 72 6f |..Vasily..Sidoro| -000001b0 76 20 01 28 fb 48 32 03 62 6d 70 3a 0d 2b 34 34 |v .(.H2.bmp:.+44| -000001c0 32 30 31 32 33 34 35 36 37 38 40 01 4d 50 e0 27 |2012345678@.MP.'| -000001d0 5c 50 17 58 04 62 05 53 75 6e 6e 79 6a 05 fa 01 |\P.X.b.Sunnyj...| -000001e0 f4 01 0a 72 08 4d 75 72 6d 61 6e 73 6b 7a 08 fd |...r.Murmanskz..| -000001f0 f0 89 42 c8 4c 04 42 81 01 11 2d 44 54 fb 21 09 |..B.L.B...-DT.!.| -00000200 40 89 01 00 00 00 e8 76 48 37 42 95 01 00 00 48 |@......vH7B....H| -00000210 44 9d 01 cd cc 4c c0 a0 01 80 d4 9f 93 01 ab 01 |D....L..........| -00000220 0a 04 00 00 80 41 12 05 70 6f 75 6e 64 ac 01 b3 |.....A..pound...| -00000230 01 0b a2 06 05 0b 08 f7 03 0c 0c b4 01 |.............| -0000023d - -MESSAGE #1 AT 0x00000005 -uuid: "a7522158-3d41-4b77-ad69-6c598ee55c49" -name: "Ivan" -surname: "Petrov" -gender: male -birthDate: 4015 -photo: "png" -phoneNumber: "+74951234567\000" -isOnline: true -visitTime: 1546703100 -age: 38 -zodiacSign: capricorn -songs: "Yesterday" -songs: "Flowers" -color: 255 -color: 0 -color: 0 -hometown: "Moscow" -location: 55.7532158 -location: 37.6225052 -pi: 3.14 -lotteryWin: 214.1 -someRatio: 0.1 -temperature: 5.8 -randomBigNumber: 17060000000 -MeasureUnits { - coef: 1 - coef: 0.01 - coef: 1000 - unit: "meter" - unit: "centimeter" - unit: "kilometer" -} -Nestiness { - A { - b { - C { - d: 500 - e: 501 - e: 502 - } - } - } -} -MESSAGE #2 AT 0x000000F7 -uuid: "c694ad8a-f714-4ea3-907d-fd54fb25d9b5" -name: "Natalia" -surname: "Sokolova" -gender: female -birthDate: 8102 -photo: "jpg" -age: 26 -zodiacSign: pisces -color: 100 -color: 200 -color: 50 -hometown: "Plymouth" -location: 50.4037247 -location: -4.14212322 -pi: 3.14159 -someRatio: 0.007 -temperature: 5.4 -randomBigNumber: -20000000000000 -MESSAGE #3 AT 0x0000017A -uuid: "a7da1aa6-f425-4789-8947-b034786ed374" -name: "Vasily" -surname: "Sidorov" -gender: male -birthDate: 9339 -photo: "bmp" -phoneNumber: "+442012345678" -isOnline: true -visitTime: 1546117200 -age: 23 -zodiacSign: leo -songs: "Sunny" -color: 250 -color: 244 -color: 10 -hometown: "Murmansk" -location: 68.9706802 -location: 33.0749817 -pi: 3.14159265358979 -lotteryWin: 100000000000 -someRatio: 800 -temperature: -3.2 -randomBigNumber: 154400000 -MeasureUnits { - coef: 16 - unit: "pound" -} -Nestiness { - A { - b { - C { - d: 503 - } - } - } -} - -Binary representation differs from the expected one (listed below): -00000000 be 04 0a f1 01 0a 24 61 37 35 32 32 31 35 38 2d |......$a7522158-| -00000010 33 64 34 31 2d 34 62 37 37 2d 61 64 36 39 2d 36 |3d41-4b77-ad69-6| -00000020 63 35 39 38 65 65 35 35 63 34 39 12 04 49 76 61 |c598ee55c49..Iva| -00000030 6e 1a 06 50 65 74 72 6f 76 20 01 28 af 1f 32 03 |n..Petrov .(..2.| -00000040 70 6e 67 3a 0d 2b 37 34 39 35 31 32 33 34 35 36 |png:.+7495123456| -00000050 37 00 40 01 4d fc d0 30 5c 50 26 58 09 62 09 59 |7.@.M..0\P&X.b.Y| -00000060 65 73 74 65 72 64 61 79 62 07 46 6c 6f 77 65 72 |esterdayb.Flower| -00000070 73 68 ff 01 68 00 68 00 72 06 4d 6f 73 63 6f 77 |sh..h.h.r.Moscow| -00000080 7a 08 4b 03 5f 42 72 7d 16 42 81 01 1f 85 eb 51 |z.K._Br}.B.....Q| -00000090 b8 1e 09 40 89 01 33 33 33 33 33 c3 6a 40 95 01 |...@..33333.j@..| -000000a0 cd cc cc 3d 9d 01 9a 99 b9 40 a0 01 80 c4 d7 8d |...=.....@......| -000000b0 7f ab 01 0d 00 00 80 3f 0d 0a d7 23 3c 0d 00 00 |.......?...#<...| -000000c0 7a 44 12 05 6d 65 74 65 72 12 0a 63 65 6e 74 69 |zD..meter..centi| -000000d0 6d 65 74 65 72 12 09 6b 69 6c 6f 6d 65 74 65 72 |meter..kilometer| -000000e0 ac 01 b3 01 0b a2 06 0b 0b 08 f4 03 10 f5 03 10 |................| -000000f0 f6 03 0c 0c b4 01 0a 81 01 0a 24 63 36 39 34 61 |..........$c694a| -00000100 64 38 61 2d 66 37 31 34 2d 34 65 61 33 2d 39 30 |d8a-f714-4ea3-90| -00000110 37 64 2d 66 64 35 34 66 62 32 35 64 39 62 35 12 |7d-fd54fb25d9b5.| -00000120 07 4e 61 74 61 6c 69 61 1a 08 53 6f 6b 6f 6c 6f |.Natalia..Sokolo| -00000130 76 61 20 00 28 a6 3f 32 03 6a 70 67 50 1a 58 0b |va .(.?2.jpgP.X.| -00000140 68 64 68 c8 01 68 32 72 08 50 6c 79 6d 6f 75 74 |hdh..h2r.Plymout| -00000150 68 7a 08 6a 9d 49 42 46 8c 84 c0 81 01 6e 86 1b |hz.j.IBF.....n..| -00000160 f0 f9 21 09 40 95 01 42 60 e5 3b 9d 01 cd cc ac |..!.@..B`.;.....| -00000170 40 a0 01 ff ff a9 ce 93 8c 09 0a c3 01 0a 24 61 |@.............$a| -00000180 37 64 61 31 61 61 36 2d 66 34 32 35 2d 34 37 38 |7da1aa6-f425-478| -00000190 39 2d 38 39 34 37 2d 62 30 33 34 37 38 36 65 64 |9-8947-b034786ed| -000001a0 33 37 34 12 06 56 61 73 69 6c 79 1a 07 53 69 64 |374..Vasily..Sid| -000001b0 6f 72 6f 76 20 01 28 fb 48 32 03 62 6d 70 3a 0d |orov .(.H2.bmp:.| -000001c0 2b 34 34 32 30 31 32 33 34 35 36 37 38 40 01 4d |+442012345678@.M| -000001d0 50 e0 27 5c 50 17 58 04 62 05 53 75 6e 6e 79 68 |P.'\P.X.b.Sunnyh| -000001e0 fa 01 68 f4 01 68 0a 72 08 4d 75 72 6d 61 6e 73 |..h..h.r.Murmans| -000001f0 6b 7a 08 fd f0 89 42 c8 4c 04 42 81 01 11 2d 44 |kz....B.L.B...-D| -00000200 54 fb 21 09 40 89 01 00 00 00 e8 76 48 37 42 95 |T.!.@......vH7B.| -00000210 01 00 00 48 44 9d 01 cd cc 4c c0 a0 01 80 d4 9f |...HD....L......| -00000220 93 01 ab 01 0d 00 00 80 41 12 05 70 6f 75 6e 64 |........A..pound| -00000230 ac 01 b3 01 0b a2 06 05 0b 08 f7 03 0c 0c b4 01 |................| -00000240 - - -Settings used in the test: --max_insert_threads 0 --group_by_two_level_threshold 963158 --group_by_two_level_threshold_bytes 13149870 --distributed_aggregation_memory_efficient 1 --fsync_metadata 0 --output_format_parallel_formatting 0 --input_format_parallel_parsing 1 --min_chunk_bytes_for_parallel_parsing 18512987 --max_read_buffer_size 685645 --prefer_localhost_replica 0 --max_block_size 36563 --max_threads 2 --optimize_or_like_chain 1 --optimize_read_in_order 1 --enable_multiple_prewhere_read_steps 1 --read_in_order_two_level_merge_threshold 74 --optimize_aggregation_in_order 0 --aggregation_in_order_max_block_bytes 26213853 --min_compress_block_size 2850779 --max_compress_block_size 2496283 --use_uncompressed_cache 0 --min_bytes_to_use_direct_io 10737418240 --min_bytes_to_use_mmap_io 10737418240 --local_filesystem_read_method pread --remote_filesystem_read_method read --local_filesystem_read_prefetch 1 --filesystem_cache_segments_batch_size 10 --read_from_filesystem_cache_if_exists_otherwise_bypass_cache 0 --throw_on_error_from_cache_on_write_operations 0 --remote_filesystem_read_prefetch 1 --allow_prefetched_read_pool_for_remote_filesystem 0 --filesystem_prefetch_max_memory_usage 128Mi --filesystem_prefetches_limit 0 --filesystem_prefetch_min_bytes_for_single_read_task 1Mi --filesystem_prefetch_step_marks 50 --filesystem_prefetch_step_bytes 0 --compile_aggregate_expressions 1 --compile_sort_description 0 --merge_tree_coarse_index_granularity 8 --optimize_distinct_in_order 1 --optimize_sorting_by_input_stream_properties 1 --http_response_buffer_size 2897457 --http_wait_end_of_query True --enable_memory_bound_merging_of_aggregation_results 1 --min_count_to_compile_expression 3 --min_count_to_compile_aggregate_expression 0 --min_count_to_compile_sort_description 0 --session_timezone Africa/Juba - -MergeTree settings used in test: --ratio_of_defaults_for_sparse_serialization 1.0 --prefer_fetch_merged_part_size_threshold 10737418240 --vertical_merge_algorithm_min_rows_to_activate 1000000 --vertical_merge_algorithm_min_columns_to_activate 1 --allow_vertical_merges_from_compact_to_wide_parts 1 --min_merge_bytes_to_use_direct_io 1041313230 --index_granularity_bytes 7044432 --merge_max_block_size 16869 --index_granularity 27099 --min_bytes_for_wide_part 1073741824 --compress_marks 1 --compress_primary_key 1 --marks_compress_block_size 60638 --primary_key_compress_block_size 64768 --replace_long_file_name_to_hash 1 --max_file_name_length 0 - -Database: test_xjjpx0p6 - -Having 1 errors! 0 tests passed. 0 tests skipped. 20.40 s elapsed (MainProcess). -Won't run stateful tests because test data wasn't loaded. -All tests have finished. diff --git a/tests/queries/0_stateless/01179_insert_values_semicolon.expect b/tests/queries/0_stateless/01179_insert_values_semicolon.expect index 4b8693126a1..bcae8047ebd 100755 --- a/tests/queries/0_stateless/01179_insert_values_semicolon.expect +++ b/tests/queries/0_stateless/01179_insert_values_semicolon.expect @@ -20,7 +20,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " send -- "DROP TABLE IF EXISTS test_01179\r" diff --git a/tests/queries/0_stateless/01180_client_syntax_errors.expect b/tests/queries/0_stateless/01180_client_syntax_errors.expect index 042b16c3296..058c75e4307 100755 --- a/tests/queries/0_stateless/01180_client_syntax_errors.expect +++ b/tests/queries/0_stateless/01180_client_syntax_errors.expect @@ -20,7 +20,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " # Make a query with syntax error diff --git a/tests/queries/0_stateless/01293_client_interactive_vertical_multiline.expect b/tests/queries/0_stateless/01293_client_interactive_vertical_multiline.expect index 3de7df04ec0..5dcf999a065 100755 --- a/tests/queries/0_stateless/01293_client_interactive_vertical_multiline.expect +++ b/tests/queries/0_stateless/01293_client_interactive_vertical_multiline.expect @@ -25,7 +25,7 @@ expect_after { # useful debugging configuration # exp_internal 1 -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " send -- "SELECT 1\r" @@ -67,7 +67,7 @@ expect ":) " send -- "" expect eof -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --highlight 0 --multiline --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --highlight 0 --multiline --history_file=$history_file" expect ":) " send -- "SELECT 1;\r" diff --git a/tests/queries/0_stateless/01293_client_interactive_vertical_singleline.expect b/tests/queries/0_stateless/01293_client_interactive_vertical_singleline.expect index 68d2d1f0a13..8d163cc7a04 100755 --- a/tests/queries/0_stateless/01293_client_interactive_vertical_singleline.expect +++ b/tests/queries/0_stateless/01293_client_interactive_vertical_singleline.expect @@ -21,7 +21,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " send -- "SELECT 1\r" diff --git a/tests/queries/0_stateless/01300_client_save_history_when_terminated_long.expect b/tests/queries/0_stateless/01300_client_save_history_when_terminated_long.expect index 9f471bc694b..b7ff04a3844 100755 --- a/tests/queries/0_stateless/01300_client_save_history_when_terminated_long.expect +++ b/tests/queries/0_stateless/01300_client_save_history_when_terminated_long.expect @@ -20,7 +20,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " # Make a query @@ -33,7 +33,7 @@ exec kill -9 [exp_pid] close # Run client one more time and press "up" to see the last recorded query -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " send -- "\[A" expect "for the history" diff --git a/tests/queries/0_stateless/01520_client_print_query_id.expect b/tests/queries/0_stateless/01520_client_print_query_id.expect index 70f446e1584..a3fcd493f55 100755 --- a/tests/queries/0_stateless/01520_client_print_query_id.expect +++ b/tests/queries/0_stateless/01520_client_print_query_id.expect @@ -20,7 +20,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " # Make a query diff --git a/tests/queries/0_stateless/01565_query_loop_after_client_error.expect b/tests/queries/0_stateless/01565_query_loop_after_client_error.expect index f08ef911da4..68533b72f24 100755 --- a/tests/queries/0_stateless/01565_query_loop_after_client_error.expect +++ b/tests/queries/0_stateless/01565_query_loop_after_client_error.expect @@ -24,7 +24,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion -m --history_file=$history_file --highlight 0" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 -m --history_file=$history_file --highlight 0" expect "\n:) " send -- "DROP TABLE IF EXISTS t01565;\r" diff --git a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect index 3efbe478ce5..859faf196d7 100755 --- a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect +++ b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect @@ -20,7 +20,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " # regression for heap-buffer-overflow issue (under ASAN) diff --git a/tests/queries/0_stateless/01945_show_debug_warning.expect b/tests/queries/0_stateless/01945_show_debug_warning.expect index a6950b5ab82..d69ae6299ac 100755 --- a/tests/queries/0_stateless/01945_show_debug_warning.expect +++ b/tests/queries/0_stateless/01945_show_debug_warning.expect @@ -26,7 +26,7 @@ expect_after { set Debug_type 0 -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " # Check debug type @@ -44,7 +44,7 @@ expect eof if { $Debug_type > 0} { -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect "Warnings:" expect " * Server was built in debug mode. It will work slowly." expect ":) " @@ -58,7 +58,7 @@ send -- "q\r" expect eof } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --max_memory_usage_for_all_queries=123 --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --max_memory_usage_for_all_queries=123 --history_file=$history_file" expect "Warnings:" expect " * Obsolete setting" expect ":) " diff --git a/tests/queries/0_stateless/02047_client_exception.expect b/tests/queries/0_stateless/02047_client_exception.expect index d7dcb97867a..3231cf74f2b 100755 --- a/tests/queries/0_stateless/02047_client_exception.expect +++ b/tests/queries/0_stateless/02047_client_exception.expect @@ -21,7 +21,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " send -- "DROP TABLE IF EXISTS test_02047\r" diff --git a/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect b/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect index ddfd6e9d158..f6b173e4069 100755 --- a/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect +++ b/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect @@ -20,7 +20,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_LOCAL --disable_suggestion" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_LOCAL --disable_suggestion --enable-progress-table-toggle=0" expect ":) " send -- "drop table if exists t\r" diff --git a/tests/queries/0_stateless/02105_backslash_letter_commands.expect b/tests/queries/0_stateless/02105_backslash_letter_commands.expect index f09e9613a87..0520a93a806 100755 --- a/tests/queries/0_stateless/02105_backslash_letter_commands.expect +++ b/tests/queries/0_stateless/02105_backslash_letter_commands.expect @@ -20,7 +20,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " # Send a command diff --git a/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect b/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect index 4b8524add80..aee5e7c2d67 100755 --- a/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect +++ b/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect @@ -22,7 +22,7 @@ expect_after { } system "echo \"drop table if exists t; create table t(i String) engine=Memory; insert into t select 'test string'\" > $CLICKHOUSE_TMP/file_02112" -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT --disable_suggestion --interactive --queries-file $CLICKHOUSE_TMP/file_02112" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT --disable_suggestion --enable-progress-table-toggle=0 --interactive --queries-file $CLICKHOUSE_TMP/file_02112" expect ":) " send -- "select i from t format TSV\r" diff --git a/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect b/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect index 9df889e7c90..0e1cf3cb719 100755 --- a/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect +++ b/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect @@ -20,7 +20,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_LOCAL --disable_suggestion --interactive --query 'create table t(i Int32) engine=Memory; insert into t select 1'" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_LOCAL --disable_suggestion --enable-progress-table-toggle=0 --interactive --query 'create table t(i Int32) engine=Memory; insert into t select 1'" expect ":) " send -- "select * from t format TSV\r" diff --git a/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect b/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect index 0c07adf231d..9fb72c66ba5 100755 --- a/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect +++ b/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect @@ -21,7 +21,7 @@ expect_after { } system "echo \"drop table if exists t; create table t(i String) engine=Memory; insert into t select 'test string'\" > $CLICKHOUSE_TMP/file_02112" -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_LOCAL --disable_suggestion --interactive --queries-file $CLICKHOUSE_TMP/file_02112" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_LOCAL --disable_suggestion --enable-progress-table-toggle=0 --interactive --queries-file $CLICKHOUSE_TMP/file_02112" expect ":) " send -- "select \* from t format TSV\r" diff --git a/tests/queries/0_stateless/02116_interactive_hello.expect b/tests/queries/0_stateless/02116_interactive_hello.expect index 41cd515ea34..dfa7961a69b 100755 --- a/tests/queries/0_stateless/02116_interactive_hello.expect +++ b/tests/queries/0_stateless/02116_interactive_hello.expect @@ -21,7 +21,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" # (?n) - Do not match new lines expect -re "(?n)ClickHouse client version \[\\d\]{2}\.\[\\d\]{1,2}\.\[\\d\]{1,2}\.\[\\d\]{1,}.*\r" diff --git a/tests/queries/0_stateless/02132_client_history_navigation.expect b/tests/queries/0_stateless/02132_client_history_navigation.expect index 3fba7ab1692..b0a3f2b15c0 100755 --- a/tests/queries/0_stateless/02132_client_history_navigation.expect +++ b/tests/queries/0_stateless/02132_client_history_navigation.expect @@ -24,7 +24,7 @@ expect_after { # useful debugging configuration # exp_internal 1 -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --highlight 0 --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --highlight 0 --history_file=$history_file" expect ":) " # Make a query diff --git a/tests/queries/0_stateless/02352_interactive_queries_from_file.expect b/tests/queries/0_stateless/02352_interactive_queries_from_file.expect index d11d55ba941..d6260853a16 100755 --- a/tests/queries/0_stateless/02352_interactive_queries_from_file.expect +++ b/tests/queries/0_stateless/02352_interactive_queries_from_file.expect @@ -22,7 +22,7 @@ expect_after { } spawn bash -c "echo 'select 1;\nselect 2;\nselect 3' > queries_02352" -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT --disable_suggestion" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT --disable_suggestion --enable-progress-table-toggle=0" expect ":) " send -- "\\i queries_02352\r" diff --git a/tests/queries/0_stateless/02354_vector_search_queries.reference b/tests/queries/0_stateless/02354_vector_search_queries.reference index e42f91d05dc..34dcccc84c5 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.reference +++ b/tests/queries/0_stateless/02354_vector_search_queries.reference @@ -39,8 +39,8 @@ Expression (Projection) Special cases -- Non-default metric, M, ef_construction, ef_search 6 [1,9.3] 0.005731362878640178 +4 [2.4,5.2] 0.09204062768384846 1 [2,3.2] 0.15200169244542905 -7 [5.5,4.7] 0.3503476876550442 Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) @@ -55,7 +55,7 @@ Expression (Projection) Name: idx Description: vector_similarity GRANULARITY 2 Parts: 1/1 - Granules: 2/4 + Granules: 3/4 -- Setting "max_limit_for_ann_queries" Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) @@ -69,8 +69,8 @@ Expression (Projection) Granules: 4/4 -- Non-default quantization 1 [2,3.2] 2.3323807824711897 +4 [2.4,5.2] 3.9999999046325727 2 [4.2,3.4] 4.427188573446585 -0 [4.6,2.3] 4.609772130377966 Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) @@ -85,10 +85,10 @@ Expression (Projection) Name: idx Description: vector_similarity GRANULARITY 2 Parts: 1/1 - Granules: 2/4 + Granules: 4/4 1 [2,3.2] 2.3323807824711897 +4 [2.4,5.2] 3.9999999046325727 2 [4.2,3.4] 4.427188573446585 -0 [4.6,2.3] 4.609772130377966 Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) @@ -103,10 +103,10 @@ Expression (Projection) Name: idx Description: vector_similarity GRANULARITY 2 Parts: 1/1 - Granules: 2/4 + Granules: 4/4 1 [2,3.2] 2.3323807824711897 +4 [2.4,5.2] 3.9999999046325727 2 [4.2,3.4] 4.427188573446585 -0 [4.6,2.3] 4.609772130377966 Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) @@ -121,10 +121,10 @@ Expression (Projection) Name: idx Description: vector_similarity GRANULARITY 2 Parts: 1/1 - Granules: 2/4 + Granules: 4/4 1 [2,3.2] 2.3323807824711897 +4 [2.4,5.2] 3.9999999046325727 2 [4.2,3.4] 4.427188573446585 -0 [4.6,2.3] 4.609772130377966 Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) @@ -139,10 +139,10 @@ Expression (Projection) Name: idx Description: vector_similarity GRANULARITY 2 Parts: 1/1 - Granules: 2/4 + Granules: 4/4 1 [2,3.2] 2.3323807824711897 +4 [2.4,5.2] 3.9999999046325727 2 [4.2,3.4] 4.427188573446585 -0 [4.6,2.3] 4.609772130377966 Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) @@ -157,7 +157,7 @@ Expression (Projection) Name: idx Description: vector_similarity GRANULARITY 2 Parts: 1/1 - Granules: 2/4 + Granules: 3/4 -- Index on Array(Float64) column 6 [0,2] 0 7 [0,2.1] 0.10000000000000009 diff --git a/tests/queries/0_stateless/02417_repeat_input_commands.expect b/tests/queries/0_stateless/02417_repeat_input_commands.expect index 5a4b2840854..e5125025655 100755 --- a/tests/queries/0_stateless/02417_repeat_input_commands.expect +++ b/tests/queries/0_stateless/02417_repeat_input_commands.expect @@ -21,7 +21,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " # ----------------------------------------- diff --git a/tests/queries/0_stateless/02793_implicit_pretty_format_settings.expect b/tests/queries/0_stateless/02793_implicit_pretty_format_settings.expect index ab70bdeca1f..28e42758c71 100755 --- a/tests/queries/0_stateless/02793_implicit_pretty_format_settings.expect +++ b/tests/queries/0_stateless/02793_implicit_pretty_format_settings.expect @@ -20,7 +20,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " # Send a command diff --git a/tests/queries/0_stateless/02907_suggestions_readonly_user.expect b/tests/queries/0_stateless/02907_suggestions_readonly_user.expect index 025ccbfbae8..f6d706d3bf9 100755 --- a/tests/queries/0_stateless/02907_suggestions_readonly_user.expect +++ b/tests/queries/0_stateless/02907_suggestions_readonly_user.expect @@ -22,7 +22,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " send -- "DROP USER IF EXISTS 02907_suggestions_readonly_user\r" @@ -51,7 +51,7 @@ set timeout 60 send -- "exit\r" expect eof -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " send -- "DROP USER 02907_suggestions_readonly_user\r" diff --git a/tests/queries/0_stateless/02931_client_fuzzy_search_crash.expect b/tests/queries/0_stateless/02931_client_fuzzy_search_crash.expect index 992ff85ca1e..3386c400132 100755 --- a/tests/queries/0_stateless/02931_client_fuzzy_search_crash.expect +++ b/tests/queries/0_stateless/02931_client_fuzzy_search_crash.expect @@ -21,7 +21,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " # send Ctrl-R (octal code of R is 022, see ascii(7)) diff --git a/tests/queries/0_stateless/02932_refreshable_materialized_views_1.reference b/tests/queries/0_stateless/02932_refreshable_materialized_views_1.reference index bfc6add90a7..b21356db24e 100644 --- a/tests/queries/0_stateless/02932_refreshable_materialized_views_1.reference +++ b/tests/queries/0_stateless/02932_refreshable_materialized_views_1.reference @@ -1,32 +1,30 @@ -<1: created view> a [] 1 -CREATE MATERIALIZED VIEW default.a\nREFRESH AFTER 2 SECOND\n(\n `x` UInt64\n)\nENGINE = Memory\nAS SELECT number AS x\nFROM numbers(2)\nUNION ALL\nSELECT rand64() AS x +<1: created view> a +CREATE MATERIALIZED VIEW default.a\nREFRESH EVERY 2 SECOND\n(\n `x` UInt64\n)\nENGINE = Memory\nAS SELECT number AS x\nFROM numbers(2)\nUNION ALL\nSELECT rand64() AS x <2: refreshed> 3 1 1 <3: time difference at least> 1000 -<4: next refresh in> 2 -<4.1: fake clock> Scheduled 2050-01-01 00:00:01 2050-01-01 00:00:03 -<4.5: altered> Scheduled Finished 2052-01-01 00:00:00 +<4: next refresh in> 2 Scheduled +<4.1: fake clock> Scheduled 2050-01-01 00:00:01 2050-01-01 00:00:02 1 3 3 3 0 +<4.5: altered> Scheduled 2050-01-01 00:00:01 2052-01-01 00:00:00 CREATE MATERIALIZED VIEW default.a\nREFRESH EVERY 2 YEAR\n(\n `x` UInt64\n)\nENGINE = Memory\nAS SELECT x * 2 AS x\nFROM default.src <5: no refresh> 3 <6: refreshed> 2 -<7: refreshed> Scheduled Finished 2054-01-01 00:00:00 +<7: refreshed> Scheduled 2052-02-03 04:05:06 2054-01-01 00:00:00 CREATE MATERIALIZED VIEW default.b\nREFRESH EVERY 2 YEAR DEPENDS ON default.a\n(\n `y` Int32\n)\nENGINE = MergeTree\nORDER BY y\nSETTINGS index_granularity = 8192\nAS SELECT x * 10 AS y\nFROM default.a <7.5: created dependent> 2052-11-11 11:11:11 <8: refreshed> 20 -<9: refreshed> a Scheduled Finished 2054-01-01 00:00:00 -<9: refreshed> b Scheduled Finished 2054-01-01 00:00:00 +<9: refreshed> a Scheduled 2054-01-01 00:00:00 +<9: refreshed> b Scheduled 2054-01-01 00:00:00 <9.2: dropping> 0 2 <9.4: dropped> 0 2 -<10: creating> a Scheduled [] 2054-01-01 00:00:00 -<10: creating> b WaitingForDependencies ['default.a'] 2054-01-01 00:00:00 +<10: creating> a Scheduled 2054-01-01 00:00:00 +<10: creating> b WaitingForDependencies 2054-01-01 00:00:00 <11: chain-refreshed a> 4 <12: chain-refreshed b> 40 -<13: chain-refreshed> a Scheduled [] Finished 2054-01-01 00:00:01 2056-01-01 00:00:00 1 -<13: chain-refreshed> b Scheduled ['default.a'] Finished 2054-01-24 23:22:21 2056-01-01 00:00:00 1 -<14: waiting for next cycle> a Scheduled [] 2058-01-01 00:00:00 -<14: waiting for next cycle> b WaitingForDependencies ['default.a'] 2060-01-01 00:00:00 +<13: chain-refreshed> a Scheduled 2054-01-01 00:00:01 2054-01-01 00:00:01 2056-01-01 00:00:00 1 +<13: chain-refreshed> b Scheduled 2054-01-24 23:22:21 2054-01-24 23:22:21 2056-01-01 00:00:00 1 <15: chain-refreshed a> 6 <16: chain-refreshed b> 60 <17: chain-refreshed> a Scheduled 2062-01-01 00:00:00 <17: chain-refreshed> b Scheduled 2062-01-01 00:00:00 -<18: removed dependency> b Scheduled [] 2062-03-03 03:03:03 2064-01-01 00:00:00 5 +<18: removed dependency> b Scheduled 2062-03-03 03:03:03 2062-03-03 03:03:03 2064-01-01 00:00:00 CREATE MATERIALIZED VIEW default.b\nREFRESH EVERY 2 YEAR\n(\n `y` Int32\n)\nENGINE = MergeTree\nORDER BY y\nSETTINGS index_granularity = 8192\nAS SELECT x * 10 AS y\nFROM default.a diff --git a/tests/queries/0_stateless/02932_refreshable_materialized_views_1.sh b/tests/queries/0_stateless/02932_refreshable_materialized_views_1.sh index 057f76e63d0..cca90f36c11 100755 --- a/tests/queries/0_stateless/02932_refreshable_materialized_views_1.sh +++ b/tests/queries/0_stateless/02932_refreshable_materialized_views_1.sh @@ -16,14 +16,14 @@ $CLICKHOUSE_CLIENT -q "create view refreshes as select * from system.view_refres # Basic refreshing. $CLICKHOUSE_CLIENT -q " create materialized view a - refresh after 2 second + refresh every 2 second engine Memory empty as select number as x from numbers(2) union all select rand64() as x; - select '<1: created view>', view, remaining_dependencies, exception, last_refresh_result in ('Unknown', 'Finished') from refreshes; + select '<1: created view>', exception, view from refreshes; show create a;" # Wait for any refresh. (xargs trims the string and turns \t and \n into spaces) -while [ "`$CLICKHOUSE_CLIENT -q "select last_refresh_result from refreshes -- $LINENO" | xargs`" == 'Unknown' ] +while [ "`$CLICKHOUSE_CLIENT -q "select last_success_time is null from refreshes -- $LINENO" | xargs`" != '0' ] do sleep 0.5 done @@ -49,8 +49,15 @@ done # like crazy. This is potentially flaky, but we need at least one test that uses non-mocked timer # to make sure the clock+timer code works at all. If it turns out flaky, increase refresh period above. $CLICKHOUSE_CLIENT -q " - select '<3: time difference at least>', min2(reinterpret(now64(), 'Int64') - $start_time, 1000); - select '<4: next refresh in>', next_refresh_time-last_refresh_time from refreshes;" + select '<3: time difference at least>', min2(reinterpret(now64(), 'Int64') - $start_time, 1000);" +while : +do + # Wait for status to change to Scheduled. If status = Scheduling, next_refresh_time is stale. + res="`$CLICKHOUSE_CLIENT -q "select '<4: next refresh in>', next_refresh_time-last_success_time, status from refreshes -- $LINENO"`" + echo "$res" | grep -q 'Scheduled' && break + sleep 0.5 +done +echo "$res" # Create a source table from which views will read. $CLICKHOUSE_CLIENT -q " @@ -62,22 +69,23 @@ $CLICKHOUSE_CLIENT -q " system wait view a; system refresh view a; system wait view a; - select '<4.1: fake clock>', status, last_refresh_time, next_refresh_time from refreshes; + select '<4.1: fake clock>', status, last_success_time, next_refresh_time, progress, read_rows, total_rows, written_rows, retry from refreshes; alter table a modify refresh every 2 year; alter table a modify query select x*2 as x from src; - select '<4.5: altered>', status, last_refresh_result, next_refresh_time from refreshes; + system wait view a; + select '<4.5: altered>', status, last_success_time, next_refresh_time from refreshes; show create a;" # Advance time to trigger the refresh. $CLICKHOUSE_CLIENT -q " select '<5: no refresh>', count() from a; system test view a set fake time '2052-02-03 04:05:06';" -while [ "`$CLICKHOUSE_CLIENT -q "select last_refresh_time from refreshes -- $LINENO" | xargs`" != '2052-02-03 04:05:06' ] +while [ "`$CLICKHOUSE_CLIENT -q "select last_success_time, status from refreshes -- $LINENO" | xargs`" != '2052-02-03 04:05:06 Scheduled' ] do sleep 0.5 done $CLICKHOUSE_CLIENT -q " select '<6: refreshed>', * from a; - select '<7: refreshed>', status, last_refresh_result, next_refresh_time from refreshes;" + select '<7: refreshed>', status, last_success_time, next_refresh_time from refreshes;" # Create a dependent view, refresh it once. $CLICKHOUSE_CLIENT -q " @@ -86,13 +94,13 @@ $CLICKHOUSE_CLIENT -q " system test view b set fake time '2052-11-11 11:11:11'; system refresh view b; system wait view b; - select '<7.5: created dependent>', last_refresh_time from refreshes where view = 'b';" + select '<7.5: created dependent>', last_success_time from refreshes where view = 'b';" # Next refresh shouldn't start until the dependency refreshes. $CLICKHOUSE_CLIENT -q " select '<8: refreshed>', * from b; - select '<9: refreshed>', view, status, last_refresh_result, next_refresh_time from refreshes; + select '<9: refreshed>', view, status, next_refresh_time from refreshes; system test view b set fake time '2054-01-24 23:22:21';" -while [ "`$CLICKHOUSE_CLIENT -q "select status, next_refresh_time from refreshes where view = 'b' -- $LINENO" | xargs`" != 'WaitingForDependencies 2054-01-01 00:00:00' ] +while [ "`$CLICKHOUSE_CLIENT -q "select status from refreshes where view = 'b' -- $LINENO" | xargs`" != 'WaitingForDependencies' ] do sleep 0.5 done @@ -102,14 +110,14 @@ $CLICKHOUSE_CLIENT -q " select '<9.2: dropping>', countIf(name like '%tmp%'), countIf(name like '%.inner%') from system.tables where database = currentDatabase(); drop table src; system refresh view a;" -$CLICKHOUSE_CLIENT -q "system wait view a;" 2>/dev/null && echo "SYSTEM WAIT VIEW failed to fail at $LINENO" +$CLICKHOUSE_CLIENT -q "system wait view a; -- { serverError REFRESH_FAILED }" $CLICKHOUSE_CLIENT -q " select '<9.4: dropped>', countIf(name like '%tmp%'), countIf(name like '%.inner%') from system.tables where database = currentDatabase();" # Create the source table again, check that refresh succeeds (in particular that tables are looked # up by name rather than uuid). $CLICKHOUSE_CLIENT -q " - select '<10: creating>', view, status, remaining_dependencies, next_refresh_time from refreshes; + select '<10: creating>', view, status, next_refresh_time from refreshes; create table src (x Int16) engine Memory as select 2; system test view a set fake time '2054-01-01 00:00:01';" while [ "`$CLICKHOUSE_CLIENT -q "select status from refreshes where view = 'b' -- $LINENO" | xargs`" != 'Scheduled' ] @@ -120,27 +128,10 @@ done $CLICKHOUSE_CLIENT -q " select '<11: chain-refreshed a>', * from a; select '<12: chain-refreshed b>', * from b; - select '<13: chain-refreshed>', view, status, remaining_dependencies, last_refresh_result, last_refresh_time, next_refresh_time, exception == '' from refreshes;" + select '<13: chain-refreshed>', view, status, last_success_time, last_refresh_time, next_refresh_time, exception == '' from refreshes;" -# Make the dependent table run ahead by one refresh cycle, make sure it waits for the dependency to -# catch up to the same cycle. -$CLICKHOUSE_CLIENT -q " - system test view b set fake time '2059-01-01 00:00:00'; - system refresh view b;" -while [ "`$CLICKHOUSE_CLIENT -q "select next_refresh_time from refreshes where view = 'b' -- $LINENO" | xargs`" != '2060-01-01 00:00:00' ] -do - sleep 0.5 -done $CLICKHOUSE_CLIENT -q " system test view b set fake time '2061-01-01 00:00:00'; - system test view a set fake time '2057-01-01 00:00:00';" -while [ "`$CLICKHOUSE_CLIENT -q "select status, next_refresh_time from refreshes -- $LINENO" | xargs`" != 'Scheduled 2058-01-01 00:00:00 WaitingForDependencies 2060-01-01 00:00:00' ] -do - sleep 0.5 -done - -$CLICKHOUSE_CLIENT -q " - select '<14: waiting for next cycle>', view, status, remaining_dependencies, next_refresh_time from refreshes; truncate src; insert into src values (3); system test view a set fake time '2060-02-02 02:02:02';" @@ -167,9 +158,13 @@ do sleep 0.5 done $CLICKHOUSE_CLIENT -q " - select '<18: removed dependency>', view, status, remaining_dependencies, last_refresh_time,next_refresh_time, refresh_count from refreshes where view = 'b'; + select '<18: removed dependency>', view, status, last_success_time, last_refresh_time, next_refresh_time from refreshes where view = 'b'; show create b;" +# Can't use the same time unit multiple times. +$CLICKHOUSE_CLIENT -q " + create materialized view c refresh every 1 second 2 second (x Int64) engine Memory empty as select * from src; -- { clientError SYNTAX_ERROR }" + $CLICKHOUSE_CLIENT -q " drop table src; drop table a; diff --git a/tests/queries/0_stateless/02932_refreshable_materialized_views_2.reference b/tests/queries/0_stateless/02932_refreshable_materialized_views_2.reference index cdaad32de0a..3eeab4f574e 100644 --- a/tests/queries/0_stateless/02932_refreshable_materialized_views_2.reference +++ b/tests/queries/0_stateless/02932_refreshable_materialized_views_2.reference @@ -1,11 +1,11 @@ <19: exception> 1 <20: unexception> 1 <21: rename> 1 -<22: rename> d Finished +<22: rename> d 0 <23: simple refresh> 1 <24: rename during refresh> 1 <25: rename during refresh> f Running -<27: cancelled> f Scheduled Cancelled +<27: cancelled> f Scheduled cancelled <28: drop during refresh> 0 0 CREATE MATERIALIZED VIEW default.g\nREFRESH EVERY 1 WEEK OFFSET 3 DAY 4 HOUR RANDOMIZE FOR 4 DAY 1 HOUR\n(\n `x` Int64\n)\nENGINE = Memory\nAS SELECT 42 AS x <29: randomize> 1 1 @@ -13,18 +13,16 @@ CREATE MATERIALIZED VIEW default.h\nREFRESH EVERY 1 SECOND TO default.dest\n(\n <30: to existing table> 10 <31: to existing table> 10 <31: to existing table> 20 -<31.5: will retry> Error 1 +<31.5: will retry> 1 1 <31.6: did retry> 10 -<32: empty> i Scheduled Unknown 0 -<32: empty> j Scheduled Finished 0 +<32: empty> i Scheduled 1 0 +<32: empty> j Scheduled 0 0 <34: append> 10 <35: append> 10 <35: append> 20 <35: append> 30 -<36: not append> 20 -<36: not append> 30 <37: append chain> 100 <38: append chain> 100 <38: append chain> 100 <38: append chain> 200 -creating MergeTree without ORDER BY failed, as expected +<39: append> 0 1 1 0 diff --git a/tests/queries/0_stateless/02932_refreshable_materialized_views_2.sh b/tests/queries/0_stateless/02932_refreshable_materialized_views_2.sh index 2d00d61f253..41ed88686f6 100755 --- a/tests/queries/0_stateless/02932_refreshable_materialized_views_2.sh +++ b/tests/queries/0_stateless/02932_refreshable_materialized_views_2.sh @@ -2,8 +2,6 @@ # Tags: atomic-database CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# reset --log_comment -CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh @@ -20,7 +18,7 @@ $CLICKHOUSE_CLIENT -q " create table src (x Int8) engine Memory as select 1; create materialized view c refresh every 1 second (x Int64) engine Memory empty as select * from src; drop table src;" -while [ "`$CLICKHOUSE_CLIENT -q "select last_refresh_result from refreshes where view = 'c' -- $LINENO" | xargs`" != 'Error' ] +while [ "`$CLICKHOUSE_CLIENT -q "select exception == '' from refreshes -- $LINENO" | xargs`" != '0' ] do sleep 0.5 done @@ -28,17 +26,14 @@ done $CLICKHOUSE_CLIENT -q " select '<19: exception>', exception ilike '%UNKNOWN_TABLE%' ? '1' : exception from refreshes where view = 'c'; create table src (x Int64) engine Memory as select 1; - system refresh view c;" -while [ "`$CLICKHOUSE_CLIENT -q "select last_refresh_result from refreshes -- $LINENO" | xargs`" != 'Finished' ] -do - sleep 0.5 -done + system refresh view c; + system wait view c;" # Rename table. $CLICKHOUSE_CLIENT -q " select '<20: unexception>', * from c; rename table c to d; select '<21: rename>', * from d; - select '<22: rename>', view, last_refresh_result from refreshes;" + select '<22: rename>', view, last_success_time is null from refreshes;" # Do various things during a refresh. # First make a nonempty view. @@ -46,11 +41,8 @@ $CLICKHOUSE_CLIENT -q " drop table d; truncate src; insert into src values (1); - create materialized view e refresh every 1 second (x Int64) engine MergeTree order by x empty as select x + sleepEachRow(1) as x from src settings max_block_size = 1;" -while [ "`$CLICKHOUSE_CLIENT -q "select last_refresh_result from refreshes -- $LINENO" | xargs`" != 'Finished' ] -do - sleep 0.5 -done + create materialized view e refresh every 1 second (x Int64) engine MergeTree order by x as select x + sleepEachRow(1) as x from src settings max_block_size = 1; + system wait view e;" # Stop refreshes. $CLICKHOUSE_CLIENT -q " select '<23: simple refresh>', * from e; @@ -73,18 +65,18 @@ $CLICKHOUSE_CLIENT -q " rename table e to f; select '<24: rename during refresh>', * from f; select '<25: rename during refresh>', view, status from refreshes where view = 'f'; - alter table f modify refresh after 10 year;" - + alter table f modify refresh after 10 year settings refresh_retries = 0;" +sleep 1 # make it likely that at least one row was processed # Cancel. $CLICKHOUSE_CLIENT -q " system cancel view f;" -while [ "`$CLICKHOUSE_CLIENT -q "select status from refreshes where view = 'f' -- $LINENO" | xargs`" != 'Scheduled' ] +while [ "`$CLICKHOUSE_CLIENT -q "select status from refreshes -- $LINENO" | xargs`" != 'Scheduled' ] do sleep 0.5 done # Check that another refresh doesn't immediately start after the cancelled one. $CLICKHOUSE_CLIENT -q " - select '<27: cancelled>', view, status, last_refresh_result from refreshes where view = 'f'; + select '<27: cancelled>', view, status, exception from refreshes where view = 'f'; system refresh view f;" while [ "`$CLICKHOUSE_CLIENT -q "select status from refreshes where view = 'f' -- $LINENO" | xargs`" != 'Running' ] do @@ -115,13 +107,10 @@ $CLICKHOUSE_CLIENT -q " create table dest (x Int64) engine MergeTree order by x; truncate src; insert into src values (1); - create materialized view h refresh every 1 second to dest empty as select x*10 as x from src; - show create h;" -while [ "`$CLICKHOUSE_CLIENT -q "select last_refresh_result from refreshes -- $LINENO" | xargs`" != 'Finished' ] -do - sleep 0.5 -done -$CLICKHOUSE_CLIENT -q " + create materialized view h refresh every 1 second to dest as select x*10 as x from src; + create materialized view hh refresh every 2 second to dest as select x from src; -- { serverError BAD_ARGUMENTS } + show create h; + system wait view h; select '<30: to existing table>', * from dest; insert into src values (2);" while [ "`$CLICKHOUSE_CLIENT -q "select count() from dest -- $LINENO" | xargs`" != '2' ] @@ -136,14 +125,14 @@ $CLICKHOUSE_CLIENT -q " # Retries. $CLICKHOUSE_CLIENT -q " create materialized view h2 refresh after 1 year settings refresh_retries = 10 (x Int64) engine Memory as select x*10 + throwIf(x % 2 == 0) as x from src;" -$CLICKHOUSE_CLIENT -q "system wait view h2;" 2>/dev/null && echo "SYSTEM WAIT VIEW failed to fail at $LINENO" +$CLICKHOUSE_CLIENT -q "system wait view h2; -- { serverError REFRESH_FAILED }" $CLICKHOUSE_CLIENT -q " - select '<31.5: will retry>', last_refresh_result, retry > 0 from refreshes; - create table src2 (x Int8) engine Memory; + select '<31.5: will retry>', exception != '', retry > 0 from refreshes; + create table src2 empty as src; insert into src2 values (1); exchange tables src and src2; drop table src2;" -while [ "`$CLICKHOUSE_CLIENT -q "select last_refresh_result, retry from refreshes -- $LINENO" | xargs`" != 'Finished 0' ] +while [ "`$CLICKHOUSE_CLIENT -nq "select status, retry from refreshes -- $LINENO" | xargs`" != 'Scheduled 0' ] do sleep 0.5 done @@ -154,13 +143,14 @@ $CLICKHOUSE_CLIENT -q " # EMPTY $CLICKHOUSE_CLIENT -q " create materialized view i refresh after 1 year engine Memory empty as select number as x from numbers(2); + system wait view i; create materialized view j refresh after 1 year engine Memory as select number as x from numbers(2);" while [ "`$CLICKHOUSE_CLIENT -q "select sum(last_success_time is null) from refreshes -- $LINENO" | xargs`" == '2' ] do sleep 0.5 done $CLICKHOUSE_CLIENT -q " - select '<32: empty>', view, status, last_refresh_result, retry from refreshes order by view; + select '<32: empty>', view, status, last_success_time is null, retry from refreshes order by view; drop table i; drop table j;" @@ -178,11 +168,8 @@ $CLICKHOUSE_CLIENT -q " select '<35: append>', * from k order by x;" # ALTER to non-APPEND $CLICKHOUSE_CLIENT -q " - alter table k modify refresh every 10 year; - system wait view k; - system refresh view k; - system wait view k; - select '<36: not append>', * from k order by x; + alter table k modify refresh every 10 year; -- { serverError NOT_IMPLEMENTED }" +$CLICKHOUSE_CLIENT -q " drop table k; truncate table src;" @@ -205,7 +192,7 @@ $CLICKHOUSE_CLIENT -q " # Failing to create inner table. $CLICKHOUSE_CLIENT -q " - create materialized view n refresh every 1 second (x Int64) engine MergeTree as select 1 as x from numbers(2);" 2>/dev/null || echo "creating MergeTree without ORDER BY failed, as expected" + create materialized view n refresh every 1 second (x Int64) engine MergeTree as select 1 as x from numbers(2); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}" $CLICKHOUSE_CLIENT -q " create materialized view n refresh every 1 second (x Int64) engine MergeTree order by x as select 1 as x from numbers(2); drop table n;" @@ -218,5 +205,11 @@ $CLICKHOUSE_CLIENT -q " create materialized view o refresh every 1 second (x Int64) engine Memory as select x from nope.nonexist settings allow_materialized_view_with_bad_select = 1; drop table o;" +$CLICKHOUSE_CLIENT -q " + create materialized view o refresh every 1 second append (number UInt64) engine Memory as select number from numbers(2); + system wait view o; + select '<39: append>', count() % 2, count() > 0, sum(number) > 0, sum(number)*2 - count() from o; + drop table o;" + $CLICKHOUSE_CLIENT -q " drop table refreshes;" diff --git a/tests/queries/0_stateless/02961_sumMapFiltered_keepKey.reference b/tests/queries/0_stateless/02961_sumMapFiltered_keepKey.reference new file mode 100644 index 00000000000..3753b994d82 --- /dev/null +++ b/tests/queries/0_stateless/02961_sumMapFiltered_keepKey.reference @@ -0,0 +1,2 @@ +([1,2,3],[10,10,20]) +([1,2,3],[10,10,20]) diff --git a/tests/queries/0_stateless/02961_sumMapFiltered_keepKey.sql b/tests/queries/0_stateless/02961_sumMapFiltered_keepKey.sql new file mode 100644 index 00000000000..dc5aa743bb8 --- /dev/null +++ b/tests/queries/0_stateless/02961_sumMapFiltered_keepKey.sql @@ -0,0 +1,3 @@ + +SELECT sumMapFiltered([1,2,3])(a,b) FROM values('a Array(Int64), b Array(Int64)',([1, 2, 3], [10, 10, 10]), ([3, 4, 5], [10, 10, 10]),([4, 5, 6], [10, 10, 10]),([6, 7, 8], [10, 10, 10])); +SELECT sumMapFiltered([1,2,3,toInt8(-3)])(a,b) FROM values('a Array(UInt64), b Array(Int64)',([1, 2, 3], [10, 10, 10]), ([3, 4, 5], [10, 10, 10]),([4, 5, 6], [10, 10, 10]),([6, 7, 8], [10, 10, 10])); diff --git a/tests/queries/0_stateless/03014_invalid_utf8_client.expect b/tests/queries/0_stateless/03014_invalid_utf8_client.expect index 6689e1a1179..b0a9de920bc 100755 --- a/tests/queries/0_stateless/03014_invalid_utf8_client.expect +++ b/tests/queries/0_stateless/03014_invalid_utf8_client.expect @@ -21,7 +21,7 @@ expect_after { -i $any_spawn_id timeout { exit 1 } } -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --history_file=$history_file" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --enable-progress-table-toggle=0 --history_file=$history_file" expect ":) " send -- "SELECT \x99\r" diff --git a/tests/queries/0_stateless/03093_special_column_errors.sql b/tests/queries/0_stateless/03093_special_column_errors.sql index 1464927db7e..5daf72d222a 100644 --- a/tests/queries/0_stateless/03093_special_column_errors.sql +++ b/tests/queries/0_stateless/03093_special_column_errors.sql @@ -21,6 +21,7 @@ CREATE TABLE collapsing (key Int64, sign Int8) ENGINE = CollapsingMergeTree(sign ALTER TABLE collapsing MODIFY COLUMN sign String; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN } ALTER TABLE collapsing DROP COLUMN sign; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN } ALTER TABLE collapsing RENAME COLUMN sign TO sign2; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN } +ALTER TABLE collapsing MODIFY COLUMN sign MODIFY SETTING max_compress_block_size = 123456; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN } CREATE TABLE versioned_collapsing_wrong (key Int64, version UInt8, sign Int8) ENGINE = VersionedCollapsingMergeTree(sign, sign) ORDER BY key; -- { serverError BAD_ARGUMENTS } diff --git a/tests/queries/0_stateless/03206_projection_merge_special_mergetree.sql b/tests/queries/0_stateless/03206_projection_merge_special_mergetree.sql index 82684f754b6..d3448138396 100644 --- a/tests/queries/0_stateless/03206_projection_merge_special_mergetree.sql +++ b/tests/queries/0_stateless/03206_projection_merge_special_mergetree.sql @@ -103,4 +103,4 @@ SELECT FROM system.projection_parts WHERE (database = currentDatabase()) AND (`table` = 'tp') AND (active = 1); -DROP TABLE tp; +DROP TABLE tp; \ No newline at end of file diff --git a/tests/queries/0_stateless/03206_projection_merge_special_mergetree_ignore.reference b/tests/queries/0_stateless/03206_projection_merge_special_mergetree_ignore.reference new file mode 100644 index 00000000000..4913ceae376 --- /dev/null +++ b/tests/queries/0_stateless/03206_projection_merge_special_mergetree_ignore.reference @@ -0,0 +1,6 @@ +2 0 +2 1 +2 2 +3 0 +3 1 +3 2 diff --git a/tests/queries/0_stateless/03206_projection_merge_special_mergetree_ignore.sql b/tests/queries/0_stateless/03206_projection_merge_special_mergetree_ignore.sql new file mode 100644 index 00000000000..113b5ce4ba6 --- /dev/null +++ b/tests/queries/0_stateless/03206_projection_merge_special_mergetree_ignore.sql @@ -0,0 +1,31 @@ +DROP TABLE IF EXISTS tp; + +CREATE TABLE tp ( + type Int32, + eventcnt UInt64, + PROJECTION p (select sum(eventcnt), type group by type) +) engine = ReplacingMergeTree order by type +SETTINGS deduplicate_merge_projection_mode = 'ignore'; + +INSERT INTO tp SELECT number%3, 1 FROM numbers(3); +INSERT INTO tp SELECT number%3, 2 FROM numbers(3); + +OPTIMIZE TABLE tp DEDUPLICATE; -- { serverError SUPPORT_IS_DISABLED } + +OPTIMIZE TABLE tp FINAL; + +SET optimize_use_projections = false, force_optimize_projection = false; + +SELECT sum(eventcnt) eventcnt, type +FROM tp +GROUP BY type +ORDER BY eventcnt, type; + +SET optimize_use_projections = true, force_optimize_projection = true; + +SELECT sum(eventcnt) eventcnt, type +FROM tp +GROUP BY type +ORDER By eventcnt, type; + +DROP TABLE tp; diff --git a/tests/queries/0_stateless/03237_insert_sparse_columns_mem.sh b/tests/queries/0_stateless/03237_insert_sparse_columns_mem.sh index ac682a0f574..af3e1c9fe80 100755 --- a/tests/queries/0_stateless/03237_insert_sparse_columns_mem.sh +++ b/tests/queries/0_stateless/03237_insert_sparse_columns_mem.sh @@ -11,7 +11,9 @@ for i in {1..250}; do table_structure+=", c$i String" done -$CLICKHOUSE_CLIENT --query " +MY_CLICKHOUSE_CLIENT="$CLICKHOUSE_CLIENT --enable_parsing_to_custom_serialization 1" + +$MY_CLICKHOUSE_CLIENT --query " DROP TABLE IF EXISTS t_insert_mem; DROP TABLE IF EXISTS t_reference; @@ -23,7 +25,7 @@ $CLICKHOUSE_CLIENT --query " filename="test_data_sparse_$CLICKHOUSE_DATABASE.json" -$CLICKHOUSE_CLIENT --query " +$MY_CLICKHOUSE_CLIENT --query " INSERT INTO FUNCTION file('$filename', LineAsString) SELECT format('{{ \"id\": {}, \"c{}\": \"{}\" }}', number, number % 250, hex(number * 1000000)) FROM numbers(30000) SETTINGS engine_file_truncate_on_insert = 1; @@ -34,15 +36,19 @@ $CLICKHOUSE_CLIENT --query " " for _ in {1..4}; do - $CLICKHOUSE_CLIENT --query "INSERT INTO t_reference SELECT * FROM file('$filename', JSONEachRow)" + $MY_CLICKHOUSE_CLIENT --query "INSERT INTO t_reference SELECT * FROM file('$filename', JSONEachRow)" done; -$CLICKHOUSE_CLIENT --enable_parsing_to_custom_serialization 1 --query "INSERT INTO t_insert_mem SELECT * FROM file('$filename', JSONEachRow)" -$CLICKHOUSE_CLIENT --enable_parsing_to_custom_serialization 1 --query "INSERT INTO t_insert_mem SELECT * FROM file('$filename', JSONEachRow)" -$CLICKHOUSE_CLIENT --enable_parsing_to_custom_serialization 1 --query "INSERT INTO t_insert_mem SELECT * FROM s3(s3_conn, filename='$filename', format='JSONEachRow')" -$CLICKHOUSE_CLIENT --query "SELECT * FROM file('$filename', LineAsString) FORMAT LineAsString" | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=INSERT+INTO+t_insert_mem+FORMAT+JSONEachRow&enable_parsing_to_custom_serialization=1" --data-binary @- +$MY_CLICKHOUSE_CLIENT --query "INSERT INTO t_insert_mem SELECT * FROM file('$filename', JSONEachRow)" +$MY_CLICKHOUSE_CLIENT --query "INSERT INTO t_insert_mem SELECT * FROM file('$filename', JSONEachRow)" -$CLICKHOUSE_CLIENT --query " +$MY_CLICKHOUSE_CLIENT --query "DETACH TABLE t_insert_mem" +$MY_CLICKHOUSE_CLIENT --query "ATTACH TABLE t_insert_mem" + +$MY_CLICKHOUSE_CLIENT --query "INSERT INTO t_insert_mem SELECT * FROM s3(s3_conn, filename='$filename', format='JSONEachRow')" +$MY_CLICKHOUSE_CLIENT --query "SELECT * FROM file('$filename', LineAsString) FORMAT LineAsString" | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=INSERT+INTO+t_insert_mem+FORMAT+JSONEachRow&enable_parsing_to_custom_serialization=1" --data-binary @- + +$MY_CLICKHOUSE_CLIENT --query " SELECT count() FROM t_insert_mem; SELECT sum(sipHash64(*)) FROM t_insert_mem; SELECT sum(sipHash64(*)) FROM t_reference; @@ -53,7 +59,7 @@ $CLICKHOUSE_CLIENT --query " SYSTEM FLUSH LOGS; - SELECT written_bytes <= 3000000 FROM system.query_log + SELECT written_bytes <= 10000000 FROM system.query_log WHERE query LIKE 'INSERT INTO t_insert_mem%' AND current_database = '$CLICKHOUSE_DATABASE' AND type = 'QueryFinish' ORDER BY event_time_microseconds; diff --git a/tests/queries/0_stateless/03237_max_map_state_decimal_serialization.reference b/tests/queries/0_stateless/03237_max_map_state_decimal_serialization.reference new file mode 100644 index 00000000000..31b1da2af43 --- /dev/null +++ b/tests/queries/0_stateless/03237_max_map_state_decimal_serialization.reference @@ -0,0 +1 @@ +{"x":"\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"} diff --git a/tests/queries/0_stateless/03237_max_map_state_decimal_serialization.sql b/tests/queries/0_stateless/03237_max_map_state_decimal_serialization.sql new file mode 100644 index 00000000000..8742f470cf0 --- /dev/null +++ b/tests/queries/0_stateless/03237_max_map_state_decimal_serialization.sql @@ -0,0 +1 @@ +select maxMapState([0], [toDateTime64(0, 0)]) as x format JSONEachRow; diff --git a/tests/queries/0_stateless/03246_range_literal_replacement_works.reference b/tests/queries/0_stateless/03246_range_literal_replacement_works.reference index 9766475a418..d00491fd7e5 100644 --- a/tests/queries/0_stateless/03246_range_literal_replacement_works.reference +++ b/tests/queries/0_stateless/03246_range_literal_replacement_works.reference @@ -1 +1 @@ -ok +1 diff --git a/tests/queries/0_stateless/03246_range_literal_replacement_works.sql b/tests/queries/0_stateless/03246_range_literal_replacement_works.sql index 3771a9eb921..57bd369e6e1 100644 --- a/tests/queries/0_stateless/03246_range_literal_replacement_works.sql +++ b/tests/queries/0_stateless/03246_range_literal_replacement_works.sql @@ -1,13 +1,10 @@ -CREATE TABLE my_table ( - str String, -) ORDER BY str; +SET input_format_values_interpret_expressions = 0; +SET input_format_values_accurate_types_of_literals = 0; -INSERT INTO my_table VALUES -( -CASE WHEN - (0 BETWEEN 0 AND 2) THEN 'ok' ELSE - 'wat' -END -); +CREATE TABLE IF NOT EXISTS 03246_range_literal_replacement_works (id UInt8) Engine=Memory; -SELECT * FROM my_table; \ No newline at end of file +INSERT INTO 03246_range_literal_replacement_works VALUES (1 BETWEEN 0 AND 2); + +SELECT * FROM 03246_range_literal_replacement_works; + +DROP TABLE IF EXISTS 03246_range_literal_replacement_works; diff --git a/tests/queries/0_stateless/03247_generic_arrayMin_arrayMax_fixes.reference b/tests/queries/0_stateless/03247_generic_arrayMin_arrayMax_fixes.reference new file mode 100644 index 00000000000..8a143e535e2 --- /dev/null +++ b/tests/queries/0_stateless/03247_generic_arrayMin_arrayMax_fixes.reference @@ -0,0 +1,37 @@ +-- { echoOn } +-- https://github.com/ClickHouse/ClickHouse/issues/68895 +SELECT arrayMax(x -> toFixedString('.', 1), []); +. +-- https://github.com/ClickHouse/ClickHouse/issues/69600 +SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res; +-1 +SELECT arrayMax(x -> toUInt16(-x), [1, 2, 4]) AS res; +65535 +-- https://github.com/ClickHouse/ClickHouse/pull/69640 +SELECT arrayMin(x1 -> (x1 * toNullable(-1)), materialize([1, 2, 3])); +-3 +SELECT arrayMin(x1 -> x1 * -1, [1,2,3]); +-3 +DROP TABLE IF EXISTS test_aggregation_array; +CREATE TABLE test_aggregation_array (x Array(Int)) ENGINE=MergeTree() ORDER by tuple(); +INSERT INTO test_aggregation_array VALUES ([1,2,3,4,5,6]), ([]), ([1,2,3]); +SELECT [arrayMin(x1 -> (x1 * materialize(-1)), [toNullable(toUInt256(0)), materialize(4)])], arrayMin([arrayMin([0])]) FROM test_aggregation_array GROUP BY arrayAvg([1]), [0, toUInt256(8)] WITH CUBE SETTINGS allow_experimental_analyzer = 1; +[-4] 0 +[-4] 0 +[-4] 0 +[-4] 0 +SELECT [arrayMin([3, arrayMin([toUInt128(8)]), 4, 5]), arrayMax([materialize(1)]), arrayMin([arrayMax([1]), 2]), 2], arrayMin([0, toLowCardinality(8)]), 2, arrayMax(x1 -> (x1 * -1), x) FROM test_aggregation_array; +[3,1,1,2] 0 2 -1 +[3,1,1,2] 0 2 0 +[3,1,1,2] 0 2 -1 +select arrayMax(x -> x.1, [(1, 'a'), (0, 'b')]); +1 +select arrayMin(x -> x.2, [(1, 'a'), (0, 'b')]); +a +-- Extra validation of generic arrayMin/arrayMax +WITH [(1,2),(1,3)] AS t SELECT arrayMin(t), arrayMax(t); +(1,2) (1,3) +WITH [map('a', 1, 'b', 2), map('a',1,'b',3)] AS t SELECT arrayMin(t), arrayMax(t); +{'a':1,'b':2} {'a':1,'b':3} +WITH [map('a', 1, 'b', 2, 'c', 10), map('a',1,'b',3, 'c', 0)] AS t SELECT arrayMin(x -> x['c'], t), arrayMax(x -> x['c'], t); +0 10 diff --git a/tests/queries/0_stateless/03247_generic_arrayMin_arrayMax_fixes.sql b/tests/queries/0_stateless/03247_generic_arrayMin_arrayMax_fixes.sql new file mode 100644 index 00000000000..2cd052917b2 --- /dev/null +++ b/tests/queries/0_stateless/03247_generic_arrayMin_arrayMax_fixes.sql @@ -0,0 +1,26 @@ +-- { echoOn } +-- https://github.com/ClickHouse/ClickHouse/issues/68895 +SELECT arrayMax(x -> toFixedString('.', 1), []); + +-- https://github.com/ClickHouse/ClickHouse/issues/69600 +SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res; +SELECT arrayMax(x -> toUInt16(-x), [1, 2, 4]) AS res; + +-- https://github.com/ClickHouse/ClickHouse/pull/69640 +SELECT arrayMin(x1 -> (x1 * toNullable(-1)), materialize([1, 2, 3])); +SELECT arrayMin(x1 -> x1 * -1, [1,2,3]); + +DROP TABLE IF EXISTS test_aggregation_array; +CREATE TABLE test_aggregation_array (x Array(Int)) ENGINE=MergeTree() ORDER by tuple(); +INSERT INTO test_aggregation_array VALUES ([1,2,3,4,5,6]), ([]), ([1,2,3]); + +SELECT [arrayMin(x1 -> (x1 * materialize(-1)), [toNullable(toUInt256(0)), materialize(4)])], arrayMin([arrayMin([0])]) FROM test_aggregation_array GROUP BY arrayAvg([1]), [0, toUInt256(8)] WITH CUBE SETTINGS allow_experimental_analyzer = 1; +SELECT [arrayMin([3, arrayMin([toUInt128(8)]), 4, 5]), arrayMax([materialize(1)]), arrayMin([arrayMax([1]), 2]), 2], arrayMin([0, toLowCardinality(8)]), 2, arrayMax(x1 -> (x1 * -1), x) FROM test_aggregation_array; + +select arrayMax(x -> x.1, [(1, 'a'), (0, 'b')]); +select arrayMin(x -> x.2, [(1, 'a'), (0, 'b')]); + +-- Extra validation of generic arrayMin/arrayMax +WITH [(1,2),(1,3)] AS t SELECT arrayMin(t), arrayMax(t); +WITH [map('a', 1, 'b', 2), map('a',1,'b',3)] AS t SELECT arrayMin(t), arrayMax(t); +WITH [map('a', 1, 'b', 2, 'c', 10), map('a',1,'b',3, 'c', 0)] AS t SELECT arrayMin(x -> x['c'], t), arrayMax(x -> x['c'], t); diff --git a/tests/queries/0_stateless/03247_materialized_view_select_intersect.reference b/tests/queries/0_stateless/03247_materialized_view_select_intersect.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03247_materialized_view_select_intersect.sql b/tests/queries/0_stateless/03247_materialized_view_select_intersect.sql new file mode 100644 index 00000000000..72efac0ce27 --- /dev/null +++ b/tests/queries/0_stateless/03247_materialized_view_select_intersect.sql @@ -0,0 +1 @@ +CREATE MATERIALIZED VIEW v0 AS (SELECT 1) INTERSECT (SELECT 1); --{serverError QUERY_IS_NOT_SUPPORTED_IN_MATERIALIZED_VIEW} diff --git a/tests/queries/0_stateless/03248_with_fill_string_crash.reference b/tests/queries/0_stateless/03248_with_fill_string_crash.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03248_with_fill_string_crash.sql b/tests/queries/0_stateless/03248_with_fill_string_crash.sql new file mode 100644 index 00000000000..ba00640f842 --- /dev/null +++ b/tests/queries/0_stateless/03248_with_fill_string_crash.sql @@ -0,0 +1,7 @@ +CREATE TABLE users (date DateTime, name String, age Int16) ENGINE=MergeTree() ORDER BY date; + +INSERT INTO users VALUES ('2024-01-01', 'John', 33), + ('2024-02-01', 'Ksenia', 48), + ('2024-02-15', 'Alice', 50); + +SELECT * FROM users ORDER BY date WITH FILL TO '2024-02-17' STEP toIntervalHour(1); -- { serverError INVALID_WITH_FILL_EXPRESSION } diff --git a/tests/queries/0_stateless/03249_dynamic_alter_consistency.reference b/tests/queries/0_stateless/03249_dynamic_alter_consistency.reference new file mode 100644 index 00000000000..a24c35449ed --- /dev/null +++ b/tests/queries/0_stateless/03249_dynamic_alter_consistency.reference @@ -0,0 +1,2 @@ +600000 UInt64 false +400000 String true diff --git a/tests/queries/0_stateless/03249_dynamic_alter_consistency.sql b/tests/queries/0_stateless/03249_dynamic_alter_consistency.sql new file mode 100644 index 00000000000..5840866628e --- /dev/null +++ b/tests/queries/0_stateless/03249_dynamic_alter_consistency.sql @@ -0,0 +1,9 @@ +set allow_experimental_dynamic_type=1; + +drop table if exists test; +create table test (d Dynamic) engine=MergeTree order by tuple() settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1; +insert into test select number < 600000 ? number::Dynamic : ('str_' || number)::Dynamic from numbers(1000000); +alter table test modify column d Dynamic(max_types=1); +select count(), dynamicType(d), isDynamicElementInSharedData(d) from test group by dynamicType(d), isDynamicElementInSharedData(d); +drop table test; + diff --git a/tests/queries/0_stateless/helpers/client.py b/tests/queries/0_stateless/helpers/client.py index ac0896f2e93..b721931e46d 100644 --- a/tests/queries/0_stateless/helpers/client.py +++ b/tests/queries/0_stateless/helpers/client.py @@ -16,7 +16,10 @@ class client(object): def __init__(self, command=None, name="", log=None): self.client = uexpect.spawn(["/bin/bash", "--noediting"]) if command is None: - command = os.environ.get("CLICKHOUSE_BINARY", "clickhouse") + " client" + options = "--enable-progress-table-toggle=0" + command = ( + os.environ.get("CLICKHOUSE_BINARY", "clickhouse") + " client " + options + ) self.client.command = command self.client.eol("\r") self.client.logger(log, prefix=name) diff --git a/tests/result b/tests/result deleted file mode 100644 index b76f44f1e6a..00000000000 --- a/tests/result +++ /dev/null @@ -1,12 +0,0 @@ -Using queries from 'queries' directory -Connecting to ClickHouse server...... OK -Connected to server 24.7.1.1 @ 246f421f2402799fd11b22a608b4d0d497cb8438 chesema-processor-onCancel - -Running 1 stateless tests (MainProcess). - -00993_system_parts_race_condition_drop_zookeeper: [ OK ] - -1 tests passed. 0 tests skipped. 124.59 s elapsed (MainProcess). - -0 tests passed. 0 tests skipped. 0.00 s elapsed (MainProcess). -All tests have finished. diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 2260705323b..74d19062e53 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1422,8 +1422,6 @@ config configs conformant congruential -conjuction -conjuctive connectionId const contrib @@ -1700,7 +1698,6 @@ formatReadableSize formatReadableTimeDelta formatRow formatRowNoNewline -formated formatschema formatter formatters @@ -3052,3 +3049,89 @@ znodes zookeeperSessionUptime zstd postgres +ArrowCompression +CapnProtoEnumComparingMode +DateTimeInputFormat +DateTimeOutputFormat +DateTimeOverflowBehavior +deserialize +dotall +EachRow +EscapingRule +IdentifierQuotingRule +IdentifierQuotingStyle +IntervalOutputFormat +MsgPackUUIDRepresentation +ORCCompression +ParquetCompression +ParquetVersion +SchemaInferenceMode +alloc +CacheWarmer +conjuctive +cors +CORS +countIf +DefaultTableEngine +dereference +DistributedDDLOutputMode +DistributedProductMode +formatdatetime +inequal +INVOKER +ITION +JoinAlgorithm +JoinStrictness +keepalive +ListObject +ListObjects +LoadBalancing +LocalFSReadMethod +LogQueriesType +LogsLevel +MaxThreads +MemorySample +multibuffer +multiif +multiread +multithreading +MySQLDataTypesSupport +nonconst +NonZeroUInt +nullptr +OverflowMode +OverflowModeGroupBy +ParallelReplicasMode +param +parsedatetime +perf +PerfEventInfo +perkey +prefetched +prefetches +prefetching +preimage +QueryCacheNondeterministicFunctionHandling +QueryCacheSystemTableHandling +remerge +replcase +rerange +RetryStrategy +rowlist +SetOperationMode +ShortCircuitFunctionEvaluation +SQLSecurityType +sumIf +TCPHandler +throwif +TotalsMode +TransactionsWaitCSNMode +undelete +unmerged +DataPacket +DDLs +DistributedCacheLogMode +DistributedCachePoolBehaviourOnLimit +SharedJoin +ShareSet +unacked diff --git a/utils/check-style/check-style b/utils/check-style/check-style index 2737be85a91..d5753e3010b 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -14,7 +14,8 @@ LC_ALL="en_US.UTF-8" ROOT_PATH=$(git rev-parse --show-toplevel) -EXCLUDE_DIRS='build/|integration/|widechar_width/|glibc-compatibility/|poco/|memcpy/|consistent-hashing|benchmark|tests/.*.cpp|utils/keeper-bench/example.yaml' +EXCLUDE='build/|integration/|widechar_width/|glibc-compatibility/|poco/|memcpy/|consistent-hashing|benchmark|tests/.*.cpp|utils/keeper-bench/example.yaml' +EXCLUDE_DOCS='Settings\.cpp|FormatFactorySettingsDeclaration\.h' # From [1]: # But since array_to_string_internal() in array.c still loops over array @@ -31,7 +32,8 @@ function in_array() } find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | + grep -vP $EXCLUDE_DOCS | xargs grep $@ -P '((class|struct|namespace|enum|if|for|while|else|throw|switch).*|\)(\s*const)?(\s*override)?\s*)\{$|\s$|^ {1,3}[^\* ]\S|\t|^\s*(if|else if|if constexpr|else if constexpr|for|while|catch|switch)\(|\( [^\s\\]|\S \)' | # a curly brace not in a new line, but not for the case of C++11 init or agg. initialization | trailing whitespace | number of ws not a multiple of 4, but not in the case of comment continuation | missing whitespace after for/if/while... before opening brace | whitespaces inside braces grep -v -P '(//|:\s+\*|\$\(\()| \)"' @@ -39,12 +41,12 @@ find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/n # Tabs find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null | - grep -vP $EXCLUDE_DIRS | - xargs grep $@ -F $'\t' + grep -vP $EXCLUDE | + xargs grep $@ -F $'\t' && echo '^ tabs are not allowed' # // namespace comments are unneeded find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep $@ -P '}\s*//+\s*namespace\s*' # Broken symlinks @@ -52,22 +54,46 @@ find -L $ROOT_PATH -type l 2>/dev/null | grep -v contrib && echo "^ Broken symli # Duplicated or incorrect setting declarations SETTINGS_FILE=$(mktemp) -cat $ROOT_PATH/src/Core/Settings.cpp $ROOT_PATH/src/Core/FormatFactorySettingsDeclaration.h | grep "M(" | awk '{print substr($2, 0, length($2) - 1) " " substr($1, 3, length($1) - 3) " SettingsDeclaration" }' > ${SETTINGS_FILE} -find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep "extern const Settings" -T | awk '{print substr($5, 0, length($5) -1) " " substr($4, 9) " " substr($1, 0, length($1) - 1)}' >> ${SETTINGS_FILE} +ALL_DECLARATION_FILES=" + $ROOT_PATH/src/Core/Settings.cpp + $ROOT_PATH/src/Storages/MergeTree/MergeTreeSettings.cpp + $ROOT_PATH/src/Core/FormatFactorySettingsDeclaration.h" + +cat $ROOT_PATH/src/Core/Settings.cpp $ROOT_PATH/src/Core/FormatFactorySettingsDeclaration.h | grep "M(" | awk '{print substr($2, 0, length($2) - 1) " Settings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq > ${SETTINGS_FILE} +cat $ROOT_PATH/src/Storages/MergeTree/MergeTreeSettings.cpp | grep "M(" | awk '{print substr($2, 0, length($2) - 1) " MergeTreeSettings" substr($1, 3, length($1) - 3) " SettingsDeclaration" }' | sort | uniq >> ${SETTINGS_FILE} + +# Check that if there are duplicated settings (declared in different objects) they all have the same type (it's simpler to validate style with that assert) +for setting in $(awk '{print $1 " " $2}' ${SETTINGS_FILE} | sed -e 's/MergeTreeSettings//g' -e 's/Settings//g' | sort | uniq | awk '{ print $1 }' | uniq -d); +do + echo "# Found multiple definitions of setting ${setting} with different types: " + grep --line-number " ${setting}," ${ALL_DECLARATION_FILES} | awk '{print " > " $0 }' +done + +# We append all uses of extern found in implementation files to validate them in a single pass and avoid reading the same files over and over +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | xargs grep -e "^\s*extern const Settings" -e "^\s**extern const MergeTreeSettings" -T | awk '{print substr($5, 0, length($5) -1) " " $4 " " substr($1, 0, length($1) - 1)}' >> ${SETTINGS_FILE} # Duplicate extern declarations for settings awk '{if (seen[$0]++) print $3 " -> " $1 ;}' ${SETTINGS_FILE} | while read line; do - echo "Found duplicated setting declaration in: $line" + echo "# Found duplicated setting declaration in: $line" done -# Incorrect declarations for settings -for setting in $(awk '{print $1 " " $2}' ${SETTINGS_FILE} | sort | uniq | awk '{ print $1 }' | sort | uniq -d); +# Find missing declarations (obsolete settings being used) +# Note that SettingsDeclaration are first in the file +# Disabled for now pending fixing the code +#awk '{print $1 " " $3}' ${SETTINGS_FILE} | awk '{if (!seen[$1]++) print $0}' | grep -v SettingsDeclaration | while read setting; +#do +# echo "Could not find setting (maybe obsolete but used?) $setting" +#done + +# Look for settings declared with multiple types +for setting in $(awk '{print $1 " " $2}' ${SETTINGS_FILE} | sed -e 's/MergeTreeSettings//g' -e 's/Settings//g' | sort | uniq | awk '{ print $1 }' | sort | uniq -d); do + echo $setting expected=$(grep "^$setting " ${SETTINGS_FILE} | grep SettingsDeclaration | awk '{ print $2 }') grep "^$setting " ${SETTINGS_FILE} | grep -v " $expected" | awk '{ print $3 " found setting " $1 " with type " $2 }' | while read line; do - echo "In $line but it should be $expected" + echo "# In $line but it should be ${expected/$'\n'/ }" done done @@ -91,12 +117,14 @@ EXTERN_TYPES_EXCLUDES=( ProfileEvents::Timer ProfileEvents::Type ProfileEvents::TypeEnum + ProfileEvents::ValueType ProfileEvents::dumpToMapColumn ProfileEvents::getProfileEvents ProfileEvents::ThreadIdToCountersSnapshot ProfileEvents::LOCAL_NAME ProfileEvents::keeper_profile_events ProfileEvents::CountersIncrement + ProfileEvents::size CurrentMetrics::add CurrentMetrics::sub @@ -108,6 +136,7 @@ EXTERN_TYPES_EXCLUDES=( CurrentMetrics::values CurrentMetrics::Value CurrentMetrics::keeper_metrics + CurrentMetrics::size ErrorCodes::ErrorCode ErrorCodes::getName @@ -130,7 +159,7 @@ for extern_type in ${!EXTERN_TYPES[@]}; do # and this matches with zkutil::CreateMode grep -v -e 'src/Common/ZooKeeper/Types.h' -e 'src/Coordination/KeeperConstants.cpp' } | { - grep -vP $EXCLUDE_DIRS | xargs grep -l -P "extern const $type_of_extern $allowed_chars" + grep -vP $EXCLUDE | xargs grep -l -P "extern const $type_of_extern $allowed_chars" } | while read file; do grep -P "extern const $type_of_extern $allowed_chars;" $file | sed -r -e "s/^.*?extern const $type_of_extern ($allowed_chars);.*?$/\1/" | while read val; do if ! grep -q "$extern_type::$val" $file; then @@ -148,7 +177,7 @@ for extern_type in ${!EXTERN_TYPES[@]}; do # sed -i -r "0,/(\s*)extern const $type_of_extern [$allowed_chars]+/s//\1extern const $type_of_extern $val;\n&/" $file || \ # awk '{ print; if (ns == 1) { ns = 2 }; if (ns == 2) { ns = 0; print "namespace $extern_type\n{\n extern const $type_of_extern '$val';\n}" } }; /namespace DB/ { ns = 1; };' < $file > ${file}.tmp && mv ${file}.tmp $file ) find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | { - grep -vP $EXCLUDE_DIRS | xargs grep -l -P "$extern_type::$allowed_chars" + grep -vP $EXCLUDE | xargs grep -l -P "$extern_type::$allowed_chars" } | while read file; do grep -P "$extern_type::$allowed_chars" $file | grep -P -v '^\s*//' | sed -r -e "s/^.*?$extern_type::($allowed_chars).*?$/\1/" | while read val; do if ! grep -q "extern const $type_of_extern $val" $file; then @@ -161,7 +190,7 @@ for extern_type in ${!EXTERN_TYPES[@]}; do # Duplicates find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | { - grep -vP $EXCLUDE_DIRS | xargs grep -l -P "$extern_type::$allowed_chars" + grep -vP $EXCLUDE | xargs grep -l -P "$extern_type::$allowed_chars" } | while read file; do grep -P "extern const $type_of_extern $allowed_chars;" $file | sort | uniq -c | grep -v -P ' +1 ' && echo "Duplicate $extern_type in file $file" done @@ -169,16 +198,16 @@ done # Three or more consecutive empty lines find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | while read file; do awk '/^$/ { ++i; if (i > 2) { print "More than two consecutive empty lines in file '$file'" } } /./ { i = 0 }' $file; done # Broken XML files (requires libxml2-utils) find $ROOT_PATH/{src,base,programs,utils} -name '*.xml' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs xmllint --noout --nonet find $ROOT_PATH -not -path $ROOT_PATH'/contrib*' \( -name '*.yaml' -or -name '*.yml' \) -type f | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs yamllint --config-file=$ROOT_PATH/.yamllint # Tests should not be named with "fail" in their names. It makes looking at the results less convenient. @@ -286,11 +315,11 @@ done find $ROOT_PATH/{src,base,programs} -name '*.h' -or -name '*.cpp' 2>/dev/null | xargs grep -i -F 'General Public License' && echo "There shouldn't be any code snippets under GPL or LGPL" # There shouldn't be any docker compose files outside docker directory -find $ROOT_PATH -name '*compose*.yml' -type f -not -path $ROOT_PATH'/docker' -not -path $ROOT_PATH'/tests/integration*' -not -path $ROOT_PATH'/docker*' -not -path $ROOT_PATH'/contrib*' 2>/dev/null | grep -vP $EXCLUDE_DIRS | xargs --no-run-if-empty grep -l "version:" | xargs --no-run-if-empty -n1 echo "Please move docker compose to the 'docker' or 'tests' directory:" +find $ROOT_PATH -name '*compose*.yml' -type f -not -path $ROOT_PATH'/docker' -not -path $ROOT_PATH'/tests/integration*' -not -path $ROOT_PATH'/docker*' -not -path $ROOT_PATH'/contrib*' 2>/dev/null | grep -vP $EXCLUDE | xargs --no-run-if-empty grep -l "version:" | xargs --no-run-if-empty -n1 echo "Please move docker compose to the 'docker' or 'tests' directory:" # Check that every header file has #pragma once in first line find $ROOT_PATH/{src,programs,utils} -name '*.h' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | while read file; do [[ $(head -n1 $file) != '#pragma once' ]] && echo "File $file must have '#pragma once' in first line"; done # Check for executable bit on non-executable files @@ -303,22 +332,22 @@ find $ROOT_PATH/{src,base,programs,utils,tests,docs,cmake} -name '*.md' -or -nam # Too many exclamation marks find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -F '!!!' | grep -P '.' && echo "Too many exclamation marks (looks dirty, unconfident)." # Exclamation mark in a message find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -F '!",' | grep -P '.' && echo "No need for an exclamation mark (looks dirty, unconfident)." # Trailing whitespaces find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -n -P ' $' | grep -n -P '.' && echo "^ Trailing whitespaces." # Forbid stringstream because it's easy to use them incorrectly and hard to debug possible issues find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -P 'std::[io]?stringstream' | grep -v "STYLE_CHECK_ALLOW_STD_STRING_STREAM" && echo "Use WriteBufferFromOwnString or ReadBufferFromString instead of std::stringstream" # Forbid std::cerr/std::cout in src (fine in programs/utils) @@ -328,6 +357,7 @@ std_cerr_cout_excludes=( _fuzzer # OK src/Common/ProgressIndication.cpp + src/Common/ProgressTable.cpp # only under #ifdef DBMS_HASH_MAP_DEBUG_RESIZES, that is used only in tests src/Common/HashTable/HashTable.h # SensitiveDataMasker::printStats() @@ -354,7 +384,7 @@ std_cerr_cout_excludes=( ) sources_with_std_cerr_cout=( $( find $ROOT_PATH/{src,base} -name '*.h' -or -name '*.cpp' | \ - grep -vP $EXCLUDE_DIRS | \ + grep -vP $EXCLUDE | \ grep -F -v $(printf -- "-e %s " "${std_cerr_cout_excludes[@]}") | \ xargs grep -F --with-filename -e std::cerr -e std::cout | cut -d: -f1 | sort -u ) ) @@ -371,7 +401,7 @@ done # NOTE: it is not that accuate, but at least something. tests_with_event_time_date=( $( find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep --with-filename -e event_time -e event_date | cut -d: -f1 | sort -u ) ) for test_case in "${tests_with_event_time_date[@]}"; do @@ -427,23 +457,23 @@ find $ROOT_PATH | sort -f | uniq -i -c | awk '{ if ($1 > 1) print }' # Forbid std::filesystem::is_symlink and std::filesystem::read_symlink, because it's easy to use them incorrectly find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -P '::(is|read)_symlink' | grep -v "STYLE_CHECK_ALLOW_STD_FS_SYMLINK" && echo "Use DB::FS::isSymlink and DB::FS::readSymlink instead" # Forbid __builtin_unreachable(), because it's hard to debug when it becomes reachable find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -P '__builtin_unreachable' && echo "Use UNREACHABLE() from defines.h instead" # Forbid mt19937() and random_device() which are outdated and slow find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -P '(std::mt19937|std::mersenne_twister_engine|std::random_device)' && echo "Use pcg64_fast (from pcg_random.h) and randomSeed (from Common/randomSeed.h) instead" # Require checking return value of close(), # since it can hide fd misuse and break other places. find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -e ' close(.*fd' -e ' ::close(' | grep -v = && echo "Return value of close() should be checked" # Check for existence of __init__.py files @@ -479,12 +509,12 @@ find $ROOT_PATH/{base,src,programs,utils,docs} -name '*.md' -or -name '*.h' -or # Wrong spelling of abbreviations, e.g. SQL is right, Sql is wrong. XMLHttpRequest is very wrong. find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -P 'Sql|Html|Xml|Cpu|Tcp|Udp|Http|Db|Json|Yaml' | grep -v -P 'RabbitMQ|Azure|Aws|aws|Avro|IO/S3' && echo "Abbreviations such as SQL, XML, HTTP, should be in all caps. For example, SQL is right, Sql is wrong. XMLHttpRequest is very wrong." find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | - grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE | xargs grep -F -i 'ErrorCodes::LOGICAL_ERROR, "Logical error:' && echo "If an exception has LOGICAL_ERROR code, there is no need to include the text 'Logical error' in the exception message, because then the phrase 'Logical error' will be printed twice." diff --git a/utils/check-style/check-whitespaces b/utils/check-style/check-whitespaces index 507b1dd2ede..4a17cf4e2bb 100755 --- a/utils/check-style/check-whitespaces +++ b/utils/check-style/check-whitespaces @@ -2,9 +2,11 @@ ROOT_PATH=$(git rev-parse --show-toplevel) EXCLUDE_DIRS='build/|integration/|widechar_width/|glibc-compatibility/|memcpy/|consistent-hashing/|Parsers/New' +EXCLUDE_FILES='Settings\.cpp|FormatFactorySettingsDeclaration\.h' NPROC=$(($(nproc) + 3)) # Double whitespaces find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' 2>/dev/null | grep -vP $EXCLUDE_DIRS | + grep -vP $EXCLUDE_FILES | xargs -P "$NPROC" -n 20 "${ROOT_PATH}/utils/check-style/double-whitespaces.pl" diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index fec72709174..5554b916c39 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,3 +1,4 @@ +v24.9.2.42-stable 2024-10-03 v24.9.1.3278-stable 2024-09-26 v24.8.4.13-lts 2024-09-06 v24.8.3.59-lts 2024-09-03