diff --git a/.clang-tidy b/.clang-tidy index e2f318562ec..219ac263ab3 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -22,6 +22,7 @@ Checks: [ '-bugprone-exception-escape', '-bugprone-forward-declaration-namespace', '-bugprone-implicit-widening-of-multiplication-result', + '-bugprone-multi-level-implicit-pointer-conversion', '-bugprone-narrowing-conversions', '-bugprone-not-null-terminated-result', '-bugprone-reserved-identifier', # useful but too slow, TODO retry when https://reviews.llvm.org/rG1c282052624f9d0bd273bde0b47b30c96699c6c7 is merged @@ -98,6 +99,7 @@ Checks: [ '-modernize-use-nodiscard', '-modernize-use-trailing-return-type', + '-performance-enum-size', '-performance-inefficient-string-concatenation', '-performance-no-int-to-ptr', '-performance-avoid-endl', @@ -105,6 +107,7 @@ Checks: [ '-portability-simd-intrinsics', + '-readability-avoid-nested-conditional-operator', '-readability-avoid-unconditional-preprocessor-if', '-readability-braces-around-statements', '-readability-convert-member-functions-to-static', @@ -118,6 +121,12 @@ Checks: [ '-readability-magic-numbers', '-readability-named-parameter', '-readability-redundant-declaration', + '-readability-redundant-inline-specifier', + '-readability-redundant-member-init', # Useful but triggers another problem. Imagine a struct S with multiple String members. Structs are often instantiated via designated + # initializer S s{.s1 = [...], .s2 = [...], [...]}. In this case, compiler warning `missing-field-initializers` requires to specify all members which are not in-struct + # initialized (example: s1 in struct S { String s1; String s2{};}; is not in-struct initialized, therefore it must be specified at instantiation time). As explicitly + # specifying all members is tedious for large structs, `missing-field-initializers` makes programmers initialize as many members as possible in-struct. Clang-tidy + # warning `readability-redundant-member-init` does the opposite thing, both are not compatible with each other. '-readability-simplify-boolean-expr', '-readability-suspicious-call-argument', '-readability-uppercase-literal-suffix', @@ -125,17 +134,6 @@ Checks: [ '-zircon-*', - # These are new in clang-18, and we have to sort them out: - '-readability-avoid-nested-conditional-operator', - '-modernize-use-designated-initializers', - '-performance-enum-size', - '-readability-redundant-inline-specifier', - '-readability-redundant-member-init', - '-bugprone-crtp-constructor-accessibility', - '-bugprone-suspicious-stringview-data-usage', - '-bugprone-multi-level-implicit-pointer-conversion', - '-cert-err33-c', - # This is a good check, but clang-tidy crashes, see https://github.com/llvm/llvm-project/issues/91872 '-modernize-use-constraints', # https://github.com/abseil/abseil-cpp/issues/1667 diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 3e0131a388a..64dc9049bc2 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -42,25 +42,25 @@ At a minimum, the following information should be added (but add more as needed) > Information about CI checks: https://clickhouse.com/docs/en/development/continuous-integration/
- Modify your CI run + CI Settings **NOTE:** If your merge the PR with modified CI you **MUST KNOW** what you are doing **NOTE:** Checked options will be applied if set before CI RunConfig/PrepareRunConfig step -#### Include tests (required builds will be added automatically): -- [ ] Fast test +#### Run these jobs only (required builds will be added automatically): - [ ] Integration Tests - [ ] Stateless tests - [ ] Stateful tests - [ ] Unit tests - [ ] Performance tests +- [ ] All with aarch64 - [ ] All with ASAN - [ ] All with TSAN - [ ] All with Analyzer - [ ] All with Azure - [ ] Add your option here -#### Exclude tests: +#### Deny these jobs: - [ ] Fast test - [ ] Integration Tests - [ ] Stateless tests @@ -72,7 +72,6 @@ At a minimum, the following information should be added (but add more as needed) - [ ] All with UBSAN - [ ] All with Coverage - [ ] All with Aarch64 -- [ ] Add your option here #### Extra options: - [ ] do not test (only style check) diff --git a/.github/workflows/merge_queue.yml b/.github/workflows/merge_queue.yml index 1b6cc320ec4..97aa0db4cdb 100644 --- a/.github/workflows/merge_queue.yml +++ b/.github/workflows/merge_queue.yml @@ -22,6 +22,9 @@ jobs: clear-repository: true # to ensure correct digests fetch-depth: 0 # to get version filter: tree:0 + - name: Cancel PR workflow + run: | + python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --cancel-previous-run - name: Python unit tests run: | cd "$GITHUB_WORKSPACE/tests/ci" diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 9f16e32707e..f20e987db97 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -130,15 +130,21 @@ jobs: with: stage: Tests_2 data: ${{ needs.RunConfig.outputs.data }} + # stage for jobs that do not prohibit merge + Tests_3: + needs: [RunConfig, Tests_1, Tests_2] + if: ${{ !failure() && !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).stages_data.stages_to_do, 'Tests_3') }} + uses: ./.github/workflows/reusable_test_stage.yml + with: + stage: Tests_3 + data: ${{ needs.RunConfig.outputs.data }} ################################# Reports ################################# - # Reports should by run even if Builds_1/2 fail, so put them separatly in wf (not in Tests_1/2) + # Reports should by run even if Builds_1/2 fail, so put them separately in wf (not in Tests_1/2) Builds_1_Report: # run report check for failed builds to indicate the CI error - if: ${{ !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).jobs_data.jobs_to_do, 'ClickHouse build check') }} - needs: - - RunConfig - - Builds_1 + if: ${{ !cancelled() && needs.StyleCheck.result == 'success' && contains(fromJson(needs.RunConfig.outputs.data).jobs_data.jobs_to_do, 'ClickHouse build check') }} + needs: [RunConfig, StyleCheck, Builds_1] uses: ./.github/workflows/reusable_test.yml with: test_name: ClickHouse build check @@ -146,25 +152,39 @@ jobs: data: ${{ needs.RunConfig.outputs.data }} Builds_2_Report: # run report check for failed builds to indicate the CI error - if: ${{ !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).jobs_data.jobs_to_do, 'ClickHouse special build check') }} - needs: - - RunConfig - - Builds_2 + if: ${{ !cancelled() && needs.StyleCheck.result == 'success' && contains(fromJson(needs.RunConfig.outputs.data).jobs_data.jobs_to_do, 'ClickHouse special build check') }} + needs: [RunConfig, StyleCheck, Builds_2] uses: ./.github/workflows/reusable_test.yml with: test_name: ClickHouse special build check runner_type: style-checker-aarch64 data: ${{ needs.RunConfig.outputs.data }} + CheckReadyForMerge: + if: ${{ !cancelled() && needs.StyleCheck.result == 'success' }} + needs: [RunConfig, BuildDockers, StyleCheck, FastTest, Builds_1, Builds_2, Builds_1_Report, Builds_2_Report, Tests_1, Tests_2] + runs-on: [self-hosted, style-checker-aarch64] + steps: + - name: Check out repository code + uses: ClickHouse/checkout@v1 + with: + filter: tree:0 + - name: Check and set merge status + run: | + cd "$GITHUB_WORKSPACE/tests/ci" + python3 merge_pr.py --set-ci-status --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} + ################################# Stage Final ################################# # FinishCheck: if: ${{ !failure() && !cancelled() }} - needs: [RunConfig, BuildDockers, StyleCheck, FastTest, Builds_1, Builds_2, Builds_1_Report, Builds_2_Report, Tests_1, Tests_2] + needs: [RunConfig, BuildDockers, StyleCheck, FastTest, Builds_1, Builds_2, Builds_1_Report, Builds_2_Report, Tests_1, Tests_2, Tests_3] runs-on: [self-hosted, style-checker] steps: - name: Check out repository code uses: ClickHouse/checkout@v1 + with: + filter: tree:0 - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" diff --git a/CMakeLists.txt b/CMakeLists.txt index abbc48ab23a..96ba2961d3a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,13 +61,16 @@ if (ENABLE_CHECK_HEAVY_BUILDS) # set CPU time limit to 1000 seconds set (RLIMIT_CPU 1000) - # Sanitizers are too heavy - if (SANITIZE OR SANITIZE_COVERAGE OR WITH_COVERAGE) - set (RLIMIT_DATA 10000000000) # 10G + # Sanitizers are too heavy. Some architectures too. + if (SANITIZE OR SANITIZE_COVERAGE OR WITH_COVERAGE OR ARCH_RISCV64 OR ARCH_LOONGARCH64) + # Twice as large + set (RLIMIT_DATA 10000000000) + set (RLIMIT_AS 20000000000) endif() - # For some files currently building RISCV64 might be too slow. TODO: Improve compilation times per file - if (ARCH_RISCV64) + # For some files currently building RISCV64/LOONGARCH64 might be too slow. + # TODO: Improve compilation times per file + if (ARCH_RISCV64 OR ARCH_LOONGARCH64) set (RLIMIT_CPU 1800) endif() diff --git a/base/base/cgroupsv2.cpp b/base/base/cgroupsv2.cpp index bea2e99fa51..f20b9daf22e 100644 --- a/base/base/cgroupsv2.cpp +++ b/base/base/cgroupsv2.cpp @@ -9,11 +9,18 @@ bool cgroupsV2Enabled() { #if defined(OS_LINUX) - /// This file exists iff the host has cgroups v2 enabled. - auto controllers_file = default_cgroups_mount / "cgroup.controllers"; - if (!std::filesystem::exists(controllers_file)) - return false; - return true; + try + { + /// This file exists iff the host has cgroups v2 enabled. + auto controllers_file = default_cgroups_mount / "cgroup.controllers"; + if (!std::filesystem::exists(controllers_file)) + return false; + return true; + } + catch (const std::filesystem::filesystem_error &) /// all "underlying OS API errors", typically: permission denied + { + return false; /// not logging the exception as most callers fall back to cgroups v1 + } #else return false; #endif diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index f8ff71876c6..dfbbb66a1e9 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -2,11 +2,11 @@ # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION, # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes. -SET(VERSION_REVISION 54486) +SET(VERSION_REVISION 54487) SET(VERSION_MAJOR 24) -SET(VERSION_MINOR 5) +SET(VERSION_MINOR 6) SET(VERSION_PATCH 1) -SET(VERSION_GITHASH 6d4b31322d168356c8b10c43b4cef157c82337ff) -SET(VERSION_DESCRIBE v24.5.1.1-testing) -SET(VERSION_STRING 24.5.1.1) +SET(VERSION_GITHASH 70a1d3a63d47f0be077d67b8deb907230fc7cfb0) +SET(VERSION_DESCRIBE v24.6.1.1-testing) +SET(VERSION_STRING 24.6.1.1) # end of autochange diff --git a/contrib/libunwind b/contrib/libunwind index 854538ce337..d6a01c46327 160000 --- a/contrib/libunwind +++ b/contrib/libunwind @@ -1 +1 @@ -Subproject commit 854538ce337d631b619010528adff22cd58f9dce +Subproject commit d6a01c46327e56fd86beb8aaa31591fcd9a6b7df diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile index 5d53d03606f..172fbce6406 100644 --- a/docker/test/style/Dockerfile +++ b/docker/test/style/Dockerfile @@ -11,6 +11,7 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \ aspell \ curl \ git \ + gh \ file \ libxml2-utils \ moreutils \ diff --git a/docs/_description_templates/template-setting.md b/docs/_description_templates/template-setting.md index fc912aba3e1..f4525d872df 100644 --- a/docs/_description_templates/template-setting.md +++ b/docs/_description_templates/template-setting.md @@ -2,7 +2,7 @@ Description. -For the switch setting, use the typical phrase: “Enables or disables something …”. +For the switch setting, use the typical phrase: “Enables or disables something ...”. Possible values: diff --git a/docs/changelogs/v20.7.1.4310-prestable.md b/docs/changelogs/v20.7.1.4310-prestable.md index f47c7334228..aa1d993b263 100644 --- a/docs/changelogs/v20.7.1.4310-prestable.md +++ b/docs/changelogs/v20.7.1.4310-prestable.md @@ -166,4 +166,4 @@ * NO CL ENTRY: 'Revert "Abort on std::out_of_range in debug builds"'. [#12752](https://github.com/ClickHouse/ClickHouse/pull/12752) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * NO CL ENTRY: 'Bump protobuf from 3.12.2 to 3.12.4 in /docs/tools'. [#13102](https://github.com/ClickHouse/ClickHouse/pull/13102) ([dependabot-preview[bot]](https://github.com/apps/dependabot-preview)). * NO CL ENTRY: 'Merge [#12574](https://github.com/ClickHouse/ClickHouse/issues/12574)'. [#13158](https://github.com/ClickHouse/ClickHouse/pull/13158) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* NO CL ENTRY: 'Revert "Add QueryTimeMicroseconds, SelectQueryTimeMicroseconds and InsertQuer…"'. [#13303](https://github.com/ClickHouse/ClickHouse/pull/13303) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* NO CL ENTRY: 'Revert "Add QueryTimeMicroseconds, SelectQueryTimeMicroseconds and InsertQuer..."'. [#13303](https://github.com/ClickHouse/ClickHouse/pull/13303) ([Alexey Milovidov](https://github.com/alexey-milovidov)). diff --git a/docs/changelogs/v21.12.1.9017-prestable.md b/docs/changelogs/v21.12.1.9017-prestable.md index 88b8260e312..bd84873e67a 100644 --- a/docs/changelogs/v21.12.1.9017-prestable.md +++ b/docs/changelogs/v21.12.1.9017-prestable.md @@ -421,5 +421,5 @@ sidebar_label: 2022 * Fix possible crash in DataTypeAggregateFunction [#32287](https://github.com/ClickHouse/ClickHouse/pull/32287) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). * Update backport.py [#32323](https://github.com/ClickHouse/ClickHouse/pull/32323) ([Kseniia Sumarokova](https://github.com/kssenii)). * Fix graphite-bench build [#32351](https://github.com/ClickHouse/ClickHouse/pull/32351) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Revert "graphite: split tagged/plain rollup rules (for merges perfoma… [#32376](https://github.com/ClickHouse/ClickHouse/pull/32376) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Revert "graphite: split tagged/plain rollup rules (for merges perfoma... [#32376](https://github.com/ClickHouse/ClickHouse/pull/32376) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). * Another attempt to fix unit test Executor::RemoveTasksStress [#32390](https://github.com/ClickHouse/ClickHouse/pull/32390) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). diff --git a/docs/changelogs/v21.3.3.14-lts.md b/docs/changelogs/v21.3.3.14-lts.md index 57bde602f21..91d99deaa6b 100644 --- a/docs/changelogs/v21.3.3.14-lts.md +++ b/docs/changelogs/v21.3.3.14-lts.md @@ -18,4 +18,4 @@ sidebar_label: 2022 #### NOT FOR CHANGELOG / INSIGNIFICANT -* fix incorrect number of rows for Chunks with no columns in PartialSor… [#21761](https://github.com/ClickHouse/ClickHouse/pull/21761) ([Alexander Kuzmenkov](https://github.com/akuzm)). +* fix incorrect number of rows for Chunks with no columns in PartialSor... [#21761](https://github.com/ClickHouse/ClickHouse/pull/21761) ([Alexander Kuzmenkov](https://github.com/akuzm)). diff --git a/docs/changelogs/v21.4.1.6422-prestable.md b/docs/changelogs/v21.4.1.6422-prestable.md index 2eadb0d4754..66937c3be15 100644 --- a/docs/changelogs/v21.4.1.6422-prestable.md +++ b/docs/changelogs/v21.4.1.6422-prestable.md @@ -223,7 +223,7 @@ sidebar_label: 2022 * Do not overlap zookeeper path for ReplicatedMergeTree in stateless *.sh tests [#21724](https://github.com/ClickHouse/ClickHouse/pull/21724) ([Azat Khuzhin](https://github.com/azat)). * make the fuzzer use sources from the CI [#21754](https://github.com/ClickHouse/ClickHouse/pull/21754) ([Alexander Kuzmenkov](https://github.com/akuzm)). * Add one more variant to memcpy benchmark [#21759](https://github.com/ClickHouse/ClickHouse/pull/21759) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* fix incorrect number of rows for Chunks with no columns in PartialSor… [#21761](https://github.com/ClickHouse/ClickHouse/pull/21761) ([Alexander Kuzmenkov](https://github.com/akuzm)). +* fix incorrect number of rows for Chunks with no columns in PartialSor... [#21761](https://github.com/ClickHouse/ClickHouse/pull/21761) ([Alexander Kuzmenkov](https://github.com/akuzm)). * docs(fix): typo [#21775](https://github.com/ClickHouse/ClickHouse/pull/21775) ([Ali Demirci](https://github.com/depyronick)). * DDLWorker.cpp: fixed exceeded amount of tries typo [#21807](https://github.com/ClickHouse/ClickHouse/pull/21807) ([Eldar Nasyrov](https://github.com/3ldar-nasyrov)). * fix integration MaterializeMySQL test [#21819](https://github.com/ClickHouse/ClickHouse/pull/21819) ([TCeason](https://github.com/TCeason)). diff --git a/docs/changelogs/v21.4.2.10-prestable.md b/docs/changelogs/v21.4.2.10-prestable.md index 3db17ddfcf3..b9bdbd80c0c 100644 --- a/docs/changelogs/v21.4.2.10-prestable.md +++ b/docs/changelogs/v21.4.2.10-prestable.md @@ -226,7 +226,7 @@ sidebar_label: 2022 * Do not overlap zookeeper path for ReplicatedMergeTree in stateless *.sh tests [#21724](https://github.com/ClickHouse/ClickHouse/pull/21724) ([Azat Khuzhin](https://github.com/azat)). * make the fuzzer use sources from the CI [#21754](https://github.com/ClickHouse/ClickHouse/pull/21754) ([Alexander Kuzmenkov](https://github.com/akuzm)). * Add one more variant to memcpy benchmark [#21759](https://github.com/ClickHouse/ClickHouse/pull/21759) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* fix incorrect number of rows for Chunks with no columns in PartialSor… [#21761](https://github.com/ClickHouse/ClickHouse/pull/21761) ([Alexander Kuzmenkov](https://github.com/akuzm)). +* fix incorrect number of rows for Chunks with no columns in PartialSor... [#21761](https://github.com/ClickHouse/ClickHouse/pull/21761) ([Alexander Kuzmenkov](https://github.com/akuzm)). * docs(fix): typo [#21775](https://github.com/ClickHouse/ClickHouse/pull/21775) ([Ali Demirci](https://github.com/depyronick)). * DDLWorker.cpp: fixed exceeded amount of tries typo [#21807](https://github.com/ClickHouse/ClickHouse/pull/21807) ([Eldar Nasyrov](https://github.com/3ldar-nasyrov)). * fix integration MaterializeMySQL test [#21819](https://github.com/ClickHouse/ClickHouse/pull/21819) ([TCeason](https://github.com/TCeason)). diff --git a/docs/changelogs/v22.6.1.1985-stable.md b/docs/changelogs/v22.6.1.1985-stable.md index c915d24fe00..7bd7038377a 100644 --- a/docs/changelogs/v22.6.1.1985-stable.md +++ b/docs/changelogs/v22.6.1.1985-stable.md @@ -160,7 +160,7 @@ sidebar_label: 2022 * fix toString error on DatatypeDate32. [#37775](https://github.com/ClickHouse/ClickHouse/pull/37775) ([LiuNeng](https://github.com/liuneng1994)). * The clickhouse-keeper setting `dead_session_check_period_ms` was transformed into microseconds (multiplied by 1000), which lead to dead sessions only being cleaned up after several minutes (instead of 500ms). [#37824](https://github.com/ClickHouse/ClickHouse/pull/37824) ([Michael Lex](https://github.com/mlex)). * Fix possible "No more packets are available" for distributed queries (in case of `async_socket_for_remote`/`use_hedged_requests` is disabled). [#37826](https://github.com/ClickHouse/ClickHouse/pull/37826) ([Azat Khuzhin](https://github.com/azat)). -* Do not drop the inner target table when executing `ALTER TABLE … MODIFY QUERY` in WindowView. [#37879](https://github.com/ClickHouse/ClickHouse/pull/37879) ([vxider](https://github.com/Vxider)). +* Do not drop the inner target table when executing `ALTER TABLE ... MODIFY QUERY` in WindowView. [#37879](https://github.com/ClickHouse/ClickHouse/pull/37879) ([vxider](https://github.com/Vxider)). * Fix directory ownership of coordination dir in clickhouse-keeper Docker image. Fixes [#37914](https://github.com/ClickHouse/ClickHouse/issues/37914). [#37915](https://github.com/ClickHouse/ClickHouse/pull/37915) ([James Maidment](https://github.com/jamesmaidment)). * Dictionaries fix custom query with update field and `{condition}`. Closes [#33746](https://github.com/ClickHouse/ClickHouse/issues/33746). [#37947](https://github.com/ClickHouse/ClickHouse/pull/37947) ([Maksim Kita](https://github.com/kitaisreal)). * Fix possible incorrect result of `SELECT ... WITH FILL` in the case when `ORDER BY` should be applied after `WITH FILL` result (e.g. for outer query). Incorrect result was caused by optimization for `ORDER BY` expressions ([#35623](https://github.com/ClickHouse/ClickHouse/issues/35623)). Closes [#37904](https://github.com/ClickHouse/ClickHouse/issues/37904). [#37959](https://github.com/ClickHouse/ClickHouse/pull/37959) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). @@ -180,7 +180,7 @@ sidebar_label: 2022 #### NO CL ENTRY * NO CL ENTRY: 'Revert "Fix mutations in tables with columns of type `Object`"'. [#37355](https://github.com/ClickHouse/ClickHouse/pull/37355) ([Alexander Tokmakov](https://github.com/tavplubix)). -* NO CL ENTRY: 'Revert "Remove height restrictions from the query div in play web tool, and m…"'. [#37501](https://github.com/ClickHouse/ClickHouse/pull/37501) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* NO CL ENTRY: 'Revert "Remove height restrictions from the query div in play web tool, and m..."'. [#37501](https://github.com/ClickHouse/ClickHouse/pull/37501) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * NO CL ENTRY: 'Revert "Add support for preprocessing ZooKeeper operations in `clickhouse-keeper`"'. [#37534](https://github.com/ClickHouse/ClickHouse/pull/37534) ([Antonio Andelic](https://github.com/antonio2368)). * NO CL ENTRY: 'Revert "(only with zero-copy replication, non-production experimental feature not recommended to use) fix possible deadlock during fetching part"'. [#37545](https://github.com/ClickHouse/ClickHouse/pull/37545) ([Alexander Tokmakov](https://github.com/tavplubix)). * NO CL ENTRY: 'Revert "RFC: Fix converting types for UNION queries (may produce LOGICAL_ERROR)"'. [#37582](https://github.com/ClickHouse/ClickHouse/pull/37582) ([Dmitry Novik](https://github.com/novikd)). diff --git a/docs/changelogs/v22.7.1.2484-stable.md b/docs/changelogs/v22.7.1.2484-stable.md index 7464b0449ee..c4a76c66e0c 100644 --- a/docs/changelogs/v22.7.1.2484-stable.md +++ b/docs/changelogs/v22.7.1.2484-stable.md @@ -410,7 +410,7 @@ sidebar_label: 2022 * Add test for [#39132](https://github.com/ClickHouse/ClickHouse/issues/39132) [#39173](https://github.com/ClickHouse/ClickHouse/pull/39173) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). * Suppression for BC check (`Cannot parse string 'Hello' as UInt64`) [#39176](https://github.com/ClickHouse/ClickHouse/pull/39176) ([Alexander Tokmakov](https://github.com/tavplubix)). * Fix 01961_roaring_memory_tracking test [#39187](https://github.com/ClickHouse/ClickHouse/pull/39187) ([Dmitry Novik](https://github.com/novikd)). -* Cleanup: done during [#38719](https://github.com/ClickHouse/ClickHouse/issues/38719) (SortingStep: deduce way to sort based on … [#39191](https://github.com/ClickHouse/ClickHouse/pull/39191) ([Igor Nikonov](https://github.com/devcrafter)). +* Cleanup: done during [#38719](https://github.com/ClickHouse/ClickHouse/issues/38719) (SortingStep: deduce way to sort based on ... [#39191](https://github.com/ClickHouse/ClickHouse/pull/39191) ([Igor Nikonov](https://github.com/devcrafter)). * Fix exception in AsynchronousMetrics for s390x [#39193](https://github.com/ClickHouse/ClickHouse/pull/39193) ([Harry Lee](https://github.com/HarryLeeIBM)). * Optimize accesses to system.stack_trace (filter by name before sending signal) [#39212](https://github.com/ClickHouse/ClickHouse/pull/39212) ([Azat Khuzhin](https://github.com/azat)). * Enable warning "-Wdeprecated-dynamic-exception-spec" [#39213](https://github.com/ClickHouse/ClickHouse/pull/39213) ([Robert Schulze](https://github.com/rschu1ze)). diff --git a/docs/changelogs/v22.8.13.20-lts.md b/docs/changelogs/v22.8.13.20-lts.md index 0734f40bf3e..ad44fbfc5d6 100644 --- a/docs/changelogs/v22.8.13.20-lts.md +++ b/docs/changelogs/v22.8.13.20-lts.md @@ -20,4 +20,4 @@ sidebar_label: 2023 * Fix wrong approved_at, simplify conditions [#45302](https://github.com/ClickHouse/ClickHouse/pull/45302) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). * Get rid of artifactory in favor of r2 + ch-repos-manager [#45421](https://github.com/ClickHouse/ClickHouse/pull/45421) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). * Trim refs/tags/ from GITHUB_TAG in release workflow [#45636](https://github.com/ClickHouse/ClickHouse/pull/45636) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* Merge pull request [#38262](https://github.com/ClickHouse/ClickHouse/issues/38262) from PolyProgrammist/fix-ordinary-system-un… [#45650](https://github.com/ClickHouse/ClickHouse/pull/45650) ([alesapin](https://github.com/alesapin)). +* Merge pull request [#38262](https://github.com/ClickHouse/ClickHouse/issues/38262) from PolyProgrammist/fix-ordinary-system-un... [#45650](https://github.com/ClickHouse/ClickHouse/pull/45650) ([alesapin](https://github.com/alesapin)). diff --git a/docs/changelogs/v23.11.1.2711-stable.md b/docs/changelogs/v23.11.1.2711-stable.md index e32dee41dc7..0bdee08f5c9 100644 --- a/docs/changelogs/v23.11.1.2711-stable.md +++ b/docs/changelogs/v23.11.1.2711-stable.md @@ -217,7 +217,7 @@ sidebar_label: 2023 * S3Queue minor fix [#56999](https://github.com/ClickHouse/ClickHouse/pull/56999) ([Kseniia Sumarokova](https://github.com/kssenii)). * Fix file path validation for DatabaseFileSystem [#57029](https://github.com/ClickHouse/ClickHouse/pull/57029) ([San](https://github.com/santrancisco)). * Fix `fuzzBits` with `ARRAY JOIN` [#57033](https://github.com/ClickHouse/ClickHouse/pull/57033) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix Nullptr dereference in partial merge join with joined_subquery_re… [#57048](https://github.com/ClickHouse/ClickHouse/pull/57048) ([vdimir](https://github.com/vdimir)). +* Fix Nullptr dereference in partial merge join with joined_subquery_re... [#57048](https://github.com/ClickHouse/ClickHouse/pull/57048) ([vdimir](https://github.com/vdimir)). * Fix race condition in RemoteSource [#57052](https://github.com/ClickHouse/ClickHouse/pull/57052) ([Raúl Marín](https://github.com/Algunenano)). * Implement `bitHammingDistance` for big integers [#57073](https://github.com/ClickHouse/ClickHouse/pull/57073) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * S3-style links bug fix [#57075](https://github.com/ClickHouse/ClickHouse/pull/57075) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). diff --git a/docs/changelogs/v23.12.1.1368-stable.md b/docs/changelogs/v23.12.1.1368-stable.md index 1a322ae9c0f..cb8ba57100e 100644 --- a/docs/changelogs/v23.12.1.1368-stable.md +++ b/docs/changelogs/v23.12.1.1368-stable.md @@ -272,7 +272,7 @@ sidebar_label: 2023 * Bump Azure to v1.6.0 [#58052](https://github.com/ClickHouse/ClickHouse/pull/58052) ([Robert Schulze](https://github.com/rschu1ze)). * Correct values for randomization [#58058](https://github.com/ClickHouse/ClickHouse/pull/58058) ([Anton Popov](https://github.com/CurtizJ)). * Non post request should be readonly [#58060](https://github.com/ClickHouse/ClickHouse/pull/58060) ([San](https://github.com/santrancisco)). -* Revert "Merge pull request [#55710](https://github.com/ClickHouse/ClickHouse/issues/55710) from guoxiaolongzte/clickhouse-test… [#58066](https://github.com/ClickHouse/ClickHouse/pull/58066) ([Raúl Marín](https://github.com/Algunenano)). +* Revert "Merge pull request [#55710](https://github.com/ClickHouse/ClickHouse/issues/55710) from guoxiaolongzte/clickhouse-test... [#58066](https://github.com/ClickHouse/ClickHouse/pull/58066) ([Raúl Marín](https://github.com/Algunenano)). * fix typo in the test 02479 [#58072](https://github.com/ClickHouse/ClickHouse/pull/58072) ([Sema Checherinda](https://github.com/CheSema)). * Bump Azure to 1.7.2 [#58075](https://github.com/ClickHouse/ClickHouse/pull/58075) ([Robert Schulze](https://github.com/rschu1ze)). * Fix flaky test `02567_and_consistency` [#58076](https://github.com/ClickHouse/ClickHouse/pull/58076) ([Anton Popov](https://github.com/CurtizJ)). diff --git a/docs/changelogs/v23.3.1.2823-lts.md b/docs/changelogs/v23.3.1.2823-lts.md index 0c9be3601da..f81aba53ebe 100644 --- a/docs/changelogs/v23.3.1.2823-lts.md +++ b/docs/changelogs/v23.3.1.2823-lts.md @@ -520,7 +520,7 @@ sidebar_label: 2023 * Improve script for updating clickhouse-docs [#48135](https://github.com/ClickHouse/ClickHouse/pull/48135) ([Alexander Tokmakov](https://github.com/tavplubix)). * Fix stdlib compatibility issues [#48150](https://github.com/ClickHouse/ClickHouse/pull/48150) ([DimasKovas](https://github.com/DimasKovas)). * Make test test_disallow_concurrency less flaky [#48152](https://github.com/ClickHouse/ClickHouse/pull/48152) ([Vitaly Baranov](https://github.com/vitlibar)). -* Remove unused mockSystemDatabase from gtest_transform_query_for_exter… [#48162](https://github.com/ClickHouse/ClickHouse/pull/48162) ([Vladimir C](https://github.com/vdimir)). +* Remove unused mockSystemDatabase from gtest_transform_query_for_exter... [#48162](https://github.com/ClickHouse/ClickHouse/pull/48162) ([Vladimir C](https://github.com/vdimir)). * Update environmental-sensors.md [#48166](https://github.com/ClickHouse/ClickHouse/pull/48166) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Correctly handle NULL constants in logical optimizer for new analyzer [#48168](https://github.com/ClickHouse/ClickHouse/pull/48168) ([Antonio Andelic](https://github.com/antonio2368)). * Try making KeeperMap test more stable [#48170](https://github.com/ClickHouse/ClickHouse/pull/48170) ([Antonio Andelic](https://github.com/antonio2368)). diff --git a/docs/changelogs/v23.5.1.3174-stable.md b/docs/changelogs/v23.5.1.3174-stable.md index 2212eb6e893..4bdd4139afc 100644 --- a/docs/changelogs/v23.5.1.3174-stable.md +++ b/docs/changelogs/v23.5.1.3174-stable.md @@ -474,7 +474,7 @@ sidebar_label: 2023 * Fix flakiness of test_distributed_load_balancing test [#49921](https://github.com/ClickHouse/ClickHouse/pull/49921) ([Azat Khuzhin](https://github.com/azat)). * Add some logging [#49925](https://github.com/ClickHouse/ClickHouse/pull/49925) ([Kseniia Sumarokova](https://github.com/kssenii)). * Support hardlinking parts transactionally [#49931](https://github.com/ClickHouse/ClickHouse/pull/49931) ([Michael Kolupaev](https://github.com/al13n321)). -* Fix for analyzer: 02377_ optimize_sorting_by_input_stream_properties_e… [#49943](https://github.com/ClickHouse/ClickHouse/pull/49943) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix for analyzer: 02377_ optimize_sorting_by_input_stream_properties_e... [#49943](https://github.com/ClickHouse/ClickHouse/pull/49943) ([Igor Nikonov](https://github.com/devcrafter)). * Follow up to [#49429](https://github.com/ClickHouse/ClickHouse/issues/49429) [#49964](https://github.com/ClickHouse/ClickHouse/pull/49964) ([Kseniia Sumarokova](https://github.com/kssenii)). * Fix flaky test_ssl_cert_authentication to use urllib3 [#49982](https://github.com/ClickHouse/ClickHouse/pull/49982) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). * Fix woboq codebrowser build with -Wno-poison-system-directories [#49992](https://github.com/ClickHouse/ClickHouse/pull/49992) ([Azat Khuzhin](https://github.com/azat)). diff --git a/docs/changelogs/v23.8.1.2992-lts.md b/docs/changelogs/v23.8.1.2992-lts.md index 7c224b19350..05385d9c52b 100644 --- a/docs/changelogs/v23.8.1.2992-lts.md +++ b/docs/changelogs/v23.8.1.2992-lts.md @@ -272,7 +272,7 @@ sidebar_label: 2023 * Add more checks into ThreadStatus ctor. [#42019](https://github.com/ClickHouse/ClickHouse/pull/42019) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). * Refactor Query Tree visitor [#46740](https://github.com/ClickHouse/ClickHouse/pull/46740) ([Dmitry Novik](https://github.com/novikd)). * Revert "Revert "Randomize JIT settings in tests"" [#48282](https://github.com/ClickHouse/ClickHouse/pull/48282) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix outdated cache configuration in s3 tests: s3_storage_policy_by_defau… [#48424](https://github.com/ClickHouse/ClickHouse/pull/48424) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix outdated cache configuration in s3 tests: s3_storage_policy_by_defau... [#48424](https://github.com/ClickHouse/ClickHouse/pull/48424) ([Kseniia Sumarokova](https://github.com/kssenii)). * Fix IN with decimal in analyzer [#48754](https://github.com/ClickHouse/ClickHouse/pull/48754) ([vdimir](https://github.com/vdimir)). * Some unclear change in StorageBuffer::reschedule() for something [#49723](https://github.com/ClickHouse/ClickHouse/pull/49723) ([DimasKovas](https://github.com/DimasKovas)). * MergeTree & SipHash checksum big-endian support [#50276](https://github.com/ClickHouse/ClickHouse/pull/50276) ([ltrk2](https://github.com/ltrk2)). diff --git a/docs/changelogs/v24.1.3.31-stable.md b/docs/changelogs/v24.1.3.31-stable.md index 046ca451fbc..e898fba5c87 100644 --- a/docs/changelogs/v24.1.3.31-stable.md +++ b/docs/changelogs/v24.1.3.31-stable.md @@ -13,7 +13,7 @@ sidebar_label: 2024 #### Bug Fix (user-visible misbehavior in an official stable release) -* Fix `ASTAlterCommand::formatImpl` in case of column specific settings… [#59445](https://github.com/ClickHouse/ClickHouse/pull/59445) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Fix `ASTAlterCommand::formatImpl` in case of column specific settings... [#59445](https://github.com/ClickHouse/ClickHouse/pull/59445) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). * Make MAX use the same rules as permutation for complex types [#59498](https://github.com/ClickHouse/ClickHouse/pull/59498) ([Raúl Marín](https://github.com/Algunenano)). * Fix corner case when passing `update_insert_deduplication_token_in_dependent_materialized_views` [#59544](https://github.com/ClickHouse/ClickHouse/pull/59544) ([Jordi Villar](https://github.com/jrdi)). * Fix incorrect result of arrayElement / map[] on empty value [#59594](https://github.com/ClickHouse/ClickHouse/pull/59594) ([Raúl Marín](https://github.com/Algunenano)). diff --git a/docs/changelogs/v24.2.1.2248-stable.md b/docs/changelogs/v24.2.1.2248-stable.md index 6113dd51ab1..02affe12c43 100644 --- a/docs/changelogs/v24.2.1.2248-stable.md +++ b/docs/changelogs/v24.2.1.2248-stable.md @@ -130,7 +130,7 @@ sidebar_label: 2024 * Fix translate() with FixedString input [#59356](https://github.com/ClickHouse/ClickHouse/pull/59356) ([Raúl Marín](https://github.com/Algunenano)). * Fix digest calculation in Keeper [#59439](https://github.com/ClickHouse/ClickHouse/pull/59439) ([Antonio Andelic](https://github.com/antonio2368)). * Fix stacktraces for binaries without debug symbols [#59444](https://github.com/ClickHouse/ClickHouse/pull/59444) ([Azat Khuzhin](https://github.com/azat)). -* Fix `ASTAlterCommand::formatImpl` in case of column specific settings… [#59445](https://github.com/ClickHouse/ClickHouse/pull/59445) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Fix `ASTAlterCommand::formatImpl` in case of column specific settings... [#59445](https://github.com/ClickHouse/ClickHouse/pull/59445) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). * Fix `SELECT * FROM [...] ORDER BY ALL` with Analyzer [#59462](https://github.com/ClickHouse/ClickHouse/pull/59462) ([zhongyuankai](https://github.com/zhongyuankai)). * Fix possible uncaught exception during distributed query cancellation [#59487](https://github.com/ClickHouse/ClickHouse/pull/59487) ([Azat Khuzhin](https://github.com/azat)). * Make MAX use the same rules as permutation for complex types [#59498](https://github.com/ClickHouse/ClickHouse/pull/59498) ([Raúl Marín](https://github.com/Algunenano)). diff --git a/docs/changelogs/v24.3.1.2672-lts.md b/docs/changelogs/v24.3.1.2672-lts.md index e5d008680a8..006ab941203 100644 --- a/docs/changelogs/v24.3.1.2672-lts.md +++ b/docs/changelogs/v24.3.1.2672-lts.md @@ -526,7 +526,7 @@ sidebar_label: 2024 * No "please" [#61916](https://github.com/ClickHouse/ClickHouse/pull/61916) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Update version_date.tsv and changelogs after v23.12.6.19-stable [#61917](https://github.com/ClickHouse/ClickHouse/pull/61917) ([robot-clickhouse](https://github.com/robot-clickhouse)). * Update version_date.tsv and changelogs after v24.1.8.22-stable [#61918](https://github.com/ClickHouse/ClickHouse/pull/61918) ([robot-clickhouse](https://github.com/robot-clickhouse)). -* Fix flaky test_broken_projestions/test.py::test_broken_ignored_replic… [#61932](https://github.com/ClickHouse/ClickHouse/pull/61932) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix flaky test_broken_projestions/test.py::test_broken_ignored_replic... [#61932](https://github.com/ClickHouse/ClickHouse/pull/61932) ([Kseniia Sumarokova](https://github.com/kssenii)). * Check is Rust avaiable for build, if not, suggest a way to disable Rust support [#61938](https://github.com/ClickHouse/ClickHouse/pull/61938) ([Azat Khuzhin](https://github.com/azat)). * CI: new ci menu in PR body [#61948](https://github.com/ClickHouse/ClickHouse/pull/61948) ([Max K.](https://github.com/maxknv)). * Remove flaky test `01193_metadata_loading` [#61961](https://github.com/ClickHouse/ClickHouse/pull/61961) ([Nikita Taranov](https://github.com/nickitat)). diff --git a/docs/en/development/style.md b/docs/en/development/style.md index d201bbb0d3c..0f097d27607 100644 --- a/docs/en/development/style.md +++ b/docs/en/development/style.md @@ -57,7 +57,7 @@ memcpy(&buf[place_value], &x, sizeof(x)); for (size_t i = 0; i < rows; i += storage.index_granularity) ``` -**7.** Add spaces around binary operators (`+`, `-`, `*`, `/`, `%`, …) and the ternary operator `?:`. +**7.** Add spaces around binary operators (`+`, `-`, `*`, `/`, `%`, ...) and the ternary operator `?:`. ``` cpp UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); @@ -86,7 +86,7 @@ dst.ClickGoodEvent = click.GoodEvent; If necessary, the operator can be wrapped to the next line. In this case, the offset in front of it is increased. -**11.** Do not use a space to separate unary operators (`--`, `++`, `*`, `&`, …) from the argument. +**11.** Do not use a space to separate unary operators (`--`, `++`, `*`, `&`, ...) from the argument. **12.** Put a space after a comma, but not before it. The same rule goes for a semicolon inside a `for` expression. @@ -115,7 +115,7 @@ public: **16.** If the same `namespace` is used for the entire file, and there isn’t anything else significant, an offset is not necessary inside `namespace`. -**17.** If the block for an `if`, `for`, `while`, or other expression consists of a single `statement`, the curly brackets are optional. Place the `statement` on a separate line, instead. This rule is also valid for nested `if`, `for`, `while`, … +**17.** If the block for an `if`, `for`, `while`, or other expression consists of a single `statement`, the curly brackets are optional. Place the `statement` on a separate line, instead. This rule is also valid for nested `if`, `for`, `while`, ... But if the inner `statement` contains curly brackets or `else`, the external block should be written in curly brackets. diff --git a/docs/en/engines/table-engines/integrations/hdfs.md b/docs/en/engines/table-engines/integrations/hdfs.md index dbd1c270a4a..2749fa7e479 100644 --- a/docs/en/engines/table-engines/integrations/hdfs.md +++ b/docs/en/engines/table-engines/integrations/hdfs.md @@ -118,7 +118,7 @@ If the listing of files contains number ranges with leading zeros, use the const **Example** -Create table with files named `file000`, `file001`, … , `file999`: +Create table with files named `file000`, `file001`, ... , `file999`: ``` sql CREATE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index dfa06801d04..cb1da1c8e68 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -178,7 +178,7 @@ If the listing of files contains number ranges with leading zeros, use the const **Example with wildcards 1** -Create table with files named `file-000.csv`, `file-001.csv`, … , `file-999.csv`: +Create table with files named `file-000.csv`, `file-001.csv`, ... , `file-999.csv`: ``` sql CREATE TABLE big_table (name String, value UInt32) diff --git a/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md b/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md index 23d98d4b20e..eda87fd06c1 100644 --- a/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md +++ b/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md @@ -71,7 +71,7 @@ WHERE table = 'visits' └───────────┴───────────────────┴────────┘ ``` -The `partition` column contains the names of the partitions. There are two partitions in this example: `201901` and `201902`. You can use this column value to specify the partition name in [ALTER … PARTITION](../../../sql-reference/statements/alter/partition.md) queries. +The `partition` column contains the names of the partitions. There are two partitions in this example: `201901` and `201902`. You can use this column value to specify the partition name in [ALTER ... PARTITION](../../../sql-reference/statements/alter/partition.md) queries. The `name` column contains the names of the partition data parts. You can use this column to specify the name of the part in the [ALTER ATTACH PART](../../../sql-reference/statements/alter/partition.md#alter_attach-partition) query. diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 7862eef69f8..a009c4a32f3 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -954,7 +954,7 @@ In the case of `MergeTree` tables, data is getting to disk in different ways: - As a result of an insert (`INSERT` query). - During background merges and [mutations](/docs/en/sql-reference/statements/alter/index.md#alter-mutations). - When downloading from another replica. -- As a result of partition freezing [ALTER TABLE … FREEZE PARTITION](/docs/en/sql-reference/statements/alter/partition.md/#alter_freeze-partition). +- As a result of partition freezing [ALTER TABLE ... FREEZE PARTITION](/docs/en/sql-reference/statements/alter/partition.md/#alter_freeze-partition). In all these cases except for mutations and partition freezing, a part is stored on a volume and a disk according to the given storage policy: @@ -966,7 +966,7 @@ Under the hood, mutations and partition freezing make use of [hard links](https: In the background, parts are moved between volumes on the basis of the amount of free space (`move_factor` parameter) according to the order the volumes are declared in the configuration file. Data is never transferred from the last one and into the first one. One may use system tables [system.part_log](/docs/en/operations/system-tables/part_log.md/#system_tables-part-log) (field `type = MOVE_PART`) and [system.parts](/docs/en/operations/system-tables/parts.md/#system_tables-parts) (fields `path` and `disk`) to monitor background moves. Also, the detailed information can be found in server logs. -User can force moving a part or a partition from one volume to another using the query [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](/docs/en/sql-reference/statements/alter/partition.md/#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met. +User can force moving a part or a partition from one volume to another using the query [ALTER TABLE ... MOVE PART\|PARTITION ... TO VOLUME\|DISK ...](/docs/en/sql-reference/statements/alter/partition.md/#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met. Moving data does not interfere with data replication. Therefore, different storage policies can be specified for the same table on different replicas. diff --git a/docs/en/engines/table-engines/special/external-data.md b/docs/en/engines/table-engines/special/external-data.md index 7ea3f3e30d6..f6d6dae7eb6 100644 --- a/docs/en/engines/table-engines/special/external-data.md +++ b/docs/en/engines/table-engines/special/external-data.md @@ -29,7 +29,7 @@ Only a single table can be retrieved from stdin. The following parameters are optional: **–name**– Name of the table. If omitted, _data is used. **–format** – Data format in the file. If omitted, TabSeparated is used. -One of the following parameters is required:**–types** – A list of comma-separated column types. For example: `UInt64,String`. The columns will be named _1, _2, … +One of the following parameters is required:**–types** – A list of comma-separated column types. For example: `UInt64,String`. The columns will be named _1, _2, ... **–structure**– The table structure in the format`UserID UInt64`, `URL String`. Defines the column names and types. The files specified in ‘file’ will be parsed by the format specified in ‘format’, using the data types specified in ‘types’ or ‘structure’. The table will be uploaded to the server and accessible there as a temporary table with the name in ‘name’. diff --git a/docs/en/engines/table-engines/special/file.md b/docs/en/engines/table-engines/special/file.md index fdf5242ba3b..0d422f64762 100644 --- a/docs/en/engines/table-engines/special/file.md +++ b/docs/en/engines/table-engines/special/file.md @@ -14,6 +14,10 @@ Usage scenarios: - Convert data from one format to another. - Updating data in ClickHouse via editing a file on a disk. +:::note +This engine is not currently available in ClickHouse Cloud, please [use the S3 table function instead](/docs/en/sql-reference/table-functions/s3.md). +::: + ## Usage in ClickHouse Server {#usage-in-clickhouse-server} ``` sql diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index a137eb2bdf2..66d5bd2e574 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -197,6 +197,7 @@ SELECT * FROM nestedt FORMAT TSV - [input_format_tsv_enum_as_number](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_enum_as_number) - treat inserted enum values in TSV formats as enum indices. Default value - `false`. - [input_format_tsv_use_best_effort_in_schema_inference](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_use_best_effort_in_schema_inference) - use some tweaks and heuristics to infer schema in TSV format. If disabled, all fields will be inferred as Strings. Default value - `true`. - [output_format_tsv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#output_format_tsv_crlf_end_of_line) - if it is set true, end of line in TSV output format will be `\r\n` instead of `\n`. Default value - `false`. +- [input_format_tsv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_crlf_end_of_line) - if it is set true, end of line in TSV input format will be `\r\n` instead of `\n`. Default value - `false`. - [input_format_tsv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_first_lines) - skip specified number of lines at the beginning of data. Default value - `0`. - [input_format_tsv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_detect_header) - automatically detect header with names and types in TSV format. Default value - `true`. - [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. diff --git a/docs/en/operations/backup.md b/docs/en/operations/backup.md index 2ba50b39934..46c24ad8491 100644 --- a/docs/en/operations/backup.md +++ b/docs/en/operations/backup.md @@ -22,7 +22,7 @@ description: In order to effectively mitigate possible human errors, you should TEMPORARY TABLE table_name [AS table_name_in_backup] | VIEW view_name [AS view_name_in_backup] ALL TEMPORARY TABLES [EXCEPT ...] | - ALL DATABASES [EXCEPT ...] } [,...] + ALL [EXCEPT ...] } [,...] [ON CLUSTER 'cluster_name'] TO|FROM File('/') | Disk('', '/') | S3('/', '', '') [SETTINGS base_backup = File('/') | Disk(...) | S3('/', '', '')] diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 28831404a1f..a5fe74fd0c6 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -561,6 +561,25 @@ Default value: 5000 400 ``` +## max\_view\_num\_to\_warn {#max-view-num-to-warn} +If the number of attached views exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table. +Default value: 10000 + +**Example** + +``` xml +400 +``` + +## max\_dictionary\_num\_to\_warn {#max-dictionary-num-to-warn} +If the number of attached dictionaries exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table. +Default value: 1000 + +**Example** + +``` xml +400 +``` ## max\_part\_num\_to\_warn {#max-part-num-to-warn} If the number of active parts exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table. diff --git a/docs/en/operations/settings/query-complexity.md b/docs/en/operations/settings/query-complexity.md index d86f18ff982..2a20e74e20f 100644 --- a/docs/en/operations/settings/query-complexity.md +++ b/docs/en/operations/settings/query-complexity.md @@ -303,7 +303,7 @@ What to do when the amount of data exceeds one of the limits: ‘throw’ or ‘ Limits the number of rows in the hash table that is used when joining tables. -This settings applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and the [Join](../../engines/table-engines/special/join.md) table engine. +This settings applies to [SELECT ... JOIN](../../sql-reference/statements/select/join.md#select-join) operations and the [Join](../../engines/table-engines/special/join.md) table engine. If a query contains multiple joins, ClickHouse checks this setting for every intermediate result. @@ -320,7 +320,7 @@ Default value: 0. Limits the size in bytes of the hash table used when joining tables. -This setting applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md). +This setting applies to [SELECT ... JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md). If the query contains joins, ClickHouse checks this setting for every intermediate result. diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 6666f68c177..1a27b350652 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -831,7 +831,13 @@ Default value: `0`. ### output_format_tsv_crlf_end_of_line {#output_format_tsv_crlf_end_of_line} -Use DOC/Windows-style line separator (CRLF) in TSV instead of Unix style (LF). +Use DOS/Windows-style line separator (CRLF) in TSV instead of Unix style (LF). + +Disabled by default. + +### input_format_tsv_crlf_end_of_line {#input_format_tsv_crlf_end_of_line} + +Use DOS/Windows-style line separator (CRLF) for TSV input files instead of Unix style (LF). Disabled by default. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 91b544c6a82..2b5cd11819a 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -2248,7 +2248,7 @@ Default value: 0. ## count_distinct_implementation {#count_distinct_implementation} -Specifies which of the `uniq*` functions should be used to perform the [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference/count.md/#agg_function-count) construction. +Specifies which of the `uniq*` functions should be used to perform the [COUNT(DISTINCT ...)](../../sql-reference/aggregate-functions/reference/count.md/#agg_function-count) construction. Possible values: diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 7005783dd60..53ecd66396d 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -7,27 +7,27 @@ title: "External Disks for Storing Data" Data, processed in ClickHouse, is usually stored in the local file system — on the same machine with the ClickHouse server. That requires large-capacity disks, which can be expensive enough. To avoid that you can store the data remotely. Various storages are supported: 1. [Amazon S3](https://aws.amazon.com/s3/) object storage. -2. The Hadoop Distributed File System ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)) -3. [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs). +2. [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs). +3. Unsupported: The Hadoop Distributed File System ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)) :::note ClickHouse also has support for external table engines, which are different from external storage option described on this page as they allow to read data stored in some general file format (like Parquet), while on this page we are describing storage configuration for ClickHouse `MergeTree` family or `Log` family tables. 1. to work with data stored on `Amazon S3` disks, use [S3](/docs/en/engines/table-engines/integrations/s3.md) table engine. -2. to work with data in the Hadoop Distributed File System — [HDFS](/docs/en/engines/table-engines/integrations/hdfs.md) table engine. -3. to work with data stored in Azure Blob Storage use [AzureBlobStorage](/docs/en/engines/table-engines/integrations/azureBlobStorage.md) table engine. +2. to work with data stored in Azure Blob Storage use [AzureBlobStorage](/docs/en/engines/table-engines/integrations/azureBlobStorage.md) table engine. +3. Unsupported: to work with data in the Hadoop Distributed File System — [HDFS](/docs/en/engines/table-engines/integrations/hdfs.md) table engine. ::: ## Configuring external storage {#configuring-external-storage} -[MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) and [Log](/docs/en/engines/table-engines/log-family/log.md) family table engines can store data to `S3`, `AzureBlobStorage`, `HDFS` using a disk with types `s3`, `azure_blob_storage`, `hdfs` accordingly. +[MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) and [Log](/docs/en/engines/table-engines/log-family/log.md) family table engines can store data to `S3`, `AzureBlobStorage`, `HDFS` (unsupported) using a disk with types `s3`, `azure_blob_storage`, `hdfs` (unsupported) accordingly. Disk configuration requires: -1. `type` section, equal to one of `s3`, `azure_blob_storage`, `hdfs`, `local_blob_storage`, `web`. +1. `type` section, equal to one of `s3`, `azure_blob_storage`, `hdfs` (unsupported), `local_blob_storage`, `web`. 2. Configuration of a specific external storage type. Starting from 24.1 clickhouse version, it is possible to use a new configuration option. It requires to specify: 1. `type` equal to `object_storage` -2. `object_storage_type`, equal to one of `s3`, `azure_blob_storage` (or just `azure` from `24.3`), `hdfs`, `local_blob_storage` (or just `local` from `24.3`), `web`. +2. `object_storage_type`, equal to one of `s3`, `azure_blob_storage` (or just `azure` from `24.3`), `hdfs` (unsupported), `local_blob_storage` (or just `local` from `24.3`), `web`. Optionally, `metadata_type` can be specified (it is equal to `local` by default), but it can also be set to `plain`, `web` and, starting from `24.4`, `plain_rewritable`. Usage of `plain` metadata type is described in [plain storage section](/docs/en/operations/storing-data.md/#storing-data-on-webserver), `web` metadata type can be used only with `web` object storage type, `local` metadata type stores metadata files locally (each metadata files contains mapping to files in object storage and some additional meta information about them). @@ -328,7 +328,7 @@ Configuration: ``` -Starting from `24.1` it is possible configure any object storage disk (`s3`, `azure`, `hdfs`, `local`) using `plain` metadata type. +Starting from `24.1` it is possible configure any object storage disk (`s3`, `azure`, `hdfs` (unsupported), `local`) using `plain` metadata type. Configuration: ``` xml @@ -421,6 +421,7 @@ Other parameters: * `skip_access_check` - If true, disk access checks will not be performed on disk start-up. Default value is `false`. * `read_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of read requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk). * `write_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of write requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk). +* `metadata_keep_free_space_bytes` - the amount of free metadata disk space to be reserved. Examples of working configurations can be found in integration tests directory (see e.g. [test_merge_tree_azure_blob_storage](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/storage_conf.xml) or [test_azure_blob_storage_zero_copy_replication](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_azure_blob_storage_zero_copy_replication/configs/config.d/storage_conf.xml)). @@ -428,12 +429,14 @@ Examples of working configurations can be found in integration tests directory ( Zero-copy replication is disabled by default in ClickHouse version 22.8 and higher. This feature is not recommended for production use. ::: -## Using HDFS storage {#hdfs-storage} +## Using HDFS storage (Unsupported) In this sample configuration: -- the disk is of type `hdfs` +- the disk is of type `hdfs` (unsupported) - the data is hosted at `hdfs://hdfs1:9000/clickhouse/` +By the way, HDFS is unsupported and therefore there might be issues when using it. Feel free to make a pull request with the fix if any issue arises. + ```xml @@ -464,9 +467,11 @@ In this sample configuration: ``` +Keep in mind that HDFS may not work in corner cases. + ### Using Data Encryption {#encrypted-virtual-file-system} -You can encrypt the data stored on [S3](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-s3), or [HDFS](#configuring-hdfs) external disks, or on a local disk. To turn on the encryption mode, in the configuration file you must define a disk with the type `encrypted` and choose a disk on which the data will be saved. An `encrypted` disk ciphers all written files on the fly, and when you read files from an `encrypted` disk it deciphers them automatically. So you can work with an `encrypted` disk like with a normal one. +You can encrypt the data stored on [S3](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-s3), or [HDFS](#configuring-hdfs) (unsupported) external disks, or on a local disk. To turn on the encryption mode, in the configuration file you must define a disk with the type `encrypted` and choose a disk on which the data will be saved. An `encrypted` disk ciphers all written files on the fly, and when you read files from an `encrypted` disk it deciphers them automatically. So you can work with an `encrypted` disk like with a normal one. Example of disk configuration: @@ -529,7 +534,7 @@ Example of disk configuration: It is possible to configure local cache over disks in storage configuration starting from version 22.3. For versions 22.3 - 22.7 cache is supported only for `s3` disk type. For versions >= 22.8 cache is supported for any disk type: S3, Azure, Local, Encrypted, etc. -For versions >= 23.5 cache is supported only for remote disk types: S3, Azure, HDFS. +For versions >= 23.5 cache is supported only for remote disk types: S3, Azure, HDFS (unsupported). Cache uses `LRU` cache policy. @@ -971,7 +976,7 @@ Use [http_max_single_read_retries](/docs/en/operations/settings/settings.md/#htt ### Zero-copy Replication (not ready for production) {#zero-copy} -Zero-copy replication is possible, but not recommended, with `S3` and `HDFS` disks. Zero-copy replication means that if the data is stored remotely on several machines and needs to be synchronized, then only the metadata is replicated (paths to the data parts), but not the data itself. +Zero-copy replication is possible, but not recommended, with `S3` and `HDFS` (unsupported) disks. Zero-copy replication means that if the data is stored remotely on several machines and needs to be synchronized, then only the metadata is replicated (paths to the data parts), but not the data itself. :::note Zero-copy replication is not ready for production Zero-copy replication is disabled by default in ClickHouse version 22.8 and higher. This feature is not recommended for production use. diff --git a/docs/en/sql-reference/aggregate-functions/parametric-functions.md b/docs/en/sql-reference/aggregate-functions/parametric-functions.md index 8981ac1f752..1dc89b8dcf9 100644 --- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md @@ -82,7 +82,7 @@ FROM In this case, you should remember that you do not know the histogram bin borders. -## sequenceMatch(pattern)(timestamp, cond1, cond2, …) +## sequenceMatch(pattern)(timestamp, cond1, cond2, ...) Checks whether the sequence contains an event chain that matches the pattern. @@ -172,7 +172,7 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM - [sequenceCount](#function-sequencecount) -## sequenceCount(pattern)(time, cond1, cond2, …) +## sequenceCount(pattern)(time, cond1, cond2, ...) Counts the number of event chains that matched the pattern. The function searches event chains that do not overlap. It starts to search for the next chain after the current chain is matched. diff --git a/docs/en/sql-reference/aggregate-functions/reference/analysis_of_variance.md b/docs/en/sql-reference/aggregate-functions/reference/analysis_of_variance.md new file mode 100644 index 00000000000..d9b44b3ff07 --- /dev/null +++ b/docs/en/sql-reference/aggregate-functions/reference/analysis_of_variance.md @@ -0,0 +1,45 @@ +--- +slug: /en/sql-reference/aggregate-functions/reference/analysis_of_variance +sidebar_position: 6 +--- + +# analysisOfVariance + +Provides a statistical test for one-way analysis of variance (ANOVA test). It is a test over several groups of normally distributed observations to find out whether all groups have the same mean or not. + +**Syntax** + +```sql +analysisOfVariance(val, group_no) +``` + +Aliases: `anova` + +**Parameters** +- `val`: value. +- `group_no` : group number that `val` belongs to. + +:::note +Groups are enumerated starting from 0 and there should be at least two groups to perform a test. +There should be at least one group with the number of observations greater than one. +::: + +**Returned value** + +- `(f_statistic, p_value)`. [Tuple](../../data-types/tuple.md)([Float64](../../data-types/float.md), [Float64](../../data-types/float.md)). + +**Example** + +Query: + +```sql +SELECT analysisOfVariance(number, number % 2) FROM numbers(1048575); +``` + +Result: + +```response +┌─analysisOfVariance(number, modulo(number, 2))─┐ +│ (0,1) │ +└───────────────────────────────────────────────┘ +``` diff --git a/docs/en/sql-reference/aggregate-functions/reference/index.md b/docs/en/sql-reference/aggregate-functions/reference/index.md index e9a7fe4fc2b..451ee2aae9d 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/index.md +++ b/docs/en/sql-reference/aggregate-functions/reference/index.md @@ -37,6 +37,7 @@ Standard aggregate functions: ClickHouse-specific aggregate functions: +- [analysisOfVariance](/docs/en/sql-reference/aggregate-functions/reference/analysis_of_variance.md) - [any](/docs/en/sql-reference/aggregate-functions/reference/any_respect_nulls.md) - [anyHeavy](/docs/en/sql-reference/aggregate-functions/reference/anyheavy.md) - [anyLast](/docs/en/sql-reference/aggregate-functions/reference/anylast.md) diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiles.md b/docs/en/sql-reference/aggregate-functions/reference/quantiles.md index e2a5bc53e32..856d447ac13 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantiles.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantiles.md @@ -7,7 +7,7 @@ sidebar_position: 201 ## quantiles -Syntax: `quantiles(level1, level2, …)(x)` +Syntax: `quantiles(level1, level2, ...)(x)` All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantileInterpolatedWeighted`, `quantilesTDigest`, `quantilesBFloat16`, `quantilesDD`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values. diff --git a/docs/en/sql-reference/data-types/aggregatefunction.md b/docs/en/sql-reference/data-types/aggregatefunction.md index 87511a505dc..37f0d0e50ae 100644 --- a/docs/en/sql-reference/data-types/aggregatefunction.md +++ b/docs/en/sql-reference/data-types/aggregatefunction.md @@ -6,9 +6,9 @@ sidebar_label: AggregateFunction # AggregateFunction -Aggregate functions can have an implementation-defined intermediate state that can be serialized to an `AggregateFunction(…)` data type and stored in a table, usually, by means of [a materialized view](../../sql-reference/statements/create/view.md). The common way to produce an aggregate function state is by calling the aggregate function with the `-State` suffix. To get the final result of aggregation in the future, you must use the same aggregate function with the `-Merge`suffix. +Aggregate functions can have an implementation-defined intermediate state that can be serialized to an `AggregateFunction(...)` data type and stored in a table, usually, by means of [a materialized view](../../sql-reference/statements/create/view.md). The common way to produce an aggregate function state is by calling the aggregate function with the `-State` suffix. To get the final result of aggregation in the future, you must use the same aggregate function with the `-Merge`suffix. -`AggregateFunction(name, types_of_arguments…)` — parametric data type. +`AggregateFunction(name, types_of_arguments...)` — parametric data type. **Parameters** diff --git a/docs/en/sql-reference/data-types/dynamic.md b/docs/en/sql-reference/data-types/dynamic.md new file mode 100644 index 00000000000..955fd54e641 --- /dev/null +++ b/docs/en/sql-reference/data-types/dynamic.md @@ -0,0 +1,495 @@ +--- +slug: /en/sql-reference/data-types/dynamic +sidebar_position: 56 +sidebar_label: Dynamic +--- + +# Dynamic + +This type allows to store values of any type inside it without knowing all of them in advance. + +To declare a column of `Dynamic` type, use the following syntax: + +``` sql + Dynamic(max_types=N) +``` + +Where `N` is an optional parameter between `1` and `255` indicating how many different data types can be stored inside a column with type `Dynamic` across single block of data that is stored separately (for example across single data part for MergeTree table). If this limit is exceeded, all new types will be converted to type `String`. Default value of `max_types` is `32`. + +:::note +The Dynamic data type is an experimental feature. To use it, set `allow_experimental_dynamic_type = 1`. +::: + +## Creating Dynamic + +Using `Dynamic` type in table column definition: + +```sql +CREATE TABLE test (d Dynamic) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT d, dynamicType(d) FROM test; +``` + +```text +┌─d─────────────┬─dynamicType(d)─┐ +│ ᴺᵁᴸᴸ │ None │ +│ 42 │ Int64 │ +│ Hello, World! │ String │ +│ [1,2,3] │ Array(Int64) │ +└───────────────┴────────────────┘ +``` + +Using CAST from ordinary column: + +```sql +SELECT 'Hello, World!'::Dynamic as d, dynamicType(d); +``` + +```text +┌─d─────────────┬─dynamicType(d)─┐ +│ Hello, World! │ String │ +└───────────────┴────────────────┘ +``` + +Using CAST from `Variant` column: + +```sql +SET allow_experimental_variant_type = 1, use_variant_as_common_type = 1; +SELECT multiIf((number % 3) = 0, number, (number % 3) = 1, range(number + 1), NULL)::Dynamic AS d, dynamicType(d) FROM numbers(3) +``` + +```text +┌─d─────┬─dynamicType(d)─┐ +│ 0 │ UInt64 │ +│ [0,1] │ Array(UInt64) │ +│ ᴺᵁᴸᴸ │ None │ +└───────┴────────────────┘ +``` + + +## Reading Dynamic nested types as subcolumns + +`Dynamic` type supports reading a single nested type from a `Dynamic` column using the type name as a subcolumn. +So, if you have column `d Dynamic` you can read a subcolumn of any valid type `T` using syntax `d.T`, +this subcolumn will have type `Nullable(T)` if `T` can be inside `Nullable` and `T` otherwise. This subcolumn will +be the same size as original `Dynamic` column and will contain `NULL` values (or empty values if `T` cannot be inside `Nullable`) +in all rows in which original `Dynamic` column doesn't have type `T`. + +`Dynamic` subcolumns can be also read using function `dynamicElement(dynamic_column, type_name)`. + +Examples: + +```sql +CREATE TABLE test (d Dynamic) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT d, dynamicType(d), d.String, d.Int64, d.`Array(Int64)`, d.Date, d.`Array(String)` FROM test; +``` + +```text +┌─d─────────────┬─dynamicType(d)─┬─d.String──────┬─d.Int64─┬─d.Array(Int64)─┬─d.Date─┬─d.Array(String)─┐ +│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ Int64 │ ᴺᵁᴸᴸ │ 42 │ [] │ ᴺᵁᴸᴸ │ [] │ +│ Hello, World! │ String │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ Array(Int64) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ ᴺᵁᴸᴸ │ [] │ +└───────────────┴────────────────┴───────────────┴─────────┴────────────────┴────────┴─────────────────┘ +``` + +```sql +SELECT toTypeName(d.String), toTypeName(d.Int64), toTypeName(d.`Array(Int64)`), toTypeName(d.Date), toTypeName(d.`Array(String)`) FROM test LIMIT 1; +``` + +```text +┌─toTypeName(d.String)─┬─toTypeName(d.Int64)─┬─toTypeName(d.Array(Int64))─┬─toTypeName(d.Date)─┬─toTypeName(d.Array(String))─┐ +│ Nullable(String) │ Nullable(Int64) │ Array(Int64) │ Nullable(Date) │ Array(String) │ +└──────────────────────┴─────────────────────┴────────────────────────────┴────────────────────┴─────────────────────────────┘ +``` + +```sql +SELECT d, dynamicType(d), dynamicElement(d, 'String'), dynamicElement(d, 'Int64'), dynamicElement(d, 'Array(Int64)'), dynamicElement(d, 'Date'), dynamicElement(d, 'Array(String)') FROM test;``` +``` + +```text +┌─d─────────────┬─dynamicType(d)─┬─dynamicElement(d, 'String')─┬─dynamicElement(d, 'Int64')─┬─dynamicElement(d, 'Array(Int64)')─┬─dynamicElement(d, 'Date')─┬─dynamicElement(d, 'Array(String)')─┐ +│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ Int64 │ ᴺᵁᴸᴸ │ 42 │ [] │ ᴺᵁᴸᴸ │ [] │ +│ Hello, World! │ String │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ Array(Int64) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ ᴺᵁᴸᴸ │ [] │ +└───────────────┴────────────────┴─────────────────────────────┴────────────────────────────┴───────────────────────────────────┴───────────────────────────┴────────────────────────────────────┘ +``` + +To know what variant is stored in each row function `dynamicType(dynamic_column)` can be used. It returns `String` with value type name for each row (or `'None'` if row is `NULL`). + +Example: + +```sql +CREATE TABLE test (d Dynamic) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT dynamicType(d) from test; +``` + +```text +┌─dynamicType(d)─┐ +│ None │ +│ Int64 │ +│ String │ +│ Array(Int64) │ +└────────────────┘ +``` + +## Conversion between Dynamic column and other columns + +There are 4 possible conversions that can be performed with `Dynamic` column. + +### Converting an ordinary column to a Dynamic column + +```sql +SELECT 'Hello, World!'::Dynamic as d, dynamicType(d); +``` + +```text +┌─d─────────────┬─dynamicType(d)─┐ +│ Hello, World! │ String │ +└───────────────┴────────────────┘ +``` + +### Converting a String column to a Dynamic column through parsing + +To parse `Dynamic` type values from a `String` column you can enable setting `cast_string_to_dynamic_use_inference`: + +```sql +SET cast_string_to_dynamic_use_inference = 1; +SELECT CAST(materialize(map('key1', '42', 'key2', 'true', 'key3', '2020-01-01')), 'Map(String, Dynamic)') as map_of_dynamic, mapApply((k, v) -> (k, dynamicType(v)), map_of_dynamic) as map_of_dynamic_types; +``` + +```text +┌─map_of_dynamic──────────────────────────────┬─map_of_dynamic_types─────────────────────────┐ +│ {'key1':42,'key2':true,'key3':'2020-01-01'} │ {'key1':'Int64','key2':'Bool','key3':'Date'} │ +└─────────────────────────────────────────────┴──────────────────────────────────────────────┘ +``` + +### Converting a Dynamic column to an ordinary column + +It is possible to convert a `Dynamic` column to an ordinary column. In this case all nested types will be converted to a destination type: + +```sql +CREATE TABLE test (d Dynamic) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('42.42'), (true), ('e10'); +SELECT d::Nullable(Float64) FROM test; +``` + +```text +┌─CAST(d, 'Nullable(Float64)')─┐ +│ ᴺᵁᴸᴸ │ +│ 42 │ +│ 42.42 │ +│ 1 │ +│ 0 │ +└──────────────────────────────┘ +``` + +### Converting a Variant column to Dynamic column + +```sql +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('String'), ([1, 2, 3]); +SELECT v::Dynamic as d, dynamicType(d) from test; +``` + +```text +┌─d───────┬─dynamicType(d)─┐ +│ ᴺᵁᴸᴸ │ None │ +│ 42 │ UInt64 │ +│ String │ String │ +│ [1,2,3] │ Array(UInt64) │ +└─────────┴────────────────┘ +``` + +### Converting a Dynamic(max_types=N) column to another Dynamic(max_types=K) + +If `K >= N` than during conversion the data doesn't change: + +```sql +CREATE TABLE test (d Dynamic(max_types=3)) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), (43), ('42.42'), (true); +SELECT d::Dynamic(max_types=5) as d2, dynamicType(d2) FROM test; +``` + +```text +┌─d─────┬─dynamicType(d)─┐ +│ ᴺᵁᴸᴸ │ None │ +│ 42 │ Int64 │ +│ 43 │ Int64 │ +│ 42.42 │ String │ +│ true │ Bool │ +└───────┴────────────────┘ +``` + +If `K < N`, then the values with the rarest types are converted to `String`: +```text +CREATE TABLE test (d Dynamic(max_types=4)) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), (43), ('42.42'), (true), ([1, 2, 3]); +SELECT d, dynamicType(d), d::Dynamic(max_types=2) as d2, dynamicType(d2) FROM test; +``` + +```text +┌─d───────┬─dynamicType(d)─┬─d2──────┬─dynamicType(d2)─┐ +│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ None │ +│ 42 │ Int64 │ 42 │ Int64 │ +│ 43 │ Int64 │ 43 │ Int64 │ +│ 42.42 │ String │ 42.42 │ String │ +│ true │ Bool │ true │ String │ +│ [1,2,3] │ Array(Int64) │ [1,2,3] │ String │ +└─────────┴────────────────┴─────────┴─────────────────┘ +``` + +If `K=1`, all types are converted to `String`: + +```text +CREATE TABLE test (d Dynamic(max_types=4)) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), (43), ('42.42'), (true), ([1, 2, 3]); +SELECT d, dynamicType(d), d::Dynamic(max_types=1) as d2, dynamicType(d2) FROM test; +``` + +```text +┌─d───────┬─dynamicType(d)─┬─d2──────┬─dynamicType(d2)─┐ +│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ None │ +│ 42 │ Int64 │ 42 │ String │ +│ 43 │ Int64 │ 43 │ String │ +│ 42.42 │ String │ 42.42 │ String │ +│ true │ Bool │ true │ String │ +│ [1,2,3] │ Array(Int64) │ [1,2,3] │ String │ +└─────────┴────────────────┴─────────┴─────────────────┘ +``` + +## Reading Dynamic type from the data + +All text formats (TSV, CSV, CustomSeparated, Values, JSONEachRow, etc) supports reading `Dynamic` type. During data parsing ClickHouse tries to infer the type of each value and use it during insertion to `Dynamic` column. + +Example: + +```sql +SELECT + d, + dynamicType(d), + dynamicElement(d, 'String') AS str, + dynamicElement(d, 'Int64') AS num, + dynamicElement(d, 'Float64') AS float, + dynamicElement(d, 'Date') AS date, + dynamicElement(d, 'Array(Int64)') AS arr +FROM format(JSONEachRow, 'd Dynamic', $$ +{"d" : "Hello, World!"}, +{"d" : 42}, +{"d" : 42.42}, +{"d" : "2020-01-01"}, +{"d" : [1, 2, 3]} +$$) +``` + +```text +┌─d─────────────┬─dynamicType(d)─┬─str───────────┬──num─┬─float─┬───────date─┬─arr─────┐ +│ Hello, World! │ String │ Hello, World! │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ Int64 │ ᴺᵁᴸᴸ │ 42 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42.42 │ Float64 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 42.42 │ ᴺᵁᴸᴸ │ [] │ +│ 2020-01-01 │ Date │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 2020-01-01 │ [] │ +│ [1,2,3] │ Array(Int64) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└───────────────┴────────────────┴───────────────┴──────┴───────┴────────────┴─────────┘ +``` + +## Comparing values of Dynamic type + +Values of `Dynamic` types are compared similar to values of `Variant` type: +The result of operator `<` for values `d1` with underlying type `T1` and `d2` with underlying type `T2` of a type `Dynamic` is defined as follows: +- If `T1 = T2 = T`, the result will be `d1.T < d2.T` (underlying values will be compared). +- If `T1 != T2`, the result will be `T1 < T2` (type names will be compared). + +Examples: +```sql +CREATE TABLE test (d1 Dynamic, d2 Dynamic) ENGINE=Memory; +INSERT INTO test VALUES (42, 42), (42, 43), (42, 'abc'), (42, [1, 2, 3]), (42, []), (42, NULL); +``` + +```sql +SELECT d2, dynamicType(d2) as d2_type from test order by d2; +``` + +```text +┌─d2──────┬─d2_type──────┐ +│ [] │ Array(Int64) │ +│ [1,2,3] │ Array(Int64) │ +│ 42 │ Int64 │ +│ 43 │ Int64 │ +│ abc │ String │ +│ ᴺᵁᴸᴸ │ None │ +└─────────┴──────────────┘ +``` + +```sql +SELECT d1, dynamicType(d1) as d1_type, d2, dynamicType(d2) as d2_type, d1 = d2, d1 < d2, d1 > d2 from test; +``` + +```text +┌─d1─┬─d1_type─┬─d2──────┬─d2_type──────┬─equals(d1, d2)─┬─less(d1, d2)─┬─greater(d1, d2)─┐ +│ 42 │ Int64 │ 42 │ Int64 │ 1 │ 0 │ 0 │ +│ 42 │ Int64 │ 43 │ Int64 │ 0 │ 1 │ 0 │ +│ 42 │ Int64 │ abc │ String │ 0 │ 1 │ 0 │ +│ 42 │ Int64 │ [1,2,3] │ Array(Int64) │ 0 │ 0 │ 1 │ +│ 42 │ Int64 │ [] │ Array(Int64) │ 0 │ 0 │ 1 │ +│ 42 │ Int64 │ ᴺᵁᴸᴸ │ None │ 0 │ 1 │ 0 │ +└────┴─────────┴─────────┴──────────────┴────────────────┴──────────────┴─────────────────┘ +``` + +If you need to find the row with specific `Dynamic` value, you can do one of the following: + +- Cast value to the `Dynamic` type: + +```sql +SELECT * FROM test WHERE d2 == [1,2,3]::Array(UInt32)::Dynamic; +``` + +```text +┌─d1─┬─d2──────┐ +│ 42 │ [1,2,3] │ +└────┴─────────┘ +``` + +- Compare `Dynamic` subcolumn with required type: + +```sql +SELECT * FROM test WHERE d2.`Array(Int65)` == [1,2,3] -- or using variantElement(d2, 'Array(UInt32)') +``` + +```text +┌─d1─┬─d2──────┐ +│ 42 │ [1,2,3] │ +└────┴─────────┘ +``` + +Sometimes it can be useful to make additional check on dynamic type as subcolumns with complex types like `Array/Map/Tuple` cannot be inside `Nullable` and will have default values instead of `NULL` on rows with different types: + +```sql +SELECT d2, d2.`Array(Int64)`, dynamicType(d2) FROM test WHERE d2.`Array(Int64)` == []; +``` + +```text +┌─d2───┬─d2.Array(UInt32)─┬─dynamicType(d2)─┐ +│ 42 │ [] │ Int64 │ +│ 43 │ [] │ Int64 │ +│ abc │ [] │ String │ +│ [] │ [] │ Array(Int32) │ +│ ᴺᵁᴸᴸ │ [] │ None │ +└──────┴──────────────────┴─────────────────┘ +``` + +```sql +SELECT d2, d2.`Array(Int64)`, dynamicType(d2) FROM test WHERE dynamicType(d2) == 'Array(Int64)' AND d2.`Array(Int64)` == []; +``` + +```text +┌─d2─┬─d2.Array(UInt32)─┬─dynamicType(d2)─┐ +│ [] │ [] │ Array(Int64) │ +└────┴──────────────────┴─────────────────┘ +``` + +**Note:** values of dynamic types with different numeric types are considered as different values and not compared between each other, their type names are compared instead. + +Example: + +```sql +CREATE TABLE test (d Dynamic) ENGINE=Memory; +INSERT INTO test VALUES (1::UInt32), (1::Int64), (100::UInt32), (100::Int64); +SELECT d, dynamicType(d) FROM test ORDER by d; +``` + +```text +┌─v───┬─dynamicType(v)─┐ +│ 1 │ Int64 │ +│ 100 │ Int64 │ +│ 1 │ UInt32 │ +│ 100 │ UInt32 │ +└─────┴────────────────┘ +``` + +## Reaching the limit in number of different data types stored inside Dynamic + +`Dynamic` data type can store only limited number of different data types inside. By default, this limit is 32, but you can change it in type declaration using syntax `Dynamic(max_types=N)` where N is between 1 and 255 (due to implementation details, it's impossible to have more than 255 different data types inside Dynamic). +When the limit is reached, all new data types inserted to `Dynamic` column will be casted to `String` and stored as `String` values. + +Let's see what happens when the limit is reached in different scenarios. + +### Reaching the limit during data parsing + +During parsing of `Dynamic` values from the data, when the limit is reached for current block of data, all new values will be inserted as `String` values: + +```sql +SELECT d, dynamicType(d) FROM format(JSONEachRow, 'd Dynamic(max_types=3)', ' +{"d" : 42} +{"d" : [1, 2, 3]} +{"d" : "Hello, World!"} +{"d" : "2020-01-01"} +{"d" : ["str1", "str2", "str3"]} +{"d" : {"a" : 1, "b" : [1, 2, 3]}} +') +``` + +```text +┌─d──────────────────────────┬─dynamicType(d)─┐ +│ 42 │ Int64 │ +│ [1,2,3] │ Array(Int64) │ +│ Hello, World! │ String │ +│ 2020-01-01 │ String │ +│ ["str1", "str2", "str3"] │ String │ +│ {"a" : 1, "b" : [1, 2, 3]} │ String │ +└────────────────────────────┴────────────────┘ +``` + +As we can see, after inserting 3 different data types `Int64`, `Array(Int64)` and `String` all new types were converted to `String`. + +### During merges of data parts in MergeTree table engines + +During merge of several data parts in MergeTree table the `Dynamic` column in the resulting data part can reach the limit of different data types inside and won't be able to store all types from source parts. +In this case ClickHouse chooses what types will remain after merge and what types will be casted to `String`. In most cases ClickHouse tries to keep the most frequent types and cast the rarest types to `String`, but it depends on the implementation. + +Let's see an example of such merge. First, let's create a table with `Dynamic` column, set the limit of different data types to `3` and insert values with `5` different types: + +```sql +CREATE TABLE test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree ORDER BY id; +SYSTEM STOP MERGES test; +INSERT INTO test SELECT number, number FROM numbers(5); +INSERT INTO test SELECT number, range(number) FROM numbers(4); +INSERT INTO test SELECT number, toDate(number) FROM numbers(3); +INSERT INTO test SELECT number, map(number, number) FROM numbers(2); +INSERT INTO test SELECT number, 'str_' || toString(number) FROM numbers(1); +``` + +Each insert will create a separate data pert with `Dynamic` column containing single type: +```sql +SELECT count(), dynamicType(d), _part FROM test GROUP BY _part, dynamicType(d) ORDER BY _part; +``` + +```text +┌─count()─┬─dynamicType(d)──────┬─_part─────┐ +│ 5 │ UInt64 │ all_1_1_0 │ +│ 4 │ Array(UInt64) │ all_2_2_0 │ +│ 3 │ Date │ all_3_3_0 │ +│ 2 │ Map(UInt64, UInt64) │ all_4_4_0 │ +│ 1 │ String │ all_5_5_0 │ +└─────────┴─────────────────────┴───────────┘ +``` + +Now, let's merge all parts into one and see what will happen: + +```sql +SYSTEM START MERGES test; +OPTIMIZE TABLE test FINAL; +SELECT count(), dynamicType(d), _part FROM test GROUP BY _part, dynamicType(d) ORDER BY _part; +``` + +```text +┌─count()─┬─dynamicType(d)─┬─_part─────┐ +│ 5 │ UInt64 │ all_1_5_2 │ +│ 6 │ String │ all_1_5_2 │ +│ 4 │ Array(UInt64) │ all_1_5_2 │ +└─────────┴────────────────┴───────────┘ +``` + +As we can see, ClickHouse kept the most frequent types `UInt64` and `Array(UInt64)` and casted all other types to `String`. diff --git a/docs/en/sql-reference/data-types/fixedstring.md b/docs/en/sql-reference/data-types/fixedstring.md index 0316df7fe34..0c021b28f74 100644 --- a/docs/en/sql-reference/data-types/fixedstring.md +++ b/docs/en/sql-reference/data-types/fixedstring.md @@ -21,8 +21,8 @@ The `FixedString` type is efficient when data has the length of precisely `N` by Examples of the values that can be efficiently stored in `FixedString`-typed columns: - The binary representation of IP addresses (`FixedString(16)` for IPv6). -- Language codes (ru_RU, en_US … ). -- Currency codes (USD, RUB … ). +- Language codes (ru_RU, en_US ... ). +- Currency codes (USD, RUB ... ). - Binary representation of hashes (`FixedString(16)` for MD5, `FixedString(32)` for SHA256). To store UUID values, use the [UUID](../../sql-reference/data-types/uuid.md) data type. diff --git a/docs/en/sql-reference/data-types/nested-data-structures/index.md b/docs/en/sql-reference/data-types/nested-data-structures/index.md index d118170cd39..579ee9bfa8b 100644 --- a/docs/en/sql-reference/data-types/nested-data-structures/index.md +++ b/docs/en/sql-reference/data-types/nested-data-structures/index.md @@ -6,7 +6,7 @@ sidebar_label: Nested(Name1 Type1, Name2 Type2, ...) # Nested -## Nested(name1 Type1, Name2 Type2, …) +## Nested(name1 Type1, Name2 Type2, ...) A nested data structure is like a table inside a cell. The parameters of a nested data structure – the column names and types – are specified the same way as in a [CREATE TABLE](../../../sql-reference/statements/create/table.md) query. Each table row can correspond to any number of rows in a nested data structure. diff --git a/docs/en/sql-reference/data-types/simpleaggregatefunction.md b/docs/en/sql-reference/data-types/simpleaggregatefunction.md index 39f8409c1e1..4fb74ac30e4 100644 --- a/docs/en/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/en/sql-reference/data-types/simpleaggregatefunction.md @@ -5,7 +5,7 @@ sidebar_label: SimpleAggregateFunction --- # SimpleAggregateFunction -`SimpleAggregateFunction(name, types_of_arguments…)` data type stores current value of the aggregate function, and does not store its full state as [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) does. This optimization can be applied to functions for which the following property holds: the result of applying a function `f` to a row set `S1 UNION ALL S2` can be obtained by applying `f` to parts of the row set separately, and then again applying `f` to the results: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. This property guarantees that partial aggregation results are enough to compute the combined one, so we do not have to store and process any extra data. +`SimpleAggregateFunction(name, types_of_arguments...)` data type stores current value of the aggregate function, and does not store its full state as [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) does. This optimization can be applied to functions for which the following property holds: the result of applying a function `f` to a row set `S1 UNION ALL S2` can be obtained by applying `f` to parts of the row set separately, and then again applying `f` to the results: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. This property guarantees that partial aggregation results are enough to compute the combined one, so we do not have to store and process any extra data. The common way to produce an aggregate function value is by calling the aggregate function with the [-SimpleState](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-simplestate) suffix. diff --git a/docs/en/sql-reference/functions/arithmetic-functions.md b/docs/en/sql-reference/functions/arithmetic-functions.md index 6d95f3dc358..7b079152907 100644 --- a/docs/en/sql-reference/functions/arithmetic-functions.md +++ b/docs/en/sql-reference/functions/arithmetic-functions.md @@ -140,6 +140,70 @@ Same as `intDiv` but returns zero when dividing by zero or when dividing a minim intDivOrZero(a, b) ``` +## isFinite + +Returns 1 if the Float32 or Float64 argument not infinite and not a NaN, otherwise this function returns 0. + +**Syntax** + +```sql +isFinite(x) +``` + +## isInfinite + +Returns 1 if the Float32 or Float64 argument is infinite, otherwise this function returns 0. Note that 0 is returned for a NaN. + +**Syntax** + +```sql +isInfinite(x) +``` + +## ifNotFinite + +Checks whether a floating point value is finite. + +**Syntax** + +```sql +ifNotFinite(x,y) +``` + +**Arguments** + +- `x` — Value to check for infinity. [Float\*](../../sql-reference/data-types/float.md). +- `y` — Fallback value. [Float\*](../../sql-reference/data-types/float.md). + +**Returned value** + +- `x` if `x` is finite. +- `y` if `x` is not finite. + +**Example** + +Query: + + SELECT 1/0 as infimum, ifNotFinite(infimum,42) + +Result: + + ┌─infimum─┬─ifNotFinite(divide(1, 0), 42)─┐ + │ inf │ 42 │ + └─────────┴───────────────────────────────┘ + +You can get similar result by using the [ternary operator](../../sql-reference/functions/conditional-functions.md#ternary-operator): `isFinite(x) ? x : y`. + +## isNaN + +Returns 1 if the Float32 and Float64 argument is NaN, otherwise this function 0. + +**Syntax** + +```sql +isNaN(x) +``` + ## modulo Calculates the remainder of the division of two values `a` by `b`. diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 87e733a4b0c..f929ea00b8b 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -561,7 +561,7 @@ Result: └─────────────┴─────────────┴────────────────┴─────────────────┘ ``` -## array(x1, …), operator \[x1, …\] +## array(x1, ...), operator \[x1, ...\] Creates an array from the function arguments. The arguments must be constants and have types that have the smallest common type. At least one argument must be passed, because otherwise it isn’t clear which type of array to create. That is, you can’t use this function to create an empty array (to do that, use the ‘emptyArray\*’ function described above). @@ -768,9 +768,9 @@ SELECT indexOf([1, 3, NULL, NULL], NULL) Elements set to `NULL` are handled as normal values. -## arrayCount(\[func,\] arr1, …) +## arrayCount(\[func,\] arr1, ...) -Returns the number of elements for which `func(arr1[i], …, arrN[i])` returns something other than 0. If `func` is not specified, it returns the number of non-zero elements in the array. +Returns the number of elements for which `func(arr1[i], ..., arrN[i])` returns something other than 0. If `func` is not specified, it returns the number of non-zero elements in the array. Note that the `arrayCount` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. @@ -847,7 +847,7 @@ SELECT countEqual([1, 2, NULL, NULL], NULL) ## arrayEnumerate(arr) -Returns the array \[1, 2, 3, …, length (arr) \] +Returns the array \[1, 2, 3, ..., length (arr) \] This function is normally used with ARRAY JOIN. It allows counting something just once for each array after applying ARRAY JOIN. Example: @@ -887,7 +887,7 @@ WHERE (CounterID = 160656) AND notEmpty(GoalsReached) This function can also be used in higher-order functions. For example, you can use it to get array indexes for elements that match a condition. -## arrayEnumerateUniq(arr, …) +## arrayEnumerateUniq(arr, ...) Returns an array the same size as the source array, indicating for each element what its position is among elements with the same value. For example: arrayEnumerateUniq(\[10, 20, 10, 30\]) = \[1, 1, 2, 1\]. @@ -1206,7 +1206,7 @@ Result: └───────────────────┘ ``` -## arraySort(\[func,\] arr, …) {#sort} +## arraySort(\[func,\] arr, ...) {#sort} Sorts the elements of the `arr` array in ascending order. If the `func` function is specified, sorting order is determined by the result of the `func` function applied to the elements of the array. If `func` accepts multiple arguments, the `arraySort` function is passed several arrays that the arguments of `func` will correspond to. Detailed examples are shown at the end of `arraySort` description. @@ -1307,11 +1307,11 @@ SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; To improve sorting efficiency, the [Schwartzian transform](https://en.wikipedia.org/wiki/Schwartzian_transform) is used. ::: -## arrayPartialSort(\[func,\] limit, arr, …) +## arrayPartialSort(\[func,\] limit, arr, ...) Same as `arraySort` with additional `limit` argument allowing partial sorting. Returns an array of the same size as the original array where elements in range `[1..limit]` are sorted in ascending order. Remaining elements `(limit..N]` shall contain elements in unspecified order. -## arrayReverseSort(\[func,\] arr, …) {#reverse-sort} +## arrayReverseSort(\[func,\] arr, ...) {#reverse-sort} Sorts the elements of the `arr` array in descending order. If the `func` function is specified, `arr` is sorted according to the result of the `func` function applied to the elements of the array, and then the sorted array is reversed. If `func` accepts multiple arguments, the `arrayReverseSort` function is passed several arrays that the arguments of `func` will correspond to. Detailed examples are shown at the end of `arrayReverseSort` description. @@ -1412,7 +1412,7 @@ SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; └─────────┘ ``` -## arrayPartialReverseSort(\[func,\] limit, arr, …) +## arrayPartialReverseSort(\[func,\] limit, arr, ...) Same as `arrayReverseSort` with additional `limit` argument allowing partial sorting. Returns an array of the same size as the original array where elements in range `[1..limit]` are sorted in descending order. Remaining elements `(limit..N]` shall contain elements in unspecified order. @@ -1535,7 +1535,7 @@ Result: [3,9,1,4,5,6,7,8,2,10] ``` -## arrayUniq(arr, …) +## arrayUniq(arr, ...) If one argument is passed, it counts the number of different elements in the array. If multiple arguments are passed, it counts the number of different tuples of elements at corresponding positions in multiple arrays. @@ -2079,9 +2079,9 @@ Result: └───────────────────────────────────────────────┘ ``` -## arrayMap(func, arr1, …) +## arrayMap(func, arr1, ...) -Returns an array obtained from the original arrays by application of `func(arr1[i], …, arrN[i])` for each element. Arrays `arr1` … `arrN` must have the same number of elements. +Returns an array obtained from the original arrays by application of `func(arr1[i], ..., arrN[i])` for each element. Arrays `arr1` ... `arrN` must have the same number of elements. Examples: @@ -2109,9 +2109,9 @@ SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res Note that the `arrayMap` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. -## arrayFilter(func, arr1, …) +## arrayFilter(func, arr1, ...) -Returns an array containing only the elements in `arr1` for which `func(arr1[i], …, arrN[i])` returns something other than 0. +Returns an array containing only the elements in `arr1` for which `func(arr1[i], ..., arrN[i])` returns something other than 0. Examples: @@ -2142,9 +2142,9 @@ SELECT Note that the `arrayFilter` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. -## arrayFill(func, arr1, …) +## arrayFill(func, arr1, ...) -Scan through `arr1` from the first element to the last element and replace `arr1[i]` by `arr1[i - 1]` if `func(arr1[i], …, arrN[i])` returns 0. The first element of `arr1` will not be replaced. +Scan through `arr1` from the first element to the last element and replace `arr1[i]` by `arr1[i - 1]` if `func(arr1[i], ..., arrN[i])` returns 0. The first element of `arr1` will not be replaced. Examples: @@ -2160,9 +2160,9 @@ SELECT arrayFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, Note that the `arrayFill` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. -## arrayReverseFill(func, arr1, …) +## arrayReverseFill(func, arr1, ...) -Scan through `arr1` from the last element to the first element and replace `arr1[i]` by `arr1[i + 1]` if `func(arr1[i], …, arrN[i])` returns 0. The last element of `arr1` will not be replaced. +Scan through `arr1` from the last element to the first element and replace `arr1[i]` by `arr1[i + 1]` if `func(arr1[i], ..., arrN[i])` returns 0. The last element of `arr1` will not be replaced. Examples: @@ -2178,9 +2178,9 @@ SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, Note that the `arrayReverseFill` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. -## arraySplit(func, arr1, …) +## arraySplit(func, arr1, ...) -Split `arr1` into multiple arrays. When `func(arr1[i], …, arrN[i])` returns something other than 0, the array will be split on the left hand side of the element. The array will not be split before the first element. +Split `arr1` into multiple arrays. When `func(arr1[i], ..., arrN[i])` returns something other than 0, the array will be split on the left hand side of the element. The array will not be split before the first element. Examples: @@ -2196,9 +2196,9 @@ SELECT arraySplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res Note that the `arraySplit` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. -## arrayReverseSplit(func, arr1, …) +## arrayReverseSplit(func, arr1, ...) -Split `arr1` into multiple arrays. When `func(arr1[i], …, arrN[i])` returns something other than 0, the array will be split on the right hand side of the element. The array will not be split after the last element. +Split `arr1` into multiple arrays. When `func(arr1[i], ..., arrN[i])` returns something other than 0, the array will be split on the right hand side of the element. The array will not be split after the last element. Examples: @@ -2214,30 +2214,30 @@ SELECT arrayReverseSplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res Note that the `arrayReverseSplit` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. -## arrayExists(\[func,\] arr1, …) +## arrayExists(\[func,\] arr1, ...) -Returns 1 if there is at least one element in `arr` for which `func(arr1[i], …, arrN[i])` returns something other than 0. Otherwise, it returns 0. +Returns 1 if there is at least one element in `arr` for which `func(arr1[i], ..., arrN[i])` returns something other than 0. Otherwise, it returns 0. Note that the `arrayExists` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. -## arrayAll(\[func,\] arr1, …) +## arrayAll(\[func,\] arr1, ...) -Returns 1 if `func(arr1[i], …, arrN[i])` returns something other than 0 for all the elements in arrays. Otherwise, it returns 0. +Returns 1 if `func(arr1[i], ..., arrN[i])` returns something other than 0 for all the elements in arrays. Otherwise, it returns 0. Note that the `arrayAll` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. -## arrayFirst(func, arr1, …) +## arrayFirst(func, arr1, ...) -Returns the first element in the `arr1` array for which `func(arr1[i], …, arrN[i])` returns something other than 0. +Returns the first element in the `arr1` array for which `func(arr1[i], ..., arrN[i])` returns something other than 0. ## arrayFirstOrNull -Returns the first element in the `arr1` array for which `func(arr1[i], …, arrN[i])` returns something other than 0, otherwise it returns `NULL`. +Returns the first element in the `arr1` array for which `func(arr1[i], ..., arrN[i])` returns something other than 0, otherwise it returns `NULL`. **Syntax** ```sql -arrayFirstOrNull(func, arr1, …) +arrayFirstOrNull(func, arr1, ...) ``` **Parameters** @@ -2292,20 +2292,20 @@ Result: \N ``` -## arrayLast(func, arr1, …) +## arrayLast(func, arr1, ...) -Returns the last element in the `arr1` array for which `func(arr1[i], …, arrN[i])` returns something other than 0. +Returns the last element in the `arr1` array for which `func(arr1[i], ..., arrN[i])` returns something other than 0. Note that the `arrayLast` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. ## arrayLastOrNull -Returns the last element in the `arr1` array for which `func(arr1[i], …, arrN[i])` returns something other than 0, otherwise returns `NULL`. +Returns the last element in the `arr1` array for which `func(arr1[i], ..., arrN[i])` returns something other than 0, otherwise returns `NULL`. **Syntax** ```sql -arrayLastOrNull(func, arr1, …) +arrayLastOrNull(func, arr1, ...) ``` **Parameters** @@ -2348,15 +2348,15 @@ Result: \N ``` -## arrayFirstIndex(func, arr1, …) +## arrayFirstIndex(func, arr1, ...) -Returns the index of the first element in the `arr1` array for which `func(arr1[i], …, arrN[i])` returns something other than 0. +Returns the index of the first element in the `arr1` array for which `func(arr1[i], ..., arrN[i])` returns something other than 0. Note that the `arrayFirstIndex` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. -## arrayLastIndex(func, arr1, …) +## arrayLastIndex(func, arr1, ...) -Returns the index of the last element in the `arr1` array for which `func(arr1[i], …, arrN[i])` returns something other than 0. +Returns the index of the last element in the `arr1` array for which `func(arr1[i], ..., arrN[i])` returns something other than 0. Note that the `arrayLastIndex` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. @@ -2580,9 +2580,9 @@ Result: └─────┘ ``` -## arrayCumSum(\[func,\] arr1, …) +## arrayCumSum(\[func,\] arr1, ...) -Returns an array of the partial (running) sums of the elements in the source array `arr1`. If `func` is specified, then the sum is computed from applying `func` to `arr1`, `arr2`, ..., `arrN`, i.e. `func(arr1[i], …, arrN[i])`. +Returns an array of the partial (running) sums of the elements in the source array `arr1`. If `func` is specified, then the sum is computed from applying `func` to `arr1`, `arr2`, ..., `arrN`, i.e. `func(arr1[i], ..., arrN[i])`. **Syntax** @@ -2614,9 +2614,9 @@ SELECT arrayCumSum([1, 1, 1, 1]) AS res Note that the `arrayCumSum` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. -## arrayCumSumNonNegative(\[func,\] arr1, …) +## arrayCumSumNonNegative(\[func,\] arr1, ...) -Same as `arrayCumSum`, returns an array of the partial (running) sums of the elements in the source array. If `func` is specified, then the sum is computed from applying `func` to `arr1`, `arr2`, ..., `arrN`, i.e. `func(arr1[i], …, arrN[i])`. Unlike `arrayCumSum`, if the current running sum is smaller than `0`, it is replaced by `0`. +Same as `arrayCumSum`, returns an array of the partial (running) sums of the elements in the source array. If `func` is specified, then the sum is computed from applying `func` to `arr1`, `arr2`, ..., `arrN`, i.e. `func(arr1[i], ..., arrN[i])`. Unlike `arrayCumSum`, if the current running sum is smaller than `0`, it is replaced by `0`. **Syntax** diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 843f22e5a6f..1a56691ffc0 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -1499,7 +1499,7 @@ This function returns the week number for date or datetime. The two-argument for The following table describes how the mode argument works. -| Mode | First day of week | Range | Week 1 is the first week … | +| Mode | First day of week | Range | Week 1 is the first week ... | |------|-------------------|-------|-------------------------------| | 0 | Sunday | 0-53 | with a Sunday in this year | | 1 | Monday | 0-53 | with 4 or more days this year | diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md index e920ab82988..ba72b3cc6ed 100644 --- a/docs/en/sql-reference/functions/json-functions.md +++ b/docs/en/sql-reference/functions/json-functions.md @@ -386,7 +386,7 @@ SELECT isValidJSON('{"a": "hello", "b": [-100, 200.0, 300]}') = 1 SELECT isValidJSON('not a json') = 0 ``` -## JSONHas(json\[, indices_or_keys\]…) +## JSONHas(json\[, indices_or_keys\]...) If the value exists in the JSON document, `1` will be returned. @@ -419,7 +419,7 @@ SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' ``` -## JSONLength(json\[, indices_or_keys\]…) +## JSONLength(json\[, indices_or_keys\]...) Return the length of a JSON array or a JSON object. @@ -432,7 +432,7 @@ SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 ``` -## JSONType(json\[, indices_or_keys\]…) +## JSONType(json\[, indices_or_keys\]...) Return the type of a JSON value. @@ -446,13 +446,13 @@ SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' ``` -## JSONExtractUInt(json\[, indices_or_keys\]…) +## JSONExtractUInt(json\[, indices_or_keys\]...) -## JSONExtractInt(json\[, indices_or_keys\]…) +## JSONExtractInt(json\[, indices_or_keys\]...) -## JSONExtractFloat(json\[, indices_or_keys\]…) +## JSONExtractFloat(json\[, indices_or_keys\]...) -## JSONExtractBool(json\[, indices_or_keys\]…) +## JSONExtractBool(json\[, indices_or_keys\]...) Parses a JSON and extract a value. These functions are similar to `visitParam` functions. @@ -466,7 +466,7 @@ SELECT JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200 SELECT JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 ``` -## JSONExtractString(json\[, indices_or_keys\]…) +## JSONExtractString(json\[, indices_or_keys\]...) Parses a JSON and extract a string. This function is similar to `visitParamExtractString` functions. @@ -484,7 +484,7 @@ SELECT JSONExtractString('{"abc":"\\u263"}', 'abc') = '' SELECT JSONExtractString('{"abc":"hello}', 'abc') = '' ``` -## JSONExtract(json\[, indices_or_keys…\], Return_type) +## JSONExtract(json\[, indices_or_keys...\], Return_type) Parses a JSON and extract a value of the given ClickHouse data type. @@ -506,7 +506,7 @@ SELECT JSONExtract('{"day": "Thursday"}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Friday' ``` -## JSONExtractKeysAndValues(json\[, indices_or_keys…\], Value_type) +## JSONExtractKeysAndValues(json\[, indices_or_keys...\], Value_type) Parses key-value pairs from a JSON where the values are of the given ClickHouse data type. @@ -554,7 +554,7 @@ text └────────────────────────────────────────────────────────────┘ ``` -## JSONExtractRaw(json\[, indices_or_keys\]…) +## JSONExtractRaw(json\[, indices_or_keys\]...) Returns a part of JSON as unparsed string. @@ -566,7 +566,7 @@ Example: SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]'; ``` -## JSONExtractArrayRaw(json\[, indices_or_keys…\]) +## JSONExtractArrayRaw(json\[, indices_or_keys...\]) Returns an array with elements of JSON array, each represented as unparsed string. diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index 945166056af..324adbfb4b3 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -947,3 +947,49 @@ Result: │ 11 │ └──────────────────────────────────┘ ``` + +## proportionsZTest + +Returns test statistics for the two proportion Z-test - a statistical test for comparing the proportions from two populations `x` and `y`. + +**Syntax** + +```sql +proportionsZTest(successes_x, successes_y, trials_x, trials_y, conf_level, pool_type) +``` + +**Arguments** + +- `successes_x`: Number of successes in population `x`. [UInt64](../data-types/int-uint.md). +- `successes_y`: Number of successes in population `y`. [UInt64](../data-types/int-uint.md). +- `trials_x`: Number of trials in population `x`. [UInt64](../data-types/int-uint.md). +- `trials_y`: Number of trials in population `y`. [UInt64](../data-types/int-uint.md). +- `conf_level`: Confidence level for the test. [Float64](../data-types/float.md). +- `pool_type`: Selection of pooling (way in which the standard error is estimated). Can be either `unpooled` or `pooled`. [String](../data-types/string.md). + +:::note +For argument `pool_type`: In the pooled version, the two proportions are averaged, and only one proportion is used to estimate the standard error. In the unpooled version, the two proportions are used separately. +::: + +**Returned value** + +- `z_stat`: Z statistic. [Float64](../data-types/float.md). +- `p_val`: P value. [Float64](../data-types/float.md). +- `ci_low`: The lower confidence interval. [Float64](../data-types/float.md). +- `ci_high`: The upper confidence interval. [Float64](../data-types/float.md). + +**Example** + +Query: + +```sql +SELECT proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled'); +``` + +Result: + +```response +┌─proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled')───────────────────────────────┐ +│ (-0.20656724435948853,0.8363478437079654,-0.09345975390115283,0.07563797172293502) │ +└────────────────────────────────────────────────────────────────────────────────────┘ +``` \ No newline at end of file diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 11ee471d709..c16e8af1ef0 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -6,11 +6,21 @@ sidebar_label: Other # Other Functions -## hostName() +## hostName Returns the name of the host on which this function was executed. If the function executes on a remote server (distributed processing), the remote server name is returned. If the function executes in the context of a distributed table, it generates a normal column with values relevant to each shard. Otherwise it produces a constant value. +**Syntax** + +```sql +hostName() +``` + +**Returned value** + +- Host name. [String](../data-types/string.md). + ## getMacro {#getMacro} Returns a named value from the [macros](../../operations/server-configuration-parameters/settings.md#macros) section of the server configuration. @@ -27,9 +37,7 @@ getMacro(name); **Returned value** -- Value of the specified macro. - -Type: [String](../../sql-reference/data-types/string.md). +- Value of the specified macro. [String](../../sql-reference/data-types/string.md). **Example** @@ -82,9 +90,7 @@ This function is case-insensitive. **Returned value** -- String with the fully qualified domain name. - -Type: `String`. +- String with the fully qualified domain name. [String](../data-types/string.md). **Example** @@ -163,34 +169,58 @@ Result: └────────────────┴────────────────────────────┘ ``` -## visibleWidth(x) +## visibleWidth Calculates the approximate width when outputting values to the console in text format (tab-separated). -This function is used by the system to implement Pretty formats. +This function is used by the system to implement [Pretty formats](../../interfaces/formats.md). `NULL` is represented as a string corresponding to `NULL` in `Pretty` formats. +**Syntax** + +```sql +visibleWidth(x) +``` + +**Example** + +Query: + ```sql SELECT visibleWidth(NULL) ``` +Result: + ```text ┌─visibleWidth(NULL)─┐ │ 4 │ └────────────────────┘ ``` -## toTypeName(x) +## toTypeName Returns the type name of the passed argument. If `NULL` is passed, then the function returns type `Nullable(Nothing)`, which corresponds to ClickHouse's internal `NULL` representation. -## blockSize() {#blockSize} +**Syntax** + +```sql +toTypeName(x) +``` + +## blockSize {#blockSize} In ClickHouse, queries are processed in blocks (chunks). This function returns the size (row count) of the block the function is called on. +**Syntax** + +```sql +blockSize() +``` + ## byteSize Returns an estimation of uncompressed byte size of its arguments in memory. @@ -207,9 +237,7 @@ byteSize(argument [, ...]) **Returned value** -- Estimation of byte size of the arguments in memory. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- Estimation of byte size of the arguments in memory. [UInt64](../../sql-reference/data-types/int-uint.md). **Examples** @@ -288,16 +316,28 @@ Result: └────────────────────────────┘ ``` -## materialize(x) +## materialize Turns a constant into a full column containing a single value. Full columns and constants are represented differently in memory. Functions usually execute different code for normal and constant arguments, although the result should typically be the same. This function can be used to debug this behavior. -## ignore(…) +**Syntax** + +```sql +materialize(x) +``` + +## ignore Accepts any arguments, including `NULL` and does nothing. Always returns 0. The argument is internally still evaluated. Useful e.g. for benchmarks. +**Syntax** + +```sql +ignore(x) +``` + ## sleep Used to introduce a delay or pause in the execution of a query. It is primarily used for testing and debugging purposes. @@ -392,27 +432,33 @@ The `sleepEachRow()` function is primarily used for testing and debugging purpos Like the [`sleep()` function](#sleep), it's important to use `sleepEachRow()` judiciously and only when necessary, as it can significantly impact the overall performance and responsiveness of your ClickHouse system, especially when dealing with large result sets. -## currentDatabase() +## currentDatabase Returns the name of the current database. Useful in table engine parameters of `CREATE TABLE` queries where you need to specify the database. -## currentUser() {#currentUser} +**Syntax** + +```sql +currentDatabase() +``` + +## currentUser {#currentUser} Returns the name of the current user. In case of a distributed query, the name of the user who initiated the query is returned. +**Syntax** + ```sql -SELECT currentUser(); +currentUser() ``` Aliases: `user()`, `USER()`, `current_user()`. Aliases are case insensitive. **Returned values** -- The name of the current user. -- In distributed queries, the login of the user who initiated the query. - -Type: `String`. +- The name of the current user. [String](../data-types/string.md). +- In distributed queries, the login of the user who initiated the query. [String](../data-types/string.md). **Example** @@ -448,10 +494,8 @@ isConstant(x) **Returned values** -- `1` if `x` is constant. -- `0` if `x` is non-constant. - -Type: [UInt8](../../sql-reference/data-types/int-uint.md). +- `1` if `x` is constant. [UInt8](../../sql-reference/data-types/int-uint.md). +- `0` if `x` is non-constant. [UInt8](../../sql-reference/data-types/int-uint.md). **Examples** @@ -497,52 +541,6 @@ Result: └────────────────────┘ ``` -## isFinite(x) - -Returns 1 if the Float32 or Float64 argument not infinite and not a NaN, otherwise this function returns 0. - -## isInfinite(x) - -Returns 1 if the Float32 or Float64 argument is infinite, otherwise this function returns 0. Note that 0 is returned for a NaN. - -## ifNotFinite - -Checks whether a floating point value is finite. - -**Syntax** - -```sql -ifNotFinite(x,y) -``` - -**Arguments** - -- `x` — Value to check for infinity. Type: [Float\*](../../sql-reference/data-types/float.md). -- `y` — Fallback value. Type: [Float\*](../../sql-reference/data-types/float.md). - -**Returned value** - -- `x` if `x` is finite. -- `y` if `x` is not finite. - -**Example** - -Query: - - SELECT 1/0 as infimum, ifNotFinite(infimum,42) - -Result: - - ┌─infimum─┬─ifNotFinite(divide(1, 0), 42)─┐ - │ inf │ 42 │ - └─────────┴───────────────────────────────┘ - -You can get similar result by using the [ternary operator](../../sql-reference/functions/conditional-functions.md#ternary-operator): `isFinite(x) ? x : y`. - -## isNaN(x) - -Returns 1 if the Float32 and Float64 argument is NaN, otherwise this function 0. - ## hasColumnInTable Given the database name, the table name, and the column name as constant strings, returns 1 if the given column exists, otherwise 0. @@ -733,11 +731,19 @@ LIMIT 10 └────────────────┴─────────┘ ``` -## formatReadableDecimalSize(x) +## formatReadableDecimalSize Given a size (number of bytes), this function returns a readable, rounded size with suffix (KB, MB, etc.) as string. -Example: +**Syntax** + +```sql +formatReadableDecimalSize(x) +``` + +**Example** + +Query: ```sql SELECT @@ -745,6 +751,8 @@ SELECT formatReadableDecimalSize(filesize_bytes) AS filesize ``` +Result: + ```text ┌─filesize_bytes─┬─filesize───┐ │ 1 │ 1.00 B │ @@ -754,11 +762,20 @@ SELECT └────────────────┴────────────┘ ``` -## formatReadableSize(x) +## formatReadableSize Given a size (number of bytes), this function returns a readable, rounded size with suffix (KiB, MiB, etc.) as string. -Example: +**Syntax** + +```sql +formatReadableSize(x) +``` +Alias: `FORMAT_BYTES`. + +**Example** + +Query: ```sql SELECT @@ -766,7 +783,7 @@ SELECT formatReadableSize(filesize_bytes) AS filesize ``` -Alias: `FORMAT_BYTES`. +Result: ```text ┌─filesize_bytes─┬─filesize───┐ @@ -777,11 +794,19 @@ Alias: `FORMAT_BYTES`. └────────────────┴────────────┘ ``` -## formatReadableQuantity(x) +## formatReadableQuantity Given a number, this function returns a rounded number with suffix (thousand, million, billion, etc.) as string. -Example: +**Syntax** + +```sql +formatReadableQuantity(x) +``` + +**Example** + +Query: ```sql SELECT @@ -789,6 +814,8 @@ SELECT formatReadableQuantity(number) AS number_for_humans ``` +Result: + ```text ┌─────────number─┬─number_for_humans─┐ │ 1024 │ 1.02 thousand │ @@ -903,15 +930,27 @@ SELECT parseTimeDelta('1yr2mo') └──────────────────────────┘ ``` -## least(a, b) +## least Returns the smaller value of a and b. -## greatest(a, b) +**Syntax** + +```sql +least(a, b) +``` + +## greatest Returns the larger value of a and b. -## uptime() +**Syntax** + +```sql +greatest(a, b) +``` + +## uptime Returns the server’s uptime in seconds. If executed in the context of a distributed table, this function generates a normal column with values relevant to each shard. Otherwise it produces a constant value. @@ -924,9 +963,7 @@ uptime() **Returned value** -- Time value of seconds. - -Type: [UInt32](/docs/en/sql-reference/data-types/int-uint.md). +- Time value of seconds. [UInt32](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -944,7 +981,7 @@ Result: └────────┘ ``` -## version() +## version Returns the current version of ClickHouse as a string in the form of: @@ -971,7 +1008,7 @@ None. **Returned value** -Type: [String](../data-types/string) +- Current version of ClickHouse. [String](../data-types/string) **Implementation details** @@ -993,22 +1030,178 @@ SELECT version() └───────────┘ ``` -## buildId() +## buildId Returns the build ID generated by a compiler for the running ClickHouse server binary. If executed in the context of a distributed table, this function generates a normal column with values relevant to each shard. Otherwise it produces a constant value. -## blockNumber() +**Syntax** -Returns the sequence number of the data block where the row is located. +```sql +buildId() +``` -## rowNumberInBlock() {#rowNumberInBlock} +## blockNumber -Returns the ordinal number of the row in the data block. Different data blocks are always recalculated. +Returns a monotonically increasing sequence number of the [block](../../development/architecture.md#block) containing the row. +The returned block number is updated on a best-effort basis, i.e. it may not be fully accurate. -## rowNumberInAllBlocks() +**Syntax** -Returns the ordinal number of the row in the data block. This function only considers the affected data blocks. +```sql +blockNumber() +``` + +**Returned value** + +- Sequence number of the data block where the row is located. [UInt64](../data-types/int-uint.md). + +**Example** + +Query: + +```sql +SELECT blockNumber() +FROM +( + SELECT * + FROM system.numbers + LIMIT 10 +) SETTINGS max_block_size = 2 +``` + +Result: + +```response +┌─blockNumber()─┐ +│ 7 │ +│ 7 │ +└───────────────┘ +┌─blockNumber()─┐ +│ 8 │ +│ 8 │ +└───────────────┘ +┌─blockNumber()─┐ +│ 9 │ +│ 9 │ +└───────────────┘ +┌─blockNumber()─┐ +│ 10 │ +│ 10 │ +└───────────────┘ +┌─blockNumber()─┐ +│ 11 │ +│ 11 │ +└───────────────┘ +``` + +## rowNumberInBlock {#rowNumberInBlock} + +Returns for each [block](../../development/architecture.md#block) processed by `rowNumberInBlock` the number of the current row. +The returned number starts for each block at 0. + +**Syntax** + +```sql +rowNumberInBlock() +``` + +**Returned value** + +- Ordinal number of the row in the data block starting from 0. [UInt64](../data-types/int-uint.md). + +**Example** + +Query: + +```sql +SELECT rowNumberInBlock() +FROM +( + SELECT * + FROM system.numbers_mt + LIMIT 10 +) SETTINGS max_block_size = 2 +``` + +Result: + +```response +┌─rowNumberInBlock()─┐ +│ 0 │ +│ 1 │ +└────────────────────┘ +┌─rowNumberInBlock()─┐ +│ 0 │ +│ 1 │ +└────────────────────┘ +┌─rowNumberInBlock()─┐ +│ 0 │ +│ 1 │ +└────────────────────┘ +┌─rowNumberInBlock()─┐ +│ 0 │ +│ 1 │ +└────────────────────┘ +┌─rowNumberInBlock()─┐ +│ 0 │ +│ 1 │ +└────────────────────┘ +``` + +## rowNumberInAllBlocks + +Returns a unique row number for each row processed by `rowNumberInAllBlocks`. The returned numbers start at 0. + +**Syntax** + +```sql +rowNumberInAllBlocks() +``` + +**Returned value** + +- Ordinal number of the row in the data block starting from 0. [UInt64](../data-types/int-uint.md). + +**Example** + +Query: + +```sql +SELECT rowNumberInAllBlocks() +FROM +( + SELECT * + FROM system.numbers_mt + LIMIT 10 +) +SETTINGS max_block_size = 2 +``` + +Result: + +```response +┌─rowNumberInAllBlocks()─┐ +│ 0 │ +│ 1 │ +└────────────────────────┘ +┌─rowNumberInAllBlocks()─┐ +│ 4 │ +│ 5 │ +└────────────────────────┘ +┌─rowNumberInAllBlocks()─┐ +│ 2 │ +│ 3 │ +└────────────────────────┘ +┌─rowNumberInAllBlocks()─┐ +│ 6 │ +│ 7 │ +└────────────────────────┘ +┌─rowNumberInAllBlocks()─┐ +│ 8 │ +│ 9 │ +└────────────────────────┘ +``` ## neighbor @@ -1128,7 +1321,7 @@ Result: └────────────┴───────┴───────────┴────────────────┘ ``` -## runningDifference(x) {#runningDifference} +## runningDifference {#runningDifference} Calculates the difference between two consecutive row values in the data block. Returns 0 for the first row, and for subsequent rows the difference to the previous row. @@ -1143,7 +1336,15 @@ The result of the function depends on the affected data blocks and the order of The order of rows during calculation of `runningDifference()` can differ from the order of rows returned to the user. To prevent that you can create a subquery with [ORDER BY](../../sql-reference/statements/select/order-by.md) and call the function from outside the subquery. -Example: +**Syntax** + +```sql +runningDifference(x) +``` + +**Example** + +Query: ```sql SELECT @@ -1162,6 +1363,8 @@ FROM ) ``` +Result: + ```text ┌─EventID─┬───────────EventTime─┬─delta─┐ │ 1106 │ 2016-11-24 00:00:04 │ 0 │ @@ -1174,6 +1377,8 @@ FROM Please note that the block size affects the result. The internal state of `runningDifference` state is reset for each new block. +Query: + ```sql SELECT number, @@ -1182,6 +1387,8 @@ FROM numbers(100000) WHERE diff != 1 ``` +Result: + ```text ┌─number─┬─diff─┐ │ 0 │ 0 │ @@ -1191,6 +1398,8 @@ WHERE diff != 1 └────────┴──────┘ ``` +Query: + ```sql set max_block_size=100000 -- default value is 65536! @@ -1201,6 +1410,8 @@ FROM numbers(100000) WHERE diff != 1 ``` +Result: + ```text ┌─number─┬─diff─┐ │ 0 │ 0 │ @@ -1238,9 +1449,7 @@ runningConcurrency(start, end) **Returned values** -- The number of concurrent events at each event start time. - -Type: [UInt32](../../sql-reference/data-types/int-uint.md) +- The number of concurrent events at each event start time. [UInt32](../../sql-reference/data-types/int-uint.md). **Example** @@ -1272,23 +1481,43 @@ Result: └────────────┴────────────────────────────────┘ ``` -## MACNumToString(num) +## MACNumToString Interprets a UInt64 number as a MAC address in big endian format. Returns the corresponding MAC address in format AA:BB:CC:DD:EE:FF (colon-separated numbers in hexadecimal form) as string. -## MACStringToNum(s) +**Syntax** + +```sql +MACNumToString(num) +``` + +## MACStringToNum The inverse function of MACNumToString. If the MAC address has an invalid format, it returns 0. -## MACStringToOUI(s) +**Syntax** + +```sql +MACStringToNum(s) +``` + +## MACStringToOUI Given a MAC address in format AA:BB:CC:DD:EE:FF (colon-separated numbers in hexadecimal form), returns the first three octets as a UInt64 number. If the MAC address has an invalid format, it returns 0. +**Syntax** + +```sql +MACStringToOUI(s) +``` + ## getSizeOfEnumType Returns the number of fields in [Enum](../../sql-reference/data-types/enum.md). An exception is thrown if the type is not `Enum`. +**Syntax** + ```sql getSizeOfEnumType(value) ``` @@ -1349,6 +1578,8 @@ Result: Returns the internal name of the data type that represents the value. +**Syntax** + ```sql toColumnTypeName(value) ``` @@ -1427,6 +1658,8 @@ Returns the default value for the given data type. Does not include default values for custom columns set by the user. +**Syntax** + ```sql defaultValueOfArgumentType(expression) ``` @@ -1625,29 +1858,31 @@ Result: Creates an array with a single value. -Used for the internal implementation of [arrayJoin](../../sql-reference/functions/array-join.md#functions_arrayjoin). +:::note +This function is used for the internal implementation of [arrayJoin](../../sql-reference/functions/array-join.md#functions_arrayjoin). +::: + +**Syntax** ```sql -SELECT replicate(x, arr); +replicate(x, arr) ``` -**Arguments:** +**Arguments** -- `arr` — An array. - `x` — The value to fill the result array with. +- `arr` — An array. [Array](../data-types/array.md). **Returned value** -An array of the lame length as `arr` filled with value `x`. - -Type: `Array`. +An array of the lame length as `arr` filled with value `x`. [Array](../data-types/array.md). **Example** Query: ```sql -SELECT replicate(1, ['a', 'b', 'c']) +SELECT replicate(1, ['a', 'b', 'c']); ``` Result: @@ -1658,6 +1893,36 @@ Result: └───────────────────────────────┘ ``` +## revision + +Returns the current ClickHouse [server revision](../../operations/system-tables/metrics#revision). + +**Syntax** + +```sql +revision() +``` + +**Returned value** + +- The current ClickHouse server revision. [UInt32](../data-types/int-uint.md). + +**Example** + +Query: + +```sql +SELECT revision(); +``` + +Result: + +```response +┌─revision()─┐ +│ 54485 │ +└────────────┘ +``` + ## filesystemAvailable Returns the amount of free space in the filesystem hosting the database persistence. The returned value is always smaller than total free space ([filesystemFree](#filesystemfree)) because some space is reserved for the operating system. @@ -1670,9 +1935,7 @@ filesystemAvailable() **Returned value** -- The amount of remaining space available in bytes. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- The amount of remaining space available in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). **Example** @@ -1702,9 +1965,7 @@ filesystemFree() **Returned value** -- The amount of free space in bytes. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- The amount of free space in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). **Example** @@ -1734,9 +1995,7 @@ filesystemCapacity() **Returned value** -- Capacity of the filesystem in bytes. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- Capacity of the filesystem in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). **Example** @@ -2100,7 +2359,7 @@ Result: └──────────────────────────────────────────────────┘ ``` -## catboostEvaluate(path_to_model, feature_1, feature_2, …, feature_n) +## catboostEvaluate :::note This function is not available in ClickHouse Cloud. @@ -2109,6 +2368,14 @@ This function is not available in ClickHouse Cloud. Evaluate an external catboost model. [CatBoost](https://catboost.ai) is an open-source gradient boosting library developed by Yandex for machine learning. Accepts a path to a catboost model and model arguments (features). Returns Float64. +**Syntax** + +```sql +catboostEvaluate(path_to_model, feature_1, feature_2, ..., feature_n) +``` + +**Example** + ```sql SELECT feat1, ..., feat_n, catboostEvaluate('/path/to/model.bin', feat_1, ..., feat_n) AS prediction FROM data_table @@ -2145,10 +2412,16 @@ communicate using a HTTP interface. By default, port `9012` is used. A different See [Training and applying models](https://catboost.ai/docs/features/training.html#training) for how to train catboost models from a training data set. -## throwIf(x\[, message\[, error_code\]\]) +## throwIf Throw an exception if argument `x` is true. +**Syntax** + +```sql +throwIf(x[, message[, error_code]]) +``` + **Arguments** - `x` - the condition to check. @@ -2284,9 +2557,7 @@ countDigits(x) **Returned value** -Number of digits. - -Type: [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). +- Number of digits. [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). :::note For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#is-decimal-overflow). @@ -2310,9 +2581,7 @@ Result: ## errorCodeToName -Returns the textual name of an error code. - -Type: [LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md). +- Returns the textual name of an error code. [LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md). **Syntax** @@ -2343,9 +2612,7 @@ tcpPort() **Returned value** -- The TCP port number. - -Type: [UInt16](../../sql-reference/data-types/int-uint.md). +- The TCP port number. [UInt16](../../sql-reference/data-types/int-uint.md). **Example** @@ -2381,9 +2648,7 @@ currentProfiles() **Returned value** -- List of the current user settings profiles. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the current user settings profiles. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## enabledProfiles @@ -2397,9 +2662,7 @@ enabledProfiles() **Returned value** -- List of the enabled settings profiles. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the enabled settings profiles. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## defaultProfiles @@ -2413,9 +2676,7 @@ defaultProfiles() **Returned value** -- List of the default settings profiles. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the default settings profiles. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## currentRoles @@ -2429,9 +2690,7 @@ currentRoles() **Returned value** -- A list of the current roles for the current user. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- A list of the current roles for the current user. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## enabledRoles @@ -2445,9 +2704,7 @@ enabledRoles() **Returned value** -- List of the enabled roles for the current user. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the enabled roles for the current user. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## defaultRoles @@ -2461,9 +2718,7 @@ defaultRoles() **Returned value** -- List of the default roles for the current user. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the default roles for the current user. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## getServerPort @@ -2492,9 +2747,7 @@ getServerPort(port_name) **Returned value** -- The number of the server port. - -Type: [UInt16](../../sql-reference/data-types/int-uint.md). +- The number of the server port. [UInt16](../../sql-reference/data-types/int-uint.md). **Example** @@ -2526,9 +2779,7 @@ queryID() **Returned value** -- The ID of the current query. - -Type: [String](../../sql-reference/data-types/string.md) +- The ID of the current query. [String](../../sql-reference/data-types/string.md). **Example** @@ -2562,9 +2813,7 @@ initialQueryID() **Returned value** -- The ID of the initial current query. - -Type: [String](../../sql-reference/data-types/string.md) +- The ID of the initial current query. [String](../../sql-reference/data-types/string.md). **Example** @@ -2597,9 +2846,7 @@ shardNum() **Returned value** -- Shard index or constant `0`. - -Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- Shard index or constant `0`. [UInt32](../../sql-reference/data-types/int-uint.md). **Example** @@ -2639,9 +2886,7 @@ shardCount() **Returned value** -- Total number of shards or `0`. - -Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- Total number of shards or `0`. [UInt32](../../sql-reference/data-types/int-uint.md). **See Also** @@ -2663,9 +2908,7 @@ getOSKernelVersion() **Returned value** -- The current OS kernel version. - -Type: [String](../../sql-reference/data-types/string.md). +- The current OS kernel version. [String](../../sql-reference/data-types/string.md). **Example** @@ -2699,9 +2942,7 @@ zookeeperSessionUptime() **Returned value** -- Uptime of the current ZooKeeper session in seconds. - -Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- Uptime of the current ZooKeeper session in seconds. [UInt32](../../sql-reference/data-types/int-uint.md). **Example** @@ -2738,9 +2979,7 @@ All arguments must be constant. **Returned value** -- Randomly generated table structure. - -Type: [String](../../sql-reference/data-types/string.md). +- Randomly generated table structure. [String](../../sql-reference/data-types/string.md). **Examples** @@ -2807,9 +3046,7 @@ structureToCapnProtoSchema(structure) **Returned value** -- CapnProto schema - -Type: [String](../../sql-reference/data-types/string.md). +- CapnProto schema. [String](../../sql-reference/data-types/string.md). **Examples** @@ -2908,9 +3145,7 @@ structureToProtobufSchema(structure) **Returned value** -- Protobuf schema - -Type: [String](../../sql-reference/data-types/string.md). +- Protobuf schema. [String](../../sql-reference/data-types/string.md). **Examples** diff --git a/docs/en/sql-reference/functions/string-replace-functions.md b/docs/en/sql-reference/functions/string-replace-functions.md index 0b761b62006..0e183626555 100644 --- a/docs/en/sql-reference/functions/string-replace-functions.md +++ b/docs/en/sql-reference/functions/string-replace-functions.md @@ -139,7 +139,7 @@ Format the `pattern` string with the values (strings, integers, etc.) listed in **Syntax** ```sql -format(pattern, s0, s1, …) +format(pattern, s0, s1, ...) ``` **Example** diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 9738c19bf3c..a6eb4a4ceff 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -799,7 +799,7 @@ If you only want to search multiple substrings in a string, you can use function **Syntax** ```sql -multiMatchAny(haystack, \[pattern1, pattern2, …, patternn\]) +multiMatchAny(haystack, \[pattern1, pattern2, ..., patternn\]) ``` ## multiMatchAnyIndex @@ -809,7 +809,7 @@ Like `multiMatchAny` but returns any index that matches the haystack. **Syntax** ```sql -multiMatchAnyIndex(haystack, \[pattern1, pattern2, …, patternn\]) +multiMatchAnyIndex(haystack, \[pattern1, pattern2, ..., patternn\]) ``` ## multiMatchAllIndices @@ -819,7 +819,7 @@ Like `multiMatchAny` but returns the array of all indices that match the haystac **Syntax** ```sql -multiMatchAllIndices(haystack, \[pattern1, pattern2, …, patternn\]) +multiMatchAllIndices(haystack, \[pattern1, pattern2, ..., patternn\]) ``` ## multiFuzzyMatchAny @@ -833,7 +833,7 @@ Like `multiMatchAny` but returns 1 if any pattern matches the haystack within a **Syntax** ```sql -multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, …, patternn\]) +multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, ..., patternn\]) ``` ## multiFuzzyMatchAnyIndex @@ -843,7 +843,7 @@ Like `multiFuzzyMatchAny` but returns any index that matches the haystack within **Syntax** ```sql -multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2, …, patternn\]) +multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2, ..., patternn\]) ``` ## multiFuzzyMatchAllIndices @@ -853,7 +853,7 @@ Like `multiFuzzyMatchAny` but returns the array of all indices in any order that **Syntax** ```sql -multiFuzzyMatchAllIndices(haystack, distance, \[pattern1, pattern2, …, patternn\]) +multiFuzzyMatchAllIndices(haystack, distance, \[pattern1, pattern2, ..., patternn\]) ``` ## extract diff --git a/docs/en/sql-reference/functions/tuple-functions.md b/docs/en/sql-reference/functions/tuple-functions.md index 64b1732597f..c2219bb3f90 100644 --- a/docs/en/sql-reference/functions/tuple-functions.md +++ b/docs/en/sql-reference/functions/tuple-functions.md @@ -7,15 +7,15 @@ sidebar_label: Tuples ## tuple A function that allows grouping multiple columns. -For columns with the types T1, T2, …, it returns a Tuple(T1, T2, …) type tuple containing these columns. There is no cost to execute the function. +For columns with the types T1, T2, ..., it returns a Tuple(T1, T2, ...) type tuple containing these columns. There is no cost to execute the function. Tuples are normally used as intermediate values for an argument of IN operators, or for creating a list of formal parameters of lambda functions. Tuples can’t be written to a table. -The function implements the operator `(x, y, …)`. +The function implements the operator `(x, y, ...)`. **Syntax** ``` sql -tuple(x, y, …) +tuple(x, y, ...) ``` ## tupleElement diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md index 377283bc006..6386b4d5b1d 100644 --- a/docs/en/sql-reference/functions/tuple-map-functions.md +++ b/docs/en/sql-reference/functions/tuple-map-functions.md @@ -589,7 +589,7 @@ mapApply(func, map) **Returned value** -- Returns a map obtained from the original map by application of `func(map1[i], …, mapN[i])` for each element. +- Returns a map obtained from the original map by application of `func(map1[i], ..., mapN[i])` for each element. **Example** @@ -629,7 +629,7 @@ mapFilter(func, map) **Returned value** -- Returns a map containing only the elements in `map` for which `func(map1[i], …, mapN[i])` returns something other than 0. +- Returns a map containing only the elements in `map` for which `func(map1[i], ..., mapN[i])` returns something other than 0. **Example** diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md index cf2940d63e1..b1ab7bd8ebe 100644 --- a/docs/en/sql-reference/functions/url-functions.md +++ b/docs/en/sql-reference/functions/url-functions.md @@ -16,7 +16,7 @@ If the relevant part isn’t present in a URL, an empty string is returned. Extracts the protocol from a URL. -Examples of typical returned values: http, https, ftp, mailto, tel, magnet… +Examples of typical returned values: http, https, ftp, mailto, tel, magnet... ### domain diff --git a/docs/en/sql-reference/statements/alter/comment.md b/docs/en/sql-reference/statements/alter/comment.md index f6fb179d969..320828f0de9 100644 --- a/docs/en/sql-reference/statements/alter/comment.md +++ b/docs/en/sql-reference/statements/alter/comment.md @@ -4,7 +4,7 @@ sidebar_position: 51 sidebar_label: COMMENT --- -# ALTER TABLE … MODIFY COMMENT +# ALTER TABLE ... MODIFY COMMENT Adds, modifies, or removes comment to the table, regardless if it was set before or not. Comment change is reflected in both [system.tables](../../../operations/system-tables/tables.md) and `SHOW CREATE TABLE` query. diff --git a/docs/en/sql-reference/statements/alter/delete.md b/docs/en/sql-reference/statements/alter/delete.md index b6f45b67d52..af56bec7a11 100644 --- a/docs/en/sql-reference/statements/alter/delete.md +++ b/docs/en/sql-reference/statements/alter/delete.md @@ -4,7 +4,7 @@ sidebar_position: 39 sidebar_label: DELETE --- -# ALTER TABLE … DELETE Statement +# ALTER TABLE ... DELETE Statement ``` sql ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr diff --git a/docs/en/sql-reference/statements/alter/index.md b/docs/en/sql-reference/statements/alter/index.md index 7961315c193..3cfb99cff83 100644 --- a/docs/en/sql-reference/statements/alter/index.md +++ b/docs/en/sql-reference/statements/alter/index.md @@ -42,7 +42,7 @@ These `ALTER` statements modify entities related to role-based access control: ## Mutations -`ALTER` queries that are intended to manipulate table data are implemented with a mechanism called “mutations”, most notably [ALTER TABLE … DELETE](/docs/en/sql-reference/statements/alter/delete.md) and [ALTER TABLE … UPDATE](/docs/en/sql-reference/statements/alter/update.md). They are asynchronous background processes similar to merges in [MergeTree](/docs/en/engines/table-engines/mergetree-family/index.md) tables that to produce new “mutated” versions of parts. +`ALTER` queries that are intended to manipulate table data are implemented with a mechanism called “mutations”, most notably [ALTER TABLE ... DELETE](/docs/en/sql-reference/statements/alter/delete.md) and [ALTER TABLE ... UPDATE](/docs/en/sql-reference/statements/alter/update.md). They are asynchronous background processes similar to merges in [MergeTree](/docs/en/engines/table-engines/mergetree-family/index.md) tables that to produce new “mutated” versions of parts. For `*MergeTree` tables mutations execute by **rewriting whole data parts**. There is no atomicity - parts are substituted for mutated parts as soon as they are ready and a `SELECT` query that started executing during a mutation will see data from parts that have already been mutated along with data from parts that have not been mutated yet. diff --git a/docs/en/sql-reference/statements/alter/update.md b/docs/en/sql-reference/statements/alter/update.md index ab7d0ca7378..0b300e5849a 100644 --- a/docs/en/sql-reference/statements/alter/update.md +++ b/docs/en/sql-reference/statements/alter/update.md @@ -4,7 +4,7 @@ sidebar_position: 40 sidebar_label: UPDATE --- -# ALTER TABLE … UPDATE Statements +# ALTER TABLE ... UPDATE Statements ``` sql ALTER TABLE [db.]table [ON CLUSTER cluster] UPDATE column1 = expr1 [, ...] [IN PARTITION partition_id] WHERE filter_expr diff --git a/docs/en/sql-reference/statements/alter/view.md b/docs/en/sql-reference/statements/alter/view.md index e063b27424e..83e8e9311b4 100644 --- a/docs/en/sql-reference/statements/alter/view.md +++ b/docs/en/sql-reference/statements/alter/view.md @@ -4,9 +4,9 @@ sidebar_position: 50 sidebar_label: VIEW --- -# ALTER TABLE … MODIFY QUERY Statement +# ALTER TABLE ... MODIFY QUERY Statement -You can modify `SELECT` query that was specified when a [materialized view](../create/view.md#materialized) was created with the `ALTER TABLE … MODIFY QUERY` statement without interrupting ingestion process. +You can modify `SELECT` query that was specified when a [materialized view](../create/view.md#materialized) was created with the `ALTER TABLE ... MODIFY QUERY` statement without interrupting ingestion process. This command is created to change materialized view created with `TO [db.]name` clause. It does not change the structure of the underlying storage table and it does not change the columns' definition of the materialized view, because of this the application of this command is very limited for materialized views are created without `TO [db.]name` clause. @@ -198,6 +198,6 @@ SELECT * FROM mv; `ALTER LIVE VIEW ... REFRESH` statement refreshes a [Live view](../create/view.md#live-view). See [Force Live View Refresh](../create/view.md#live-view-alter-refresh). -## ALTER TABLE … MODIFY REFRESH Statement +## ALTER TABLE ... MODIFY REFRESH Statement `ALTER TABLE ... MODIFY REFRESH` statement changes refresh parameters of a [Refreshable Materialized View](../create/view.md#refreshable-materialized-view). See [Changing Refresh Parameters](../create/view.md#changing-refresh-parameters). diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 073a3c0d246..b526c94e508 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -306,7 +306,7 @@ CREATE WINDOW VIEW test.wv TO test.dst WATERMARK=ASCENDING ALLOWED_LATENESS=INTE Note that elements emitted by a late firing should be treated as updated results of a previous computation. Instead of firing at the end of windows, the window view will fire immediately when the late event arrives. Thus, it will result in multiple outputs for the same window. Users need to take these duplicated results into account or deduplicate them. -You can modify `SELECT` query that was specified in the window view by using `ALTER TABLE … MODIFY QUERY` statement. The data structure resulting in a new `SELECT` query should be the same as the original `SELECT` query when with or without `TO [db.]name` clause. Note that the data in the current window will be lost because the intermediate state cannot be reused. +You can modify `SELECT` query that was specified in the window view by using `ALTER TABLE ... MODIFY QUERY` statement. The data structure resulting in a new `SELECT` query should be the same as the original `SELECT` query when with or without `TO [db.]name` clause. Note that the data in the current window will be lost because the intermediate state cannot be reused. ### Monitoring New Windows diff --git a/docs/en/sql-reference/statements/insert-into.md b/docs/en/sql-reference/statements/insert-into.md index a76692cf291..f3dadabd25f 100644 --- a/docs/en/sql-reference/statements/insert-into.md +++ b/docs/en/sql-reference/statements/insert-into.md @@ -73,7 +73,7 @@ Data can be passed to the INSERT in any [format](../../interfaces/formats.md#for INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format_name data_set ``` -For example, the following query format is identical to the basic version of INSERT … VALUES: +For example, the following query format is identical to the basic version of INSERT ... VALUES: ``` sql INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... diff --git a/docs/en/sql-reference/statements/select/limit.md b/docs/en/sql-reference/statements/select/limit.md index d61a5a44b58..58fdf988bf3 100644 --- a/docs/en/sql-reference/statements/select/limit.md +++ b/docs/en/sql-reference/statements/select/limit.md @@ -17,11 +17,11 @@ If there is no [ORDER BY](../../../sql-reference/statements/select/order-by.md) The number of rows in the result set can also depend on the [limit](../../../operations/settings/settings.md#limit) setting. ::: -## LIMIT … WITH TIES Modifier +## LIMIT ... WITH TIES Modifier When you set `WITH TIES` modifier for `LIMIT n[,m]` and specify `ORDER BY expr_list`, you will get in result first `n` or `n,m` rows and all rows with same `ORDER BY` fields values equal to row at position `n` for `LIMIT n` and `m` for `LIMIT n,m`. -This modifier also can be combined with [ORDER BY … WITH FILL modifier](../../../sql-reference/statements/select/order-by.md#orderby-with-fill). +This modifier also can be combined with [ORDER BY ... WITH FILL modifier](../../../sql-reference/statements/select/order-by.md#orderby-with-fill). For example, the following query diff --git a/docs/en/sql-reference/statements/select/order-by.md b/docs/en/sql-reference/statements/select/order-by.md index d6432a7b4f8..512a58d7cd9 100644 --- a/docs/en/sql-reference/statements/select/order-by.md +++ b/docs/en/sql-reference/statements/select/order-by.md @@ -283,7 +283,7 @@ In `MaterializedView`-engine tables the optimization works with views like `SELE ## ORDER BY Expr WITH FILL Modifier -This modifier also can be combined with [LIMIT … WITH TIES modifier](../../../sql-reference/statements/select/limit.md#limit-with-ties). +This modifier also can be combined with [LIMIT ... WITH TIES modifier](../../../sql-reference/statements/select/limit.md#limit-with-ties). `WITH FILL` modifier can be set after `ORDER BY expr` with optional `FROM expr`, `TO expr` and `STEP expr` parameters. All missed values of `expr` column will be filled sequentially and other columns will be filled as defaults. diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index 3a63811add6..f66178afbb2 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -169,7 +169,7 @@ If your listing of files contains number ranges with leading zeros, use the cons **Example** -Query the total number of rows in files named `file000`, `file001`, … , `file999`: +Query the total number of rows in files named `file000`, `file001`, ... , `file999`: ``` sql SELECT count(*) FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32'); diff --git a/docs/en/sql-reference/table-functions/gcs.md b/docs/en/sql-reference/table-functions/gcs.md index 80077ecdb33..b891d88df31 100644 --- a/docs/en/sql-reference/table-functions/gcs.md +++ b/docs/en/sql-reference/table-functions/gcs.md @@ -130,7 +130,7 @@ FROM gcs('https://storage.googleapis.com/my-test-bucket-768/{some,another}_prefi If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. ::: -Count the total amount of rows in files named `file-000.csv`, `file-001.csv`, … , `file-999.csv`: +Count the total amount of rows in files named `file-000.csv`, `file-001.csv`, ... , `file-999.csv`: ``` sql SELECT count(*) diff --git a/docs/en/sql-reference/table-functions/hdfs.md b/docs/en/sql-reference/table-functions/hdfs.md index 92f904b8841..d65615e7588 100644 --- a/docs/en/sql-reference/table-functions/hdfs.md +++ b/docs/en/sql-reference/table-functions/hdfs.md @@ -85,7 +85,7 @@ If your listing of files contains number ranges with leading zeros, use the cons **Example** -Query the data from files named `file000`, `file001`, … , `file999`: +Query the data from files named `file000`, `file001`, ... , `file999`: ``` sql SELECT count(*) diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 38d77a98749..cbef80371a3 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -137,7 +137,7 @@ FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/ If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. ::: -Count the total amount of rows in files named `file-000.csv`, `file-001.csv`, … , `file-999.csv`: +Count the total amount of rows in files named `file-000.csv`, `file-001.csv`, ... , `file-999.csv`: ``` sql SELECT count(*) diff --git a/docs/ru/development/style.md b/docs/ru/development/style.md index 4aa2073d75b..a071d0fb00d 100644 --- a/docs/ru/development/style.md +++ b/docs/ru/development/style.md @@ -57,7 +57,7 @@ memcpy(&buf[place_value], &x, sizeof(x)); for (size_t i = 0; i < rows; i += storage.index_granularity) ``` -**7.** Вокруг бинарных операторов (`+`, `-`, `*`, `/`, `%`, …), а также тернарного оператора `?:` ставятся пробелы. +**7.** Вокруг бинарных операторов (`+`, `-`, `*`, `/`, `%`, ...), а также тернарного оператора `?:` ставятся пробелы. ``` cpp UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); @@ -86,7 +86,7 @@ dst.ClickGoodEvent = click.GoodEvent; При необходимости, оператор может быть перенесён на новую строку. В этом случае, перед ним увеличивается отступ. -**11.** Унарные операторы `--`, `++`, `*`, `&`, … не отделяются от аргумента пробелом. +**11.** Унарные операторы `--`, `++`, `*`, `&`, ... не отделяются от аргумента пробелом. **12.** После запятой ставится пробел, а перед — нет. Аналогично для точки с запятой внутри выражения `for`. @@ -115,7 +115,7 @@ public: **16.** Если на весь файл один `namespace` и кроме него ничего существенного нет, то отступ внутри `namespace` не нужен. -**17.** Если блок для выражения `if`, `for`, `while`, … состоит из одного `statement`, то фигурные скобки не обязательны. Вместо этого поместите `statement` на отдельную строку. Это правило справедливо и для вложенных `if`, `for`, `while`, … +**17.** Если блок для выражения `if`, `for`, `while`, ... состоит из одного `statement`, то фигурные скобки не обязательны. Вместо этого поместите `statement` на отдельную строку. Это правило справедливо и для вложенных `if`, `for`, `while`, ... Если внутренний `statement` содержит фигурные скобки или `else`, то внешний блок следует писать в фигурных скобках. @@ -266,7 +266,7 @@ void executeQuery( Пример взят с ресурса http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/. -**7.** Нельзя писать мусорные комментарии (автор, дата создания…) в начале каждого файла. +**7.** Нельзя писать мусорные комментарии (автор, дата создания...) в начале каждого файла. **8.** Однострочные комментарии начинаются с трёх слешей: `///` , многострочные с `/**`. Такие комментарии считаются «документирующими». diff --git a/docs/ru/engines/table-engines/integrations/hdfs.md b/docs/ru/engines/table-engines/integrations/hdfs.md index 72087b56652..cf43eef73e3 100644 --- a/docs/ru/engines/table-engines/integrations/hdfs.md +++ b/docs/ru/engines/table-engines/integrations/hdfs.md @@ -103,7 +103,7 @@ CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs **Example** -Создадим таблицу с именами `file000`, `file001`, … , `file999`: +Создадим таблицу с именами `file000`, `file001`, ... , `file999`: ``` sql CREATE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') diff --git a/docs/ru/engines/table-engines/integrations/s3.md b/docs/ru/engines/table-engines/integrations/s3.md index 720aa589122..a1c69df4d0a 100644 --- a/docs/ru/engines/table-engines/integrations/s3.md +++ b/docs/ru/engines/table-engines/integrations/s3.md @@ -73,7 +73,7 @@ SELECT * FROM s3_engine_table LIMIT 2; **Пример подстановки 1** -Таблица содержит данные из файлов с именами `file-000.csv`, `file-001.csv`, … , `file-999.csv`: +Таблица содержит данные из файлов с именами `file-000.csv`, `file-001.csv`, ... , `file-999.csv`: ``` sql CREATE TABLE big_table (name String, value UInt32) diff --git a/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md b/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md index 46597c94370..c3203804211 100644 --- a/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md +++ b/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md @@ -66,7 +66,7 @@ WHERE table = 'visits' └───────────┴───────────────────┴────────┘ ``` -Столбец `partition` содержит имена всех партиций таблицы. Таблица `visits` из нашего примера содержит две партиции: `201901` и `201902`. Используйте значения из этого столбца в запросах [ALTER … PARTITION](../../../sql-reference/statements/alter/partition.md). +Столбец `partition` содержит имена всех партиций таблицы. Таблица `visits` из нашего примера содержит две партиции: `201901` и `201902`. Используйте значения из этого столбца в запросах [ALTER ... PARTITION](../../../sql-reference/statements/alter/partition.md). Столбец `name` содержит названия кусков партиций. Значения из этого столбца можно использовать в запросах [ALTER ATTACH PART](../../../sql-reference/statements/alter/partition.md#alter_attach-partition). diff --git a/docs/ru/engines/table-engines/mergetree-family/mergetree.md b/docs/ru/engines/table-engines/mergetree-family/mergetree.md index faa492d4d85..49ba229b1d5 100644 --- a/docs/ru/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/mergetree.md @@ -771,7 +771,7 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd' - В результате вставки (запрос `INSERT`). - В фоновых операциях слияний и [мутаций](../../../sql-reference/statements/alter/index.md#mutations). - При скачивании данных с другой реплики. -- В результате заморозки партиций [ALTER TABLE … FREEZE PARTITION](../../../engines/table-engines/mergetree-family/mergetree.md#alter_freeze-partition). +- В результате заморозки партиций [ALTER TABLE ... FREEZE PARTITION](../../../engines/table-engines/mergetree-family/mergetree.md#alter_freeze-partition). Во всех случаях, кроме мутаций и заморозки партиций, при записи куска выбирается том и диск в соответствии с указанной конфигурацией хранилища: @@ -781,7 +781,7 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd' Мутации и запросы заморозки партиций в реализации используют [жесткие ссылки](https://ru.wikipedia.org/wiki/%D0%96%D1%91%D1%81%D1%82%D0%BA%D0%B0%D1%8F_%D1%81%D1%81%D1%8B%D0%BB%D0%BA%D0%B0). Жесткие ссылки между различными дисками не поддерживаются, поэтому в случае таких операций куски размещаются на тех же дисках, что и исходные. В фоне куски перемещаются между томами на основе информации о занятом месте (настройка `move_factor`) по порядку, в котором указаны тома в конфигурации. Данные никогда не перемещаются с последнего тома и на первый том. Следить за фоновыми перемещениями можно с помощью системных таблиц [system.part_log](../../../engines/table-engines/mergetree-family/mergetree.md#system_tables-part-log) (поле `type = MOVE_PART`) и [system.parts](../../../engines/table-engines/mergetree-family/mergetree.md#system_tables-parts) (поля `path` и `disk`). Также подробная информация о перемещениях доступна в логах сервера. -С помощью запроса [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../../engines/table-engines/mergetree-family/mergetree.md#alter_move-partition) пользователь может принудительно перенести кусок или партицию с одного раздела на другой. При этом учитываются все ограничения, указанные для фоновых операций. Запрос самостоятельно инициирует процесс перемещения не дожидаясь фоновых операций. В случае недостатка места или неудовлетворения ограничениям пользователь получит сообщение об ошибке. +С помощью запроса [ALTER TABLE ... MOVE PART\|PARTITION ... TO VOLUME\|DISK ...](../../../engines/table-engines/mergetree-family/mergetree.md#alter_move-partition) пользователь может принудительно перенести кусок или партицию с одного раздела на другой. При этом учитываются все ограничения, указанные для фоновых операций. Запрос самостоятельно инициирует процесс перемещения не дожидаясь фоновых операций. В случае недостатка места или неудовлетворения ограничениям пользователь получит сообщение об ошибке. Перемещения данных не взаимодействуют с репликацией данных, поэтому на разных репликах одной и той же таблицы могут быть указаны разные политики хранения. diff --git a/docs/ru/engines/table-engines/special/external-data.md b/docs/ru/engines/table-engines/special/external-data.md index 881566e5f34..3d9737096f5 100644 --- a/docs/ru/engines/table-engines/special/external-data.md +++ b/docs/ru/engines/table-engines/special/external-data.md @@ -31,7 +31,7 @@ ClickHouse позволяет отправить на сервер данные, - **--format** - формат данных в файле. Если не указано - используется TabSeparated. Должен быть указан один из следующих параметров: -- **--types** - список типов столбцов через запятую. Например, `UInt64,String`. Столбцы будут названы _1, _2, … +- **--types** - список типов столбцов через запятую. Например, `UInt64,String`. Столбцы будут названы _1, _2, ... - **--structure** - структура таблицы, в форме `UserID UInt64`, `URL String`. Определяет имена и типы столбцов. Файлы, указанные в file, будут разобраны форматом, указанным в format, с использованием типов данных, указанных в types или structure. Таблица будет загружена на сервер, и доступна там в качестве временной таблицы с именем name. diff --git a/docs/ru/faq/general/olap.md b/docs/ru/faq/general/olap.md index c9021f7c92e..bcfe9663381 100644 --- a/docs/ru/faq/general/olap.md +++ b/docs/ru/faq/general/olap.md @@ -9,13 +9,13 @@ sidebar_position: 100 [OLAP](https://ru.wikipedia.org/wiki/OLAP) (OnLine Analytical Processing) переводится как обработка данных в реальном времени. Это широкий термин, который можно рассмотреть с двух сторон: с технической и с точки зрения бизнеса. Для самого общего понимания можно просто прочитать его с конца: **Processing** - Обрабатываются некие исходные данные… + Обрабатываются некие исходные данные... **Analytical** -: … чтобы получить какие-то аналитические отчеты или новые знания… +: ... чтобы получить какие-то аналитические отчеты или новые знания... **OnLine** -: … в реальном времени, практически без задержек на обработку. +: ... в реальном времени, практически без задержек на обработку. ## OLAP с точки зрения бизнеса {#olap-from-the-business-perspective} diff --git a/docs/ru/getting-started/example-datasets/nyc-taxi.md b/docs/ru/getting-started/example-datasets/nyc-taxi.md index 12d0c18c3a1..a42033e7d41 100644 --- a/docs/ru/getting-started/example-datasets/nyc-taxi.md +++ b/docs/ru/getting-started/example-datasets/nyc-taxi.md @@ -196,7 +196,7 @@ real 75m56.214s (Импорт данных напрямую из Postgres также возможен с использованием `COPY ... TO PROGRAM`.) -К сожалению, все поля, связанные с погодой (precipitation…average_wind_speed) заполнены NULL. Из-за этого мы исключим их из финального набора данных. +К сожалению, все поля, связанные с погодой (precipitation...average_wind_speed) заполнены NULL. Из-за этого мы исключим их из финального набора данных. Для начала мы создадим таблицу на одном сервере. Позже мы сделаем таблицу распределенной. diff --git a/docs/ru/index.md b/docs/ru/index.md index 29f2bbe07fb..02be8912b94 100644 --- a/docs/ru/index.md +++ b/docs/ru/index.md @@ -12,10 +12,10 @@ ClickHouse — столбцовая система управления база | Строка | WatchID | JavaEnable | Title | GoodEvent | EventTime | |--------|-------------|------------|--------------------|-----------|---------------------| -| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | -| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | -| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | -| #N | … | … | … | … | … | +| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | +| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | +| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | +| #N | ... | ... | ... | ... | ... | То есть, значения, относящиеся к одной строке, физически хранятся рядом. @@ -24,13 +24,13 @@ ClickHouse — столбцовая система управления база В столбцовых СУБД данные хранятся в таком порядке: -| Строка: | #0 | #1 | #2 | #N | +| Строка: | #0 | #1 | #2 | #N | |-------------|---------------------|---------------------|---------------------|-----| -| WatchID: | 89354350662 | 90329509958 | 89953706054 | … | -| JavaEnable: | 1 | 0 | 1 | … | -| Title: | Investor Relations | Contact us | Mission | … | -| GoodEvent: | 1 | 1 | 1 | … | -| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | +| WatchID: | 89354350662 | 90329509958 | 89953706054 | ... | +| JavaEnable: | 1 | 0 | 1 | ... | +| Title: | Investor Relations | Contact us | Mission | ... | +| GoodEvent: | 1 | 1 | 1 | ... | +| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | ... | В примерах изображён только порядок расположения данных. То есть значения из разных столбцов хранятся отдельно, а данные одного столбца — вместе. diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index a9280de9c7b..4ed42b6fb22 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -119,6 +119,7 @@ Hello\nworld Hello\ world ``` +`\n\r` (CRLF) поддерживается с помощью настройки `input_format_tsv_crlf_end_of_line`. Второй вариант поддерживается, так как его использует MySQL при записи tab-separated дампа. diff --git a/docs/ru/operations/settings/query-complexity.md b/docs/ru/operations/settings/query-complexity.md index d1d38a587c6..e82a5a008eb 100644 --- a/docs/ru/operations/settings/query-complexity.md +++ b/docs/ru/operations/settings/query-complexity.md @@ -260,7 +260,7 @@ FORMAT Null; Ограничивает количество строк в хэш-таблице, используемой при соединении таблиц. -Параметр применяется к операциям [SELECT… JOIN](../../sql-reference/statements/select/join.md#select-join) и к движку таблиц [Join](../../engines/table-engines/special/join.md). +Параметр применяется к операциям [SELECT... JOIN](../../sql-reference/statements/select/join.md#select-join) и к движку таблиц [Join](../../engines/table-engines/special/join.md). Если запрос содержит несколько `JOIN`, то ClickHouse проверяет значение настройки для каждого промежуточного результата. @@ -277,7 +277,7 @@ FORMAT Null; Ограничивает размер (в байтах) хэш-таблицы, используемой при объединении таблиц. -Параметр применяется к операциям [SELECT… JOIN](../../sql-reference/statements/select/join.md#select-join) и к движку таблиц [Join](../../engines/table-engines/special/join.md). +Параметр применяется к операциям [SELECT... JOIN](../../sql-reference/statements/select/join.md#select-join) и к движку таблиц [Join](../../engines/table-engines/special/join.md). Если запрос содержит несколько `JOIN`, то ClickHouse проверяет значение настройки для каждого промежуточного результата. diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 2b3607dcf08..3a70a0bac12 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1859,7 +1859,7 @@ SELECT * FROM test_table ## count_distinct_implementation {#settings-count_distinct_implementation} -Задаёт, какая из функций `uniq*` используется при выполнении конструкции [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count). +Задаёт, какая из функций `uniq*` используется при выполнении конструкции [COUNT(DISTINCT ...)](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count). Возможные значения: diff --git a/docs/ru/sql-reference/aggregate-functions/parametric-functions.md b/docs/ru/sql-reference/aggregate-functions/parametric-functions.md index 6463f6bd95d..e6a61d9b381 100644 --- a/docs/ru/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/ru/sql-reference/aggregate-functions/parametric-functions.md @@ -82,7 +82,7 @@ FROM В этом случае необходимо помнить, что границы корзин гистограммы не известны. -## sequenceMatch(pattern)(timestamp, cond1, cond2, …) {#function-sequencematch} +## sequenceMatch(pattern)(timestamp, cond1, cond2, ...) {#function-sequencematch} Проверяет, содержит ли последовательность событий цепочку, которая соответствует указанному шаблону. @@ -172,7 +172,7 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM - [sequenceCount](#function-sequencecount) -## sequenceCount(pattern)(time, cond1, cond2, …) {#function-sequencecount} +## sequenceCount(pattern)(time, cond1, cond2, ...) {#function-sequencecount} Вычисляет количество цепочек событий, соответствующих шаблону. Функция обнаруживает только непересекающиеся цепочки событий. Она начинает искать следующую цепочку только после того, как полностью совпала текущая цепочка событий. diff --git a/docs/ru/sql-reference/aggregate-functions/reference/quantiles.md b/docs/ru/sql-reference/aggregate-functions/reference/quantiles.md index fed0f8b328b..a0a430f7a68 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/quantiles.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/quantiles.md @@ -7,7 +7,7 @@ sidebar_position: 201 ## quantiles {#quantiles} -Синтаксис: `quantiles(level1, level2, …)(x)` +Синтаксис: `quantiles(level1, level2, ...)(x)` Все функции для вычисления квантилей имеют соответствующие функции для вычисления нескольких квантилей: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`, `quantilesBFloat16`. Эти функции вычисляют все квантили указанных уровней в один проход и возвращают массив с вычисленными значениями. diff --git a/docs/ru/sql-reference/data-types/aggregatefunction.md b/docs/ru/sql-reference/data-types/aggregatefunction.md index e42b467e4af..0481151c7e4 100644 --- a/docs/ru/sql-reference/data-types/aggregatefunction.md +++ b/docs/ru/sql-reference/data-types/aggregatefunction.md @@ -6,9 +6,9 @@ sidebar_label: AggregateFunction # AggregateFunction {#data-type-aggregatefunction} -Агрегатные функции могут обладать определяемым реализацией промежуточным состоянием, которое может быть сериализовано в тип данных, соответствующий AggregateFunction(…), и быть записано в таблицу обычно посредством [материализованного представления](../../sql-reference/statements/create/view.md). Чтобы получить промежуточное состояние, обычно используются агрегатные функции с суффиксом `-State`. Чтобы в дальнейшем получить агрегированные данные необходимо использовать те же агрегатные функции с суффиксом `-Merge`. +Агрегатные функции могут обладать определяемым реализацией промежуточным состоянием, которое может быть сериализовано в тип данных, соответствующий AggregateFunction(...), и быть записано в таблицу обычно посредством [материализованного представления](../../sql-reference/statements/create/view.md). Чтобы получить промежуточное состояние, обычно используются агрегатные функции с суффиксом `-State`. Чтобы в дальнейшем получить агрегированные данные необходимо использовать те же агрегатные функции с суффиксом `-Merge`. -`AggregateFunction(name, types_of_arguments…)` — параметрический тип данных. +`AggregateFunction(name, types_of_arguments...)` — параметрический тип данных. **Параметры** diff --git a/docs/ru/sql-reference/data-types/fixedstring.md b/docs/ru/sql-reference/data-types/fixedstring.md index d7a4e865903..56a5632f88d 100644 --- a/docs/ru/sql-reference/data-types/fixedstring.md +++ b/docs/ru/sql-reference/data-types/fixedstring.md @@ -21,8 +21,8 @@ sidebar_label: FixedString(N) Примеры значений, которые можно эффективно хранить в столбцах типа `FixedString`: - Двоичное представление IP-адреса (`FixedString(16)` для IPv6). -- Коды языков (ru_RU, en_US … ). -- Коды валют (USD, RUB … ). +- Коды языков (ru_RU, en_US ... ). +- Коды валют (USD, RUB ... ). - Двоичное представление хэшей (`FixedString(16)` для MD5, `FixedString(32)` для SHA256). Для хранения значений UUID используйте тип данных [UUID](uuid.md). diff --git a/docs/ru/sql-reference/data-types/nested-data-structures/nested.md b/docs/ru/sql-reference/data-types/nested-data-structures/nested.md index 4ec8333d563..8fd293a0415 100644 --- a/docs/ru/sql-reference/data-types/nested-data-structures/nested.md +++ b/docs/ru/sql-reference/data-types/nested-data-structures/nested.md @@ -3,7 +3,7 @@ slug: /ru/sql-reference/data-types/nested-data-structures/nested --- # Nested {#nested} -## Nested(Name1 Type1, Name2 Type2, …) {#nestedname1-type1-name2-type2} +## Nested(Name1 Type1, Name2 Type2, ...) {#nestedname1-type1-name2-type2} Вложенная структура данных - это как будто вложенная таблица. Параметры вложенной структуры данных - имена и типы столбцов, указываются так же, как у запроса CREATE. Каждой строке таблицы может соответствовать произвольное количество строк вложенной структуры данных. diff --git a/docs/ru/sql-reference/data-types/tuple.md b/docs/ru/sql-reference/data-types/tuple.md index 8953134d154..9d86c26c563 100644 --- a/docs/ru/sql-reference/data-types/tuple.md +++ b/docs/ru/sql-reference/data-types/tuple.md @@ -4,7 +4,7 @@ sidebar_position: 54 sidebar_label: Tuple(T1, T2, ...) --- -# Tuple(T1, T2, …) {#tuplet1-t2} +# Tuple(T1, T2, ...) {#tuplet1-t2} Кортеж из элементов любого [типа](index.md#data_types). Элементы кортежа могут быть одного или разных типов. diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md index 1f06bdf264a..825e3f06be2 100644 --- a/docs/ru/sql-reference/functions/array-functions.md +++ b/docs/ru/sql-reference/functions/array-functions.md @@ -161,7 +161,7 @@ SELECT range(5), range(1, 5), range(1, 5, 2); ``` -## array(x1, …), оператор \[x1, …\] {#arrayx1-operator-x1} +## array(x1, ...), оператор \[x1, ...\] {#arrayx1-operator-x1} Создаёт массив из аргументов функции. Аргументы должны быть константами и иметь типы, для которых есть наименьший общий тип. Должен быть передан хотя бы один аргумент, так как иначе непонятно, какого типа создавать массив. То есть, с помощью этой функции невозможно создать пустой массив (для этого используйте функции emptyArray\*, описанные выше). @@ -308,7 +308,7 @@ SELECT indexOf([1, 3, NULL, NULL], NULL) Элементы, равные `NULL`, обрабатываются как обычные значения. -## arrayCount(\[func,\] arr1, …) {#array-count} +## arrayCount(\[func,\] arr1, ...) {#array-count} Возвращает количество элементов массива `arr`, для которых функция `func` возвращает не 0. Если `func` не указана - возвращает количество ненулевых элементов массива. @@ -335,7 +335,7 @@ SELECT countEqual([1, 2, NULL, NULL], NULL) ## arrayEnumerate(arr) {#array_functions-arrayenumerate} -Возвращает массив \[1, 2, 3, …, length(arr)\] +Возвращает массив \[1, 2, 3, ..., length(arr)\] Эта функция обычно используется совместно с ARRAY JOIN. Она позволяет, после применения ARRAY JOIN, посчитать что-либо только один раз для каждого массива. Пример: @@ -375,7 +375,7 @@ WHERE (CounterID = 160656) AND notEmpty(GoalsReached) Также эта функция может быть использована в функциях высшего порядка. Например, с её помощью можно достать индексы массива для элементов, удовлетворяющих некоторому условию. -## arrayEnumerateUniq(arr, …) {#arrayenumerateuniqarr} +## arrayEnumerateUniq(arr, ...) {#arrayenumerateuniqarr} Возвращает массив, такого же размера, как исходный, где для каждого элемента указано, какой он по счету среди элементов с таким же значением. Например: arrayEnumerateUniq(\[10, 20, 10, 30\]) = \[1, 1, 2, 1\]. @@ -597,7 +597,7 @@ SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res; Элементы массива равные `NULL` обрабатываются как обычные значения. -## arraySort(\[func,\] arr, …) {#array_functions-sort} +## arraySort(\[func,\] arr, ...) {#array_functions-sort} Возвращает массив `arr`, отсортированный в восходящем порядке. Если задана функция `func`, то порядок сортировки определяется результатом применения этой функции на элементы массива `arr`. Если `func` принимает несколько аргументов, то в функцию `arraySort` нужно передавать несколько массивов, которые будут соответствовать аргументам функции `func`. Подробные примеры рассмотрены в конце описания `arraySort`. @@ -698,11 +698,11 @@ SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; Для улучшения эффективности сортировки применяется [преобразование Шварца](https://ru.wikipedia.org/wiki/%D0%9F%D1%80%D0%B5%D0%BE%D0%B1%D1%80%D0%B0%D0%B7%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_%D0%A8%D0%B2%D0%B0%D1%80%D1%86%D0%B0). ::: -## arrayPartialSort(\[func,\] limit, arr, …) {#array_functions-sort} +## arrayPartialSort(\[func,\] limit, arr, ...) {#array_functions-sort} То же, что и `arraySort` с дополнительным аргументом `limit`, позволяющим частичную сортировку. Возвращает массив того же размера, как и исходный, в котором элементы `[1..limit]` отсортированы в возрастающем порядке. Остальные элементы `(limit..N]` остаются в неспецифицированном порядке. -## arrayReverseSort(\[func,\] arr, …) {#array_functions-reverse-sort} +## arrayReverseSort(\[func,\] arr, ...) {#array_functions-reverse-sort} Возвращает массив `arr`, отсортированный в нисходящем порядке. Если указана функция `func`, то массив `arr` сначала сортируется в порядке, который определяется функцией `func`, а затем отсортированный массив переворачивается. Если функция `func` принимает несколько аргументов, то в функцию `arrayReverseSort` необходимо передавать несколько массивов, которые будут соответствовать аргументам функции `func`. Подробные примеры рассмотрены в конце описания функции `arrayReverseSort`. @@ -803,11 +803,11 @@ SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; └─────────┘ ``` -## arrayPartialReverseSort(\[func,\] limit, arr, …) {#array_functions-sort} +## arrayPartialReverseSort(\[func,\] limit, arr, ...) {#array_functions-sort} То же, что и `arrayReverseSort` с дополнительным аргументом `limit`, позволяющим частичную сортировку. Возвращает массив того же размера, как и исходный, в котором элементы `[1..limit]` отсортированы в убывающем порядке. Остальные элементы `(limit..N]` остаются в неспецифицированном порядке. -## arrayUniq(arr, …) {#array-functions-arrayuniq} +## arrayUniq(arr, ...) {#array-functions-arrayuniq} Если передан один аргумент, считает количество разных элементов в массиве. Если передано несколько аргументов, считает количество разных кортежей из элементов на соответствующих позициях в нескольких массивах. @@ -1174,7 +1174,7 @@ SELECT arrayZip(['a', 'b', 'c'], [5, 2, 1]); └──────────────────────────────────────┘ ``` -## arrayMap(func, arr1, …) {#array-map} +## arrayMap(func, arr1, ...) {#array-map} Возвращает массив, полученный на основе результатов применения функции `func` к каждому элементу массива `arr`. @@ -1204,7 +1204,7 @@ SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res; Функция `arrayMap` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. -## arrayFilter(func, arr1, …) {#array-filter} +## arrayFilter(func, arr1, ...) {#array-filter} Возвращает массив, содержащий только те элементы массива `arr1`, для которых функция `func` возвращает не 0. @@ -1237,7 +1237,7 @@ SELECT Функция `arrayFilter` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. -## arrayFill(func, arr1, …) {#array-fill} +## arrayFill(func, arr1, ...) {#array-fill} Перебирает `arr1` от первого элемента к последнему и заменяет `arr1[i]` на `arr1[i - 1]`, если `func` вернула 0. Первый элемент `arr1` остаётся неизменным. @@ -1255,7 +1255,7 @@ SELECT arrayFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, Функция `arrayFill` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. -## arrayReverseFill(func, arr1, …) {#array-reverse-fill} +## arrayReverseFill(func, arr1, ...) {#array-reverse-fill} Перебирает `arr1` от последнего элемента к первому и заменяет `arr1[i]` на `arr1[i + 1]`, если `func` вернула 0. Последний элемент `arr1` остаётся неизменным. @@ -1273,7 +1273,7 @@ SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, Функция `arrayReverseFill` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. -## arraySplit(func, arr1, …) {#array-split} +## arraySplit(func, arr1, ...) {#array-split} Разделяет массив `arr1` на несколько. Если `func` возвращает не 0, то массив разделяется, а элемент помещается в левую часть. Массив не разбивается по первому элементу. @@ -1291,7 +1291,7 @@ SELECT arraySplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res Функция `arraySplit` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. -## arrayReverseSplit(func, arr1, …) {#array-reverse-split} +## arrayReverseSplit(func, arr1, ...) {#array-reverse-split} Разделяет массив `arr1` на несколько. Если `func` возвращает не 0, то массив разделяется, а элемент помещается в правую часть. Массив не разбивается по последнему элементу. @@ -1309,25 +1309,25 @@ SELECT arrayReverseSplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res Функция `arrayReverseSplit` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. -## arrayExists(\[func,\] arr1, …) {#arrayexistsfunc-arr1} +## arrayExists(\[func,\] arr1, ...) {#arrayexistsfunc-arr1} Возвращает 1, если существует хотя бы один элемент массива `arr`, для которого функция func возвращает не 0. Иначе возвращает 0. Функция `arrayExists` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) - в качестве первого аргумента ей можно передать лямбда-функцию. -## arrayAll(\[func,\] arr1, …) {#arrayallfunc-arr1} +## arrayAll(\[func,\] arr1, ...) {#arrayallfunc-arr1} Возвращает 1, если для всех элементов массива `arr`, функция `func` возвращает не 0. Иначе возвращает 0. Функция `arrayAll` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) - в качестве первого аргумента ей можно передать лямбда-функцию. -## arrayFirst(func, arr1, …) {#array-first} +## arrayFirst(func, arr1, ...) {#array-first} Возвращает первый элемент массива `arr1`, для которого функция func возвращает не 0. Функция `arrayFirst` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. -## arrayFirstIndex(func, arr1, …) {#array-first-index} +## arrayFirstIndex(func, arr1, ...) {#array-first-index} Возвращает индекс первого элемента массива `arr1`, для которого функция func возвращает не 0. @@ -1599,7 +1599,7 @@ SELECT arraySum(x -> x*x, [2, 3]) AS res; └─────┘ ``` -## arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1} +## arrayCumSum(\[func,\] arr1, ...) {#arraycumsumfunc-arr1} Возвращает массив из частичных сумм элементов исходного массива (сумма с накоплением). Если указана функция `func`, то значения элементов массива преобразуются этой функцией перед суммированием. diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index 56ae4359bf1..bcc5f807c32 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -559,7 +559,7 @@ SELECT Описание режимов (mode): -| Mode | Первый день недели | Диапазон | Неделя 1 это первая неделя … | +| Mode | Первый день недели | Диапазон | Неделя 1 это первая неделя ... | | ----------- | -------- | -------- | ------------------ | |0|Воскресенье|0-53|с воскресеньем в этом году |1|Понедельник|0-53|с 4-мя или более днями в этом году diff --git a/docs/ru/sql-reference/functions/json-functions.md b/docs/ru/sql-reference/functions/json-functions.md index 123f40ce05d..18f625bf80f 100644 --- a/docs/ru/sql-reference/functions/json-functions.md +++ b/docs/ru/sql-reference/functions/json-functions.md @@ -88,7 +88,7 @@ SELECT isValidJSON('{"a": "hello", "b": [-100, 200.0, 300]}') = 1 SELECT isValidJSON('not a json') = 0 ``` -## JSONHas(json\[, indices_or_keys\]…) {#jsonhasjson-indices-or-keys} +## JSONHas(json\[, indices_or_keys\]...) {#jsonhasjson-indices-or-keys} Если значение существует в документе JSON, то возвращается `1`. @@ -121,7 +121,7 @@ SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' ``` -## JSONLength(json\[, indices_or_keys\]…) {#jsonlengthjson-indices-or-keys} +## JSONLength(json\[, indices_or_keys\]...) {#jsonlengthjson-indices-or-keys} Возвращает длину массива JSON или объекта JSON. @@ -134,7 +134,7 @@ SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 ``` -## JSONType(json\[, indices_or_keys\]…) {#jsontypejson-indices-or-keys} +## JSONType(json\[, indices_or_keys\]...) {#jsontypejson-indices-or-keys} Возвращает тип значения JSON. @@ -148,13 +148,13 @@ SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' ``` -## JSONExtractUInt(json\[, indices_or_keys\]…) {#jsonextractuintjson-indices-or-keys} +## JSONExtractUInt(json\[, indices_or_keys\]...) {#jsonextractuintjson-indices-or-keys} -## JSONExtractInt(json\[, indices_or_keys\]…) {#jsonextractintjson-indices-or-keys} +## JSONExtractInt(json\[, indices_or_keys\]...) {#jsonextractintjson-indices-or-keys} -## JSONExtractFloat(json\[, indices_or_keys\]…) {#jsonextractfloatjson-indices-or-keys} +## JSONExtractFloat(json\[, indices_or_keys\]...) {#jsonextractfloatjson-indices-or-keys} -## JSONExtractBool(json\[, indices_or_keys\]…) {#jsonextractbooljson-indices-or-keys} +## JSONExtractBool(json\[, indices_or_keys\]...) {#jsonextractbooljson-indices-or-keys} Парсит JSON и извлекает значение. Эти функции аналогичны функциям `visitParam`. @@ -168,7 +168,7 @@ SELECT JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200 SELECT JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 ``` -## JSONExtractString(json\[, indices_or_keys\]…) {#jsonextractstringjson-indices-or-keys} +## JSONExtractString(json\[, indices_or_keys\]...) {#jsonextractstringjson-indices-or-keys} Парсит JSON и извлекает строку. Эта функция аналогична функции `visitParamExtractString`. @@ -186,7 +186,7 @@ SELECT JSONExtractString('{"abc":"\\u263"}', 'abc') = '' SELECT JSONExtractString('{"abc":"hello}', 'abc') = '' ``` -## JSONExtract(json\[, indices_or_keys…\], Return_type) {#jsonextractjson-indices-or-keys-return-type} +## JSONExtract(json\[, indices_or_keys...\], Return_type) {#jsonextractjson-indices-or-keys-return-type} Парсит JSON и извлекает значение с заданным типом данных. @@ -207,7 +207,7 @@ SELECT JSONExtract('{"day": "Thursday"}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Friday' ``` -## JSONExtractKeysAndValues(json\[, indices_or_keys…\], Value_type) {#jsonextractkeysandvaluesjson-indices-or-keys-value-type} +## JSONExtractKeysAndValues(json\[, indices_or_keys...\], Value_type) {#jsonextractkeysandvaluesjson-indices-or-keys-value-type} Разбор пар ключ-значение из JSON, где значение имеет тип данных ClickHouse. @@ -255,7 +255,7 @@ text └────────────────────────────────────────────────────────────┘ ``` -## JSONExtractRaw(json\[, indices_or_keys\]…) {#jsonextractrawjson-indices-or-keys} +## JSONExtractRaw(json\[, indices_or_keys\]...) {#jsonextractrawjson-indices-or-keys} Возвращает часть JSON в виде строки, содержащей неразобранную подстроку. @@ -267,7 +267,7 @@ text SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]'; ``` -## JSONExtractArrayRaw(json\[, indices_or_keys\]…) {#jsonextractarrayrawjson-indices-or-keys} +## JSONExtractArrayRaw(json\[, indices_or_keys\]...) {#jsonextractarrayrawjson-indices-or-keys} Возвращает массив из элементов JSON массива, каждый из которых представлен в виде строки с неразобранными подстроками из JSON. diff --git a/docs/ru/sql-reference/functions/other-functions.md b/docs/ru/sql-reference/functions/other-functions.md index 835aed934d5..f7637cfa3f7 100644 --- a/docs/ru/sql-reference/functions/other-functions.md +++ b/docs/ru/sql-reference/functions/other-functions.md @@ -286,7 +286,7 @@ SELECT byteSize(NULL, 1, 0.3, ''); Превращает константу в полноценный столбец, содержащий только одно значение. В ClickHouse полноценные столбцы и константы представлены в памяти по-разному. Функции по-разному работают для аргументов-констант и обычных аргументов (выполняется разный код), хотя результат почти всегда должен быть одинаковым. Эта функция предназначена для отладки такого поведения. -## ignore(…) {#ignore} +## ignore(...) {#ignore} Принимает любые аргументы, в т.ч. `NULL`, всегда возвращает 0. При этом, аргумент всё равно вычисляется. Это может использоваться для бенчмарков. diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index eeb5752c626..fc258f7b4cf 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -358,7 +358,7 @@ SELECT repeat('abc', 10); Разворачивает последовательность кодовых точек Unicode, при допущении, что строка содержит набор байтов, представляющий текст в кодировке UTF-8. Иначе — что-то делает (не кидает исключение). -## format(pattern, s0, s1, …) {#format} +## format(pattern, s0, s1, ...) {#format} Форматирует константный шаблон со строками, перечисленными в аргументах. `pattern` — упрощенная версия шаблона в языке Python. Шаблон содержит «заменяющие поля», которые окружены фигурными скобками `{}`. Всё, что не содержится в скобках, интерпретируется как обычный текст и просто копируется. Если нужно использовать символ фигурной скобки, можно экранировать двойной скобкой `{{ '{{' }}` или `{{ '}}' }}`. Имя полей могут быть числами (нумерация с нуля) или пустыми (тогда они интерпретируются как последовательные числа). diff --git a/docs/ru/sql-reference/functions/string-search-functions.md b/docs/ru/sql-reference/functions/string-search-functions.md index 4f9ae4428a4..53da9a6e791 100644 --- a/docs/ru/sql-reference/functions/string-search-functions.md +++ b/docs/ru/sql-reference/functions/string-search-functions.md @@ -311,19 +311,19 @@ Result: Смотрите `multiSearchAllPositions`. -## multiSearchFirstPosition(haystack, \[needle1, needle2, …, needlen\]) {#multisearchfirstpositionhaystack-needle1-needle2-needlen} +## multiSearchFirstPosition(haystack, \[needle1, needle2, ..., needlen\]) {#multisearchfirstpositionhaystack-needle1-needle2-needlen} Так же, как и `position`, только возвращает оффсет первого вхождения любого из needles. Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstPositionCaseInsensitive, multiSearchFirstPositionUTF8, multiSearchFirstPositionCaseInsensitiveUTF8`. -## multiSearchFirstIndex(haystack, \[needle1, needle2, …, needlen\]) {#multisearchfirstindexhaystack-needle1-needle2-needlen} +## multiSearchFirstIndex(haystack, \[needle1, needle2, ..., needlen\]) {#multisearchfirstindexhaystack-needle1-needle2-needlen} Возвращает индекс `i` (нумерация с единицы) первой найденной строки needlei в строке `haystack` и 0 иначе. Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`. -## multiSearchAny(haystack, \[needle1, needle2, …, needlen\]) {#function-multisearchany} +## multiSearchAny(haystack, \[needle1, needle2, ..., needlen\]) {#function-multisearchany} Возвращает 1, если хотя бы одна подстрока needlei нашлась в строке `haystack` и 0 иначе. @@ -343,30 +343,30 @@ Result: Регулярное выражение работает со строкой как с набором байт. Регулярное выражение не может содержать нулевые байты. Для шаблонов на поиск подстроки в строке, лучше используйте LIKE или position, так как они работают существенно быстрее. -## multiMatchAny(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchanyhaystack-pattern1-pattern2-patternn} +## multiMatchAny(haystack, \[pattern1, pattern2, ..., patternn\]) {#multimatchanyhaystack-pattern1-pattern2-patternn} То же, что и `match`, но возвращает ноль, если ни одно регулярное выражение не подошло и один, если хотя бы одно. Используется библиотека [hyperscan](https://github.com/intel/hyperscan) для соответствия регулярных выражений. Для шаблонов на поиск многих подстрок в строке, лучше используйте `multiSearchAny`, так как она работает существенно быстрее. :::note Примечание Длина любой строки из `haystack` должна быть меньше 232 байт, иначе бросается исключение. Это ограничение связано с ограничением hyperscan API. ::: -## multiMatchAnyIndex(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchanyindexhaystack-pattern1-pattern2-patternn} +## multiMatchAnyIndex(haystack, \[pattern1, pattern2, ..., patternn\]) {#multimatchanyindexhaystack-pattern1-pattern2-patternn} То же, что и `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения. -## multiMatchAllIndices(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchallindiceshaystack-pattern1-pattern2-patternn} +## multiMatchAllIndices(haystack, \[pattern1, pattern2, ..., patternn\]) {#multimatchallindiceshaystack-pattern1-pattern2-patternn} То же, что и `multiMatchAny`, только возвращает массив всех индексов всех подходящих регулярных выражений в любом порядке. -## multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, …, patternn\]) {#multifuzzymatchanyhaystack-distance-pattern1-pattern2-patternn} +## multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, ..., patternn\]) {#multifuzzymatchanyhaystack-distance-pattern1-pattern2-patternn} То же, что и `multiMatchAny`, но возвращает 1 если любой шаблон соответствует haystack в пределах константного [редакционного расстояния](https://en.wikipedia.org/wiki/Edit_distance). Эта функция основана на экспериментальной библиотеке [hyperscan](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching) и может быть медленной для некоторых частных случаев. Производительность зависит от значения редакционного расстояния и используемых шаблонов, но всегда медленнее по сравнению с non-fuzzy вариантами. -## multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2, …, patternn\]) {#multifuzzymatchanyindexhaystack-distance-pattern1-pattern2-patternn} +## multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2, ..., patternn\]) {#multifuzzymatchanyindexhaystack-distance-pattern1-pattern2-patternn} То же, что и `multiFuzzyMatchAny`, только возвращает любой индекс подходящего регулярного выражения в пределах константного редакционного расстояния. -## multiFuzzyMatchAllIndices(haystack, distance, \[pattern1, pattern2, …, patternn\]) {#multifuzzymatchallindiceshaystack-distance-pattern1-pattern2-patternn} +## multiFuzzyMatchAllIndices(haystack, distance, \[pattern1, pattern2, ..., patternn\]) {#multifuzzymatchallindiceshaystack-distance-pattern1-pattern2-patternn} То же, что и `multiFuzzyMatchAny`, только возвращает массив всех индексов всех подходящих регулярных выражений в любом порядке в пределах константного редакционного расстояния. diff --git a/docs/ru/sql-reference/functions/tuple-functions.md b/docs/ru/sql-reference/functions/tuple-functions.md index c702e5d00b1..70ae44aa627 100644 --- a/docs/ru/sql-reference/functions/tuple-functions.md +++ b/docs/ru/sql-reference/functions/tuple-functions.md @@ -9,15 +9,15 @@ sidebar_label: Функции для работы с кортежами ## tuple {#tuple} Функция, позволяющая сгруппировать несколько столбцов. -Для столбцов, имеющих типы T1, T2, … возвращает кортеж типа Tuple(T1, T2, …), содержащий эти столбцы. Выполнение функции ничего не стоит. +Для столбцов, имеющих типы T1, T2, ... возвращает кортеж типа Tuple(T1, T2, ...), содержащий эти столбцы. Выполнение функции ничего не стоит. Кортежи обычно используются как промежуточное значение в качестве аргумента операторов IN, или для создания списка формальных параметров лямбда-функций. Кортежи не могут быть записаны в таблицу. -С помощью функции реализуется оператор `(x, y, …)`. +С помощью функции реализуется оператор `(x, y, ...)`. **Синтаксис** ``` sql -tuple(x, y, …) +tuple(x, y, ...) ``` ## tupleElement {#tupleelement} diff --git a/docs/ru/sql-reference/functions/url-functions.md b/docs/ru/sql-reference/functions/url-functions.md index 3c6e6151ef8..087891f4347 100644 --- a/docs/ru/sql-reference/functions/url-functions.md +++ b/docs/ru/sql-reference/functions/url-functions.md @@ -14,7 +14,7 @@ sidebar_label: "Функции для работы с URL" ### protocol {#protocol} -Возвращает протокол. Примеры: http, ftp, mailto, magnet… +Возвращает протокол. Примеры: http, ftp, mailto, magnet... ### domain {#domain} diff --git a/docs/ru/sql-reference/statements/alter/comment.md b/docs/ru/sql-reference/statements/alter/comment.md index 727af15d03e..f841c8540f3 100644 --- a/docs/ru/sql-reference/statements/alter/comment.md +++ b/docs/ru/sql-reference/statements/alter/comment.md @@ -4,7 +4,7 @@ sidebar_position: 51 sidebar_label: COMMENT --- -# ALTER TABLE … MODIFY COMMENT {#alter-modify-comment} +# ALTER TABLE ... MODIFY COMMENT {#alter-modify-comment} Добавляет, изменяет или удаляет комментарий к таблице, независимо от того, был ли он установлен раньше или нет. Изменение комментария отражается как в системной таблице [system.tables](../../../operations/system-tables/tables.md), так и в результате выполнения запроса `SHOW CREATE TABLE`. diff --git a/docs/ru/sql-reference/statements/alter/delete.md b/docs/ru/sql-reference/statements/alter/delete.md index dc968a17349..c91a79f5cdd 100644 --- a/docs/ru/sql-reference/statements/alter/delete.md +++ b/docs/ru/sql-reference/statements/alter/delete.md @@ -4,7 +4,7 @@ sidebar_position: 39 sidebar_label: DELETE --- -# ALTER TABLE … DELETE {#alter-mutations} +# ALTER TABLE ... DELETE {#alter-mutations} ``` sql ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr diff --git a/docs/ru/sql-reference/statements/alter/index.md b/docs/ru/sql-reference/statements/alter/index.md index 07f5ff0a298..e8b8af39e11 100644 --- a/docs/ru/sql-reference/statements/alter/index.md +++ b/docs/ru/sql-reference/statements/alter/index.md @@ -46,7 +46,7 @@ ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|CLEAR|COMMENT|MODIFY COLUMN ### Мутации {#mutations} -Мутации - разновидность запроса ALTER, позволяющая изменять или удалять данные в таблице. В отличие от стандартных запросов [ALTER TABLE … DELETE](../../../sql-reference/statements/alter/delete.md) и [ALTER TABLE … UPDATE](../../../sql-reference/statements/alter/update.md), рассчитанных на точечное изменение данных, область применения мутаций - достаточно тяжёлые изменения, затрагивающие много строк в таблице. Поддержана для движков таблиц семейства [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md), в том числе для движков с репликацией. +Мутации - разновидность запроса ALTER, позволяющая изменять или удалять данные в таблице. В отличие от стандартных запросов [ALTER TABLE ... DELETE](../../../sql-reference/statements/alter/delete.md) и [ALTER TABLE ... UPDATE](../../../sql-reference/statements/alter/update.md), рассчитанных на точечное изменение данных, область применения мутаций - достаточно тяжёлые изменения, затрагивающие много строк в таблице. Поддержана для движков таблиц семейства [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md), в том числе для движков с репликацией. Конвертировать существующие таблицы для работы с мутациями не нужно. Но после применения первой мутации формат данных таблицы становится несовместимым с предыдущими версиями и откатиться на предыдущую версию уже не получится. diff --git a/docs/ru/sql-reference/statements/alter/update.md b/docs/ru/sql-reference/statements/alter/update.md index b2032ac77d1..01574a8a9b7 100644 --- a/docs/ru/sql-reference/statements/alter/update.md +++ b/docs/ru/sql-reference/statements/alter/update.md @@ -4,7 +4,7 @@ sidebar_position: 40 sidebar_label: UPDATE --- -# ALTER TABLE … UPDATE {#alter-table-update-statements} +# ALTER TABLE ... UPDATE {#alter-table-update-statements} ``` sql ALTER TABLE [db.]table [ON CLUSTER cluster] UPDATE column1 = expr1 [, ...] WHERE filter_expr diff --git a/docs/ru/sql-reference/statements/alter/view.md b/docs/ru/sql-reference/statements/alter/view.md index e6f6730ff99..53e295f6bbe 100644 --- a/docs/ru/sql-reference/statements/alter/view.md +++ b/docs/ru/sql-reference/statements/alter/view.md @@ -4,9 +4,9 @@ sidebar_position: 50 sidebar_label: VIEW --- -# Выражение ALTER TABLE … MODIFY QUERY {#alter-modify-query} +# Выражение ALTER TABLE ... MODIFY QUERY {#alter-modify-query} -Вы можете изменить запрос `SELECT`, который был задан при создании [материализованного представления](../create/view.md#materialized), с помощью запроса 'ALTER TABLE … MODIFY QUERY'. Используйте его если при создании материализованного представления не использовалась секция `TO [db.]name`. Настройка `allow_experimental_alter_materialized_view_structure` должна быть включена. +Вы можете изменить запрос `SELECT`, который был задан при создании [материализованного представления](../create/view.md#materialized), с помощью запроса 'ALTER TABLE ... MODIFY QUERY'. Используйте его если при создании материализованного представления не использовалась секция `TO [db.]name`. Настройка `allow_experimental_alter_materialized_view_structure` должна быть включена. Если при создании материализованного представления использовалась конструкция `TO [db.]name`, то для изменения отсоедините представление с помощью [DETACH](../detach.md), измените таблицу с помощью [ALTER TABLE](index.md), а затем снова присоедините запрос с помощью [ATTACH](../attach.md). diff --git a/docs/ru/sql-reference/statements/create/view.md b/docs/ru/sql-reference/statements/create/view.md index 032bdc6e6d4..8fa30446bb3 100644 --- a/docs/ru/sql-reference/statements/create/view.md +++ b/docs/ru/sql-reference/statements/create/view.md @@ -60,7 +60,7 @@ AS SELECT ... Если указано `POPULATE`, то при создании представления в него будут добавлены данные, уже содержащиеся в исходной таблице, как если бы был сделан запрос `CREATE TABLE ... AS SELECT ...` . Если `POPULATE` не указано, представление будет содержать только данные, добавленные в таблицу после создания представления. Использовать `POPULATE` не рекомендуется, так как в представление не попадут данные, добавляемые в таблицу во время создания представления. -Запрос `SELECT` может содержать `DISTINCT`, `GROUP BY`, `ORDER BY`, `LIMIT`… Следует иметь ввиду, что соответствующие преобразования будут выполняться независимо, на каждый блок вставляемых данных. Например, при наличии `GROUP BY`, данные будут агрегироваться при вставке, но только в рамках одной пачки вставляемых данных. Далее, данные не будут доагрегированы. Исключение - использование ENGINE, производящего агрегацию данных самостоятельно, например, `SummingMergeTree`. +Запрос `SELECT` может содержать `DISTINCT`, `GROUP BY`, `ORDER BY`, `LIMIT`... Следует иметь ввиду, что соответствующие преобразования будут выполняться независимо, на каждый блок вставляемых данных. Например, при наличии `GROUP BY`, данные будут агрегироваться при вставке, но только в рамках одной пачки вставляемых данных. Далее, данные не будут доагрегированы. Исключение - использование ENGINE, производящего агрегацию данных самостоятельно, например, `SummingMergeTree`. Выполнение запросов [ALTER](../../../sql-reference/statements/alter/view.md) над материализованными представлениями имеет свои особенности, поэтому эти запросы могут быть неудобными для использования. Если материализованное представление использует конструкцию `TO [db.]name`, то можно выполнить `DETACH` представления, `ALTER` для целевой таблицы и последующий `ATTACH` ранее отсоединенного (`DETACH`) представления. diff --git a/docs/ru/sql-reference/statements/insert-into.md b/docs/ru/sql-reference/statements/insert-into.md index 747e36b8809..309d4852b11 100644 --- a/docs/ru/sql-reference/statements/insert-into.md +++ b/docs/ru/sql-reference/statements/insert-into.md @@ -73,7 +73,7 @@ INSERT INTO insert_select_testtable VALUES (1, DEFAULT, 1) ; INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format_name data_set ``` -Например, следующий формат запроса идентичен базовому варианту INSERT … VALUES: +Например, следующий формат запроса идентичен базовому варианту INSERT ... VALUES: ``` sql INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... diff --git a/docs/ru/sql-reference/table-functions/file.md b/docs/ru/sql-reference/table-functions/file.md index 5331cf00728..546a674d41a 100644 --- a/docs/ru/sql-reference/table-functions/file.md +++ b/docs/ru/sql-reference/table-functions/file.md @@ -116,7 +116,7 @@ SELECT count(*) FROM file('{some,another}_dir/*', 'TSV', 'name String, value UIn **Пример** -Запрос данных из файлов с именами `file000`, `file001`, … , `file999`: +Запрос данных из файлов с именами `file000`, `file001`, ... , `file999`: ``` sql SELECT count(*) FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32'); diff --git a/docs/ru/sql-reference/table-functions/s3.md b/docs/ru/sql-reference/table-functions/s3.md index fe40cb0c507..2847a95bf19 100644 --- a/docs/ru/sql-reference/table-functions/s3.md +++ b/docs/ru/sql-reference/table-functions/s3.md @@ -108,7 +108,7 @@ FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefi Если список файлов содержит диапазоны чисел с ведущими нулями, используйте конструкцию с фигурными скобками для каждой цифры отдельно или используйте `?`. ::: -Подсчитаем общее количество строк в файлах с именами `file-000.csv`, `file-001.csv`, … , `file-999.csv`: +Подсчитаем общее количество строк в файлах с именами `file-000.csv`, `file-001.csv`, ... , `file-999.csv`: ``` sql SELECT count(*) diff --git a/docs/zh/changelog/index.md b/docs/zh/changelog/index.md index 7afcc07c6fb..c91d8bcf4d1 100644 --- a/docs/zh/changelog/index.md +++ b/docs/zh/changelog/index.md @@ -190,7 +190,7 @@ sidebar_label: "\u53D8\u66F4\u65E5\u5FD7" - 如果在获取系统数据时发生了zookeeper异常。副本,将其显示在单独的列中。 这实现了 [#9137](https://github.com/ClickHouse/ClickHouse/issues/9137) [#9138](https://github.com/ClickHouse/ClickHouse/pull/9138) ([阿列克谢-米洛维多夫](https://github.com/alexey-milovidov)) - 原子删除destroy上的MergeTree数据部分。 [#8402](https://github.com/ClickHouse/ClickHouse/pull/8402) ([Vladimir Chebotarev](https://github.com/excitoon)) - 支持分布式表的行级安全性。 [#8926](https://github.com/ClickHouse/ClickHouse/pull/8926) ([伊万](https://github.com/abyss7)) -- Now we recognize suffix (like KB, KiB…) in settings values. [#8072](https://github.com/ClickHouse/ClickHouse/pull/8072) ([米哈伊尔\*科罗托夫](https://github.com/millb)) +- Now we recognize suffix (like KB, KiB...) in settings values. [#8072](https://github.com/ClickHouse/ClickHouse/pull/8072) ([米哈伊尔\*科罗托夫](https://github.com/millb)) - 在构建大型连接的结果时防止内存不足。 [#8637](https://github.com/ClickHouse/ClickHouse/pull/8637) ([Artem Zuikov](https://github.com/4ertus2)) - 在交互模式下为建议添加群集名称 `clickhouse-client`. [#8709](https://github.com/ClickHouse/ClickHouse/pull/8709) ([阿列克谢-米洛维多夫](https://github.com/alexey-milovidov)) - Initialize query profiler for all threads in a group, e.g. it allows to fully profile insert-queries [#8820](https://github.com/ClickHouse/ClickHouse/pull/8820) ([伊万](https://github.com/abyss7)) @@ -523,7 +523,7 @@ sidebar_label: "\u53D8\u66F4\u65E5\u5FD7" - 现在后台在磁盘之间移动,运行它的seprate线程池。 [#7670](https://github.com/ClickHouse/ClickHouse/pull/7670) ([Vladimir Chebotarev](https://github.com/excitoon)) - `SYSTEM RELOAD DICTIONARY` 现在同步执行。 [#8240](https://github.com/ClickHouse/ClickHouse/pull/8240) ([维塔利\*巴拉诺夫](https://github.com/vitlibar)) - 堆栈跟踪现在显示物理地址(对象文件中的偏移量),而不是虚拟内存地址(加载对象文件的位置)。 这允许使用 `addr2line` 当二进制独立于位置并且ASLR处于活动状态时。 这修复 [#8360](https://github.com/ClickHouse/ClickHouse/issues/8360). [#8387](https://github.com/ClickHouse/ClickHouse/pull/8387) ([阿列克谢-米洛维多夫](https://github.com/alexey-milovidov)) -- 支持行级安全筛选器的新语法: `…
`. 修复 [#5779](https://github.com/ClickHouse/ClickHouse/issues/5779). [#8381](https://github.com/ClickHouse/ClickHouse/pull/8381) ([伊万](https://github.com/abyss7)) +- 支持行级安全筛选器的新语法: `...
`. 修复 [#5779](https://github.com/ClickHouse/ClickHouse/issues/5779). [#8381](https://github.com/ClickHouse/ClickHouse/pull/8381) ([伊万](https://github.com/abyss7)) - 现在 `cityHash` 功能可以与工作 `Decimal` 和 `UUID` 类型。 修复 [#5184](https://github.com/ClickHouse/ClickHouse/issues/5184). [#7693](https://github.com/ClickHouse/ClickHouse/pull/7693) ([米哈伊尔\*科罗托夫](https://github.com/millb)) - 从系统日志中删除了固定的索引粒度(它是1024),因为它在实现自适应粒度之后已经过时。 [#7698](https://github.com/ClickHouse/ClickHouse/pull/7698) ([阿列克谢-米洛维多夫](https://github.com/alexey-milovidov)) - 当ClickHouse在没有SSL的情况下编译时,启用MySQL兼容服务器。 [#7852](https://github.com/ClickHouse/ClickHouse/pull/7852) ([尤里\*巴拉诺夫](https://github.com/yurriy)) diff --git a/docs/zh/development/style.md b/docs/zh/development/style.md index c0a08291e02..724b22ad461 100644 --- a/docs/zh/development/style.md +++ b/docs/zh/development/style.md @@ -53,7 +53,7 @@ memcpy(&buf[place_value], &x, sizeof(x)); for (size_t i = 0; i < rows; i += storage.index_granularity) ``` -**7.** 在二元运算符(`+`,`-`,`*`,`/`,`%`,…)和三元运算符 `?:` 周围添加空格。 +**7.** 在二元运算符(`+`,`-`,`*`,`/`,`%`,...)和三元运算符 `?:` 周围添加空格。 ``` cpp UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); @@ -82,7 +82,7 @@ dst.ClickGoodEvent = click.GoodEvent; 如有必要,运算符可以包裹到下一行。 在这种情况下,它前面的偏移量增加。 -**11.** 不要使用空格来分开一元运算符 (`--`, `++`, `*`, `&`, …) 和参数。 +**11.** 不要使用空格来分开一元运算符 (`--`, `++`, `*`, `&`, ...) 和参数。 **12.** 在逗号后面加一个空格,而不是在之前。同样的规则也适合 `for` 循环中的分号。 @@ -111,7 +111,7 @@ public: **16.** 如果对整个文件使用相同的 `namespace`,并且没有其他重要的东西,则 `namespace` 中不需要偏移量。 -**17.** 在 `if`, `for`, `while` 中包裹的代码块中,若代码是一个单行的 `statement`,那么大括号是可选的。 可以将 `statement` 放到一行中。这个规则同样适用于嵌套的 `if`, `for`, `while`, … +**17.** 在 `if`, `for`, `while` 中包裹的代码块中,若代码是一个单行的 `statement`,那么大括号是可选的。 可以将 `statement` 放到一行中。这个规则同样适用于嵌套的 `if`, `for`, `while`, ... 但是如果内部 `statement` 包含大括号或 `else`,则外部块应该用大括号括起来。 @@ -262,7 +262,7 @@ void executeQuery( 这个示例来源于 http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/。 -**7.** 不要在每个文件的开头写入垃圾注释(作者,创建日期…)。 +**7.** 不要在每个文件的开头写入垃圾注释(作者,创建日期...)。 **8.** 单行注释用三个斜杆: `///` ,多行注释以 `/**`开始。 这些注释会当做文档。 diff --git a/docs/zh/engines/table-engines/integrations/hdfs.md b/docs/zh/engines/table-engines/integrations/hdfs.md index 55648afe407..be673b6ce92 100644 --- a/docs/zh/engines/table-engines/integrations/hdfs.md +++ b/docs/zh/engines/table-engines/integrations/hdfs.md @@ -103,7 +103,7 @@ CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs **示例** -创建具有名为文件的表 `file000`, `file001`, … , `file999`: +创建具有名为文件的表 `file000`, `file001`, ... , `file999`: ``` sql CREARE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') diff --git a/docs/zh/engines/table-engines/integrations/s3.md b/docs/zh/engines/table-engines/integrations/s3.md index f2585decabf..f18814675c3 100644 --- a/docs/zh/engines/table-engines/integrations/s3.md +++ b/docs/zh/engines/table-engines/integrations/s3.md @@ -109,7 +109,7 @@ CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = S3('https: **示例** -使用文件`file-000.csv`, `file-001.csv`, … , `file-999.csv`来创建表: +使用文件`file-000.csv`, `file-001.csv`, ... , `file-999.csv`来创建表: ``` sql CREATE TABLE big_table (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV'); @@ -202,7 +202,7 @@ ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_p !!! warning "Warning" 如果文件列表中包含有从0开头的数字范围,请对每个数字分别使用带括号的结构,或者使用`?`. -4. 从文件`file-000.csv`, `file-001.csv`, … , `file-999.csv`创建表: +4. 从文件`file-000.csv`, `file-001.csv`, ... , `file-999.csv`创建表: ``` sql CREATE TABLE big_table (name String, value UInt32) diff --git a/docs/zh/engines/table-engines/mergetree-family/custom-partitioning-key.md b/docs/zh/engines/table-engines/mergetree-family/custom-partitioning-key.md index 4fecf4e5669..e283a4c7510 100644 --- a/docs/zh/engines/table-engines/mergetree-family/custom-partitioning-key.md +++ b/docs/zh/engines/table-engines/mergetree-family/custom-partitioning-key.md @@ -59,7 +59,7 @@ WHERE table = 'visits' └───────────┴────────────────┴────────┘ ``` -`partition` 列存储分区的名称。此示例中有两个分区:`201901` 和 `201902`。在 [ALTER … PARTITION](#alter_manipulations-with-partitions) 语句中你可以使用该列值来指定分区名称。 +`partition` 列存储分区的名称。此示例中有两个分区:`201901` 和 `201902`。在 [ALTER ... PARTITION](#alter_manipulations-with-partitions) 语句中你可以使用该列值来指定分区名称。 `name` 列为分区中数据片段的名称。在 [ALTER ATTACH PART](#alter_attach-partition) 语句中你可以使用此列值中来指定片段名称。 diff --git a/docs/zh/engines/table-engines/mergetree-family/mergetree.md b/docs/zh/engines/table-engines/mergetree-family/mergetree.md index bfa69338657..67bd681269b 100644 --- a/docs/zh/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/zh/engines/table-engines/mergetree-family/mergetree.md @@ -702,7 +702,7 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd' - 插入(`INSERT`查询) - 后台合并和[数据变异](../../../sql-reference/statements/alter.md#alter-mutations) - 从另一个副本下载 -- [ALTER TABLE … FREEZE PARTITION](../../../sql-reference/statements/alter.md#alter_freeze-partition) 冻结分区 +- [ALTER TABLE ... FREEZE PARTITION](../../../sql-reference/statements/alter.md#alter_freeze-partition) 冻结分区 除了数据变异和冻结分区以外的情况下,数据按照以下逻辑存储到卷或磁盘上: @@ -713,7 +713,7 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd' 在后台,数据片段基于剩余空间(`move_factor`参数)根据卷在配置文件中定义的顺序进行转移。数据永远不会从最后一个移出也不会从第一个移入。可以通过系统表 [system.part_log](../../../operations/system-tables/part_log.md#system_tables-part-log) (字段 `type = MOVE_PART`) 和 [system.parts](../../../operations/system-tables/parts.md#system_tables-parts) (字段 `path` 和 `disk`) 来监控后台的移动情况。具体细节可以通过服务器日志查看。 -用户可以通过 [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../../sql-reference/statements/alter.md#alter_move-partition) 强制移动一个数据片段或分区到另外一个卷,所有后台移动的限制都会被考虑在内。这个查询会自行启动,无需等待后台操作完成。如果没有足够的可用空间或任何必须条件没有被满足,用户会收到报错信息。 +用户可以通过 [ALTER TABLE ... MOVE PART\|PARTITION ... TO VOLUME\|DISK ...](../../../sql-reference/statements/alter.md#alter_move-partition) 强制移动一个数据片段或分区到另外一个卷,所有后台移动的限制都会被考虑在内。这个查询会自行启动,无需等待后台操作完成。如果没有足够的可用空间或任何必须条件没有被满足,用户会收到报错信息。 数据移动不会妨碍到数据复制。也就是说,同一张表的不同副本可以指定不同的存储策略。 diff --git a/docs/zh/engines/table-engines/special/external-data.md b/docs/zh/engines/table-engines/special/external-data.md index 688e25402ab..06c6331b4f3 100644 --- a/docs/zh/engines/table-engines/special/external-data.md +++ b/docs/zh/engines/table-engines/special/external-data.md @@ -26,7 +26,7 @@ ClickHouse 允许向服务器发送处理查询所需的数据以及 SELECT 查 以下的参数是可选的:**–name** – 表的名称,如果省略,则采用 _data。 **–format** – 文件中的数据格式。 如果省略,则使用 TabSeparated。 -以下的参数必选一个:**–types** – 逗号分隔列类型的列表。例如:`UInt64,String`。列将被命名为 _1,_2,… +以下的参数必选一个:**–types** – 逗号分隔列类型的列表。例如:`UInt64,String`。列将被命名为 _1,_2,... **–structure**– 表结构的格式 `UserID UInt64`,`URL String`。定义列的名字以及类型。 在 «file» 中指定的文件将由 «format» 中指定的格式解析,使用在 «types» 或 «structure» 中指定的数据类型。该表将被上传到服务器,并在作为名称为 «name»临时表。 diff --git a/docs/zh/faq/general/olap.md b/docs/zh/faq/general/olap.md index b014419578b..c4b36b138fa 100644 --- a/docs/zh/faq/general/olap.md +++ b/docs/zh/faq/general/olap.md @@ -10,13 +10,13 @@ sidebar_position: 100 [OLAP](https://en.wikipedia.org/wiki/Online_analytical_processing) stands for Online Analytical Processing. It is a broad term that can be looked at from two perspectives: technical and business. But at the very high level, you can just read these words backward: Processing -: Some source data is processed… +: Some source data is processed... Analytical -: …to produce some analytical reports and insights… +: ...to produce some analytical reports and insights... Online -: …in real-time. +: ...in real-time. ## OLAP from the Business Perspective {#olap-from-the-business-perspective} diff --git a/docs/zh/getting-started/example-datasets/nyc-taxi.md b/docs/zh/getting-started/example-datasets/nyc-taxi.md index 9c487140df3..ceeb6fbb9e0 100644 --- a/docs/zh/getting-started/example-datasets/nyc-taxi.md +++ b/docs/zh/getting-started/example-datasets/nyc-taxi.md @@ -196,7 +196,7 @@ real 75m56.214s (也可以直接使用`COPY ... TO PROGRAM`从Postgres中导入数据) -数据中所有与天气相关的字段(precipitation……average_wind_speed)都填充了NULL。 所以,我们将从最终数据集中删除它们 +数据中所有与天气相关的字段(precipitation...average_wind_speed)都填充了NULL。 所以,我们将从最终数据集中删除它们 首先,我们使用单台服务器创建表,后面我们将在多台节点上创建这些表。 diff --git a/docs/zh/getting-started/example-datasets/uk-price-paid.mdx b/docs/zh/getting-started/example-datasets/uk-price-paid.mdx index ecfdcddbbe2..7d4c299b919 100644 --- a/docs/zh/getting-started/example-datasets/uk-price-paid.mdx +++ b/docs/zh/getting-started/example-datasets/uk-price-paid.mdx @@ -212,7 +212,7 @@ ORDER BY year └──────┴─────────┴───────────────────────────────────────────────────────┘ ``` -2020 年房价出事了!但这并不令人意外…… +2020 年房价出事了!但这并不令人意外... ### 查询 3. 最昂贵的社区 {#most-expensive-neighborhoods} diff --git a/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md b/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md index 758992e4084..975d5eb764c 100644 --- a/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md +++ b/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md @@ -371,7 +371,7 @@ UserID.bin,URL.bin,和EventTime.bin是UserID :::note - 最后一个索引条目(上图中的“mark 1082”)存储了上图中颗粒1082的主键列的最大值。 -- 索引条目(索引标记)不是基于表中的特定行,而是基于颗粒。例如,对于上图中的索引条目‘mark 0’,在我们的表中没有UserID为240.923且URL为“goal://metry=10000467796a411…”的行,相反,对于该表,有一个颗粒0,在该颗粒中,最小UserID值是240.923,最小URL值是“goal://metry=10000467796a411…”,这两个值来自不同的行。 +- 索引条目(索引标记)不是基于表中的特定行,而是基于颗粒。例如,对于上图中的索引条目‘mark 0’,在我们的表中没有UserID为240.923且URL为“goal://metry=10000467796a411...”的行,相反,对于该表,有一个颗粒0,在该颗粒中,最小UserID值是240.923,最小URL值是“goal://metry=10000467796a411...”,这两个值来自不同的行。 - 主索引文件完全加载到主内存中。如果文件大于可用的空闲内存空间,则ClickHouse将发生错误。 ::: diff --git a/docs/zh/index.md b/docs/zh/index.md index fab00dbcd1b..c092f296722 100644 --- a/docs/zh/index.md +++ b/docs/zh/index.md @@ -13,10 +13,10 @@ ClickHouse是一个用于联机分析(OLAP)的列式数据库管理系统(DBMS) | Row | WatchID | JavaEnable | Title | GoodEvent | EventTime | |-----|-------------|------------|--------------------|-----------|---------------------| -| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | -| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | -| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | -| #N | … | … | … | … | … | +| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | +| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | +| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | +| #N | ... | ... | ... | ... | ... | 处于同一行中的数据总是被物理的存储在一起。 @@ -24,13 +24,13 @@ ClickHouse是一个用于联机分析(OLAP)的列式数据库管理系统(DBMS) 在列式数据库系统中,数据按如下的顺序存储: -| Row: | #0 | #1 | #2 | #N | +| Row: | #0 | #1 | #2 | #N | |-------------|---------------------|---------------------|---------------------|-----| -| WatchID: | 89354350662 | 90329509958 | 89953706054 | … | -| JavaEnable: | 1 | 0 | 1 | … | -| Title: | Investor Relations | Contact us | Mission | … | -| GoodEvent: | 1 | 1 | 1 | … | -| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | +| WatchID: | 89354350662 | 90329509958 | 89953706054 | ... | +| JavaEnable: | 1 | 0 | 1 | ... | +| Title: | Investor Relations | Contact us | Mission | ... | +| GoodEvent: | 1 | 1 | 1 | ... | +| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | ... | 这些示例只显示了数据的排列顺序。来自不同列的值被单独存储,来自同一列的数据被存储在一起。 diff --git a/docs/zh/operations/settings/query-complexity.md b/docs/zh/operations/settings/query-complexity.md index 124d5fa5d1a..b1b5ca75018 100644 --- a/docs/zh/operations/settings/query-complexity.md +++ b/docs/zh/operations/settings/query-complexity.md @@ -196,7 +196,7 @@ Restrictions on the «maximum amount of something» can take the value 0, which Limits the number of rows in the hash table that is used when joining tables. -This settings applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and the [Join](../../engines/table-engines/special/join.md) table engine. +This settings applies to [SELECT ... JOIN](../../sql-reference/statements/select/join.md#select-join) operations and the [Join](../../engines/table-engines/special/join.md) table engine. If a query contains multiple joins, ClickHouse checks this setting for every intermediate result. @@ -213,7 +213,7 @@ Default value: 0. Limits the size in bytes of the hash table used when joining tables. -This settings applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md). +This settings applies to [SELECT ... JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md). If the query contains joins, ClickHouse checks this setting for every intermediate result. diff --git a/docs/zh/operations/settings/settings.md b/docs/zh/operations/settings/settings.md index c3b4194ed44..5e59196f56c 100644 --- a/docs/zh/operations/settings/settings.md +++ b/docs/zh/operations/settings/settings.md @@ -1002,7 +1002,7 @@ ClickHouse生成异常 ## count_distinct_implementation {#settings-count_distinct_implementation} -指定其中的 `uniq*` 函数应用于执行 [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count) 建筑。 +指定其中的 `uniq*` 函数应用于执行 [COUNT(DISTINCT ...)](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count) 建筑。 可能的值: diff --git a/docs/zh/operations/system-tables/dictionaries.md b/docs/zh/operations/system-tables/dictionaries.md index 0cf91e45e86..c7b1bdd04be 100644 --- a/docs/zh/operations/system-tables/dictionaries.md +++ b/docs/zh/operations/system-tables/dictionaries.md @@ -21,7 +21,7 @@ machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 - `FAILED_AND_RELOADING` — Could not load the dictionary as a result of an error and is loading now. - `origin` ([字符串](../../sql-reference/data-types/string.md)) — Path to the configuration file that describes the dictionary. - `type` ([字符串](../../sql-reference/data-types/string.md)) — Type of dictionary allocation. [在内存中存储字典](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). -- `key` — [密钥类型](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-key):数字键 ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) or Сomposite key ([字符串](../../sql-reference/data-types/string.md)) — form “(type 1, type 2, …, type n)”. +- `key` — [密钥类型](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-key):数字键 ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) or Сomposite key ([字符串](../../sql-reference/data-types/string.md)) — form “(type 1, type 2, ..., type n)”. - `attribute.names` ([阵列](../../sql-reference/data-types/array.md)([字符串](../../sql-reference/data-types/string.md))) — Array of [属性名称](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes) 由字典提供。 - `attribute.types` ([阵列](../../sql-reference/data-types/array.md)([字符串](../../sql-reference/data-types/string.md))) — Corresponding array of [属性类型](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes) 这是由字典提供。 - `bytes_allocated` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Amount of RAM allocated for the dictionary. diff --git a/docs/zh/sql-reference/aggregate-functions/parametric-functions.md b/docs/zh/sql-reference/aggregate-functions/parametric-functions.md index cb1dcc35f5c..27d3375aebb 100644 --- a/docs/zh/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/zh/sql-reference/aggregate-functions/parametric-functions.md @@ -80,7 +80,7 @@ FROM 在这种情况下,您应该记住您不知道直方图bin边界。 -## sequenceMatch(pattern)(timestamp, cond1, cond2, …) {#function-sequencematch} +## sequenceMatch(pattern)(timestamp, cond1, cond2, ...) {#function-sequencematch} 检查序列是否包含与模式匹配的事件链。 @@ -167,7 +167,7 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM - [sequenceCount](#function-sequencecount) -## sequenceCount(pattern)(time, cond1, cond2, …) {#function-sequencecount} +## sequenceCount(pattern)(time, cond1, cond2, ...) {#function-sequencecount} 计算与模式匹配的事件链的数量。该函数搜索不重叠的事件链。当前链匹配后,它开始搜索下一个链。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md index 4dce65af1ed..253eb9ef82d 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md @@ -7,7 +7,7 @@ sidebar_position: 201 **语法** ``` sql -quantiles(level1, level2, …)(x) +quantiles(level1, level2, ...)(x) ``` 所有分位数函数(quantile)也有相应的分位数(quantiles)函数: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`。 这些函数一次计算所列的级别的所有分位数, 并返回结果值的数组。 diff --git a/docs/zh/sql-reference/data-types/aggregatefunction.md b/docs/zh/sql-reference/data-types/aggregatefunction.md index e8f28b367a5..80648eb165b 100644 --- a/docs/zh/sql-reference/data-types/aggregatefunction.md +++ b/docs/zh/sql-reference/data-types/aggregatefunction.md @@ -1,7 +1,7 @@ --- slug: /zh/sql-reference/data-types/aggregatefunction --- -# AggregateFunction(name, types_of_arguments…) {#data-type-aggregatefunction} +# AggregateFunction(name, types_of_arguments...) {#data-type-aggregatefunction} 聚合函数的中间状态,可以通过聚合函数名称加`-State`后缀的形式得到它。与此同时,当您需要访问该类型的最终状态数据时,您需要以相同的聚合函数名加`-Merge`后缀的形式来得到最终状态数据。 diff --git a/docs/zh/sql-reference/data-types/domains/index.md b/docs/zh/sql-reference/data-types/domains/index.md index c123b10f6fe..9f12018732b 100644 --- a/docs/zh/sql-reference/data-types/domains/index.md +++ b/docs/zh/sql-reference/data-types/domains/index.md @@ -19,9 +19,9 @@ Domain类型是特定实现的类型,它总是与某个现存的基础类型 ### Domains的额外特性 {#domainsde-e-wai-te-xing} - 在执行SHOW CREATE TABLE 或 DESCRIBE TABLE时,其对应的列总是展示为Domain类型的名称 -- 在INSERT INTO domain_table(domain_column) VALUES(…)中输入数据总是以更人性化的格式进行输入 +- 在INSERT INTO domain_table(domain_column) VALUES(...)中输入数据总是以更人性化的格式进行输入 - 在SELECT domain_column FROM domain_table中数据总是以更人性化的格式输出 -- 在INSERT INTO domain_table FORMAT CSV …中,实现外部源数据以更人性化的格式载入 +- 在INSERT INTO domain_table FORMAT CSV ...中,实现外部源数据以更人性化的格式载入 ### Domains类型的限制 {#domainslei-xing-de-xian-zhi} diff --git a/docs/zh/sql-reference/data-types/fixedstring.md b/docs/zh/sql-reference/data-types/fixedstring.md index 633307938a9..d454e935fe7 100644 --- a/docs/zh/sql-reference/data-types/fixedstring.md +++ b/docs/zh/sql-reference/data-types/fixedstring.md @@ -18,8 +18,8 @@ slug: /zh/sql-reference/data-types/fixedstring 可以有效存储在`FixedString`类型的列中的值的示例: - 二进制表示的IP地址(IPv6使用`FixedString(16)`) -- 语言代码(ru_RU, en_US … ) -- 货币代码(USD, RUB … ) +- 语言代码(ru_RU, en_US ... ) +- 货币代码(USD, RUB ... ) - 二进制表示的哈希值(MD5使用`FixedString(16)`,SHA256使用`FixedString(32)`) 请使用[UUID](uuid.md)数据类型来存储UUID值,。 diff --git a/docs/zh/sql-reference/data-types/nested-data-structures/nested.md b/docs/zh/sql-reference/data-types/nested-data-structures/nested.md index 5ef8256b483..57b30de0881 100644 --- a/docs/zh/sql-reference/data-types/nested-data-structures/nested.md +++ b/docs/zh/sql-reference/data-types/nested-data-structures/nested.md @@ -1,7 +1,7 @@ --- slug: /zh/sql-reference/data-types/nested-data-structures/nested --- -# Nested(Name1 Type1, Name2 Type2, …) {#nestedname1-type1-name2-type2} +# Nested(Name1 Type1, Name2 Type2, ...) {#nestedname1-type1-name2-type2} 嵌套数据结构类似于嵌套表。嵌套数据结构的参数(列名和类型)与 CREATE 查询类似。每个表可以包含任意多行嵌套数据结构。 diff --git a/docs/zh/sql-reference/data-types/simpleaggregatefunction.md b/docs/zh/sql-reference/data-types/simpleaggregatefunction.md index 601cb602a78..fbaa76365ec 100644 --- a/docs/zh/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/zh/sql-reference/data-types/simpleaggregatefunction.md @@ -3,7 +3,7 @@ slug: /zh/sql-reference/data-types/simpleaggregatefunction --- # SimpleAggregateFunction {#data-type-simpleaggregatefunction} -`SimpleAggregateFunction(name, types_of_arguments…)` 数据类型存储聚合函数的当前值, 并不像 [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) 那样存储其全部状态。这种优化可以应用于具有以下属性函数: 将函数 `f` 应用于行集合 `S1 UNION ALL S2` 的结果,可以通过将 `f` 分别应用于行集合的部分, 然后再将 `f` 应用于结果来获得: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`。 这个属性保证了部分聚合结果足以计算出合并的结果,所以我们不必存储和处理任何额外的数据。 +`SimpleAggregateFunction(name, types_of_arguments...)` 数据类型存储聚合函数的当前值, 并不像 [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) 那样存储其全部状态。这种优化可以应用于具有以下属性函数: 将函数 `f` 应用于行集合 `S1 UNION ALL S2` 的结果,可以通过将 `f` 分别应用于行集合的部分, 然后再将 `f` 应用于结果来获得: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`。 这个属性保证了部分聚合结果足以计算出合并的结果,所以我们不必存储和处理任何额外的数据。 支持以下聚合函数: diff --git a/docs/zh/sql-reference/data-types/tuple.md b/docs/zh/sql-reference/data-types/tuple.md index 004c80ff916..38813701c70 100644 --- a/docs/zh/sql-reference/data-types/tuple.md +++ b/docs/zh/sql-reference/data-types/tuple.md @@ -1,7 +1,7 @@ --- slug: /zh/sql-reference/data-types/tuple --- -# Tuple(T1, T2, …) {#tuplet1-t2} +# Tuple(T1, T2, ...) {#tuplet1-t2} 元组,其中每个元素都有单独的 [类型](index.md#data_types)。 diff --git a/docs/zh/sql-reference/functions/array-functions.md b/docs/zh/sql-reference/functions/array-functions.md index d150b94b8af..69db34e4a36 100644 --- a/docs/zh/sql-reference/functions/array-functions.md +++ b/docs/zh/sql-reference/functions/array-functions.md @@ -152,7 +152,7 @@ SELECT range(5), range(1, 5), range(1, 5, 2), range(-1, 5, 2); └─────────────┴─────────────┴────────────────┴─────────────────┘ ``` -## array(x1, …), operator \[x1, …\] {#arrayx1-operator-x1} +## array(x1, ...), operator \[x1, ...\] {#arrayx1-operator-x1} 使用函数的参数作为数组元素创建一个数组。 参数必须是常量,并且具有最小公共类型的类型。必须至少传递一个参数,否则将不清楚要创建哪种类型的数组。也就是说,你不能使用这个函数来创建一个空数组(为此,使用上面描述的’emptyArray  \*’函数)。 @@ -337,7 +337,7 @@ SELECT indexOf([1, 3, NULL, NULL], NULL) 设置为«NULL»的元素将作为普通的元素值处理。 -## arrayCount(\[func,\] arr1, …) {#array-count} +## arrayCount(\[func,\] arr1, ...) {#array-count} `func`将arr数组作为参数,其返回结果为非零值的数量。如果未指定“func”,则返回数组中非零元素的数量。 @@ -363,7 +363,7 @@ SELECT countEqual([1, 2, NULL, NULL], NULL) ## arrayEnumerate(arr) {#array_functions-arrayenumerate} -返回 Array \[1, 2, 3, …, length (arr) \] +返回 Array \[1, 2, 3, ..., length (arr) \] 此功能通常与ARRAY JOIN一起使用。它允许在应用ARRAY JOIN后为每个数组计算一次。例如: @@ -403,7 +403,7 @@ WHERE (CounterID = 160656) AND notEmpty(GoalsReached) 此功能也可用于高阶函数。例如,您可以使用它来获取与条件匹配的元素的数组索引。 -## arrayEnumerateUniq(arr, …) {#arrayenumerateuniqarr} +## arrayEnumerateUniq(arr, ...) {#arrayenumerateuniqarr} 返回与源数组大小相同的数组,其中每个元素表示与其下标对应的源数组元素在源数组中出现的次数。 例如:arrayEnumerateUniq( \[10,20,10,30 \])=  \[1,1,2,1 \]。 @@ -621,7 +621,7 @@ SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res 设置为«NULL»的数组元素作为普通的数组元素值处理。 -## arraySort(\[func,\] arr, …) {#array_functions-reverse-sort} +## arraySort(\[func,\] arr, ...) {#array_functions-reverse-sort} 以升序对`arr`数组的元素进行排序。如果指定了`func`函数,则排序顺序由`func`函数的调用结果决定。如果`func`接受多个参数,那么`arraySort`函数也将解析与`func`函数参数相同数量的数组参数。更详细的示例在`arraySort`的末尾。 @@ -721,7 +721,7 @@ SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; !!! 注意 "注意" 为了提高排序效率, 使用了[施瓦茨变换](https://en.wikipedia.org/wiki/Schwartzian_transform)。 -## arrayReverseSort(\[func,\] arr, …) {#array_functions-reverse-sort} +## arrayReverseSort(\[func,\] arr, ...) {#array_functions-reverse-sort} 以降序对`arr`数组的元素进行排序。如果指定了`func`函数,则排序顺序由`func`函数的调用结果决定。如果`func`接受多个参数,那么`arrayReverseSort`函数也将解析与`func`函数参数相同数量的数组作为参数。更详细的示例在`arrayReverseSort`的末尾。 @@ -822,7 +822,7 @@ SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; └─────────┘ ``` -## arrayUniq(arr, …) {#arrayuniqarr} +## arrayUniq(arr, ...) {#arrayuniqarr} 如果传递一个参数,则计算数组中不同元素的数量。 如果传递了多个参数,则它计算多个数组中相应位置的不同元素元组的数量。 @@ -1221,7 +1221,7 @@ select arrayAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]); └───────────────────────────────────────────────┘ ``` -## arrayMap(func, arr1, …) {#array-map} +## arrayMap(func, arr1, ...) {#array-map} 将从 `func` 函数的原始应用中获得的数组返回给 `arr` 数组中的每个元素。 @@ -1251,7 +1251,7 @@ SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res 请注意,`arrayMap` 是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。 您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayFilter(func, arr1, …) {#array-filter} +## arrayFilter(func, arr1, ...) {#array-filter} 返回一个仅包含 `arr1` 中的元素的数组,其中 `func` 返回的值不是 0。 @@ -1284,7 +1284,7 @@ SELECT 请注意,`arrayFilter`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。 您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayFill(func, arr1, …) {#array-fill} +## arrayFill(func, arr1, ...) {#array-fill} 从第一个元素到最后一个元素扫描`arr1`,如果`func`返回0,则用`arr1[i - 1]`替换`arr1[i]`。`arr1`的第一个元素不会被替换。 @@ -1302,7 +1302,7 @@ SELECT arrayFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, 请注意,`arrayFill` 是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。 您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayReverseFill(func, arr1, …) {#array-reverse-fill} +## arrayReverseFill(func, arr1, ...) {#array-reverse-fill} 从最后一个元素到第一个元素扫描`arr1`,如果`func`返回0,则用`arr1[i + 1]`替换`arr1[i]`。`arr1`的最后一个元素不会被替换。 @@ -1320,7 +1320,7 @@ SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 请注意,`arrayReverseFill`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。 您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arraySplit(func, arr1, …) {#array-split} +## arraySplit(func, arr1, ...) {#array-split} 将 `arr1` 拆分为多个数组。当 `func` 返回 0 以外的值时,数组将在元素的左侧拆分。数组不会在第一个元素之前被拆分。 @@ -1338,7 +1338,7 @@ SELECT arraySplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res 请注意,`arraySplit`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。 您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayReverseSplit(func, arr1, …) {#array-reverse-split} +## arrayReverseSplit(func, arr1, ...) {#array-reverse-split} 将 `arr1` 拆分为多个数组。当 `func` 返回 0 以外的值时,数组将在元素的右侧拆分。数组不会在最后一个元素之后被拆分。 @@ -1356,37 +1356,37 @@ SELECT arrayReverseSplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res 请注意,`arrayReverseSplit`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。 您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayExists(\[func,\] arr1, …) {#arrayexistsfunc-arr1} +## arrayExists(\[func,\] arr1, ...) {#arrayexistsfunc-arr1} 如果 `arr` 中至少有一个元素 `func` 返回 0 以外的值,则返回 1。否则,它返回 0。 请注意,`arrayExists`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。您可以将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayAll(\[func,\] arr1, …) {#arrayallfunc-arr1} +## arrayAll(\[func,\] arr1, ...) {#arrayallfunc-arr1} 如果 `func` 为 `arr` 中的所有元素返回 0 以外的值,则返回 1。否则,它返回 0。 请注意,`arrayAll`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。您可以将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayFirst(func, arr1, …) {#array-first} +## arrayFirst(func, arr1, ...) {#array-first} 返回 `arr1` 数组中 `func` 返回非 0 的值的第一个元素。 请注意,`arrayFirst`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayLast(func, arr1, …) {#array-last} +## arrayLast(func, arr1, ...) {#array-last} 返回 `arr1` 数组中的最后一个元素,其中 `func` 返回的值不是 0。 请注意,`arrayLast`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayFirstIndex(func, arr1, …) {#array-first-index} +## arrayFirstIndex(func, arr1, ...) {#array-first-index} 返回 `arr1` 数组中第一个元素的索引,其中 `func` 返回的值不是 0。 请注意,`arrayFirstIndex`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayLastIndex(func, arr1, …) {#array-last-index} +## arrayLastIndex(func, arr1, ...) {#array-last-index} 返回 `arr1` 数组中最后一个元素的索引,其中 `func` 返回的值不是 0。 @@ -1612,7 +1612,7 @@ SELECT arrayAvg(x -> (x * x), [2, 4]) AS res; └─────┘ ``` -## arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1} +## arrayCumSum(\[func,\] arr1, ...) {#arraycumsumfunc-arr1} 返回源数组中元素的部分和的数组(运行总和)。如果指定了 func 函数,则数组元素的值在求和之前由该函数转换。 diff --git a/docs/zh/sql-reference/functions/date-time-functions.md b/docs/zh/sql-reference/functions/date-time-functions.md index d6493ffe605..18b9f3495c0 100644 --- a/docs/zh/sql-reference/functions/date-time-functions.md +++ b/docs/zh/sql-reference/functions/date-time-functions.md @@ -443,7 +443,7 @@ SELECT toStartOfSecond(dt64, 'Asia/Istanbul'); `toISOWeek()`是一个兼容函数,等效于`toWeek(date,3)`。 下表描述了mode参数的工作方式。 -| Mode | First day of week | Range | Week 1 is the first week … | +| Mode | First day of week | Range | Week 1 is the first week ... | |------|-------------------|-------|-------------------------------| | 0 | Sunday | 0-53 | with a Sunday in this year | | 1 | Monday | 0-53 | with 4 or more days this year | diff --git a/docs/zh/sql-reference/functions/higher-order-functions.md b/docs/zh/sql-reference/functions/higher-order-functions.md index 929dc6f3ea7..0e08f88bba1 100644 --- a/docs/zh/sql-reference/functions/higher-order-functions.md +++ b/docs/zh/sql-reference/functions/higher-order-functions.md @@ -15,13 +15,13 @@ slug: /zh/sql-reference/functions/higher-order-functions 除了’arrayMap’和’arrayFilter’以外的所有其他函数,都可以省略第一个参数(lambda函数)。在这种情况下,默认返回数组元素本身。 -### arrayMap(func, arr1, …) {#higher_order_functions-array-map} +### arrayMap(func, arr1, ...) {#higher_order_functions-array-map} 将arr 将从’func’函数的原始应用程序获得的数组返回到’arr’数组中的每个元素。 返回从原始应用程序获得的数组 ‘func’ 函数中的每个元素 ‘arr’ 阵列。 -### arrayFilter(func, arr1, …) {#arrayfilterfunc-arr1} +### arrayFilter(func, arr1, ...) {#arrayfilterfunc-arr1} 返回一个仅包含以下元素的数组 ‘arr1’ 对于哪个 ‘func’ 返回0以外的内容。 @@ -48,31 +48,31 @@ SELECT │ [2] │ └─────┘ -### arrayCount(\[func,\] arr1, …) {#arraycountfunc-arr1} +### arrayCount(\[func,\] arr1, ...) {#arraycountfunc-arr1} 返回数组arr中非零元素的数量,如果指定了’func’,则通过’func’的返回值确定元素是否为非零元素。 -### arrayExists(\[func,\] arr1, …) {#arrayexistsfunc-arr1} +### arrayExists(\[func,\] arr1, ...) {#arrayexistsfunc-arr1} 返回数组’arr’中是否存在非零元素,如果指定了’func’,则使用’func’的返回值确定元素是否为非零元素。 -### arrayAll(\[func,\] arr1, …) {#arrayallfunc-arr1} +### arrayAll(\[func,\] arr1, ...) {#arrayallfunc-arr1} 返回数组’arr’中是否存在为零的元素,如果指定了’func’,则使用’func’的返回值确定元素是否为零元素。 -### arraySum(\[func,\] arr1, …) {#arraysumfunc-arr1} +### arraySum(\[func,\] arr1, ...) {#arraysumfunc-arr1} 计算arr数组的总和,如果指定了’func’,则通过’func’的返回值计算数组的总和。 -### arrayFirst(func, arr1, …) {#arrayfirstfunc-arr1} +### arrayFirst(func, arr1, ...) {#arrayfirstfunc-arr1} 返回数组中第一个匹配的元素,函数使用’func’匹配所有元素,直到找到第一个匹配的元素。 -### arrayFirstIndex(func, arr1, …) {#arrayfirstindexfunc-arr1} +### arrayFirstIndex(func, arr1, ...) {#arrayfirstindexfunc-arr1} 返回数组中第一个匹配的元素的下标索引,函数使用’func’匹配所有元素,直到找到第一个匹配的元素。 -### arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1} +### arrayCumSum(\[func,\] arr1, ...) {#arraycumsumfunc-arr1} 返回源数组部分数据的总和,如果指定了`func`函数,则使用`func`的返回值计算总和。 @@ -98,7 +98,7 @@ SELECT arrayCumSumNonNegative([1, 1, -4, 1]) AS res │ [1,2,0,1] │ └───────────┘ -### arraySort(\[func,\] arr1, …) {#arraysortfunc-arr1} +### arraySort(\[func,\] arr1, ...) {#arraysortfunc-arr1} 返回升序排序`arr1`的结果。如果指定了`func`函数,则排序顺序由`func`的结果决定。 @@ -124,7 +124,7 @@ SELECT arraySort([1, nan, 2, NULL, 3, nan, 4, NULL]) │ [1,2,3,4,nan,nan,NULL,NULL] │ └───────────────────────────────────────────────┘ -### arrayReverseSort(\[func,\] arr1, …) {#arrayreversesortfunc-arr1} +### arrayReverseSort(\[func,\] arr1, ...) {#arrayreversesortfunc-arr1} 返回降序排序`arr1`的结果。如果指定了`func`函数,则排序顺序由`func`的结果决定。 diff --git a/docs/zh/sql-reference/functions/in-functions.md b/docs/zh/sql-reference/functions/in-functions.md index 346e076310e..9858159a495 100644 --- a/docs/zh/sql-reference/functions/in-functions.md +++ b/docs/zh/sql-reference/functions/in-functions.md @@ -10,10 +10,10 @@ sidebar_label: IN 运算符 请参阅[IN 运算符](../../sql-reference/operators/in.md#select-in-operators)部分。 -## tuple(x, y, …), 运算符 (x, y, …) {#tuplex-y-operator-x-y} +## tuple(x, y, ...), 运算符 (x, y, ...) {#tuplex-y-operator-x-y} 函数用于对多个列进行分组。 -对于具有类型T1,T2,…的列,它返回包含这些列的元组(T1,T2,…)。 执行该函数没有任何成本。 +对于具有类型T1,T2,...的列,它返回包含这些列的元组(T1,T2,...)。 执行该函数没有任何成本。 元组通常用作IN运算符的中间参数值,或用于创建lambda函数的形参列表。 元组不能写入表。 ## tupleElement(tuple, n), 运算符 x.N {#tupleelementtuple-n-operator-x-n} diff --git a/docs/zh/sql-reference/functions/json-functions.md b/docs/zh/sql-reference/functions/json-functions.md index 52ec0ed1535..f07de564847 100644 --- a/docs/zh/sql-reference/functions/json-functions.md +++ b/docs/zh/sql-reference/functions/json-functions.md @@ -56,7 +56,7 @@ slug: /zh/sql-reference/functions/json-functions 以下函数基于[simdjson](https://github.com/lemire/simdjson),专为更复杂的JSON解析要求而设计。但上述假设2仍然适用。 -## JSONHas(json\[, indices_or_keys\]…) {#jsonhasjson-indices-or-keys} +## JSONHas(json\[, indices_or_keys\]...) {#jsonhasjson-indices-or-keys} 如果JSON中存在该值,则返回`1`。 @@ -83,7 +83,7 @@ slug: /zh/sql-reference/functions/json-functions select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' select JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' -## JSONLength(json\[, indices_or_keys\]…) {#jsonlengthjson-indices-or-keys} +## JSONLength(json\[, indices_or_keys\]...) {#jsonlengthjson-indices-or-keys} 返回JSON数组或JSON对象的长度。 @@ -94,7 +94,7 @@ slug: /zh/sql-reference/functions/json-functions select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 -## JSONType(json\[, indices_or_keys\]…) {#jsontypejson-indices-or-keys} +## JSONType(json\[, indices_or_keys\]...) {#jsontypejson-indices-or-keys} 返回JSON值的类型。 @@ -106,13 +106,13 @@ slug: /zh/sql-reference/functions/json-functions select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' -## JSONExtractUInt(json\[, indices_or_keys\]…) {#jsonextractuintjson-indices-or-keys} +## JSONExtractUInt(json\[, indices_or_keys\]...) {#jsonextractuintjson-indices-or-keys} -## JSONExtractInt(json\[, indices_or_keys\]…) {#jsonextractintjson-indices-or-keys} +## JSONExtractInt(json\[, indices_or_keys\]...) {#jsonextractintjson-indices-or-keys} -## JSONExtractFloat(json\[, indices_or_keys\]…) {#jsonextractfloatjson-indices-or-keys} +## JSONExtractFloat(json\[, indices_or_keys\]...) {#jsonextractfloatjson-indices-or-keys} -## JSONExtractBool(json\[, indices_or_keys\]…) {#jsonextractbooljson-indices-or-keys} +## JSONExtractBool(json\[, indices_or_keys\]...) {#jsonextractbooljson-indices-or-keys} 解析JSON并提取值。这些函数类似于`visitParam*`函数。 @@ -124,7 +124,7 @@ slug: /zh/sql-reference/functions/json-functions select JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0 select JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 -## JSONExtractString(json\[, indices_or_keys\]…) {#jsonextractstringjson-indices-or-keys} +## JSONExtractString(json\[, indices_or_keys\]...) {#jsonextractstringjson-indices-or-keys} 解析JSON并提取字符串。此函数类似于`visitParamExtractString`函数。 @@ -140,11 +140,11 @@ slug: /zh/sql-reference/functions/json-functions select JSONExtractString('{"abc":"\\u263"}', 'abc') = '' select JSONExtractString('{"abc":"hello}', 'abc') = '' -## JSONExtract(json\[, indices_or_keys…\], Return_type) {#jsonextractjson-indices-or-keys-return-type} +## JSONExtract(json\[, indices_or_keys...\], Return_type) {#jsonextractjson-indices-or-keys-return-type} 解析JSON并提取给定ClickHouse数据类型的值。 -这是以前的`JSONExtract函数的变体。 这意味着`JSONExtract(…, ‘String’)`返回与`JSONExtractString()`返回完全相同。`JSONExtract(…, ‘Float64’)`返回于`JSONExtractFloat()\`返回完全相同。 +这是以前的`JSONExtract函数的变体。 这意味着`JSONExtract(..., ‘String’)`返回与`JSONExtractString()`返回完全相同。`JSONExtract(..., ‘Float64’)`返回于`JSONExtractFloat()\`返回完全相同。 示例: @@ -156,7 +156,7 @@ slug: /zh/sql-reference/functions/json-functions SELECT JSONExtract('{"day": "Thursday"}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Thursday' SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Friday' -## JSONExtractKeysAndValues(json\[, indices_or_keys…\], Value_type) {#jsonextractkeysandvaluesjson-indices-or-keys-value-type} +## JSONExtractKeysAndValues(json\[, indices_or_keys...\], Value_type) {#jsonextractkeysandvaluesjson-indices-or-keys-value-type} 从JSON中解析键值对,其中值是给定的ClickHouse数据类型。 @@ -164,7 +164,7 @@ slug: /zh/sql-reference/functions/json-functions SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') = [('a',5),('b',7),('c',11)]; -## JSONExtractRaw(json\[, indices_or_keys\]…) {#jsonextractrawjson-indices-or-keys} +## JSONExtractRaw(json\[, indices_or_keys\]...) {#jsonextractrawjson-indices-or-keys} 返回JSON的部分。 diff --git a/docs/zh/sql-reference/functions/other-functions.md b/docs/zh/sql-reference/functions/other-functions.md index 2eeaad63694..9c28ff867c5 100644 --- a/docs/zh/sql-reference/functions/other-functions.md +++ b/docs/zh/sql-reference/functions/other-functions.md @@ -90,7 +90,7 @@ SELECT 'some-file-name' AS a, basename(a) 将一个常量列变为一个非常量列。 在ClickHouse中,非常量列和常量列在内存中的表示方式不同。尽管函数对于常量列和非常量总是返回相同的结果,但它们的工作方式可能完全不同(执行不同的代码)。此函数用于调试这种行为。 -## ignore(…) {#ignore} +## ignore(...) {#ignore} 接受任何参数,包括`NULL`。始终返回0。 但是,函数的参数总是被计算的。该函数可以用于基准测试。 diff --git a/docs/zh/sql-reference/functions/string-functions.md b/docs/zh/sql-reference/functions/string-functions.md index d1914839d7c..c28735c7dc7 100644 --- a/docs/zh/sql-reference/functions/string-functions.md +++ b/docs/zh/sql-reference/functions/string-functions.md @@ -95,7 +95,7 @@ SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') 以Unicode字符为单位反转UTF-8编码的字符串。如果字符串不是UTF-8编码,则可能获取到一个非预期的结果(不会抛出异常)。 -## format(pattern, s0, s1, …) {#formatpattern-s0-s1} +## format(pattern, s0, s1, ...) {#formatpattern-s0-s1} 使用常量字符串`pattern`格式化其他参数。`pattern`字符串中包含由大括号`{}`包围的«替换字段»。 未被包含在大括号中的任何内容都被视为文本内容,它将原样保留在返回值中。 如果你需要在文本内容中包含一个大括号字符,它可以通过加倍来转义:`{{ '{{' }}`和`{{ '{{' }} '}}' }}`。 字段名称可以是数字(从零开始)或空(然后将它们视为连续数字) @@ -113,11 +113,11 @@ SELECT format('{} {}', 'Hello', 'World') └───────────────────────────────────┘ ``` -## concat(s1, s2, …) {#concat-s1-s2} +## concat(s1, s2, ...) {#concat-s1-s2} 将参数中的多个字符串拼接,不带分隔符。 -## concatAssumeInjective(s1, s2, …) {#concatassumeinjectives1-s2} +## concatAssumeInjective(s1, s2, ...) {#concatassumeinjectives1-s2} 与[concat](#concat-s1-s2)相同,区别在于,你需要保证concat(s1, s2, s3) -\> s4是单射的,它将用于GROUP BY的优化。 diff --git a/docs/zh/sql-reference/functions/string-search-functions.md b/docs/zh/sql-reference/functions/string-search-functions.md index 972fd84e2a1..8ada76eeeda 100644 --- a/docs/zh/sql-reference/functions/string-search-functions.md +++ b/docs/zh/sql-reference/functions/string-search-functions.md @@ -204,7 +204,7 @@ SELECT multiSearchAllPositions('Hello, World!', ['hello', '!', 'world']); **语法** ```sql -multiSearchFirstPosition(haystack, [needle1, needle2, …, needleN]) +multiSearchFirstPosition(haystack, [needle1, needle2, ..., needleN]) ``` ## multiSearchFirstIndex @@ -216,7 +216,7 @@ multiSearchFirstPosition(haystack, [needle1, needle2, …, needleN]) **语法** ```sql -multiSearchFirstIndex(haystack, \[needle1, needle2, …, needlen\]) +multiSearchFirstIndex(haystack, \[needle1, needle2, ..., needlen\]) ``` ## multiSearchAny {#multisearchany} @@ -229,7 +229,7 @@ multiSearchFirstIndex(haystack, \[needle1, needle2, …, n **语法** ```sql -multiSearchAny(haystack, [needle1, needle2, …, needleN]) +multiSearchAny(haystack, [needle1, needle2, ..., needleN]) ``` ## match {#match} @@ -273,7 +273,7 @@ Hyperscan 通常容易受到正则表达式拒绝服务 (ReDoS) 攻击。有关 **语法** ```sql -multiMatchAny(haystack, \[pattern1, pattern2, …, patternn\]) +multiMatchAny(haystack, \[pattern1, pattern2, ..., patternn\]) ``` ## multiMatchAnyIndex @@ -283,7 +283,7 @@ multiMatchAny(haystack, \[pattern1, pattern2, …, pattern **语法** ```sql -multiMatchAnyIndex(haystack, \[pattern1, pattern2, …, patternn\]) +multiMatchAnyIndex(haystack, \[pattern1, pattern2, ..., patternn\]) ``` ## multiMatchAllIndices @@ -293,7 +293,7 @@ multiMatchAnyIndex(haystack, \[pattern1, pattern2, …, pa **语法** ```sql -multiMatchAllIndices(haystack, \[pattern1, pattern2, …, patternn\]) +multiMatchAllIndices(haystack, \[pattern1, pattern2, ..., patternn\]) ``` ## multiFuzzyMatchAny @@ -307,7 +307,7 @@ multiMatchAllIndices(haystack, \[pattern1, pattern2, …, **语法** ```sql -multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, …, patternn\]) +multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, ..., patternn\]) ``` ## multiFuzzyMatchAnyIndex @@ -317,7 +317,7 @@ multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern21, pattern2, …, patternn\]) +multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2, ..., patternn\]) ``` ## multiFuzzyMatchAllIndices @@ -327,7 +327,7 @@ multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2 **语法** ```sql -multiFuzzyMatchAllIndices(haystack, distance, \[pattern1, pattern2, …, patternn\]) +multiFuzzyMatchAllIndices(haystack, distance, \[pattern1, pattern2, ..., patternn\]) ``` ## extract diff --git a/docs/zh/sql-reference/functions/url-functions.md b/docs/zh/sql-reference/functions/url-functions.md index 44880b6ca1a..e7a0354c0bf 100644 --- a/docs/zh/sql-reference/functions/url-functions.md +++ b/docs/zh/sql-reference/functions/url-functions.md @@ -11,7 +11,7 @@ slug: /zh/sql-reference/functions/url-functions ### 协议 {#protocol} -返回URL的协议。例如: http、ftp、mailto、magnet… +返回URL的协议。例如: http、ftp、mailto、magnet... ### 域 {#domain} diff --git a/docs/zh/sql-reference/statements/alter/delete.md b/docs/zh/sql-reference/statements/alter/delete.md index 5eb77c35a93..f0b41c4e214 100644 --- a/docs/zh/sql-reference/statements/alter/delete.md +++ b/docs/zh/sql-reference/statements/alter/delete.md @@ -4,7 +4,7 @@ sidebar_position: 39 sidebar_label: DELETE --- -# ALTER TABLE … DELETE 语句 {#alter-mutations} +# ALTER TABLE ... DELETE 语句 {#alter-mutations} ``` sql ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr diff --git a/docs/zh/sql-reference/statements/alter/index.md b/docs/zh/sql-reference/statements/alter/index.md index e173837a16c..2286dcccd13 100644 --- a/docs/zh/sql-reference/statements/alter/index.md +++ b/docs/zh/sql-reference/statements/alter/index.md @@ -38,7 +38,7 @@ sidebar_label: ALTER ## Mutations 突变 {#mutations} -用来操作表数据的ALTER查询是通过一种叫做“突变”的机制来实现的,最明显的是[ALTER TABLE … DELETE](../../../sql-reference/statements/alter/delete.md)和[ALTER TABLE … UPDATE](../../../sql-reference/statements/alter/update.md)。它们是异步的后台进程,类似于[MergeTree](../../../engines/table-engines/mergetree-family/index.md)表的合并,产生新的“突变”版本的部件。 +用来操作表数据的ALTER查询是通过一种叫做“突变”的机制来实现的,最明显的是[ALTER TABLE ... DELETE](../../../sql-reference/statements/alter/delete.md)和[ALTER TABLE ... UPDATE](../../../sql-reference/statements/alter/update.md)。它们是异步的后台进程,类似于[MergeTree](../../../engines/table-engines/mergetree-family/index.md)表的合并,产生新的“突变”版本的部件。 diff --git a/docs/zh/sql-reference/statements/alter/update.md b/docs/zh/sql-reference/statements/alter/update.md index 97b2b43d889..7cf37401dc5 100644 --- a/docs/zh/sql-reference/statements/alter/update.md +++ b/docs/zh/sql-reference/statements/alter/update.md @@ -4,7 +4,7 @@ sidebar_position: 40 sidebar_label: UPDATE --- -# ALTER TABLE … UPDATE 语句 {#alter-table-update-statements} +# ALTER TABLE ... UPDATE 语句 {#alter-table-update-statements} ``` sql ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr diff --git a/docs/zh/sql-reference/statements/alter/view.md b/docs/zh/sql-reference/statements/alter/view.md index 34a612803c1..a19d918612a 100644 --- a/docs/zh/sql-reference/statements/alter/view.md +++ b/docs/zh/sql-reference/statements/alter/view.md @@ -4,9 +4,9 @@ sidebar_position: 50 sidebar_label: VIEW --- -# ALTER TABLE … MODIFY QUERY 语句 {#alter-modify-query} +# ALTER TABLE ... MODIFY QUERY 语句 {#alter-modify-query} -当使用`ALTER TABLE … MODIFY QUERY`语句创建一个[物化视图](../create/view.md#materialized)时,可以修改`SELECT`查询。当物化视图在没有 `TO [db.]name` 的情况下创建时使用它。必须启用 `allow_experimental_alter_materialized_view_structure`设置。 +当使用`ALTER TABLE ... MODIFY QUERY`语句创建一个[物化视图](../create/view.md#materialized)时,可以修改`SELECT`查询。当物化视图在没有 `TO [db.]name` 的情况下创建时使用它。必须启用 `allow_experimental_alter_materialized_view_structure`设置。 如果一个物化视图使用`TO [db.]name`,你必须先 [DETACH](../detach.mdx) 视图。用[ALTER TABLE](index.md)修改目标表,然后 [ATTACH](../attach.mdx)之前分离的(`DETACH`)视图。 diff --git a/docs/zh/sql-reference/statements/create/view.md b/docs/zh/sql-reference/statements/create/view.md index bce0994ecd2..49a1d66bdf1 100644 --- a/docs/zh/sql-reference/statements/create/view.md +++ b/docs/zh/sql-reference/statements/create/view.md @@ -55,7 +55,7 @@ ClickHouse 中的物化视图更像是插入触发器。 如果视图查询中 如果指定`POPULATE`,则在创建视图时将现有表数据插入到视图中,就像创建一个`CREATE TABLE ... AS SELECT ...`一样。 否则,查询仅包含创建视图后插入表中的数据。 我们**不建议**使用POPULATE,因为在创建视图期间插入表中的数据不会插入其中。 -`SELECT` 查询可以包含`DISTINCT`、`GROUP BY`、`ORDER BY`、`LIMIT`……请注意,相应的转换是在每个插入数据块上独立执行的。 例如,如果设置了`GROUP BY`,则在插入期间聚合数据,但仅在插入数据的单个数据包内。 数据不会被进一步聚合。 例外情况是使用独立执行数据聚合的`ENGINE`,例如`SummingMergeTree`。 +`SELECT` 查询可以包含`DISTINCT`、`GROUP BY`、`ORDER BY`、`LIMIT`...请注意,相应的转换是在每个插入数据块上独立执行的。 例如,如果设置了`GROUP BY`,则在插入期间聚合数据,但仅在插入数据的单个数据包内。 数据不会被进一步聚合。 例外情况是使用独立执行数据聚合的`ENGINE`,例如`SummingMergeTree`。 在物化视图上执行[ALTER](../../../sql-reference/statements/alter/index.md)查询有局限性,因此可能不方便。 如果物化视图使用构造`TO [db.]name`,你可以`DETACH`视图,为目标表运行`ALTER`,然后`ATTACH`先前分离的(`DETACH`)视图。 diff --git a/docs/zh/sql-reference/statements/insert-into.md b/docs/zh/sql-reference/statements/insert-into.md index f80c0a8a8ea..a08a78b6f1d 100644 --- a/docs/zh/sql-reference/statements/insert-into.md +++ b/docs/zh/sql-reference/statements/insert-into.md @@ -68,7 +68,7 @@ SELECT * FROM insert_select_testtable; INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format_name data_set ``` -例如,下面的查询所使用的输入格式就与上面INSERT … VALUES的中使用的输入格式相同: +例如,下面的查询所使用的输入格式就与上面INSERT ... VALUES的中使用的输入格式相同: ``` sql INSERT INTO [TABLE] [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... diff --git a/docs/zh/sql-reference/statements/select/limit.md b/docs/zh/sql-reference/statements/select/limit.md index 2bbf2949707..795f3f4ecd1 100644 --- a/docs/zh/sql-reference/statements/select/limit.md +++ b/docs/zh/sql-reference/statements/select/limit.md @@ -13,11 +13,11 @@ sidebar_label: LIMIT 如果没有 [ORDER BY](../../../sql-reference/statements/select/order-by.md) 子句显式排序结果,结果的行选择可能是任意的和非确定性的。 -## LIMIT … WITH TIES 修饰符 {#limit-with-ties} +## LIMIT ... WITH TIES 修饰符 {#limit-with-ties} 如果为 `LIMIT n[,m]` 设置了 `WITH TIES` ,并且声明了 `ORDER BY expr_list`, 除了得到无修饰符的结果(正常情况下的 `limit n`, 前n行数据), 还会返回与第`n`行具有相同排序字段的行(即如果第n+1行的字段与第n行 拥有相同的排序字段,同样返回该结果. -此修饰符可以与: [ORDER BY … WITH FILL modifier](../../../sql-reference/statements/select/order-by.md#orderby-with-fill) 组合使用. +此修饰符可以与: [ORDER BY ... WITH FILL modifier](../../../sql-reference/statements/select/order-by.md#orderby-with-fill) 组合使用. 例如以下查询: diff --git a/docs/zh/sql-reference/statements/select/order-by.md b/docs/zh/sql-reference/statements/select/order-by.md index 3286fc9f9e7..2f2d9a4959c 100644 --- a/docs/zh/sql-reference/statements/select/order-by.md +++ b/docs/zh/sql-reference/statements/select/order-by.md @@ -89,7 +89,7 @@ SELECT a, b, c FROM t ORDER BY a, b, c ## ORDER BY Expr WITH FILL Modifier {#orderby-with-fill} -此修饰符可以与 [LIMIT … WITH TIES modifier](../../../sql-reference/statements/select/limit.md#limit-with-ties) 进行组合使用. +此修饰符可以与 [LIMIT ... WITH TIES modifier](../../../sql-reference/statements/select/limit.md#limit-with-ties) 进行组合使用. 可以在`ORDER BY expr`之后用可选的`FROM expr`,`TO expr`和`STEP expr`参数来设置`WITH FILL`修饰符。 所有`expr`列的缺失值将被顺序填充,而其他列将被填充为默认值。 diff --git a/docs/zh/sql-reference/table-functions/file.md b/docs/zh/sql-reference/table-functions/file.md index 28682255738..fa1ec12f7df 100644 --- a/docs/zh/sql-reference/table-functions/file.md +++ b/docs/zh/sql-reference/table-functions/file.md @@ -114,7 +114,7 @@ FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32') **示例** -从名为 `file000`, `file001`, … , `file999`的文件中查询数据: +从名为 `file000`, `file001`, ... , `file999`的文件中查询数据: ``` sql SELECT count(*) diff --git a/docs/zh/sql-reference/table-functions/hdfs.md b/docs/zh/sql-reference/table-functions/hdfs.md index b10b10ae2d2..f8320d8d0bb 100644 --- a/docs/zh/sql-reference/table-functions/hdfs.md +++ b/docs/zh/sql-reference/table-functions/hdfs.md @@ -84,7 +84,7 @@ FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV', 'name String, value U **示例** -从名为 `file000`, `file001`, … , `file999`的文件中查询数据: +从名为 `file000`, `file001`, ... , `file999`的文件中查询数据: ``` sql SELECT count(*) diff --git a/docs/zh/sql-reference/table-functions/s3.md b/docs/zh/sql-reference/table-functions/s3.md index f7384a7526e..4f2c7299d95 100644 --- a/docs/zh/sql-reference/table-functions/s3.md +++ b/docs/zh/sql-reference/table-functions/s3.md @@ -99,7 +99,7 @@ FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefi !!! warning "Warning" 如果文件列表中包含有从零开头的数字范围,请对每个数字分别使用带括号的结构,或者使用`?`。 -计算名为 `file-000.csv`, `file-001.csv`, … , `file-999.csv` 文件的总行数: +计算名为 `file-000.csv`, `file-001.csv`, ... , `file-999.csv` 文件的总行数: ``` sql SELECT count(*) diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index 0d91de2dad8..4640882f2be 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -162,7 +162,7 @@ if (ARCH_AMD64 AND OS_LINUX AND NOT OS_ANDROID) set (HARMFUL_LIB harmful) endif () -target_link_libraries (clickhouse PRIVATE clickhouse_common_io string_utils ${HARMFUL_LIB}) +target_link_libraries (clickhouse PRIVATE clickhouse_common_io ${HARMFUL_LIB}) target_include_directories (clickhouse PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) if (ENABLE_CLICKHOUSE_KEEPER) diff --git a/programs/client/CMakeLists.txt b/programs/client/CMakeLists.txt index e160355ef7b..f8ef8ccaf65 100644 --- a/programs/client/CMakeLists.txt +++ b/programs/client/CMakeLists.txt @@ -10,7 +10,6 @@ set (CLICKHOUSE_CLIENT_LINK clickhouse_common_io clickhouse_functions clickhouse_parsers - string_utils ) if (TARGET ch_rust::skim) diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 01ed7d70b38..efe23d57478 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -1178,7 +1178,7 @@ void Client::processConfig() pager = config().getString("pager", ""); - setDefaultFormatsFromConfiguration(); + setDefaultFormatsAndCompressionFromConfiguration(); global_context->setClientName(std::string(DEFAULT_CLIENT_NAME)); global_context->setQueryKindInitial(); diff --git a/programs/format/Format.cpp b/programs/format/Format.cpp index d4b975ce1e8..1b91e7ceaf3 100644 --- a/programs/format/Format.cpp +++ b/programs/format/Format.cpp @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include diff --git a/programs/git-import/git-import.cpp b/programs/git-import/git-import.cpp index eaf85df67b1..5430c4b0a42 100644 --- a/programs/git-import/git-import.cpp +++ b/programs/git-import/git-import.cpp @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index 267b725b02b..dba5c2b7d2a 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -182,6 +182,11 @@ std::string Keeper::getDefaultConfigFileName() const return "keeper_config.xml"; } +bool Keeper::allowTextLog() const +{ + return false; +} + void Keeper::handleCustomArguments(const std::string & arg, [[maybe_unused]] const std::string & value) // NOLINT { if (arg == "force-recovery") diff --git a/programs/keeper/Keeper.h b/programs/keeper/Keeper.h index f889ffa595b..c449c40b610 100644 --- a/programs/keeper/Keeper.h +++ b/programs/keeper/Keeper.h @@ -65,6 +65,8 @@ protected: std::string getDefaultConfigFileName() const override; + bool allowTextLog() const override; + private: Poco::Net::SocketAddress socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, [[maybe_unused]] bool secure = false) const; diff --git a/programs/keeper/clickhouse-keeper.cpp b/programs/keeper/clickhouse-keeper.cpp index be2686d936b..f2f91930ac0 100644 --- a/programs/keeper/clickhouse-keeper.cpp +++ b/programs/keeper/clickhouse-keeper.cpp @@ -1,4 +1,4 @@ -#include +#include #include "config_tools.h" diff --git a/programs/library-bridge/ExternalDictionaryLibraryUtils.h b/programs/library-bridge/ExternalDictionaryLibraryUtils.h index e6bf8f2a4c3..2eb44022742 100644 --- a/programs/library-bridge/ExternalDictionaryLibraryUtils.h +++ b/programs/library-bridge/ExternalDictionaryLibraryUtils.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 6d1ebf8d30c..4d5cfb09e6a 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -607,7 +607,7 @@ void LocalServer::processConfig() if (config().has("macros")) global_context->setMacros(std::make_unique(config(), "macros", log)); - setDefaultFormatsFromConfiguration(); + setDefaultFormatsAndCompressionFromConfiguration(); /// Sets external authenticators config (LDAP, Kerberos). global_context->setExternalAuthenticatorsConfig(config()); diff --git a/programs/main.cpp b/programs/main.cpp index 4bb73399719..bc8476e4ce4 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -15,7 +15,7 @@ #include "config_tools.h" -#include +#include #include #include diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp index 8035f053b41..688ae1a1143 100644 --- a/programs/obfuscator/Obfuscator.cpp +++ b/programs/obfuscator/Obfuscator.cpp @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/programs/odbc-bridge/validateODBCConnectionString.cpp b/programs/odbc-bridge/validateODBCConnectionString.cpp index 6c6e11162b4..72c3c9bddca 100644 --- a/programs/odbc-bridge/validateODBCConnectionString.cpp +++ b/programs/odbc-bridge/validateODBCConnectionString.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include "validateODBCConnectionString.h" diff --git a/programs/server/CMakeLists.txt b/programs/server/CMakeLists.txt index 81440b03690..76d201cc924 100644 --- a/programs/server/CMakeLists.txt +++ b/programs/server/CMakeLists.txt @@ -13,7 +13,6 @@ set (CLICKHOUSE_SERVER_LINK clickhouse_parsers clickhouse_storages_system clickhouse_table_functions - string_utils ${LINK_RESOURCE_LIB} diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 9c9476d1aa7..223bc1f77e7 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1476,6 +1476,8 @@ try global_context->setMaxTableSizeToDrop(new_server_settings.max_table_size_to_drop); global_context->setMaxPartitionSizeToDrop(new_server_settings.max_partition_size_to_drop); global_context->setMaxTableNumToWarn(new_server_settings.max_table_num_to_warn); + global_context->setMaxViewNumToWarn(new_server_settings.max_view_num_to_warn); + global_context->setMaxDictionaryNumToWarn(new_server_settings.max_dictionary_num_to_warn); global_context->setMaxDatabaseNumToWarn(new_server_settings.max_database_num_to_warn); global_context->setMaxPartNumToWarn(new_server_settings.max_part_num_to_warn); diff --git a/src/Access/User.cpp b/src/Access/User.cpp index ef5cf722113..6a296706baf 100644 --- a/src/Access/User.cpp +++ b/src/Access/User.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include diff --git a/src/Access/UsersConfigAccessStorage.cpp b/src/Access/UsersConfigAccessStorage.cpp index 908ff780c62..1f9a977bab6 100644 --- a/src/Access/UsersConfigAccessStorage.cpp +++ b/src/Access/UsersConfigAccessStorage.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/AggregateFunctions/AggregateFunctionsArgMinArgMax.cpp b/src/AggregateFunctions/AggregateFunctionsArgMinArgMax.cpp index e8f40120152..9608ca26f37 100644 --- a/src/AggregateFunctions/AggregateFunctionsArgMinArgMax.cpp +++ b/src/AggregateFunctions/AggregateFunctionsArgMinArgMax.cpp @@ -14,7 +14,7 @@ struct Settings; namespace ErrorCodes { -extern const int CORRUPTED_DATA; +extern const int INCORRECT_DATA; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int LOGICAL_ERROR; } @@ -198,7 +198,7 @@ public: this->data(place).value().read(buf, *serialization_val, arena); if (unlikely(this->data(place).value().has() != this->data(place).result().has())) throw Exception( - ErrorCodes::CORRUPTED_DATA, + ErrorCodes::INCORRECT_DATA, "Invalid state of the aggregate function {}: has_value ({}) != has_result ({})", getName(), this->data(place).value().has(), diff --git a/src/AggregateFunctions/Combinators/AggregateFunctionCombinatorFactory.cpp b/src/AggregateFunctions/Combinators/AggregateFunctionCombinatorFactory.cpp index a42e4177ac5..428f7168826 100644 --- a/src/AggregateFunctions/Combinators/AggregateFunctionCombinatorFactory.cpp +++ b/src/AggregateFunctions/Combinators/AggregateFunctionCombinatorFactory.cpp @@ -1,6 +1,6 @@ #include "AggregateFunctionCombinatorFactory.h" -#include +#include namespace DB { diff --git a/src/Analyzer/Passes/AutoFinalOnQueryPass.cpp b/src/Analyzer/Passes/AutoFinalOnQueryPass.cpp index 9bd044dd89c..70aa1a41548 100644 --- a/src/Analyzer/Passes/AutoFinalOnQueryPass.cpp +++ b/src/Analyzer/Passes/AutoFinalOnQueryPass.cpp @@ -42,7 +42,7 @@ private: return; const auto & storage = table_node ? table_node->getStorage() : table_function_node->getStorage(); - bool is_final_supported = storage && storage->supportsFinal(); + bool is_final_supported = storage && !storage->isRemote() && storage->supportsFinal(); if (!is_final_supported) return; diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index dae17e18b85..132feed778b 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -4617,6 +4617,36 @@ QueryAnalyzer::QueryTreeNodesWithNames QueryAnalyzer::resolveUnqualifiedMatcher( std::unordered_set table_expression_column_names_to_skip; + QueryTreeNodesWithNames result; + + if (matcher_node_typed.getMatcherType() == MatcherNodeType::COLUMNS_LIST) + { + auto identifiers = matcher_node_typed.getColumnsIdentifiers(); + result.reserve(identifiers.size()); + + for (const auto & identifier : identifiers) + { + auto resolve_result = tryResolveIdentifier(IdentifierLookup{identifier, IdentifierLookupContext::EXPRESSION}, scope); + if (!resolve_result.isResolved()) + throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, + "Unknown identifier '{}' inside COLUMNS matcher. In scope {}", + identifier.getFullName(), scope.dump()); + + // TODO: Introduce IdentifierLookupContext::COLUMN and get rid of this check + auto * resolved_column = resolve_result.resolved_identifier->as(); + if (!resolved_column) + throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, + "Identifier '{}' inside COLUMNS matcher must resolve into a column, but got {}. In scope {}", + identifier.getFullName(), + resolve_result.resolved_identifier->getNodeTypeName(), + scope.scope_node->formatASTForErrorMessage()); + result.emplace_back(resolve_result.resolved_identifier, resolved_column->getColumnName()); + } + return result; + } + + result.resize(matcher_node_typed.getColumnsIdentifiers().size()); + for (auto & table_expression : table_expressions_stack) { bool table_expression_in_resolve_process = nearest_query_scope->table_expressions_in_resolve_process.contains(table_expression.get()); @@ -4784,8 +4814,6 @@ QueryAnalyzer::QueryTreeNodesWithNames QueryAnalyzer::resolveUnqualifiedMatcher( table_expressions_column_nodes_with_names_stack.push_back(std::move(matched_column_nodes_with_names)); } - QueryTreeNodesWithNames result; - for (auto & table_expression_column_nodes_with_names : table_expressions_column_nodes_with_names_stack) { for (auto && table_expression_column_node_with_name : table_expression_column_nodes_with_names) diff --git a/src/Analyzer/QueryTreePassManager.cpp b/src/Analyzer/QueryTreePassManager.cpp index 51f1fb6cc2f..f7919b6422c 100644 --- a/src/Analyzer/QueryTreePassManager.cpp +++ b/src/Analyzer/QueryTreePassManager.cpp @@ -192,7 +192,7 @@ void QueryTreePassManager::run(QueryTreeNodePtr query_tree_node) void QueryTreePassManager::runOnlyResolve(QueryTreeNodePtr query_tree_node) { // Run only QueryAnalysisPass and GroupingFunctionsResolvePass passes. - run(query_tree_node, 2); + run(query_tree_node, 3); } void QueryTreePassManager::run(QueryTreeNodePtr query_tree_node, size_t up_to_pass_index) @@ -249,6 +249,7 @@ void addQueryTreePasses(QueryTreePassManager & manager, bool only_analyze) { manager.addPass(std::make_unique(only_analyze)); manager.addPass(std::make_unique()); + manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); @@ -294,7 +295,6 @@ void addQueryTreePasses(QueryTreePassManager & manager, bool only_analyze) manager.addPass(std::make_unique()); - manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index 8c0989b8202..8f32c918c61 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Bridge/IBridge.cpp b/src/Bridge/IBridge.cpp index 6da2b7c06da..c25d7bd2fed 100644 --- a/src/Bridge/IBridge.cpp +++ b/src/Bridge/IBridge.cpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index da17bc1f41f..4e8946facda 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -307,7 +307,6 @@ endif() target_link_libraries (clickhouse_common_io PRIVATE - string_utils widechar_width ${LINK_LIBRARIES_ONLY_ON_X86_64} PUBLIC @@ -320,7 +319,6 @@ target_link_libraries (clickhouse_common_io target_link_libraries (clickhouse_compression PUBLIC - string_utils pcg_random clickhouse_parsers PRIVATE @@ -410,7 +408,6 @@ dbms_target_link_libraries ( clickhouse_parsers ch_contrib::lz4 Poco::JSON - string_utils PUBLIC boost::system clickhouse_common_io @@ -645,7 +642,6 @@ if (ENABLE_TESTS) dbms clickhouse_common_config clickhouse_common_zookeeper - string_utils hilite_comparator) if (TARGET ch_contrib::simdjson) diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 4441d884754..f8391c64d5a 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -18,9 +18,10 @@ #include #include #include -#include +#include #include #include +#include #include #include #include @@ -643,6 +644,9 @@ try bool extras_into_stdout = need_render_progress || logs_into_stdout; bool select_only_into_file = select_into_file && !select_into_file_and_stdout; + if (!out_file_buf && default_output_compression_method != CompressionMethod::None) + out_file_buf = wrapWriteBufferWithCompressionMethod(out_buf, default_output_compression_method, 3, 0); + /// It is not clear how to write progress and logs /// intermixed with data with parallel formatting. /// It may increase code complexity significantly. @@ -735,7 +739,7 @@ bool ClientBase::isRegularFile(int fd) return fstat(fd, &file_stat) == 0 && S_ISREG(file_stat.st_mode); } -void ClientBase::setDefaultFormatsFromConfiguration() +void ClientBase::setDefaultFormatsAndCompressionFromConfiguration() { if (config().has("output-format")) { @@ -759,6 +763,10 @@ void ClientBase::setDefaultFormatsFromConfiguration() default_output_format = *format_from_file_name; else default_output_format = "TSV"; + + std::optional file_name = tryGetFileNameFromFileDescriptor(STDOUT_FILENO); + if (file_name) + default_output_compression_method = chooseCompressionMethod(*file_name, ""); } else if (is_interactive) { diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index 64cbdbe8989..7a0489641c8 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -190,7 +190,7 @@ protected: /// Adjust some settings after command line options and config had been processed. void adjustSettings(); - void setDefaultFormatsFromConfiguration(); + void setDefaultFormatsAndCompressionFromConfiguration(); void initTTYBuffer(ProgressOption progress); @@ -224,6 +224,7 @@ protected: String pager; String default_output_format; /// Query results output format. + CompressionMethod default_output_compression_method = CompressionMethod::None; String default_input_format; /// Tables' format for clickhouse-local. bool select_into_file = false; /// If writing result INTO OUTFILE. It affects progress rendering. diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 6e626c22527..19cd8cc4ee5 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 8d5c246c48c..1e94240dd4c 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -1289,4 +1289,14 @@ size_t ColumnArray::getNumberOfDimensions() const return 1 + nested_array->getNumberOfDimensions(); /// Every modern C++ compiler optimizes tail recursion. } +void ColumnArray::takeDynamicStructureFromSourceColumns(const Columns & source_columns) +{ + Columns nested_source_columns; + nested_source_columns.reserve(source_columns.size()); + for (const auto & source_column : source_columns) + nested_source_columns.push_back(assert_cast(*source_column).getDataPtr()); + + data->takeDynamicStructureFromSourceColumns(nested_source_columns); +} + } diff --git a/src/Columns/ColumnArray.h b/src/Columns/ColumnArray.h index 230d8830265..53eb5166df8 100644 --- a/src/Columns/ColumnArray.h +++ b/src/Columns/ColumnArray.h @@ -175,6 +175,9 @@ public: size_t getNumberOfDimensions() const; + bool hasDynamicStructure() const override { return getData().hasDynamicStructure(); } + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; + private: WrappedPtr data; WrappedPtr offsets; diff --git a/src/Columns/ColumnCompressed.h b/src/Columns/ColumnCompressed.h index 6763410b46d..934adf07cf4 100644 --- a/src/Columns/ColumnCompressed.h +++ b/src/Columns/ColumnCompressed.h @@ -122,6 +122,9 @@ public: UInt64 getNumberOfDefaultRows() const override { throwMustBeDecompressed(); } void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeDecompressed(); } + bool hasDynamicStructure() const override { throwMustBeDecompressed(); } + void takeDynamicStructureFromSourceColumns(const Columns &) override { throwMustBeDecompressed(); } + protected: size_t rows; size_t bytes; diff --git a/src/Columns/ColumnConst.h b/src/Columns/ColumnConst.h index 4a3d40ca0d2..c2c0fa3027c 100644 --- a/src/Columns/ColumnConst.h +++ b/src/Columns/ColumnConst.h @@ -306,6 +306,8 @@ public: T getValue() const { return static_cast(getField().safeGet()); } bool isCollationSupported() const override { return data->isCollationSupported(); } + + bool hasDynamicStructure() const override { return data->hasDynamicStructure(); } }; ColumnConst::Ptr createColumnConst(const ColumnPtr & column, Field value); diff --git a/src/Columns/ColumnDynamic.cpp b/src/Columns/ColumnDynamic.cpp new file mode 100644 index 00000000000..3c147b6f123 --- /dev/null +++ b/src/Columns/ColumnDynamic.cpp @@ -0,0 +1,758 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int PARAMETER_OUT_OF_BOUND; +} + + +ColumnDynamic::ColumnDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_dynamic_types_) +{ + /// Create empty Variant. + variant_info.variant_type = std::make_shared(DataTypes{}); + variant_info.variant_name = variant_info.variant_type->getName(); + variant_column = variant_info.variant_type->createColumn(); +} + +ColumnDynamic::ColumnDynamic( + MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_) + : variant_column(std::move(variant_column_)) + , variant_info(variant_info_) + , max_dynamic_types(max_dynamic_types_) + , statistics(statistics_) +{ +} + +ColumnDynamic::MutablePtr ColumnDynamic::create(MutableColumnPtr variant_column, const DataTypePtr & variant_type, size_t max_dynamic_types_, const Statistics & statistics_) +{ + VariantInfo variant_info; + variant_info.variant_type = variant_type; + variant_info.variant_name = variant_type->getName(); + const auto & variants = assert_cast(*variant_type).getVariants(); + variant_info.variant_names.reserve(variants.size()); + variant_info.variant_name_to_discriminator.reserve(variants.size()); + for (ColumnVariant::Discriminator discr = 0; discr != variants.size(); ++discr) + { + const auto & variant_name = variant_info.variant_names.emplace_back(variants[discr]->getName()); + variant_info.variant_name_to_discriminator[variant_name] = discr; + } + + return create(std::move(variant_column), variant_info, max_dynamic_types_, statistics_); +} + +bool ColumnDynamic::addNewVariant(const DB::DataTypePtr & new_variant) +{ + /// Check if we already have such variant. + if (variant_info.variant_name_to_discriminator.contains(new_variant->getName())) + return true; + + /// Check if we reached maximum number of variants. + if (variant_info.variant_names.size() >= max_dynamic_types) + { + /// ColumnDynamic can have max_dynamic_types number of variants only when it has String as a variant. + /// Otherwise we won't be able to cast new variants to Strings. + if (!variant_info.variant_name_to_discriminator.contains("String")) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Maximum number of variants reached, but no String variant exists"); + + return false; + } + + /// If we have (max_dynamic_types - 1) number of variants and don't have String variant, we can add only String variant. + if (variant_info.variant_names.size() == max_dynamic_types - 1 && new_variant->getName() != "String" && !variant_info.variant_name_to_discriminator.contains("String")) + return false; + + const DataTypes & current_variants = assert_cast(*variant_info.variant_type).getVariants(); + DataTypes all_variants = current_variants; + all_variants.push_back(new_variant); + auto new_variant_type = std::make_shared(all_variants); + updateVariantInfoAndExpandVariantColumn(new_variant_type); + return true; +} + +void ColumnDynamic::addStringVariant() +{ + if (!addNewVariant(std::make_shared())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot add String variant to Dynamic column, it's a bug"); +} + +void ColumnDynamic::updateVariantInfoAndExpandVariantColumn(const DB::DataTypePtr & new_variant_type) +{ + const DataTypes & current_variants = assert_cast(variant_info.variant_type.get())->getVariants(); + const DataTypes & new_variants = assert_cast(new_variant_type.get())->getVariants(); + + Names new_variant_names; + new_variant_names.reserve(new_variants.size()); + std::unordered_map new_variant_name_to_discriminator; + new_variant_name_to_discriminator.reserve(new_variants.size()); + std::vector> new_variant_columns_and_discriminators_to_add; + new_variant_columns_and_discriminators_to_add.reserve(new_variants.size() - current_variants.size()); + std::vector current_to_new_discriminators; + current_to_new_discriminators.resize(current_variants.size()); + + for (ColumnVariant::Discriminator discr = 0; discr != new_variants.size(); ++discr) + { + const auto & name = new_variant_names.emplace_back(new_variants[discr]->getName()); + new_variant_name_to_discriminator[name] = discr; + + auto current_it = variant_info.variant_name_to_discriminator.find(name); + if (current_it == variant_info.variant_name_to_discriminator.end()) + new_variant_columns_and_discriminators_to_add.emplace_back(new_variants[discr]->createColumn(), discr); + else + current_to_new_discriminators[current_it->second] = discr; + } + + variant_info.variant_type = new_variant_type; + variant_info.variant_name = new_variant_type->getName(); + variant_info.variant_names = new_variant_names; + variant_info.variant_name_to_discriminator = new_variant_name_to_discriminator; + assert_cast(*variant_column).extend(current_to_new_discriminators, std::move(new_variant_columns_and_discriminators_to_add)); + /// Clear mappings cache because now with new Variant we will have new mappings. + variant_mappings_cache.clear(); +} + +std::vector * ColumnDynamic::combineVariants(const DB::ColumnDynamic::VariantInfo & other_variant_info) +{ + /// Check if we already have global discriminators mapping for other Variant in cache. + /// It's used to not calculate the same mapping each call of insertFrom with the same columns. + auto cache_it = variant_mappings_cache.find(other_variant_info.variant_name); + if (cache_it != variant_mappings_cache.end()) + return &cache_it->second; + + /// Check if we already tried to combine these variants but failed due to max_dynamic_types limit. + if (variants_with_failed_combination.contains(other_variant_info.variant_name)) + return nullptr; + + const DataTypes & other_variants = assert_cast(*other_variant_info.variant_type).getVariants(); + + size_t num_new_variants = 0; + for (size_t i = 0; i != other_variants.size(); ++i) + { + if (!variant_info.variant_name_to_discriminator.contains(other_variant_info.variant_names[i])) + ++num_new_variants; + } + + /// If we have new variants we need to update current variant info and extend Variant column + if (num_new_variants) + { + const DataTypes & current_variants = assert_cast(*variant_info.variant_type).getVariants(); + + /// We cannot combine Variants if total number of variants exceeds max_dynamic_types. + if (current_variants.size() + num_new_variants > max_dynamic_types) + { + /// Remember that we cannot combine our variant with this one, so we will not try to do it again. + variants_with_failed_combination.insert(other_variant_info.variant_name); + return nullptr; + } + + /// We cannot combine Variants if total number of variants reaches max_dynamic_types and we don't have String variant. + if (current_variants.size() + num_new_variants == max_dynamic_types && !variant_info.variant_name_to_discriminator.contains("String") && !other_variant_info.variant_name_to_discriminator.contains("String")) + { + variants_with_failed_combination.insert(other_variant_info.variant_name); + return nullptr; + } + + DataTypes all_variants = current_variants; + all_variants.insert(all_variants.end(), other_variants.begin(), other_variants.end()); + auto new_variant_type = std::make_shared(all_variants); + updateVariantInfoAndExpandVariantColumn(new_variant_type); + } + + /// Create a global discriminators mapping for other variant. + std::vector other_to_new_discriminators; + other_to_new_discriminators.reserve(other_variants.size()); + for (size_t i = 0; i != other_variants.size(); ++i) + other_to_new_discriminators.push_back(variant_info.variant_name_to_discriminator[other_variant_info.variant_names[i]]); + + /// Save mapping to cache to not calculate it again for the same Variants. + auto [it, _] = variant_mappings_cache.emplace(other_variant_info.variant_name, std::move(other_to_new_discriminators)); + return &it->second; +} + +void ColumnDynamic::insert(const DB::Field & x) +{ + /// Check if we can insert field without Variant extension. + if (variant_column->tryInsert(x)) + return; + + /// If we cannot insert field into current variant column, extend it with new variant for this field from its type. + if (addNewVariant(applyVisitor(FieldToDataType(), x))) + { + /// Now we should be able to insert this field into extended variant column. + variant_column->insert(x); + } + else + { + /// We reached maximum number of variants and couldn't add new variant. + /// This case should be really rare in real use cases. + /// We should always be able to add String variant and cast inserted value to String. + addStringVariant(); + variant_column->insert(toString(x)); + } +} + +bool ColumnDynamic::tryInsert(const DB::Field & x) +{ + /// We can insert any value into Dynamic column. + insert(x); + return true; +} + + +void ColumnDynamic::insertFrom(const DB::IColumn & src_, size_t n) +{ + const auto & dynamic_src = assert_cast(src_); + + /// Check if we have the same variants in both columns. + if (variant_info.variant_name == dynamic_src.variant_info.variant_name) + { + variant_column->insertFrom(*dynamic_src.variant_column, n); + return; + } + + auto & variant_col = assert_cast(*variant_column); + + /// If variants are different, we need to extend our variant with new variants. + if (auto * global_discriminators_mapping = combineVariants(dynamic_src.variant_info)) + { + variant_col.insertFrom(*dynamic_src.variant_column, n, *global_discriminators_mapping); + return; + } + + /// We cannot combine 2 Variant types as total number of variants exceeds the limit. + /// We need to insert single value, try to add only corresponding variant. + const auto & src_variant_col = assert_cast(*dynamic_src.variant_column); + auto src_global_discr = src_variant_col.globalDiscriminatorAt(n); + + /// NULL doesn't require Variant extension. + if (src_global_discr == ColumnVariant::NULL_DISCRIMINATOR) + { + insertDefault(); + return; + } + + auto variant_type = assert_cast(*dynamic_src.variant_info.variant_type).getVariants()[src_global_discr]; + if (addNewVariant(variant_type)) + { + auto discr = variant_info.variant_name_to_discriminator[dynamic_src.variant_info.variant_names[src_global_discr]]; + variant_col.insertIntoVariantFrom(discr, src_variant_col.getVariantByGlobalDiscriminator(src_global_discr), src_variant_col.offsetAt(n)); + return; + } + + /// We reached maximum number of variants and couldn't add new variant. + /// We should always be able to add String variant and cast inserted value to String. + addStringVariant(); + auto tmp_variant_column = src_variant_col.getVariantByGlobalDiscriminator(src_global_discr).cloneEmpty(); + tmp_variant_column->insertFrom(src_variant_col.getVariantByGlobalDiscriminator(src_global_discr), src_variant_col.offsetAt(n)); + auto tmp_string_column = castColumn(ColumnWithTypeAndName(tmp_variant_column->getPtr(), variant_type, ""), std::make_shared()); + auto string_variant_discr = variant_info.variant_name_to_discriminator["String"]; + variant_col.insertIntoVariantFrom(string_variant_discr, *tmp_string_column, 0); +} + +void ColumnDynamic::insertRangeFrom(const DB::IColumn & src_, size_t start, size_t length) +{ + if (start + length > src_.size()) + throw Exception(ErrorCodes::PARAMETER_OUT_OF_BOUND, "Parameter out of bound in ColumnDynamic::insertRangeFrom method. " + "[start({}) + length({}) > src.size()({})]", start, length, src_.size()); + + const auto & dynamic_src = assert_cast(src_); + + /// Check if we have the same variants in both columns. + if (variant_info.variant_names == dynamic_src.variant_info.variant_names) + { + variant_column->insertRangeFrom(*dynamic_src.variant_column, start, length); + return; + } + + auto & variant_col = assert_cast(*variant_column); + + /// If variants are different, we need to extend our variant with new variants. + if (auto * global_discriminators_mapping = combineVariants(dynamic_src.variant_info)) + { + variant_col.insertRangeFrom(*dynamic_src.variant_column, start, length, *global_discriminators_mapping); + return; + } + + /// We cannot combine 2 Variant types as total number of variants exceeds the limit. + /// In this case we will add most frequent variants from this range and insert them as usual, + /// all other variants will be converted to String. + /// TODO: instead of keeping all current variants and just adding new most frequent variants + /// from source columns we can also try to replace rarest existing variants with frequent + /// variants from source column (so we will avoid casting new frequent variants to String + /// and keeping rare existing ones). It will require rewriting of existing data in Variant + /// column but will improve usability of Dynamic column for example during squashing blocks + /// during insert. + + const auto & src_variant_column = dynamic_src.getVariantColumn(); + + /// Calculate ranges for each variant in current range. + std::vector> variants_ranges(dynamic_src.variant_info.variant_names.size(), {0, 0}); + /// If we insert the whole column, no need to iterate through the range, we can just take variant sizes. + if (start == 0 && length == dynamic_src.size()) + { + for (size_t i = 0; i != dynamic_src.variant_info.variant_names.size(); ++i) + variants_ranges[i] = {0, src_variant_column.getVariantByGlobalDiscriminator(i).size()}; + } + /// Otherwise we need to iterate through discriminators and calculate the range for each variant. + else + { + const auto & local_discriminators = src_variant_column.getLocalDiscriminators(); + const auto & offsets = src_variant_column.getOffsets(); + size_t end = start + length; + for (size_t i = start; i != end; ++i) + { + auto discr = src_variant_column.globalDiscriminatorByLocal(local_discriminators[i]); + if (discr != ColumnVariant::NULL_DISCRIMINATOR) + { + if (!variants_ranges[discr].second) + variants_ranges[discr].first = offsets[i]; + ++variants_ranges[discr].second; + } + } + } + + const auto & src_variants = assert_cast(*dynamic_src.variant_info.variant_type).getVariants(); + /// List of variants that will be converted to String. + std::vector variants_to_convert_to_string; + /// Mapping from global discriminators of src_variant to the new variant we will create. + std::vector other_to_new_discriminators; + other_to_new_discriminators.reserve(dynamic_src.variant_info.variant_names.size()); + + /// Check if we cannot add any more new variants. In this case we will convert all new variants to String. + if (variant_info.variant_names.size() == max_dynamic_types || (variant_info.variant_names.size() == max_dynamic_types - 1 && !variant_info.variant_name_to_discriminator.contains("String"))) + { + addStringVariant(); + for (size_t i = 0; i != dynamic_src.variant_info.variant_names.size(); ++i) + { + auto it = variant_info.variant_name_to_discriminator.find(dynamic_src.variant_info.variant_names[i]); + if (it == variant_info.variant_name_to_discriminator.end()) + { + variants_to_convert_to_string.push_back(i); + other_to_new_discriminators.push_back(variant_info.variant_name_to_discriminator["String"]); + } + else + { + other_to_new_discriminators.push_back(it->second); + } + } + } + /// We still can add some new variants, but not all of them. Let's choose the most frequent variants in specified range. + else + { + std::vector> new_variants_with_sizes; + new_variants_with_sizes.reserve(dynamic_src.variant_info.variant_names.size()); + for (size_t i = 0; i != dynamic_src.variant_info.variant_names.size(); ++i) + { + const auto & variant_name = dynamic_src.variant_info.variant_names[i]; + if (variant_name != "String" && !variant_info.variant_name_to_discriminator.contains(variant_name)) + new_variants_with_sizes.emplace_back(variants_ranges[i].second, i); + } + + std::sort(new_variants_with_sizes.begin(), new_variants_with_sizes.end(), std::greater()); + DataTypes new_variants = assert_cast(*variant_info.variant_type).getVariants(); + if (!variant_info.variant_name_to_discriminator.contains("String")) + new_variants.push_back(std::make_shared()); + + for (const auto & [_, discr] : new_variants_with_sizes) + { + if (new_variants.size() != max_dynamic_types) + new_variants.push_back(src_variants[discr]); + else + variants_to_convert_to_string.push_back(discr); + } + + auto new_variant_type = std::make_shared(new_variants); + updateVariantInfoAndExpandVariantColumn(new_variant_type); + auto string_variant_discriminator = variant_info.variant_name_to_discriminator.at("String"); + for (const auto & variant_name : dynamic_src.variant_info.variant_names) + { + auto it = variant_info.variant_name_to_discriminator.find(variant_name); + if (it == variant_info.variant_name_to_discriminator.end()) + other_to_new_discriminators.push_back(string_variant_discriminator); + else + other_to_new_discriminators.push_back(it->second); + } + } + + /// Convert to String all variants that couldn't be added. + std::unordered_map variants_converted_to_string; + variants_converted_to_string.reserve(variants_to_convert_to_string.size()); + for (auto discr : variants_to_convert_to_string) + { + auto [variant_start, variant_length] = variants_ranges[discr]; + const auto & variant = src_variant_column.getVariantPtrByGlobalDiscriminator(discr); + if (variant_start == 0 && variant_length == variant->size()) + variants_converted_to_string[discr] = castColumn(ColumnWithTypeAndName(variant, src_variants[discr], ""), std::make_shared()); + else + variants_converted_to_string[discr] = castColumn(ColumnWithTypeAndName(variant->cut(variant_start, variant_length), src_variants[discr], ""), std::make_shared()); + } + + const auto & src_local_discriminators = src_variant_column.getLocalDiscriminators(); + const auto & src_offsets = src_variant_column.getOffsets(); + const auto & src_variant_columns = src_variant_column.getVariants(); + size_t end = start + length; + for (size_t i = start; i != end; ++i) + { + auto local_discr = src_local_discriminators[i]; + if (local_discr == ColumnVariant::NULL_DISCRIMINATOR) + { + variant_col.insertDefault(); + } + else + { + auto global_discr = src_variant_column.globalDiscriminatorByLocal(local_discr); + auto to_global_discr = other_to_new_discriminators[global_discr]; + auto it = variants_converted_to_string.find(global_discr); + if (it == variants_converted_to_string.end()) + { + variant_col.insertIntoVariantFrom(to_global_discr, *src_variant_columns[local_discr], src_offsets[i]); + } + else + { + variant_col.insertIntoVariantFrom(to_global_discr, *it->second, src_offsets[i] - variants_ranges[global_discr].first); + } + } + } +} + +void ColumnDynamic::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length) +{ + const auto & dynamic_src = assert_cast(src_); + + /// Check if we have the same variants in both columns. + if (variant_info.variant_names == dynamic_src.variant_info.variant_names) + { + variant_column->insertManyFrom(*dynamic_src.variant_column, position, length); + return; + } + + auto & variant_col = assert_cast(*variant_column); + + /// If variants are different, we need to extend our variant with new variants. + if (auto * global_discriminators_mapping = combineVariants(dynamic_src.variant_info)) + { + variant_col.insertManyFrom(*dynamic_src.variant_column, position, length, *global_discriminators_mapping); + return; + } + + /// We cannot combine 2 Variant types as total number of variants exceeds the limit. + /// We need to insert single value, try to add only corresponding variant. + const auto & src_variant_col = assert_cast(*dynamic_src.variant_column); + auto src_global_discr = src_variant_col.globalDiscriminatorAt(position); + if (src_global_discr == ColumnVariant::NULL_DISCRIMINATOR) + { + insertDefault(); + return; + } + + auto variant_type = assert_cast(*dynamic_src.variant_info.variant_type).getVariants()[src_global_discr]; + if (addNewVariant(variant_type)) + { + auto discr = variant_info.variant_name_to_discriminator[dynamic_src.variant_info.variant_names[src_global_discr]]; + variant_col.insertManyIntoVariantFrom(discr, src_variant_col.getVariantByGlobalDiscriminator(src_global_discr), src_variant_col.offsetAt(position), length); + return; + } + + addStringVariant(); + auto tmp_variant_column = src_variant_col.getVariantByGlobalDiscriminator(src_global_discr).cloneEmpty(); + tmp_variant_column->insertFrom(src_variant_col.getVariantByGlobalDiscriminator(src_global_discr), src_variant_col.offsetAt(position)); + auto tmp_string_column = castColumn(ColumnWithTypeAndName(tmp_variant_column->getPtr(), variant_type, ""), std::make_shared()); + auto string_variant_discr = variant_info.variant_name_to_discriminator["String"]; + variant_col.insertManyIntoVariantFrom(string_variant_discr, *tmp_string_column, 0, length); +} + + +StringRef ColumnDynamic::serializeValueIntoArena(size_t n, DB::Arena & arena, const char *& begin) const +{ + /// We cannot use Variant serialization here as it serializes discriminator + value, + /// but Dynamic doesn't have fixed mapping discriminator <-> variant type + /// as different Dynamic column can have different Variants. + /// Instead, we serialize null bit + variant type name (size + bytes) + value. + const auto & variant_col = assert_cast(*variant_column); + auto discr = variant_col.globalDiscriminatorAt(n); + StringRef res; + UInt8 null_bit = discr == ColumnVariant::NULL_DISCRIMINATOR; + if (null_bit) + { + char * pos = arena.allocContinue(sizeof(UInt8), begin); + memcpy(pos, &null_bit, sizeof(UInt8)); + res.data = pos; + res.size = sizeof(UInt8); + return res; + } + + const auto & variant_name = variant_info.variant_names[discr]; + size_t variant_name_size = variant_name.size(); + char * pos = arena.allocContinue(sizeof(UInt8) + sizeof(size_t) + variant_name.size(), begin); + memcpy(pos, &null_bit, sizeof(UInt8)); + memcpy(pos + sizeof(UInt8), &variant_name_size, sizeof(size_t)); + memcpy(pos + sizeof(UInt8) + sizeof(size_t), variant_name.data(), variant_name.size()); + res.data = pos; + res.size = sizeof(UInt8) + sizeof(size_t) + variant_name.size(); + + auto value_ref = variant_col.getVariantByGlobalDiscriminator(discr).serializeValueIntoArena(variant_col.offsetAt(n), arena, begin); + res.data = value_ref.data - res.size; + res.size += value_ref.size; + return res; +} + +const char * ColumnDynamic::deserializeAndInsertFromArena(const char * pos) +{ + auto & variant_col = assert_cast(*variant_column); + UInt8 null_bit = unalignedLoad(pos); + pos += sizeof(UInt8); + if (null_bit) + { + insertDefault(); + return pos; + } + + /// Read variant type name. + const size_t variant_name_size = unalignedLoad(pos); + pos += sizeof(variant_name_size); + String variant_name; + variant_name.resize(variant_name_size); + memcpy(variant_name.data(), pos, variant_name_size); + pos += variant_name_size; + /// If we already have such variant, just deserialize it into corresponding variant column. + auto it = variant_info.variant_name_to_discriminator.find(variant_name); + if (it != variant_info.variant_name_to_discriminator.end()) + { + auto discr = it->second; + return variant_col.deserializeVariantAndInsertFromArena(discr, pos); + } + + /// If we don't have such variant, add it. + auto variant_type = DataTypeFactory::instance().get(variant_name); + if (likely(addNewVariant(variant_type))) + { + auto discr = variant_info.variant_name_to_discriminator[variant_name]; + return variant_col.deserializeVariantAndInsertFromArena(discr, pos); + } + + /// We reached maximum number of variants and couldn't add new variant. + /// We should always be able to add String variant and cast inserted value to String. + addStringVariant(); + /// Create temporary column of this variant type and deserialize value into it. + auto tmp_variant_column = variant_type->createColumn(); + pos = tmp_variant_column->deserializeAndInsertFromArena(pos); + /// Cast temporary column to String and insert this value into String variant. + auto str_column = castColumn(ColumnWithTypeAndName(tmp_variant_column->getPtr(), variant_type, ""), std::make_shared()); + variant_col.insertIntoVariantFrom(variant_info.variant_name_to_discriminator["String"], *str_column, 0); + return pos; +} + +const char * ColumnDynamic::skipSerializedInArena(const char * pos) const +{ + UInt8 null_bit = unalignedLoad(pos); + pos += sizeof(UInt8); + if (null_bit) + return pos; + + const size_t variant_name_size = unalignedLoad(pos); + pos += sizeof(variant_name_size); + String variant_name; + variant_name.resize(variant_name_size); + memcpy(variant_name.data(), pos, variant_name_size); + pos += variant_name_size; + auto tmp_variant_column = DataTypeFactory::instance().get(variant_name)->createColumn(); + return tmp_variant_column->skipSerializedInArena(pos); +} + +void ColumnDynamic::updateHashWithValue(size_t n, SipHash & hash) const +{ + const auto & variant_col = assert_cast(*variant_column); + auto discr = variant_col.globalDiscriminatorAt(n); + if (discr == ColumnVariant::NULL_DISCRIMINATOR) + { + hash.update(discr); + return; + } + + hash.update(variant_info.variant_names[discr]); + variant_col.getVariantByGlobalDiscriminator(discr).updateHashWithValue(variant_col.offsetAt(n), hash); +} + +int ColumnDynamic::compareAt(size_t n, size_t m, const DB::IColumn & rhs, int nan_direction_hint) const +{ + const auto & left_variant = assert_cast(*variant_column); + const auto & right_dynamic = assert_cast(rhs); + const auto & right_variant = assert_cast(*right_dynamic.variant_column); + + auto left_discr = left_variant.globalDiscriminatorAt(n); + auto right_discr = right_variant.globalDiscriminatorAt(m); + + /// Check if we have NULLs and return result based on nan_direction_hint. + if (left_discr == ColumnVariant::NULL_DISCRIMINATOR && right_discr == ColumnVariant::NULL_DISCRIMINATOR) + return 0; + else if (left_discr == ColumnVariant::NULL_DISCRIMINATOR) + return nan_direction_hint; + else if (right_discr == ColumnVariant::NULL_DISCRIMINATOR) + return -nan_direction_hint; + + /// If rows have different types, we compare type names. + if (variant_info.variant_names[left_discr] != right_dynamic.variant_info.variant_names[right_discr]) + return variant_info.variant_names[left_discr] < right_dynamic.variant_info.variant_names[right_discr] ? -1 : 1; + + /// If rows have the same types, compare actual values from corresponding variants. + return left_variant.getVariantByGlobalDiscriminator(left_discr).compareAt(left_variant.offsetAt(n), right_variant.offsetAt(m), right_variant.getVariantByGlobalDiscriminator(right_discr), nan_direction_hint); +} + +ColumnPtr ColumnDynamic::compress() const +{ + ColumnPtr variant_compressed = variant_column->compress(); + size_t byte_size = variant_compressed->byteSize(); + return ColumnCompressed::create(size(), byte_size, + [my_variant_compressed = std::move(variant_compressed), my_variant_info = variant_info, my_max_dynamic_types = max_dynamic_types, my_statistics = statistics]() mutable + { + return ColumnDynamic::create(my_variant_compressed->decompress(), my_variant_info, my_max_dynamic_types, my_statistics); + }); +} + +void ColumnDynamic::takeDynamicStructureFromSourceColumns(const Columns & source_columns) +{ + if (!empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "takeDynamicStructureFromSourceColumns should be called only on empty Dynamic column"); + + /// During serialization of Dynamic column in MergeTree all Dynamic columns + /// in single part must have the same structure (the same variants). During merge + /// resulting column is constructed by inserting from source columns, + /// but it may happen that resulting column doesn't have rows from all source parts + /// but only from subset of them, and as a result some variants could be missing + /// and structures of resulting column may differ. + /// To solve this problem, before merge we create empty resulting column and use this method + /// to take dynamic structure from all source column even if we won't insert + /// rows from some of them. + + /// We want to construct resulting variant with most frequent variants from source columns and convert the rarest + /// variants to single String variant if we exceed the limit of variants. + /// First, collect all variants from all source columns and calculate total sizes. + std::unordered_map total_sizes; + DataTypes all_variants; + + for (const auto & source_column : source_columns) + { + const auto & source_dynamic = assert_cast(*source_column); + const auto & source_variant_column = source_dynamic.getVariantColumn(); + const auto & source_variant_info = source_dynamic.getVariantInfo(); + const auto & source_variants = assert_cast(*source_variant_info.variant_type).getVariants(); + /// During deserialization from MergeTree we will have variant sizes statistics from the whole data part. + const auto & source_statistics = source_dynamic.getStatistics(); + for (size_t i = 0; i != source_variants.size(); ++i) + { + const auto & variant_name = source_variant_info.variant_names[i]; + auto it = total_sizes.find(variant_name); + /// Add this variant to the list of all variants if we didn't see it yet. + if (it == total_sizes.end()) + { + all_variants.push_back(source_variants[i]); + it = total_sizes.emplace(variant_name, 0).first; + } + auto statistics_it = source_statistics.data.find(variant_name); + size_t size = statistics_it == source_statistics.data.end() ? source_variant_column.getVariantByGlobalDiscriminator(i).size() : statistics_it->second; + it->second += size; + } + } + + DataTypePtr result_variant_type; + /// Check if the number of all variants exceeds the limit. + if (all_variants.size() > max_dynamic_types || (all_variants.size() == max_dynamic_types && !total_sizes.contains("String"))) + { + /// Create list of variants with their sizes and sort it. + std::vector> variants_with_sizes; + variants_with_sizes.reserve(all_variants.size()); + for (const auto & variant : all_variants) + variants_with_sizes.emplace_back(total_sizes[variant->getName()], variant); + std::sort(variants_with_sizes.begin(), variants_with_sizes.end(), std::greater()); + + /// Take first max_dynamic_types variants from sorted list. + DataTypes result_variants; + result_variants.reserve(max_dynamic_types); + /// Add String variant in advance. + result_variants.push_back(std::make_shared()); + for (const auto & [_, variant] : variants_with_sizes) + { + if (result_variants.size() == max_dynamic_types) + break; + + if (variant->getName() != "String") + result_variants.push_back(variant); + } + + result_variant_type = std::make_shared(result_variants); + } + else + { + result_variant_type = std::make_shared(all_variants); + } + + /// Now we have resulting Variant and can fill variant info. + variant_info.variant_type = result_variant_type; + variant_info.variant_name = result_variant_type->getName(); + const auto & result_variants = assert_cast(*result_variant_type).getVariants(); + variant_info.variant_names.clear(); + variant_info.variant_names.reserve(result_variants.size()); + variant_info.variant_name_to_discriminator.clear(); + variant_info.variant_name_to_discriminator.reserve(result_variants.size()); + statistics.data.clear(); + statistics.data.reserve(result_variants.size()); + statistics.source = Statistics::Source::MERGE; + for (size_t i = 0; i != result_variants.size(); ++i) + { + auto variant_name = result_variants[i]->getName(); + variant_info.variant_names.push_back(variant_name); + variant_info.variant_name_to_discriminator[variant_name] = i; + statistics.data[variant_name] = total_sizes[variant_name]; + } + + variant_column = variant_info.variant_type->createColumn(); + + /// Now we have the resulting Variant that will be used in all merged columns. + /// Variants can also contain Dynamic columns inside, we should collect + /// all source variants that will be used in the resulting merged column + /// and call takeDynamicStructureFromSourceColumns on all resulting variants. + std::vector variants_source_columns; + variants_source_columns.resize(variant_info.variant_names.size()); + for (const auto & source_column : source_columns) + { + const auto & source_dynamic_column = assert_cast(*source_column); + const auto & source_variant_info = source_dynamic_column.getVariantInfo(); + for (size_t i = 0; i != variant_info.variant_names.size(); ++i) + { + /// Try to find this variant in current source column. + auto it = source_variant_info.variant_name_to_discriminator.find(variant_info.variant_names[i]); + if (it != source_variant_info.variant_name_to_discriminator.end()) + variants_source_columns[i].push_back(source_dynamic_column.getVariantColumn().getVariantPtrByGlobalDiscriminator(it->second)); + } + } + + auto & variant_col = getVariantColumn(); + for (size_t i = 0; i != variant_info.variant_names.size(); ++i) + variant_col.getVariantByGlobalDiscriminator(i).takeDynamicStructureFromSourceColumns(variants_source_columns[i]); +} + +void ColumnDynamic::applyNullMap(const ColumnVector::Container & null_map) +{ + assert_cast(*variant_column).applyNullMap(null_map); +} + +void ColumnDynamic::applyNegatedNullMap(const ColumnVector::Container & null_map) +{ + assert_cast(*variant_column).applyNegatedNullMap(null_map); +} + +} diff --git a/src/Columns/ColumnDynamic.h b/src/Columns/ColumnDynamic.h new file mode 100644 index 00000000000..27ad0dd583f --- /dev/null +++ b/src/Columns/ColumnDynamic.h @@ -0,0 +1,365 @@ +#pragma once + +#include +#include +#include +#include + + +namespace DB +{ + +/** + * Column for storing Dynamic type values. + * Dynamic column allows to insert and store values of any data types inside. + * Inside it stores: + * - Variant column with all inserted values of different types. + * - Information about currently stored variants. + * + * When new values are inserted into Dynamic column, the internal Variant + * type and column are extended if the inserted value has new type. + */ +class ColumnDynamic final : public COWHelper, ColumnDynamic> +{ +public: + /// + struct Statistics + { + enum class Source + { + READ, /// Statistics were loaded into column during reading from MergeTree. + MERGE, /// Statistics were calculated during merge of several MergeTree parts. + }; + + /// Source of the statistics. + Source source; + /// Statistics data: (variant name) -> (total variant size in data part). + std::unordered_map data; + }; + +private: + friend class COWHelper, ColumnDynamic>; + + struct VariantInfo + { + DataTypePtr variant_type; + /// Name of the whole variant to not call getName() every time. + String variant_name; + /// Names of variants to not call getName() every time on variants. + Names variant_names; + /// Mapping (variant name) -> (global discriminator). + /// It's used during variant extension. + std::unordered_map variant_name_to_discriminator; + }; + + explicit ColumnDynamic(size_t max_dynamic_types_); + ColumnDynamic(MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_ = {}); + +public: + /** Create immutable column using immutable arguments. This arguments may be shared with other columns. + * Use IColumn::mutate in order to make mutable column and mutate shared nested columns. + */ + using Base = COWHelper, ColumnDynamic>; + static Ptr create(const ColumnPtr & variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_ = {}) + { + return ColumnDynamic::create(variant_column_->assumeMutable(), variant_info_, max_dynamic_types_, statistics_); + } + + static MutablePtr create(MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_ = {}) + { + return Base::create(std::move(variant_column_), variant_info_, max_dynamic_types_, statistics_); + } + + static MutablePtr create(MutableColumnPtr variant_column_, const DataTypePtr & variant_type, size_t max_dynamic_types_, const Statistics & statistics_ = {}); + + static ColumnPtr create(ColumnPtr variant_column_, const DataTypePtr & variant_type, size_t max_dynamic_types_, const Statistics & statistics_ = {}) + { + return create(variant_column_->assumeMutable(), variant_type, max_dynamic_types_, statistics_); + } + + static MutablePtr create(size_t max_dynamic_types_) + { + return Base::create(max_dynamic_types_); + } + + std::string getName() const override { return "Dynamic(max_types=" + std::to_string(max_dynamic_types) + ")"; } + + const char * getFamilyName() const override + { + return "Dynamic"; + } + + TypeIndex getDataType() const override + { + return TypeIndex::Dynamic; + } + + MutableColumnPtr cloneEmpty() const override + { + /// Keep current dynamic structure + return Base::create(variant_column->cloneEmpty(), variant_info, max_dynamic_types, statistics); + } + + MutableColumnPtr cloneResized(size_t size) const override + { + return Base::create(variant_column->cloneResized(size), variant_info, max_dynamic_types, statistics); + } + + size_t size() const override + { + return variant_column->size(); + } + + Field operator[](size_t n) const override + { + return (*variant_column)[n]; + } + + void get(size_t n, Field & res) const override + { + variant_column->get(n, res); + } + + bool isDefaultAt(size_t n) const override + { + return variant_column->isDefaultAt(n); + } + + bool isNullAt(size_t n) const override + { + return variant_column->isNullAt(n); + } + + StringRef getDataAt(size_t n) const override + { + return variant_column->getDataAt(n); + } + + void insertData(const char * pos, size_t length) override + { + variant_column->insertData(pos, length); + } + + void insert(const Field & x) override; + bool tryInsert(const Field & x) override; + void insertFrom(const IColumn & src_, size_t n) override; + void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; + void insertManyFrom(const IColumn & src, size_t position, size_t length) override; + + void insertDefault() override + { + variant_column->insertDefault(); + } + + void insertManyDefaults(size_t length) override + { + variant_column->insertManyDefaults(length); + } + + void popBack(size_t n) override + { + variant_column->popBack(n); + } + + StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; + const char * deserializeAndInsertFromArena(const char * pos) override; + const char * skipSerializedInArena(const char * pos) const override; + + void updateHashWithValue(size_t n, SipHash & hash) const override; + + void updateWeakHash32(WeakHash32 & hash) const override + { + variant_column->updateWeakHash32(hash); + } + + void updateHashFast(SipHash & hash) const override + { + variant_column->updateHashFast(hash); + } + + ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override + { + return create(variant_column->filter(filt, result_size_hint), variant_info, max_dynamic_types); + } + + void expand(const Filter & mask, bool inverted) override + { + variant_column->expand(mask, inverted); + } + + ColumnPtr permute(const Permutation & perm, size_t limit) const override + { + return create(variant_column->permute(perm, limit), variant_info, max_dynamic_types); + } + + ColumnPtr index(const IColumn & indexes, size_t limit) const override + { + return create(variant_column->index(indexes, limit), variant_info, max_dynamic_types); + } + + ColumnPtr replicate(const Offsets & replicate_offsets) const override + { + return create(variant_column->replicate(replicate_offsets), variant_info, max_dynamic_types); + } + + MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override + { + MutableColumns scattered_variant_columns = variant_column->scatter(num_columns, selector); + MutableColumns scattered_columns; + scattered_columns.reserve(num_columns); + for (auto & scattered_variant_column : scattered_variant_columns) + scattered_columns.emplace_back(create(std::move(scattered_variant_column), variant_info, max_dynamic_types)); + + return scattered_columns; + } + + int compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const override; + + bool hasEqualValues() const override + { + return variant_column->hasEqualValues(); + } + + void getExtremes(Field & min, Field & max) const override + { + variant_column->getExtremes(min, max); + } + + void getPermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability, + size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override + { + variant_column->getPermutation(direction, stability, limit, nan_direction_hint, res); + } + + void updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability, + size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override + { + variant_column->updatePermutation(direction, stability, limit, nan_direction_hint, res, equal_ranges); + } + + void reserve(size_t n) override + { + variant_column->reserve(n); + } + + void ensureOwnership() override + { + variant_column->ensureOwnership(); + } + + size_t byteSize() const override + { + return variant_column->byteSize(); + } + + size_t byteSizeAt(size_t n) const override + { + return variant_column->byteSizeAt(n); + } + + size_t allocatedBytes() const override + { + return variant_column->allocatedBytes(); + } + + void protect() override + { + variant_column->protect(); + } + + void forEachSubcolumn(MutableColumnCallback callback) override + { + callback(variant_column); + } + + void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override + { + callback(*variant_column); + variant_column->forEachSubcolumnRecursively(callback); + } + + bool structureEquals(const IColumn & rhs) const override + { + if (const auto * rhs_concrete = typeid_cast(&rhs)) + return max_dynamic_types == rhs_concrete->max_dynamic_types; + return false; + } + + ColumnPtr compress() const override; + + double getRatioOfDefaultRows(double sample_ratio) const override + { + return variant_column->getRatioOfDefaultRows(sample_ratio); + } + + UInt64 getNumberOfDefaultRows() const override + { + return variant_column->getNumberOfDefaultRows(); + } + + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override + { + variant_column->getIndicesOfNonDefaultRows(indices, from, limit); + } + + void finalize() override + { + variant_column->finalize(); + } + + bool isFinalized() const override + { + return variant_column->isFinalized(); + } + + /// Apply null map to a nested Variant column. + void applyNullMap(const ColumnVector::Container & null_map); + void applyNegatedNullMap(const ColumnVector::Container & null_map); + + const VariantInfo & getVariantInfo() const { return variant_info; } + + const ColumnPtr & getVariantColumnPtr() const { return variant_column; } + ColumnPtr & getVariantColumnPtr() { return variant_column; } + + const ColumnVariant & getVariantColumn() const { return assert_cast(*variant_column); } + ColumnVariant & getVariantColumn() { return assert_cast(*variant_column); } + + bool addNewVariant(const DataTypePtr & new_variant); + void addStringVariant(); + + bool hasDynamicStructure() const override { return true; } + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; + + const Statistics & getStatistics() const { return statistics; } + + size_t getMaxDynamicTypes() const { return max_dynamic_types; } + +private: + /// Combine current variant with the other variant and return global discriminators mapping + /// from other variant to the combined one. It's used for inserting from + /// different variants. + /// Returns nullptr if maximum number of variants is reached and the new variant cannot be created. + std::vector * combineVariants(const VariantInfo & other_variant_info); + + void updateVariantInfoAndExpandVariantColumn(const DataTypePtr & new_variant_type); + + WrappedPtr variant_column; + /// Store the type of current variant with some additional information. + VariantInfo variant_info; + /// The maximum number of different types that can be stored in this Dynamic column. + /// If exceeded, all new variants will be converted to String. + size_t max_dynamic_types; + + /// Size statistics of each variants from MergeTree data part. + /// Used in takeDynamicStructureFromSourceColumns and set during deserialization. + Statistics statistics; + + /// Cache (Variant name) -> (global discriminators mapping from this variant to current variant in Dynamic column). + /// Used to avoid mappings recalculation in combineVariants for the same Variant types. + std::unordered_map> variant_mappings_cache; + /// Cache of Variant types that couldn't be combined with current variant in Dynamic column. + /// Used to avoid checking if combination is possible for the same Variant types. + std::unordered_set variants_with_failed_combination; +}; + +} diff --git a/src/Columns/ColumnMap.cpp b/src/Columns/ColumnMap.cpp index 57e8ba685b4..eecea1a273f 100644 --- a/src/Columns/ColumnMap.cpp +++ b/src/Columns/ColumnMap.cpp @@ -312,4 +312,13 @@ ColumnPtr ColumnMap::compress() const }); } +void ColumnMap::takeDynamicStructureFromSourceColumns(const Columns & source_columns) +{ + Columns nested_source_columns; + nested_source_columns.reserve(source_columns.size()); + for (const auto & source_column : source_columns) + nested_source_columns.push_back(assert_cast(*source_column).getNestedColumnPtr()); + nested->takeDynamicStructureFromSourceColumns(nested_source_columns); +} + } diff --git a/src/Columns/ColumnMap.h b/src/Columns/ColumnMap.h index 60aa69e7bf6..52165d0d74e 100644 --- a/src/Columns/ColumnMap.h +++ b/src/Columns/ColumnMap.h @@ -104,6 +104,9 @@ public: ColumnTuple & getNestedData() { return assert_cast(getNestedColumn().getData()); } ColumnPtr compress() const override; + + bool hasDynamicStructure() const override { return nested->hasDynamicStructure(); } + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; }; } diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index 30e62548ad6..dd9387d96b1 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -868,6 +868,15 @@ ColumnPtr ColumnNullable::getNestedColumnWithDefaultOnNull() const return res; } +void ColumnNullable::takeDynamicStructureFromSourceColumns(const Columns & source_columns) +{ + Columns nested_source_columns; + nested_source_columns.reserve(source_columns.size()); + for (const auto & source_column : source_columns) + nested_source_columns.push_back(assert_cast(*source_column).getNestedColumnPtr()); + nested_column->takeDynamicStructureFromSourceColumns(nested_source_columns); +} + ColumnPtr makeNullable(const ColumnPtr & column) { if (isColumnNullable(*column)) @@ -924,4 +933,23 @@ ColumnPtr makeNullableOrLowCardinalityNullableSafe(const ColumnPtr & column) return column; } +ColumnPtr removeNullable(const ColumnPtr & column) +{ + if (const auto * column_nullable = typeid_cast(column.get())) + return column_nullable->getNestedColumnPtr(); + return column; +} + +ColumnPtr removeNullableOrLowCardinalityNullable(const ColumnPtr & column) +{ + if (const auto * column_low_cardinality = typeid_cast(column.get())) + { + if (!column_low_cardinality->nestedIsNullable()) + return column; + return column_low_cardinality->cloneWithDefaultOnNull(); + } + + return removeNullable(column); +} + } diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index c7ebb6ed7b6..266c188db25 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -190,6 +190,9 @@ public: /// Check that size of null map equals to size of nested column. void checkConsistency() const; + bool hasDynamicStructure() const override { return nested_column->hasDynamicStructure(); } + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; + private: WrappedPtr nested_column; WrappedPtr null_map; @@ -211,4 +214,7 @@ ColumnPtr makeNullableSafe(const ColumnPtr & column); ColumnPtr makeNullableOrLowCardinalityNullable(const ColumnPtr & column); ColumnPtr makeNullableOrLowCardinalityNullableSafe(const ColumnPtr & column); +ColumnPtr removeNullable(const ColumnPtr & column); +ColumnPtr removeNullableOrLowCardinalityNullable(const ColumnPtr & column); + } diff --git a/src/Columns/ColumnSparse.cpp b/src/Columns/ColumnSparse.cpp index 3a63d2bffc5..49947be312d 100644 --- a/src/Columns/ColumnSparse.cpp +++ b/src/Columns/ColumnSparse.cpp @@ -801,6 +801,15 @@ ColumnSparse::Iterator ColumnSparse::getIterator(size_t n) const return Iterator(offsets_data, _size, current_offset, n); } +void ColumnSparse::takeDynamicStructureFromSourceColumns(const Columns & source_columns) +{ + Columns values_source_columns; + values_source_columns.reserve(source_columns.size()); + for (const auto & source_column : source_columns) + values_source_columns.push_back(assert_cast(*source_column).getValuesPtr()); + values->takeDynamicStructureFromSourceColumns(values_source_columns); +} + ColumnPtr recursiveRemoveSparse(const ColumnPtr & column) { if (!column) diff --git a/src/Columns/ColumnSparse.h b/src/Columns/ColumnSparse.h index c1bd614102c..7d3200da35f 100644 --- a/src/Columns/ColumnSparse.h +++ b/src/Columns/ColumnSparse.h @@ -148,6 +148,9 @@ public: size_t sizeOfValueIfFixed() const override { return values->sizeOfValueIfFixed() + values->sizeOfValueIfFixed(); } bool isCollationSupported() const override { return values->isCollationSupported(); } + bool hasDynamicStructure() const override { return values->hasDynamicStructure(); } + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; + size_t getNumberOfTrailingDefaults() const { return offsets->empty() ? _size : _size - getOffsetsData().back() - 1; diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index 2393fcf92fd..31734edced4 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -572,6 +572,34 @@ bool ColumnTuple::isCollationSupported() const return false; } +bool ColumnTuple::hasDynamicStructure() const +{ + for (const auto & column : columns) + { + if (column->hasDynamicStructure()) + return true; + } + return false; +} + +void ColumnTuple::takeDynamicStructureFromSourceColumns(const Columns & source_columns) +{ + std::vector nested_source_columns; + nested_source_columns.resize(columns.size()); + for (size_t i = 0; i != columns.size(); ++i) + nested_source_columns[i].reserve(source_columns.size()); + + for (const auto & source_column : source_columns) + { + const auto & nsource_columns = assert_cast(*source_column).getColumns(); + for (size_t i = 0; i != nsource_columns.size(); ++i) + nested_source_columns[i].push_back(nsource_columns[i]); + } + + for (size_t i = 0; i != columns.size(); ++i) + columns[i]->takeDynamicStructureFromSourceColumns(nested_source_columns[i]); +} + ColumnPtr ColumnTuple::compress() const { diff --git a/src/Columns/ColumnTuple.h b/src/Columns/ColumnTuple.h index 5b626155754..65103fa8c49 100644 --- a/src/Columns/ColumnTuple.h +++ b/src/Columns/ColumnTuple.h @@ -114,6 +114,9 @@ public: const ColumnPtr & getColumnPtr(size_t idx) const { return columns[idx]; } ColumnPtr & getColumnPtr(size_t idx) { return columns[idx]; } + bool hasDynamicStructure() const override; + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; + private: int compareAtImpl(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator * collator=nullptr) const; diff --git a/src/Columns/ColumnVariant.cpp b/src/Columns/ColumnVariant.cpp index 31e9b0964f4..ec47f5dfa74 100644 --- a/src/Columns/ColumnVariant.cpp +++ b/src/Columns/ColumnVariant.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include @@ -452,16 +451,18 @@ bool ColumnVariant::tryInsert(const DB::Field & x) return false; } -void ColumnVariant::insertFrom(const IColumn & src_, size_t n) +void ColumnVariant::insertFromImpl(const DB::IColumn & src_, size_t n, const std::vector * global_discriminators_mapping) { + const size_t num_variants = variants.size(); const ColumnVariant & src = assert_cast(src_); - const size_t num_variants = variants.size(); - if (src.variants.size() != num_variants) + if (!global_discriminators_mapping && src.variants.size() != num_variants) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types"); - /// Remember that src column can have different local variants order. - Discriminator global_discr = src.globalDiscriminatorAt(n); + Discriminator src_global_discr = src.globalDiscriminatorAt(n); + Discriminator global_discr = src_global_discr; + if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR) + global_discr = (*global_discriminators_mapping)[src_global_discr]; Discriminator local_discr = localDiscriminatorByGlobal(global_discr); getLocalDiscriminators().push_back(local_discr); if (local_discr == NULL_DISCRIMINATOR) @@ -471,25 +472,15 @@ void ColumnVariant::insertFrom(const IColumn & src_, size_t n) else { getOffsets().push_back(variants[local_discr]->size()); - variants[local_discr]->insertFrom(src.getVariantByGlobalDiscriminator(global_discr), src.offsetAt(n)); + variants[local_discr]->insertFrom(src.getVariantByGlobalDiscriminator(src_global_discr), src.offsetAt(n)); } } -void ColumnVariant::insertIntoVariant(const DB::Field & x, Discriminator global_discr) -{ - if (global_discr > variants.size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid global discriminator: {}. The number of variants is {}", size_t(global_discr), variants.size()); - auto & variant = getVariantByGlobalDiscriminator(global_discr); - variant.insert(x); - getLocalDiscriminators().push_back(localDiscriminatorByGlobal(global_discr)); - getOffsets().push_back(variant.size() - 1); -} - -void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length) +void ColumnVariant::insertRangeFromImpl(const DB::IColumn & src_, size_t start, size_t length, const std::vector * global_discriminators_mapping) { const size_t num_variants = variants.size(); const auto & src = assert_cast(src_); - if (src.variants.size() != num_variants) + if (!global_discriminators_mapping && src.variants.size() != num_variants) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types"); if (start + length > src.getLocalDiscriminators().size()) @@ -507,7 +498,12 @@ void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t l /// In this case we can simply call insertRangeFrom on this single variant. if (auto non_empty_src_local_discr = src.getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) { - auto local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(*non_empty_src_local_discr)); + Discriminator src_global_discr = src.globalDiscriminatorByLocal(*non_empty_src_local_discr); + Discriminator global_discr = src_global_discr; + if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR) + global_discr = (*global_discriminators_mapping)[src_global_discr]; + + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); size_t offset = variants[local_discr]->size(); variants[local_discr]->insertRangeFrom(*src.variants[*non_empty_src_local_discr], start, length); getLocalDiscriminators().resize_fill(local_discriminators->size() + length, local_discr); @@ -522,7 +518,7 @@ void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t l /// collect ranges we need to insert for all variants and update offsets. /// nested_ranges[i].first - offset in src.variants[i] /// nested_ranges[i].second - length in src.variants[i] - std::vector> nested_ranges(num_variants, {0, 0}); + std::vector> nested_ranges(src.variants.size(), {0, 0}); auto & offsets_data = getOffsets(); offsets_data.reserve(offsets_data.size() + length); auto & local_discriminators_data = getLocalDiscriminators(); @@ -533,7 +529,11 @@ void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t l { /// We insert from src.variants[src_local_discr] to variants[local_discr] Discriminator src_local_discr = src_local_discriminators_data[i]; - Discriminator local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr)); + Discriminator src_global_discr = src.globalDiscriminatorByLocal(src_local_discr); + Discriminator global_discr = src_global_discr; + if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR) + global_discr = (*global_discriminators_mapping)[src_global_discr]; + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); local_discriminators_data.push_back(local_discr); if (local_discr == NULL_DISCRIMINATOR) { @@ -553,22 +553,29 @@ void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t l for (size_t src_local_discr = 0; src_local_discr != nested_ranges.size(); ++src_local_discr) { auto [nested_start, nested_length] = nested_ranges[src_local_discr]; - auto local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr)); + Discriminator src_global_discr = src.globalDiscriminatorByLocal(src_local_discr); + Discriminator global_discr = src_global_discr; + if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR) + global_discr = (*global_discriminators_mapping)[src_global_discr]; + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); if (nested_length) variants[local_discr]->insertRangeFrom(*src.variants[src_local_discr], nested_start, nested_length); } } -void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length) +void ColumnVariant::insertManyFromImpl(const DB::IColumn & src_, size_t position, size_t length, const std::vector * global_discriminators_mapping) { const size_t num_variants = variants.size(); const auto & src = assert_cast(src_); - if (src.variants.size() != num_variants) + if (!global_discriminators_mapping && src.variants.size() != num_variants) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types"); - /// Remember that src column can have different local variants order. Discriminator src_local_discr = src.localDiscriminatorAt(position); - Discriminator local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr)); + Discriminator src_global_discr = src.globalDiscriminatorByLocal(src_local_discr); + Discriminator global_discr = src_global_discr; + if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR) + global_discr = (*global_discriminators_mapping)[src_global_discr]; + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); auto & local_discriminators_data = getLocalDiscriminators(); local_discriminators_data.resize_fill(local_discriminators_data.size() + length, local_discr); @@ -588,6 +595,72 @@ void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, si } } +void ColumnVariant::insertFrom(const IColumn & src_, size_t n) +{ + insertFromImpl(src_, n, nullptr); +} + +void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length) +{ + insertRangeFromImpl(src_, start, length, nullptr); +} + +void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length) +{ + insertManyFromImpl(src_, position, length, nullptr); +} + +void ColumnVariant::insertFrom(const DB::IColumn & src_, size_t n, const std::vector & global_discriminators_mapping) +{ + insertFromImpl(src_, n, &global_discriminators_mapping); +} + +void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length, const std::vector & global_discriminators_mapping) +{ + insertRangeFromImpl(src_, start, length, &global_discriminators_mapping); +} + +void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length, const std::vector & global_discriminators_mapping) +{ + insertManyFromImpl(src_, position, length, &global_discriminators_mapping); +} + +void ColumnVariant::insertIntoVariantFrom(DB::ColumnVariant::Discriminator global_discr, const DB::IColumn & src_, size_t n) +{ + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); + getLocalDiscriminators().push_back(local_discr); + getOffsets().push_back(variants[local_discr]->size()); + variants[local_discr]->insertFrom(src_, n); +} + +void ColumnVariant::insertRangeIntoVariantFrom(DB::ColumnVariant::Discriminator global_discr, const DB::IColumn & src_, size_t start, size_t length) +{ + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); + auto & local_discriminators_data = getLocalDiscriminators(); + local_discriminators_data.resize_fill(local_discriminators_data.size() + length, local_discr); + auto & offsets_data = getOffsets(); + size_t offset = variants[local_discr]->size(); + offsets_data.reserve(offsets_data.size() + length); + for (size_t i = 0; i != length; ++i) + offsets_data.push_back(offset + i); + + variants[local_discr]->insertRangeFrom(src_, start, length); +} + +void ColumnVariant::insertManyIntoVariantFrom(DB::ColumnVariant::Discriminator global_discr, const DB::IColumn & src_, size_t position, size_t length) +{ + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); + auto & local_discriminators_data = getLocalDiscriminators(); + local_discriminators_data.resize_fill(local_discriminators_data.size() + length, local_discr); + auto & offsets_data = getOffsets(); + size_t offset = variants[local_discr]->size(); + offsets_data.reserve(offsets_data.size() + length); + for (size_t i = 0; i != length; ++i) + offsets_data.push_back(offset + i); + + variants[local_discr]->insertManyFrom(src_, position, length); +} + void ColumnVariant::insertDefault() { getLocalDiscriminators().push_back(NULL_DISCRIMINATOR); @@ -678,6 +751,14 @@ const char * ColumnVariant::deserializeAndInsertFromArena(const char * pos) return variants[local_discr]->deserializeAndInsertFromArena(pos); } +const char * ColumnVariant::deserializeVariantAndInsertFromArena(DB::ColumnVariant::Discriminator global_discr, const char * pos) +{ + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); + getLocalDiscriminators().push_back(local_discr); + getOffsets().push_back(variants[local_discr]->size()); + return variants[local_discr]->deserializeAndInsertFromArena(pos); +} + const char * ColumnVariant::skipSerializedInArena(const char * pos) const { Discriminator global_discr = unalignedLoad(pos); @@ -1426,4 +1507,54 @@ void ColumnVariant::applyNullMapImpl(const ColumnVector::Container & null } } +void ColumnVariant::extend(const std::vector & old_to_new_global_discriminators, std::vector> && new_variants_and_discriminators) +{ + /// Update global discriminators for current variants. + for (Discriminator & global_discr : local_to_global_discriminators) + global_discr = old_to_new_global_discriminators[global_discr]; + + /// Add new variants. + variants.reserve(variants.size() + new_variants_and_discriminators.size()); + local_to_global_discriminators.reserve(local_to_global_discriminators.size() + new_variants_and_discriminators.size()); + for (auto & new_variant_and_discriminator : new_variants_and_discriminators) + { + variants.emplace_back(std::move(new_variant_and_discriminator.first)); + local_to_global_discriminators.push_back(new_variant_and_discriminator.second); + } + + /// Update global -> local discriminators matching. + global_to_local_discriminators.resize(local_to_global_discriminators.size()); + for (Discriminator local_discr = 0; local_discr != local_to_global_discriminators.size(); ++local_discr) + global_to_local_discriminators[local_to_global_discriminators[local_discr]] = local_discr; +} + +bool ColumnVariant::hasDynamicStructure() const +{ + for (const auto & variant : variants) + { + if (variant->hasDynamicStructure()) + return true; + } + + return false; +} + +void ColumnVariant::takeDynamicStructureFromSourceColumns(const Columns & source_columns) +{ + std::vector variants_source_columns; + variants_source_columns.resize(variants.size()); + for (size_t i = 0; i != variants.size(); ++i) + variants_source_columns[i].reserve(source_columns.size()); + + for (const auto & source_column : source_columns) + { + const auto & source_variants = assert_cast(*source_column).variants; + for (size_t i = 0; i != source_variants.size(); ++i) + variants_source_columns[i].push_back(source_variants[i]); + } + + for (size_t i = 0; i != variants.size(); ++i) + variants[i]->takeDynamicStructureFromSourceColumns(variants_source_columns[i]); +} + } diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h index 4aa2c9058cc..e5a4498f340 100644 --- a/src/Columns/ColumnVariant.h +++ b/src/Columns/ColumnVariant.h @@ -175,18 +175,32 @@ public: bool isDefaultAt(size_t n) const override; bool isNullAt(size_t n) const override; StringRef getDataAt(size_t n) const override; + void insertData(const char * pos, size_t length) override; void insert(const Field & x) override; bool tryInsert(const Field & x) override; - void insertIntoVariant(const Field & x, Discriminator global_discr); + void insertFrom(const IColumn & src_, size_t n) override; - void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; - void insertManyFrom(const IColumn & src, size_t position, size_t length) override; + void insertRangeFrom(const IColumn & src_, size_t start, size_t length) override; + void insertManyFrom(const IColumn & src_, size_t position, size_t length) override; + + /// Methods for insertion from another Variant but with known mapping between global discriminators. + void insertFrom(const IColumn & src_, size_t n, const std::vector & global_discriminators_mapping); + void insertRangeFrom(const IColumn & src_, size_t start, size_t length, const std::vector & global_discriminators_mapping); + void insertManyFrom(const IColumn & src_, size_t position, size_t length, const std::vector & global_discriminators_mapping); + + /// Methods for insertion into a specific variant. + void insertIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t n); + void insertRangeIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t start, size_t length); + void insertManyIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t position, size_t length); + void insertDefault() override; void insertManyDefaults(size_t length) override; + void popBack(size_t n) override; StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; const char * deserializeAndInsertFromArena(const char * pos) override; + const char * deserializeVariantAndInsertFromArena(Discriminator global_discr, const char * pos); const char * skipSerializedInArena(const char * pos) const override; void updateHashWithValue(size_t n, SipHash & hash) const override; void updateWeakHash32(WeakHash32 & hash) const override; @@ -234,6 +248,8 @@ public: ColumnPtr & getVariantPtrByLocalDiscriminator(size_t discr) { return variants[discr]; } ColumnPtr & getVariantPtrByGlobalDiscriminator(size_t discr) { return variants[global_to_local_discriminators.at(discr)]; } + const NestedColumns & getVariants() const { return variants; } + const IColumn & getLocalDiscriminatorsColumn() const { return *local_discriminators; } IColumn & getLocalDiscriminatorsColumn() { return *local_discriminators; } @@ -282,7 +298,19 @@ public: void applyNullMap(const ColumnVector::Container & null_map); void applyNegatedNullMap(const ColumnVector::Container & null_map); + /// Extend current column with new variants. Change global discriminators of current variants to the new + /// according to the mapping and add new variants with new global discriminators. + /// This extension doesn't rewrite any data, just adds new empty variants and modifies global/local discriminators matching. + void extend(const std::vector & old_to_new_global_discriminators, std::vector> && new_variants_and_discriminators); + + bool hasDynamicStructure() const override; + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; + private: + void insertFromImpl(const IColumn & src_, size_t n, const std::vector * global_discriminators_mapping); + void insertRangeFromImpl(const IColumn & src_, size_t start, size_t length, const std::vector * global_discriminators_mapping); + void insertManyFromImpl(const IColumn & src_, size_t position, size_t length, const std::vector * global_discriminators_mapping); + void initIdentityGlobalToLocalDiscriminatorsMapping(); template diff --git a/src/Columns/IColumn.cpp b/src/Columns/IColumn.cpp index 18974e49760..479fd7de1bc 100644 --- a/src/Columns/IColumn.cpp +++ b/src/Columns/IColumn.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -461,6 +462,7 @@ template class IColumnHelper; template class IColumnHelper; template class IColumnHelper; template class IColumnHelper; +template class IColumnHelper; template class IColumnHelper; diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index cf2693e008c..b49d6f2a66d 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -534,6 +534,11 @@ public: return res; } + /// Checks if column has dynamic subcolumns. + virtual bool hasDynamicStructure() const { return false; } + /// For columns with dynamic subcolumns this method takes dynamic structure from source columns + /// and creates proper resulting dynamic structure in advance for merge of these source columns. + virtual void takeDynamicStructureFromSourceColumns(const std::vector & /*source_columns*/) {} /** Some columns can contain another columns inside. * So, we have a tree of columns. But not all combinations are possible. diff --git a/src/Columns/tests/gtest_column_dynamic.cpp b/src/Columns/tests/gtest_column_dynamic.cpp new file mode 100644 index 00000000000..a2862b09de1 --- /dev/null +++ b/src/Columns/tests/gtest_column_dynamic.cpp @@ -0,0 +1,652 @@ +#include +#include +#include +#include + +using namespace DB; + +TEST(ColumnDynamic, CreateEmpty) +{ + auto column = ColumnDynamic::create(255); + ASSERT_TRUE(column->empty()); + ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant()"); + ASSERT_TRUE(column->getVariantInfo().variant_names.empty()); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.empty()); +} + +TEST(ColumnDynamic, InsertDefault) +{ + auto column = ColumnDynamic::create(255); + column->insertDefault(); + ASSERT_TRUE(column->size() == 1); + ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant()"); + ASSERT_TRUE(column->getVariantInfo().variant_names.empty()); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.empty()); + ASSERT_TRUE(column->isNullAt(0)); + ASSERT_EQ((*column)[0], Field(Null())); +} + +TEST(ColumnDynamic, InsertFields) +{ + auto column = ColumnDynamic::create(255); + column->insert(Field(42)); + column->insert(Field(-42)); + column->insert(Field("str1")); + column->insert(Field(Null())); + column->insert(Field(42.42)); + column->insert(Field(43)); + column->insert(Field(-43)); + column->insert(Field("str2")); + column->insert(Field(Null())); + column->insert(Field(43.43)); + ASSERT_TRUE(column->size() == 10); + + ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant(Float64, Int8, String)"); + std::vector expected_names = {"Float64", "Int8", "String"}; + ASSERT_EQ(column->getVariantInfo().variant_names, expected_names); + std::unordered_map expected_variant_name_to_discriminator = {{"Float64", 0}, {"Int8", 1}, {"String", 2}}; + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); +} + +ColumnDynamic::MutablePtr getDynamicWithManyVariants(size_t num_variants, Field tuple_element = Field(42)) +{ + auto column = ColumnDynamic::create(255); + for (size_t i = 0; i != num_variants; ++i) + { + Tuple tuple; + for (size_t j = 0; j != i + 1; ++j) + tuple.push_back(tuple_element); + column->insert(tuple); + } + + return column; +} + +TEST(ColumnDynamic, InsertFieldsOverflow1) +{ + auto column = getDynamicWithManyVariants(253); + + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 253); + + column->insert(Field(42.42)); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 254); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + + column->insert(Field(42)); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255); + ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String")); + Field field = (*column)[column->size() - 1]; + ASSERT_EQ(field, "42"); + + column->insert(Field(43)); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255); + ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column)[column->size() - 1]; + ASSERT_EQ(field, "43"); + + column->insert(Field("str1")); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255); + ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column)[column->size() - 1]; + ASSERT_EQ(field, "str1"); + + column->insert(Field(Array({Field(42), Field(43)}))); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255); + ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Array(Int8)")); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column)[column->size() - 1]; + ASSERT_EQ(field, "[42, 43]"); +} + +TEST(ColumnDynamic, InsertFieldsOverflow2) +{ + auto column = getDynamicWithManyVariants(254); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 254); + + column->insert(Field("str1")); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String")); + + column->insert(Field(42)); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255); + ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String")); + Field field = (*column)[column->size() - 1]; + ASSERT_EQ(field, "42"); +} + +ColumnDynamic::MutablePtr getInsertFromColumn(size_t num = 1) +{ + auto column_from = ColumnDynamic::create(255); + for (size_t i = 0; i != num; ++i) + { + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + } + return column_from; +} + +void checkInsertFrom(const ColumnDynamic::MutablePtr & column_from, ColumnDynamic::MutablePtr & column_to, const std::string & expected_variant, const std::vector & expected_names, const std::unordered_map & expected_variant_name_to_discriminator) +{ + column_to->insertFrom(*column_from, 0); + ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant); + ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); + auto field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42); + + column_to->insertFrom(*column_from, 1); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42.42); + + column_to->insertFrom(*column_from, 2); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "str"); + + ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant); + ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); +} + +TEST(ColumnDynamic, InsertFrom1) +{ + auto column_to = ColumnDynamic::create(255); + checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}}); +} + +TEST(ColumnDynamic, InsertFrom2) +{ + auto column_to = ColumnDynamic::create(255); + column_to->insert(Field(42)); + column_to->insert(Field(42.42)); + column_to->insert(Field("str")); + + checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}}); +} + +TEST(ColumnDynamic, InsertFrom3) +{ + auto column_to = ColumnDynamic::create(255); + column_to->insert(Field(42)); + column_to->insert(Field(42.42)); + column_to->insert(Field("str")); + column_to->insert(Array({Field(42)})); + + checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Array(Int8), Float64, Int8, String)", {"Array(Int8)", "Float64", "Int8", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"String", 3}}); +} + +TEST(ColumnDynamic, InsertFromOverflow1) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insertFrom(*column_from, 0); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 254); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + auto field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42); + + column_to->insertFrom(*column_from, 1); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "42.42"); + + column_to->insertFrom(*column_from, 2); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "str"); +} + +TEST(ColumnDynamic, InsertFromOverflow2) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insertFrom(*column_from, 0); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + auto field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42); + + column_to->insertFrom(*column_from, 1); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "42.42"); +} + +void checkInsertManyFrom(const ColumnDynamic::MutablePtr & column_from, ColumnDynamic::MutablePtr & column_to, const std::string & expected_variant, const std::vector & expected_names, const std::unordered_map & expected_variant_name_to_discriminator) +{ + column_to->insertManyFrom(*column_from, 0, 2); + ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant); + ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); + auto field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, 42); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42); + + column_to->insertManyFrom(*column_from, 1, 2); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, 42.42); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42.42); + + column_to->insertManyFrom(*column_from, 2, 2); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, "str"); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "str"); + + ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant); + ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); +} + +TEST(ColumnDynamic, InsertManyFrom1) +{ + auto column_to = ColumnDynamic::create(255); + checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}}); +} + +TEST(ColumnDynamic, InsertManyFrom2) +{ + auto column_to = ColumnDynamic::create(255); + column_to->insert(Field(42)); + column_to->insert(Field(42.42)); + column_to->insert(Field("str")); + + checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}}); +} + +TEST(ColumnDynamic, InsertManyFrom3) +{ + auto column_to = ColumnDynamic::create(255); + column_to->insert(Field(42)); + column_to->insert(Field(42.42)); + column_to->insert(Field("str")); + column_to->insert(Array({Field(42)})); + + checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Array(Int8), Float64, Int8, String)", {"Array(Int8)", "Float64", "Int8", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"String", 3}}); +} + +TEST(ColumnDynamic, InsertManyFromOverflow1) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insertManyFrom(*column_from, 0, 2); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 254); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + auto field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, 42); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42); + + column_to->insertManyFrom(*column_from, 1, 2); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, "42.42"); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "42.42"); + + column_to->insertManyFrom(*column_from, 2, 2); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "str"); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, "str"); +} + +TEST(ColumnDynamic, InsertManyFromOverflow2) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insertManyFrom(*column_from, 0, 2); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 254); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + auto field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, 42); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42); + + column_to->insertManyFrom(*column_from, 1, 2); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, "42.42"); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "42.42"); +} + +void checkInsertRangeFrom(const ColumnDynamic::MutablePtr & column_from, ColumnDynamic::MutablePtr & column_to, const std::string & expected_variant, const std::vector & expected_names, const std::unordered_map & expected_variant_name_to_discriminator) +{ + column_to->insertRangeFrom(*column_from, 0, 3); + ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant); + ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); + auto field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, 42); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, 42.42); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "str"); + + column_to->insertRangeFrom(*column_from, 3, 3); + field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, 42); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, 42.42); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "str"); + + ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant); + ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); +} + +TEST(ColumnDynamic, InsertRangeFrom1) +{ + auto column_to = ColumnDynamic::create(255); + checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}}); +} + +TEST(ColumnDynamic, InsertRangeFrom2) +{ + auto column_to = ColumnDynamic::create(255); + column_to->insert(Field(42)); + column_to->insert(Field(42.42)); + column_to->insert(Field("str1")); + + checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}}); +} + +TEST(ColumnDynamic, InsertRangeFrom3) +{ + auto column_to = ColumnDynamic::create(255); + column_to->insert(Field(42)); + column_to->insert(Field(42.42)); + column_to->insert(Field("str1")); + column_to->insert(Array({Field(42)})); + + checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Array(Int8), Float64, Int8, String)", {"Array(Int8)", "Float64", "Int8", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"String", 3}}); +} + +TEST(ColumnDynamic, InsertRangeFromOverflow1) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(43)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insertRangeFrom(*column_from, 0, 4); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + auto field = (*column_to)[column_to->size() - 4]; + ASSERT_EQ(field, Field(42)); + field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, Field(43)); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, Field("42.42")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, Field("str")); +} + +TEST(ColumnDynamic, InsertRangeFromOverflow2) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(43)); + column_from->insert(Field(42.42)); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insertRangeFrom(*column_from, 0, 3); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + auto field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, Field(42)); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, Field(43)); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, Field("42.42")); +} + +TEST(ColumnDynamic, InsertRangeFromOverflow3) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(43)); + column_from->insert(Field(42.42)); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insert(Field("Str")); + column_to->insertRangeFrom(*column_from, 0, 3); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + auto field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, Field(42)); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, Field(43)); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, Field("42.42")); +} + +TEST(ColumnDynamic, InsertRangeFromOverflow4) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + + auto column_to = getDynamicWithManyVariants(254); + column_to->insertRangeFrom(*column_from, 0, 3); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + auto field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, Field("42")); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, Field("42.42")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, Field("str")); +} + +TEST(ColumnDynamic, InsertRangeFromOverflow5) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(43)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insert(Field("str")); + column_to->insertRangeFrom(*column_from, 0, 4); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + auto field = (*column_to)[column_to->size() - 4]; + ASSERT_EQ(field, Field(42)); + field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, Field(43)); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, Field("42.42")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, Field("str")); +} + +TEST(ColumnDynamic, InsertRangeFromOverflow6) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(43)); + column_from->insert(Field(44)); + column_from->insert(Field(42.42)); + column_from->insert(Field(43.43)); + column_from->insert(Field("str")); + column_from->insert(Field(Array({Field(42)}))); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insertRangeFrom(*column_from, 2, 5); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Array(Int8)")); + auto field = (*column_to)[column_to->size() - 5]; + + ASSERT_EQ(field, Field("44")); + field = (*column_to)[column_to->size() - 4]; + ASSERT_EQ(field, Field(42.42)); + field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, Field(43.43)); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, Field("str")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, Field("[42]")); +} + +TEST(ColumnDynamic, SerializeDeserializeFromArena1) +{ + auto column = ColumnDynamic::create(255); + column->insert(Field(42)); + column->insert(Field(42.42)); + column->insert(Field("str")); + column->insert(Field(Null())); + + Arena arena; + const char * pos = nullptr; + auto ref1 = column->serializeValueIntoArena(0, arena, pos); + column->serializeValueIntoArena(1, arena, pos); + column->serializeValueIntoArena(2, arena, pos); + column->serializeValueIntoArena(3, arena, pos); + pos = column->deserializeAndInsertFromArena(ref1.data); + pos = column->deserializeAndInsertFromArena(pos); + pos = column->deserializeAndInsertFromArena(pos); + column->deserializeAndInsertFromArena(pos); + + ASSERT_EQ((*column)[column->size() - 4], 42); + ASSERT_EQ((*column)[column->size() - 3], 42.42); + ASSERT_EQ((*column)[column->size() - 2], "str"); + ASSERT_EQ((*column)[column->size() - 1], Null()); +} + +TEST(ColumnDynamic, SerializeDeserializeFromArena2) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + column_from->insert(Field(Null())); + + Arena arena; + const char * pos = nullptr; + auto ref1 = column_from->serializeValueIntoArena(0, arena, pos); + column_from->serializeValueIntoArena(1, arena, pos); + column_from->serializeValueIntoArena(2, arena, pos); + column_from->serializeValueIntoArena(3, arena, pos); + + auto column_to = ColumnDynamic::create(255); + pos = column_to->deserializeAndInsertFromArena(ref1.data); + pos = column_to->deserializeAndInsertFromArena(pos); + pos = column_to->deserializeAndInsertFromArena(pos); + column_to->deserializeAndInsertFromArena(pos); + + ASSERT_EQ((*column_from)[column_from->size() - 4], 42); + ASSERT_EQ((*column_from)[column_from->size() - 3], 42.42); + ASSERT_EQ((*column_from)[column_from->size() - 2], "str"); + ASSERT_EQ((*column_from)[column_from->size() - 1], Null()); + ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), "Variant(Float64, Int8, String)"); + std::vector expected_names = {"Float64", "Int8", "String"}; + ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names); + std::unordered_map expected_variant_name_to_discriminator = {{"Float64", 0}, {"Int8", 1}, {"String", 2}}; + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); +} + +TEST(ColumnDynamic, SerializeDeserializeFromArenaOverflow) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + column_from->insert(Field(Null())); + + Arena arena; + const char * pos = nullptr; + auto ref1 = column_from->serializeValueIntoArena(0, arena, pos); + column_from->serializeValueIntoArena(1, arena, pos); + column_from->serializeValueIntoArena(2, arena, pos); + column_from->serializeValueIntoArena(3, arena, pos); + + auto column_to = getDynamicWithManyVariants(253); + pos = column_to->deserializeAndInsertFromArena(ref1.data); + pos = column_to->deserializeAndInsertFromArena(pos); + pos = column_to->deserializeAndInsertFromArena(pos); + column_to->deserializeAndInsertFromArena(pos); + + ASSERT_EQ((*column_from)[column_from->size() - 4], 42); + ASSERT_EQ((*column_from)[column_from->size() - 3], 42.42); + ASSERT_EQ((*column_from)[column_from->size() - 2], "str"); + ASSERT_EQ((*column_from)[column_from->size() - 1], Null()); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); +} + +TEST(ColumnDynamic, skipSerializedInArena) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + column_from->insert(Field(Null())); + + Arena arena; + const char * pos = nullptr; + auto ref1 = column_from->serializeValueIntoArena(0, arena, pos); + column_from->serializeValueIntoArena(1, arena, pos); + column_from->serializeValueIntoArena(2, arena, pos); + auto ref4 = column_from->serializeValueIntoArena(3, arena, pos); + + const char * end = ref4.data + ref4.size; + auto column_to = ColumnDynamic::create(255); + pos = column_to->skipSerializedInArena(ref1.data); + pos = column_to->skipSerializedInArena(pos); + pos = column_to->skipSerializedInArena(pos); + pos = column_to->skipSerializedInArena(pos); + + ASSERT_EQ(pos, end); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.empty()); + ASSERT_TRUE(column_to->getVariantInfo().variant_names.empty()); +} diff --git a/src/Common/AsyncLoader.cpp b/src/Common/AsyncLoader.cpp index 9607333b9f7..cfb273b9058 100644 --- a/src/Common/AsyncLoader.cpp +++ b/src/Common/AsyncLoader.cpp @@ -30,6 +30,7 @@ namespace ErrorCodes extern const int ASYNC_LOAD_CYCLE; extern const int ASYNC_LOAD_FAILED; extern const int ASYNC_LOAD_CANCELED; + extern const int ASYNC_LOAD_WAIT_FAILED; extern const int LOGICAL_ERROR; } @@ -433,7 +434,7 @@ void AsyncLoader::wait(const LoadJobPtr & job, bool no_throw) std::unique_lock job_lock{job->mutex}; wait(job_lock, job); if (!no_throw && job->load_exception) - std::rethrow_exception(job->load_exception); + throw Exception(ErrorCodes::ASYNC_LOAD_WAIT_FAILED, "Waited job failed: {}", getExceptionMessage(job->load_exception, /* with_stacktrace = */ false)); } void AsyncLoader::remove(const LoadJobSet & jobs) diff --git a/src/Common/CMakeLists.txt b/src/Common/CMakeLists.txt index b83c8431f0a..d4802c28f53 100644 --- a/src/Common/CMakeLists.txt +++ b/src/Common/CMakeLists.txt @@ -1,5 +1,3 @@ -add_subdirectory(StringUtils) - if (ENABLE_BENCHMARKS) add_subdirectory(benchmarks) endif() diff --git a/src/Common/Config/CMakeLists.txt b/src/Common/Config/CMakeLists.txt index 009e2456322..09095ef5acc 100644 --- a/src/Common/Config/CMakeLists.txt +++ b/src/Common/Config/CMakeLists.txt @@ -13,8 +13,6 @@ target_link_libraries(clickhouse_common_config clickhouse_common_zookeeper common Poco::XML - PRIVATE - string_utils ) add_library(clickhouse_common_config_no_zookeeper_log ${SRCS}) @@ -23,8 +21,6 @@ target_link_libraries(clickhouse_common_config_no_zookeeper_log clickhouse_common_zookeeper_no_log common Poco::XML - PRIVATE - string_utils ) if (TARGET ch_contrib::yaml_cpp) diff --git a/src/Common/Config/ConfigProcessor.cpp b/src/Common/Config/ConfigProcessor.cpp index 7930ef20153..c9832e8efd5 100644 --- a/src/Common/Config/ConfigProcessor.cpp +++ b/src/Common/Config/ConfigProcessor.cpp @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 21b4d114d79..b557edc3e12 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -224,6 +224,8 @@ M(PartsActive, "Active data part, used by current and upcoming SELECTs.") \ M(AttachedDatabase, "Active database, used by current and upcoming SELECTs.") \ M(AttachedTable, "Active table, used by current and upcoming SELECTs.") \ + M(AttachedView, "Active view, used by current and upcoming SELECTs.") \ + M(AttachedDictionary, "Active dictionary, used by current and upcoming SELECTs.") \ M(PartsOutdated, "Not active data part, but could be used by only current SELECTs, could be deleted after SELECTs finishes.") \ M(PartsDeleting, "Not active data part with identity refcounter, it is deleting right now by a cleaner.") \ M(PartsDeleteOnDestroy, "Part was moved to another disk and should be deleted in own destructor.") \ diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 44c051401ef..d21eaedcc2a 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -600,6 +600,7 @@ M(719, QUERY_CACHE_USED_WITH_SYSTEM_TABLE) \ M(720, USER_EXPIRED) \ M(721, DEPRECATED_FUNCTION) \ + M(722, ASYNC_LOAD_WAIT_FAILED) \ \ M(900, DISTRIBUTED_CACHE_ERROR) \ M(901, CANNOT_USE_DISTRIBUTED_CACHE) \ diff --git a/src/Common/FailPoint.cpp b/src/Common/FailPoint.cpp index 79b0b6a1fe1..5454cba8e2e 100644 --- a/src/Common/FailPoint.cpp +++ b/src/Common/FailPoint.cpp @@ -40,6 +40,7 @@ static struct InitFiu REGULAR(use_delayed_remote_source) \ REGULAR(cluster_discovery_faults) \ REGULAR(replicated_sends_failpoint) \ + REGULAR(stripe_log_sink_write_fallpoint)\ ONCE(smt_commit_merge_mutate_zk_fail_after_op) \ ONCE(smt_commit_merge_mutate_zk_fail_before_op) \ ONCE(smt_commit_write_zk_fail_after_op) \ @@ -58,6 +59,7 @@ static struct InitFiu ONCE(execute_query_calling_empty_set_result_func_on_exception) \ ONCE(receive_timeout_on_table_status_response) + namespace FailPoints { #define M(NAME) extern const char(NAME)[] = #NAME ""; diff --git a/src/Common/FrequencyHolder.h b/src/Common/FrequencyHolder.h index 64207dc5423..d6c32c225bf 100644 --- a/src/Common/FrequencyHolder.h +++ b/src/Common/FrequencyHolder.h @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Common/HTTPHeaderFilter.cpp b/src/Common/HTTPHeaderFilter.cpp index 9ad8dd6fccf..fd02fe1ecef 100644 --- a/src/Common/HTTPHeaderFilter.cpp +++ b/src/Common/HTTPHeaderFilter.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index d3525010419..8c8e2163aad 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -360,6 +360,7 @@ The server successfully detected this situation and will download merged part fr M(QueryProfilerSignalOverruns, "Number of times we drop processing of a query profiler signal due to overrun plus the number of signals that OS has not delivered due to overrun.") \ M(QueryProfilerConcurrencyOverruns, "Number of times we drop processing of a query profiler signal due to too many concurrent query profilers in other threads, which may indicate overload.") \ M(QueryProfilerRuns, "Number of times QueryProfiler had been run.") \ + M(QueryProfilerErrors, "Invalid memory accesses during asynchronous stack unwinding.") \ \ M(CreatedLogEntryForMerge, "Successfully created log entry to merge parts in ReplicatedMergeTree.") \ M(NotCreatedLogEntryForMerge, "Log entry to merge parts in ReplicatedMergeTree is not created due to concurrent log update by another replica.") \ diff --git a/src/Common/ProxyConfigurationResolverProvider.cpp b/src/Common/ProxyConfigurationResolverProvider.cpp index d15b4d98615..1a6dc1090ee 100644 --- a/src/Common/ProxyConfigurationResolverProvider.cpp +++ b/src/Common/ProxyConfigurationResolverProvider.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include namespace DB diff --git a/src/Common/ProxyListConfigurationResolver.cpp b/src/Common/ProxyListConfigurationResolver.cpp index c9b8923929a..c527c89ea6b 100644 --- a/src/Common/ProxyListConfigurationResolver.cpp +++ b/src/Common/ProxyListConfigurationResolver.cpp @@ -1,6 +1,6 @@ #include -#include +#include #include #include diff --git a/src/Common/QueryProfiler.cpp b/src/Common/QueryProfiler.cpp index 4f72b4aba75..c3affbdd968 100644 --- a/src/Common/QueryProfiler.cpp +++ b/src/Common/QueryProfiler.cpp @@ -12,7 +12,6 @@ #include #include -#include namespace CurrentMetrics { @@ -25,6 +24,7 @@ namespace ProfileEvents extern const Event QueryProfilerSignalOverruns; extern const Event QueryProfilerConcurrencyOverruns; extern const Event QueryProfilerRuns; + extern const Event QueryProfilerErrors; } namespace DB @@ -84,11 +84,29 @@ namespace #endif const auto signal_context = *reinterpret_cast(context); - const StackTrace stack_trace(signal_context); + std::optional stack_trace; + +#if defined(SANITIZER) + constexpr bool sanitizer = true; +#else + constexpr bool sanitizer = false; +#endif + + asynchronous_stack_unwinding = true; + if (sanitizer || 0 == sigsetjmp(asynchronous_stack_unwinding_signal_jump_buffer, 1)) + { + stack_trace.emplace(signal_context); + } + else + { + ProfileEvents::incrementNoTrace(ProfileEvents::QueryProfilerErrors); + } + asynchronous_stack_unwinding = false; + + if (stack_trace) + TraceSender::send(trace_type, *stack_trace, {}); - TraceSender::send(trace_type, stack_trace, {}); ProfileEvents::incrementNoTrace(ProfileEvents::QueryProfilerRuns); - errno = saved_errno; } diff --git a/src/Common/RemoteHostFilter.cpp b/src/Common/RemoteHostFilter.cpp index 815be8902e6..fe7bf878596 100644 --- a/src/Common/RemoteHostFilter.cpp +++ b/src/Common/RemoteHostFilter.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Common/Scheduler/Nodes/DynamicResourceManager.cpp b/src/Common/Scheduler/Nodes/DynamicResourceManager.cpp index b568b9245ba..01aa7df48d3 100644 --- a/src/Common/Scheduler/Nodes/DynamicResourceManager.cpp +++ b/src/Common/Scheduler/Nodes/DynamicResourceManager.cpp @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff --git a/src/Common/SensitiveDataMasker.cpp b/src/Common/SensitiveDataMasker.cpp index 8c29b899841..a9f61a1c786 100644 --- a/src/Common/SensitiveDataMasker.cpp +++ b/src/Common/SensitiveDataMasker.cpp @@ -10,7 +10,7 @@ #include #include -#include +#include #include #ifndef NDEBUG diff --git a/src/Common/StackTrace.cpp b/src/Common/StackTrace.cpp index 6e6f5b42b36..239e957bdfe 100644 --- a/src/Common/StackTrace.cpp +++ b/src/Common/StackTrace.cpp @@ -560,3 +560,7 @@ void StackTrace::dropCache() std::lock_guard lock{stacktrace_cache_mutex}; cacheInstance().clear(); } + + +thread_local bool asynchronous_stack_unwinding = false; +thread_local sigjmp_buf asynchronous_stack_unwinding_signal_jump_buffer; diff --git a/src/Common/StackTrace.h b/src/Common/StackTrace.h index 06028c77034..4ce9a9281f3 100644 --- a/src/Common/StackTrace.h +++ b/src/Common/StackTrace.h @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef OS_DARWIN // ucontext is not available without _XOPEN_SOURCE @@ -87,3 +88,8 @@ protected: }; std::string signalToErrorMessage(int sig, const siginfo_t & info, const ucontext_t & context); + +/// Special handling for errors during asynchronous stack unwinding, +/// Which is used in Query Profiler +extern thread_local bool asynchronous_stack_unwinding; +extern thread_local sigjmp_buf asynchronous_stack_unwinding_signal_jump_buffer; diff --git a/src/Common/StringSearcher.h b/src/Common/StringSearcher.h index b3065354f65..d7e706fcd80 100644 --- a/src/Common/StringSearcher.h +++ b/src/Common/StringSearcher.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Common/StringUtils.cpp b/src/Common/StringUtils.cpp new file mode 100644 index 00000000000..18577e64c01 --- /dev/null +++ b/src/Common/StringUtils.cpp @@ -0,0 +1,87 @@ +#include + +#include + +#if USE_MULTITARGET_CODE +#include +#endif + + +namespace impl +{ + +bool startsWith(const std::string & s, const char * prefix, size_t prefix_size) +{ + return s.size() >= prefix_size && 0 == memcmp(s.data(), prefix, prefix_size); +} + +bool endsWith(const std::string & s, const char * suffix, size_t suffix_size) +{ + return s.size() >= suffix_size && 0 == memcmp(s.data() + s.size() - suffix_size, suffix, suffix_size); +} + +} + +DECLARE_DEFAULT_CODE( +bool isAllASCII(const UInt8 * data, size_t size) +{ + UInt8 mask = 0; + for (size_t i = 0; i < size; ++i) + mask |= data[i]; + + return !(mask & 0x80); +}) + +DECLARE_SSE42_SPECIFIC_CODE( +/// Copy from https://github.com/lemire/fastvalidate-utf-8/blob/master/include/simdasciicheck.h +bool isAllASCII(const UInt8 * data, size_t size) +{ + __m128i masks = _mm_setzero_si128(); + + size_t i = 0; + for (; i + 16 <= size; i += 16) + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast(data + i)); + masks = _mm_or_si128(masks, bytes); + } + int mask = _mm_movemask_epi8(masks); + + UInt8 tail_mask = 0; + for (; i < size; i++) + tail_mask |= data[i]; + + mask |= (tail_mask & 0x80); + return !mask; +}) + +DECLARE_AVX2_SPECIFIC_CODE( +bool isAllASCII(const UInt8 * data, size_t size) +{ + __m256i masks = _mm256_setzero_si256(); + + size_t i = 0; + for (; i + 32 <= size; i += 32) + { + __m256i bytes = _mm256_loadu_si256(reinterpret_cast(data + i)); + masks = _mm256_or_si256(masks, bytes); + } + int mask = _mm256_movemask_epi8(masks); + + UInt8 tail_mask = 0; + for (; i < size; i++) + tail_mask |= data[i]; + + mask |= (tail_mask & 0x80); + return !mask; +}) + +bool isAllASCII(const UInt8 * data, size_t size) +{ +#if USE_MULTITARGET_CODE + if (isArchSupported(DB::TargetArch::AVX2)) + return TargetSpecific::AVX2::isAllASCII(data, size); + if (isArchSupported(DB::TargetArch::SSE42)) + return TargetSpecific::SSE42::isAllASCII(data, size); +#endif + return TargetSpecific::Default::isAllASCII(data, size); +} diff --git a/src/Common/StringUtils/StringUtils.h b/src/Common/StringUtils.h similarity index 95% rename from src/Common/StringUtils/StringUtils.h rename to src/Common/StringUtils.h index 4958ecc9476..fe5fc3c058f 100644 --- a/src/Common/StringUtils/StringUtils.h +++ b/src/Common/StringUtils.h @@ -7,8 +7,10 @@ #include #include +#include -namespace detail + +namespace impl { bool startsWith(const std::string & s, const char * prefix, size_t prefix_size); bool endsWith(const std::string & s, const char * suffix, size_t suffix_size); @@ -17,12 +19,12 @@ namespace detail inline bool startsWith(const std::string & s, const std::string & prefix) { - return detail::startsWith(s, prefix.data(), prefix.size()); + return impl::startsWith(s, prefix.data(), prefix.size()); } inline bool endsWith(const std::string & s, const std::string & suffix) { - return detail::endsWith(s, suffix.data(), suffix.size()); + return impl::endsWith(s, suffix.data(), suffix.size()); } @@ -30,12 +32,12 @@ inline bool endsWith(const std::string & s, const std::string & suffix) /// string that is known at compile time. inline bool startsWith(const std::string & s, const char * prefix) { - return detail::startsWith(s, prefix, strlen(prefix)); + return impl::startsWith(s, prefix, strlen(prefix)); } inline bool endsWith(const std::string & s, const char * suffix) { - return detail::endsWith(s, suffix, strlen(suffix)); + return impl::endsWith(s, suffix, strlen(suffix)); } /// Given an integer, return the adequate suffix for @@ -315,6 +317,9 @@ inline void trim(std::string & str, char c = ' ') trimLeft(str, c); } +/// If all characters in the string are ASCII, return true +bool isAllASCII(const UInt8 * data, size_t size); + constexpr bool containsGlobs(const std::string & str) { return str.find_first_of("*?{") != std::string::npos; diff --git a/src/Common/StringUtils/CMakeLists.txt b/src/Common/StringUtils/CMakeLists.txt deleted file mode 100644 index 57c196d335c..00000000000 --- a/src/Common/StringUtils/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -# These files are located in separate library, because they are used by separate products -# in places when no dependency on whole "dbms" library is possible. - -include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") - -add_headers_and_sources(clickhouse_common_stringutils .) - -add_library(string_utils ${clickhouse_common_stringutils_headers} ${clickhouse_common_stringutils_sources}) diff --git a/src/Common/StringUtils/StringUtils.cpp b/src/Common/StringUtils/StringUtils.cpp deleted file mode 100644 index 8a0b25dbdad..00000000000 --- a/src/Common/StringUtils/StringUtils.cpp +++ /dev/null @@ -1,17 +0,0 @@ -#include "StringUtils.h" - - -namespace detail -{ - -bool startsWith(const std::string & s, const char * prefix, size_t prefix_size) -{ - return s.size() >= prefix_size && 0 == memcmp(s.data(), prefix, prefix_size); -} - -bool endsWith(const std::string & s, const char * suffix, size_t suffix_size) -{ - return s.size() >= suffix_size && 0 == memcmp(s.data() + s.size() - suffix_size, suffix, suffix_size); -} - -} diff --git a/src/Common/TLDListsHolder.cpp b/src/Common/TLDListsHolder.cpp index c3991b86983..413d221090e 100644 --- a/src/Common/TLDListsHolder.cpp +++ b/src/Common/TLDListsHolder.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/src/Common/UTF8Helpers.cpp b/src/Common/UTF8Helpers.cpp index b8f5c000e75..8c8c8e8327b 100644 --- a/src/Common/UTF8Helpers.cpp +++ b/src/Common/UTF8Helpers.cpp @@ -1,14 +1,9 @@ -#include -#include #include +#include #include #include -#if USE_MULTITARGET_CODE -#include -#endif - namespace DB { namespace UTF8 @@ -208,7 +203,6 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l } - size_t computeWidth(const UInt8 * data, size_t size, size_t prefix) noexcept { return computeWidthImpl(data, size, prefix, 0); @@ -219,71 +213,5 @@ size_t computeBytesBeforeWidth(const UInt8 * data, size_t size, size_t prefix, s return computeWidthImpl(data, size, prefix, limit); } - -DECLARE_DEFAULT_CODE( -bool isAllASCII(const UInt8 * data, size_t size) -{ - UInt8 mask = 0; - for (size_t i = 0; i < size; ++i) - mask |= data[i]; - - return !(mask & 0x80); -}) - -DECLARE_SSE42_SPECIFIC_CODE( -/// Copy from https://github.com/lemire/fastvalidate-utf-8/blob/master/include/simdasciicheck.h -bool isAllASCII(const UInt8 * data, size_t size) -{ - __m128i masks = _mm_setzero_si128(); - - size_t i = 0; - for (; i + 16 <= size; i += 16) - { - __m128i bytes = _mm_loadu_si128(reinterpret_cast(data + i)); - masks = _mm_or_si128(masks, bytes); - } - int mask = _mm_movemask_epi8(masks); - - UInt8 tail_mask = 0; - for (; i < size; i++) - tail_mask |= data[i]; - - mask |= (tail_mask & 0x80); - return !mask; -}) - -DECLARE_AVX2_SPECIFIC_CODE( -bool isAllASCII(const UInt8 * data, size_t size) -{ - __m256i masks = _mm256_setzero_si256(); - - size_t i = 0; - for (; i + 32 <= size; i += 32) - { - __m256i bytes = _mm256_loadu_si256(reinterpret_cast(data + i)); - masks = _mm256_or_si256(masks, bytes); - } - int mask = _mm256_movemask_epi8(masks); - - UInt8 tail_mask = 0; - for (; i < size; i++) - tail_mask |= data[i]; - - mask |= (tail_mask & 0x80); - return !mask; -}) - -bool isAllASCII(const UInt8* data, size_t size) -{ -#if USE_MULTITARGET_CODE - if (isArchSupported(TargetArch::AVX2)) - return TargetSpecific::AVX2::isAllASCII(data, size); - if (isArchSupported(TargetArch::SSE42)) - return TargetSpecific::SSE42::isAllASCII(data, size); -#endif - return TargetSpecific::Default::isAllASCII(data, size); -} - - } } diff --git a/src/Common/UTF8Helpers.h b/src/Common/UTF8Helpers.h index 933b62c7b63..b09d92bd731 100644 --- a/src/Common/UTF8Helpers.h +++ b/src/Common/UTF8Helpers.h @@ -136,10 +136,6 @@ size_t computeWidth(const UInt8 * data, size_t size, size_t prefix = 0) noexcept */ size_t computeBytesBeforeWidth(const UInt8 * data, size_t size, size_t prefix, size_t limit) noexcept; - -/// If all the characters in the string are ASCII, return true. -bool isAllASCII(const UInt8* data, size_t size); - } } diff --git a/src/Common/Volnitsky.h b/src/Common/Volnitsky.h index 6513bdb8bc3..3a148983790 100644 --- a/src/Common/Volnitsky.h +++ b/src/Common/Volnitsky.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Common/ZooKeeper/CMakeLists.txt b/src/Common/ZooKeeper/CMakeLists.txt index aa06375bd6a..8b6c420e565 100644 --- a/src/Common/ZooKeeper/CMakeLists.txt +++ b/src/Common/ZooKeeper/CMakeLists.txt @@ -12,8 +12,6 @@ target_link_libraries (clickhouse_common_zookeeper clickhouse_common_io clickhouse_compression common - PRIVATE - string_utils ) # for examples -- no logging (to avoid extra dependencies) @@ -23,8 +21,6 @@ target_link_libraries (clickhouse_common_zookeeper_no_log clickhouse_common_io clickhouse_compression common - PRIVATE - string_utils ) if (ENABLE_EXAMPLES) add_subdirectory(examples) diff --git a/src/Common/ZooKeeper/TestKeeper.cpp b/src/Common/ZooKeeper/TestKeeper.cpp index d02ad4523ad..51ad2e7c830 100644 --- a/src/Common/ZooKeeper/TestKeeper.cpp +++ b/src/Common/ZooKeeper/TestKeeper.cpp @@ -1,7 +1,7 @@ #include "Common/ZooKeeper/IKeeper.h" #include #include -#include +#include #include #include diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index c62c5d0c143..be490d0bfc1 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -19,7 +19,7 @@ #include #include "Common/ZooKeeper/IKeeper.h" #include -#include +#include #include #include diff --git a/src/Common/ZooKeeper/ZooKeeperArgs.cpp b/src/Common/ZooKeeper/ZooKeeperArgs.cpp index 40bd9d79a03..a581b6a7f38 100644 --- a/src/Common/ZooKeeper/ZooKeeperArgs.cpp +++ b/src/Common/ZooKeeper/ZooKeeperArgs.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include namespace DB diff --git a/src/Common/ZooKeeper/examples/CMakeLists.txt b/src/Common/ZooKeeper/examples/CMakeLists.txt index c5a93f2701e..678b302a512 100644 --- a/src/Common/ZooKeeper/examples/CMakeLists.txt +++ b/src/Common/ZooKeeper/examples/CMakeLists.txt @@ -7,7 +7,6 @@ clickhouse_add_executable(zkutil_test_commands_new_lib zkutil_test_commands_new_ target_link_libraries(zkutil_test_commands_new_lib PRIVATE clickhouse_common_zookeeper_no_log clickhouse_compression - string_utils dbms) clickhouse_add_executable(zkutil_test_async zkutil_test_async.cpp) diff --git a/src/Common/ZooKeeper/examples/zkutil_test_commands_new_lib.cpp b/src/Common/ZooKeeper/examples/zkutil_test_commands_new_lib.cpp index 414006d48a4..25d66b94b46 100644 --- a/src/Common/ZooKeeper/examples/zkutil_test_commands_new_lib.cpp +++ b/src/Common/ZooKeeper/examples/zkutil_test_commands_new_lib.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Common/escapeForFileName.cpp b/src/Common/escapeForFileName.cpp index a1f9bff28d0..2fe23245f49 100644 --- a/src/Common/escapeForFileName.cpp +++ b/src/Common/escapeForFileName.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include namespace DB diff --git a/src/Common/examples/CMakeLists.txt b/src/Common/examples/CMakeLists.txt index 90a238c9800..73e1396fb35 100644 --- a/src/Common/examples/CMakeLists.txt +++ b/src/Common/examples/CMakeLists.txt @@ -87,3 +87,6 @@ if (ENABLE_SSL) clickhouse_add_executable (encrypt_decrypt encrypt_decrypt.cpp) target_link_libraries (encrypt_decrypt PRIVATE dbms) endif() + +clickhouse_add_executable (check_pointer_valid check_pointer_valid.cpp) +target_link_libraries (check_pointer_valid PRIVATE clickhouse_common_io) diff --git a/src/Common/examples/check_pointer_valid.cpp b/src/Common/examples/check_pointer_valid.cpp new file mode 100644 index 00000000000..e59ebf43327 --- /dev/null +++ b/src/Common/examples/check_pointer_valid.cpp @@ -0,0 +1,53 @@ +#include +#include +#include +#include + + +/// This example demonstrates how is it possible to check if a pointer to memory is readable using a signal handler. + +thread_local bool checking_pointer = false; +thread_local jmp_buf signal_jump_buffer; + + +void signalHandler(int sig, siginfo_t *, void *) +{ + if (checking_pointer && sig == SIGSEGV) + siglongjmp(signal_jump_buffer, 1); +} + +bool isPointerValid(const void * ptr) +{ + checking_pointer = true; + if (0 == sigsetjmp(signal_jump_buffer, 1)) + { + char res; + memcpy(&res, ptr, 1); + __asm__ __volatile__("" :: "r"(res) : "memory"); + checking_pointer = false; + return true; + } + else + { + checking_pointer = false; + return false; + } +} + +int main(int, char **) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = signalHandler; + sa.sa_flags = SA_SIGINFO; + + if (sigemptyset(&sa.sa_mask) + || sigaddset(&sa.sa_mask, SIGSEGV) + || sigaction(SIGSEGV, &sa, nullptr)) + return 1; + + std::cerr << isPointerValid(reinterpret_cast(0x123456789)) << "\n"; + std::cerr << isPointerValid(&sa) << "\n"; + + return 0; +} diff --git a/src/Common/format.h b/src/Common/format.h index 27018f64064..3dbb88b4089 100644 --- a/src/Common/format.h +++ b/src/Common/format.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Common/formatIPv6.cpp b/src/Common/formatIPv6.cpp index 86e33beb7c3..341b3715d30 100644 --- a/src/Common/formatIPv6.cpp +++ b/src/Common/formatIPv6.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/src/Common/formatIPv6.h b/src/Common/formatIPv6.h index 3451eda6b3c..bb83e0381ef 100644 --- a/src/Common/formatIPv6.h +++ b/src/Common/formatIPv6.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include constexpr size_t IPV4_BINARY_LENGTH = 4; constexpr size_t IPV6_BINARY_LENGTH = 16; diff --git a/src/Common/getMappedArea.cpp b/src/Common/getMappedArea.cpp index 4f40c604c6a..79191d68fb9 100644 --- a/src/Common/getMappedArea.cpp +++ b/src/Common/getMappedArea.cpp @@ -3,7 +3,7 @@ #if defined(OS_LINUX) -#include +#include #include #include #include diff --git a/src/Common/getMultipleKeysFromConfig.cpp b/src/Common/getMultipleKeysFromConfig.cpp index 7cf49fcc34d..6d6589a45a3 100644 --- a/src/Common/getMultipleKeysFromConfig.cpp +++ b/src/Common/getMultipleKeysFromConfig.cpp @@ -1,7 +1,7 @@ #include #include -#include +#include namespace DB { diff --git a/src/Common/tests/gtest_async_loader.cpp b/src/Common/tests/gtest_async_loader.cpp index 174997ddf14..9fda58b9008 100644 --- a/src/Common/tests/gtest_async_loader.cpp +++ b/src/Common/tests/gtest_async_loader.cpp @@ -35,6 +35,7 @@ namespace DB::ErrorCodes extern const int ASYNC_LOAD_CYCLE; extern const int ASYNC_LOAD_FAILED; extern const int ASYNC_LOAD_CANCELED; + extern const int ASYNC_LOAD_WAIT_FAILED; } struct Initializer { @@ -262,7 +263,8 @@ TEST(AsyncLoader, CancelPendingJob) } catch (Exception & e) { - ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); } } @@ -288,7 +290,8 @@ TEST(AsyncLoader, CancelPendingTask) } catch (Exception & e) { - ASSERT_TRUE(e.code() == ErrorCodes::ASYNC_LOAD_CANCELED); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); } try @@ -298,7 +301,8 @@ TEST(AsyncLoader, CancelPendingTask) } catch (Exception & e) { - ASSERT_TRUE(e.code() == ErrorCodes::ASYNC_LOAD_CANCELED); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); } } @@ -325,7 +329,8 @@ TEST(AsyncLoader, CancelPendingDependency) } catch (Exception & e) { - ASSERT_TRUE(e.code() == ErrorCodes::ASYNC_LOAD_CANCELED); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); } try @@ -335,7 +340,8 @@ TEST(AsyncLoader, CancelPendingDependency) } catch (Exception & e) { - ASSERT_TRUE(e.code() == ErrorCodes::ASYNC_LOAD_CANCELED); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); } } @@ -451,8 +457,9 @@ TEST(AsyncLoader, JobFailure) } catch (Exception & e) { - ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_FAILED); - ASSERT_TRUE(e.message().find(error_message) != String::npos); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains(error_message)); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_FAILED")); } } @@ -489,8 +496,9 @@ TEST(AsyncLoader, ScheduleJobWithFailedDependencies) } catch (Exception & e) { - ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED); - ASSERT_TRUE(e.message().find(error_message) != String::npos); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); + ASSERT_TRUE(e.message().contains(error_message)); } try { @@ -499,8 +507,9 @@ TEST(AsyncLoader, ScheduleJobWithFailedDependencies) } catch (Exception & e) { - ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED); - ASSERT_TRUE(e.message().find(error_message) != String::npos); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); + ASSERT_TRUE(e.message().contains(error_message)); } } @@ -531,7 +540,8 @@ TEST(AsyncLoader, ScheduleJobWithCanceledDependencies) } catch (Exception & e) { - ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); } try { @@ -540,7 +550,8 @@ TEST(AsyncLoader, ScheduleJobWithCanceledDependencies) } catch (Exception & e) { - ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); } } diff --git a/src/Common/tryGetFileNameByFileDescriptor.cpp b/src/Common/tryGetFileNameByFileDescriptor.cpp new file mode 100644 index 00000000000..47e81050388 --- /dev/null +++ b/src/Common/tryGetFileNameByFileDescriptor.cpp @@ -0,0 +1,33 @@ +#include + +#ifdef OS_LINUX +# include +#elif defined(OS_DARWIN) +# include +#endif + +#include + + +namespace DB +{ +std::optional tryGetFileNameFromFileDescriptor(int fd) +{ +#ifdef OS_LINUX + std::string proc_path = fmt::format("/proc/self/fd/{}", fd); + char file_path[PATH_MAX] = {'\0'}; + if (readlink(proc_path.c_str(), file_path, sizeof(file_path) - 1) != -1) + return file_path; + return std::nullopt; +#elif defined(OS_DARWIN) + char file_path[PATH_MAX] = {'\0'}; + if (fcntl(fd, F_GETPATH, file_path) != -1) + return file_path; + return std::nullopt; +#else + (void)fd; + return std::nullopt; +#endif +} + +} diff --git a/src/Common/tryGetFileNameByFileDescriptor.h b/src/Common/tryGetFileNameByFileDescriptor.h new file mode 100644 index 00000000000..c38ccb4f851 --- /dev/null +++ b/src/Common/tryGetFileNameByFileDescriptor.h @@ -0,0 +1,10 @@ +#pragma once + +#include +#include + +namespace DB +{ +/// Supports only Linux/MacOS. On other platforms, returns nullopt. +std::optional tryGetFileNameFromFileDescriptor(int fd); +} diff --git a/src/Coordination/FourLetterCommand.cpp b/src/Coordination/FourLetterCommand.cpp index 25254e10441..28902bc8591 100644 --- a/src/Coordination/FourLetterCommand.cpp +++ b/src/Coordination/FourLetterCommand.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include "Coordination/KeeperFeatureFlags.h" #include diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index df5c2e9e0c8..9bcd0608bf7 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index 524d6ec07c2..ea0b155b22d 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -97,6 +97,8 @@ namespace DB M(UInt64, max_table_size_to_drop, 50000000000lu, "If size of a table is greater than this value (in bytes) than table could not be dropped with any DROP query.", 0) \ M(UInt64, max_partition_size_to_drop, 50000000000lu, "Same as max_table_size_to_drop, but for the partitions.", 0) \ M(UInt64, max_table_num_to_warn, 5000lu, "If number of tables is greater than this value, server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_view_num_to_warn, 10000lu, "If number of views is greater than this value, server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_dictionary_num_to_warn, 1000lu, "If number of dictionaries is greater than this value, server will create a warning that will displayed to user.", 0) \ M(UInt64, max_database_num_to_warn, 1000lu, "If number of databases is greater than this value, server will create a warning that will displayed to user.", 0) \ M(UInt64, max_part_num_to_warn, 100000lu, "If number of databases is greater than this value, server will create a warning that will displayed to user.", 0) \ M(UInt64, concurrent_threads_soft_limit_num, 0, "Sets how many concurrent thread can be allocated before applying CPU pressure. Zero means unlimited.", 0) \ diff --git a/src/Core/Settings.h b/src/Core/Settings.h index c555b5cb208..e8782e37a4a 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -80,6 +80,7 @@ class IColumn; M(UInt64, connections_with_failover_max_tries, 3, "The maximum number of attempts to connect to replicas.", 0) \ M(UInt64, s3_strict_upload_part_size, 0, "The exact size of part to upload during multipart upload to S3 (some implementations does not supports variable size parts).", 0) \ M(UInt64, azure_strict_upload_part_size, 0, "The exact size of part to upload during multipart upload to Azure blob storage.", 0) \ + M(UInt64, azure_max_blocks_in_multipart_upload, 50000, "Maximum number of blocks in multipart upload for Azure.", 0) \ M(UInt64, s3_min_upload_part_size, 16*1024*1024, "The minimum size of part to upload during multipart upload to S3.", 0) \ M(UInt64, s3_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to S3.", 0) \ M(UInt64, azure_min_upload_part_size, 16*1024*1024, "The minimum size of part to upload during multipart upload to Azure blob storage.", 0) \ @@ -880,6 +881,7 @@ class IColumn; M(Bool, traverse_shadow_remote_data_paths, false, "Traverse shadow directory when query system.remote_data_paths", 0) \ M(Bool, geo_distance_returns_float64_on_float64_arguments, true, "If all four arguments to `geoDistance`, `greatCircleDistance`, `greatCircleAngle` functions are Float64, return Float64 and use double precision for internal calculations. In previous ClickHouse versions, the functions always returned Float32.", 0) \ M(Bool, allow_get_client_http_header, false, "Allow to use the function `getClientHTTPHeader` which lets to obtain a value of an the current HTTP request's header. It is not enabled by default for security reasons, because some headers, such as `Cookie`, could contain sensitive info. Note that the `X-ClickHouse-*` and `Authentication` headers are always restricted and cannot be obtained with this function.", 0) \ + M(Bool, cast_string_to_dynamic_use_inference, false, "Use types inference during String to Dynamic conversion", 0) \ \ /** Experimental functions */ \ M(Bool, allow_experimental_materialized_postgresql_table, false, "Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental", 0) \ @@ -888,6 +890,7 @@ class IColumn; M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \ M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \ + M(Bool, allow_experimental_dynamic_type, false, "Allow Dynamic data type", 0) \ M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \ M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \ M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \ @@ -1078,6 +1081,7 @@ class IColumn; M(Bool, input_format_csv_skip_trailing_empty_lines, false, "Skip trailing empty lines in CSV format", 0) \ M(Bool, input_format_tsv_skip_trailing_empty_lines, false, "Skip trailing empty lines in TSV format", 0) \ M(Bool, input_format_custom_skip_trailing_empty_lines, false, "Skip trailing empty lines in CustomSeparated format", 0) \ + M(Bool, input_format_tsv_crlf_end_of_line, false, "If it is set true, file function will read TSV format with \\r\\n instead of \\n.", 0) \ \ M(Bool, input_format_native_allow_types_conversion, true, "Allow data types conversion in Native input format", 0) \ \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 65c8934cb23..be63dcb0b03 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -87,12 +87,16 @@ static std::map sett { {"24.5", {{"allow_deprecated_functions", true, false, "Allow usage of deprecated functions"}, {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."}, + {"input_format_tsv_crlf_end_of_line", false, false, "Enables reading of CRLF line endings with TSV formats"}, {"output_format_parquet_use_custom_encoder", false, true, "Enable custom Parquet encoder."}, {"cross_join_min_rows_to_compress", 0, 10000000, "A new setting."}, {"cross_join_min_bytes_to_compress", 0, 1_GiB, "A new setting."}, {"http_max_chunk_size", 0, 0, "Internal limitation"}, {"prefer_external_sort_block_bytes", 0, DEFAULT_BLOCK_SIZE * 256, "Prefer maximum block bytes for external sort, reduce the memory usage during merging."}, {"input_format_force_null_for_omitted_fields", false, false, "Disable type-defaults for omitted fields when needed"}, + {"cast_string_to_dynamic_use_inference", false, false, "Add setting to allow converting String to Dynamic through parsing"}, + {"allow_experimental_dynamic_type", false, false, "Add new experimental Dynamic type"}, + {"azure_max_blocks_in_multipart_upload", 50000, 50000, "Maximum number of blocks in multipart upload for Azure."}, }}, {"24.4", {{"input_format_json_throw_on_bad_escape_sequence", true, true, "Allow to save JSON strings with bad escape sequences"}, {"max_parsing_threads", 0, 0, "Add a separate setting to control number of threads in parallel parsing from files"}, diff --git a/src/Core/SettingsQuirks.cpp b/src/Core/SettingsQuirks.cpp index 5e7d02dc448..5541cc19653 100644 --- a/src/Core/SettingsQuirks.cpp +++ b/src/Core/SettingsQuirks.cpp @@ -92,7 +92,7 @@ void applySettingsQuirks(Settings & settings, LoggerPtr log) void doSettingsSanityCheckClamp(Settings & current_settings, LoggerPtr log) { - auto getCurrentValue = [¤t_settings](const std::string_view name) -> Field + auto get_current_value = [¤t_settings](const std::string_view name) -> Field { Field current_value; bool has_current_value = current_settings.tryGet(name, current_value); @@ -100,7 +100,7 @@ void doSettingsSanityCheckClamp(Settings & current_settings, LoggerPtr log) return current_value; }; - UInt64 max_threads = getCurrentValue("max_threads").get(); + UInt64 max_threads = get_current_value("max_threads").get(); UInt64 max_threads_max_value = 256 * getNumberOfPhysicalCPUCores(); if (max_threads > max_threads_max_value) { @@ -109,7 +109,7 @@ void doSettingsSanityCheckClamp(Settings & current_settings, LoggerPtr log) current_settings.set("max_threads", max_threads_max_value); } - constexpr UInt64 max_sane_block_rows_size = 4294967296; // 2^32 + static constexpr UInt64 max_sane_block_rows_size = 4294967296; // 2^32 std::unordered_set block_rows_settings{ "max_block_size", "max_insert_block_size", @@ -120,13 +120,21 @@ void doSettingsSanityCheckClamp(Settings & current_settings, LoggerPtr log) "input_format_parquet_max_block_size"}; for (auto const & setting : block_rows_settings) { - auto block_size = getCurrentValue(setting).get(); - if (block_size > max_sane_block_rows_size) + if (auto block_size = get_current_value(setting).get(); + block_size > max_sane_block_rows_size) { if (log) LOG_WARNING(log, "Sanity check: '{}' value is too high ({}). Reduced to {}", setting, block_size, max_sane_block_rows_size); current_settings.set(setting, max_sane_block_rows_size); } } + + if (auto max_block_size = get_current_value("max_block_size").get(); max_block_size == 0) + { + if (log) + LOG_WARNING(log, "Sanity check: 'max_block_size' cannot be 0. Set to default value {}", DEFAULT_BLOCK_SIZE); + current_settings.set("max_block_size", DEFAULT_BLOCK_SIZE); + } } + } diff --git a/src/Core/TypeId.h b/src/Core/TypeId.h index e6c5454b3bb..e4f850cbb59 100644 --- a/src/Core/TypeId.h +++ b/src/Core/TypeId.h @@ -50,6 +50,7 @@ enum class TypeIndex : uint8_t IPv6, JSONPaths, Variant, + Dynamic }; /** diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index fdffca9b4ef..c6c82df2a72 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -144,6 +144,9 @@ static std::atomic_flag fatal_error_printed; */ static void signalHandler(int sig, siginfo_t * info, void * context) { + if (asynchronous_stack_unwinding && sig == SIGSEGV) + siglongjmp(asynchronous_stack_unwinding_signal_jump_buffer, 1); + DENY_ALLOCATIONS_IN_SCOPE; auto saved_errno = errno; /// We must restore previous value of errno in signal handler. @@ -185,6 +188,7 @@ static void signalHandler(int sig, siginfo_t * info, void * context) errno = saved_errno; } + static bool getenvBool(const char * name) { bool res = false; diff --git a/src/DataTypes/DataTypeArray.cpp b/src/DataTypes/DataTypeArray.cpp index 6e5760933eb..806a1577a21 100644 --- a/src/DataTypes/DataTypeArray.cpp +++ b/src/DataTypes/DataTypeArray.cpp @@ -75,6 +75,27 @@ void DataTypeArray::forEachChild(const ChildCallback & callback) const nested->forEachChild(callback); } +std::unique_ptr DataTypeArray::getDynamicSubcolumnData(std::string_view subcolumn_name, const DB::IDataType::SubstreamData & data, bool throw_if_null) const +{ + auto nested_type = assert_cast(*data.type).nested; + auto nested_data = std::make_unique(nested_type->getDefaultSerialization()); + nested_data->type = nested_type; + nested_data->column = data.column ? assert_cast(*data.column).getDataPtr() : nullptr; + + auto nested_subcolumn_data = nested_type->getSubcolumnData(subcolumn_name, *nested_data, throw_if_null); + if (!nested_subcolumn_data) + return nullptr; + + auto creator = SerializationArray::SubcolumnCreator(data.column ? assert_cast(*data.column).getOffsetsPtr() : nullptr); + auto res = std::make_unique(); + res->serialization = creator.create(nested_subcolumn_data->serialization); + res->type = creator.create(nested_subcolumn_data->type); + if (data.column) + res->column = creator.create(nested_subcolumn_data->column); + + return res; +} + static DataTypePtr create(const ASTPtr & arguments) { if (!arguments || arguments->children.size() != 1) diff --git a/src/DataTypes/DataTypeArray.h b/src/DataTypes/DataTypeArray.h index 4423f137e1a..b242d871c36 100644 --- a/src/DataTypes/DataTypeArray.h +++ b/src/DataTypes/DataTypeArray.h @@ -55,7 +55,12 @@ public: bool textCanContainOnlyValidUTF8() const override { return nested->textCanContainOnlyValidUTF8(); } bool isComparable() const override { return nested->isComparable(); } bool canBeComparedWithCollation() const override { return nested->canBeComparedWithCollation(); } - bool hasDynamicSubcolumns() const override { return nested->hasDynamicSubcolumns(); } + bool hasDynamicSubcolumnsDeprecated() const override { return nested->hasDynamicSubcolumnsDeprecated(); } + + /// Array column doesn't have subcolumns by itself but allows to read subcolumns of nested column. + /// If nested column has dynamic subcolumns, Array of this type should also be able to read these dynamic subcolumns. + bool hasDynamicSubcolumnsData() const override { return nested->hasDynamicSubcolumnsData(); } + std::unique_ptr getDynamicSubcolumnData(std::string_view subcolumn_name, const SubstreamData & data, bool throw_if_null) const override; bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { diff --git a/src/DataTypes/DataTypeDynamic.cpp b/src/DataTypes/DataTypeDynamic.cpp new file mode 100644 index 00000000000..c920e69c13b --- /dev/null +++ b/src/DataTypes/DataTypeDynamic.cpp @@ -0,0 +1,144 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int UNEXPECTED_AST_STRUCTURE; +} + +DataTypeDynamic::DataTypeDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_dynamic_types_) +{ +} + +MutableColumnPtr DataTypeDynamic::createColumn() const +{ + return ColumnDynamic::create(max_dynamic_types); +} + +String DataTypeDynamic::doGetName() const +{ + if (max_dynamic_types == DEFAULT_MAX_DYNAMIC_TYPES) + return "Dynamic"; + return "Dynamic(max_types=" + toString(max_dynamic_types) + ")"; +} + +Field DataTypeDynamic::getDefault() const +{ + return Field(Null()); +} + +SerializationPtr DataTypeDynamic::doGetDefaultSerialization() const +{ + return std::make_shared(max_dynamic_types); +} + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.empty()) + return std::make_shared(); + + if (arguments->children.size() > 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Dynamic data type can have only one optional argument - the maximum number of dynamic types in a form 'Dynamic(max_types=N)"); + + + const auto * argument = arguments->children[0]->as(); + if (!argument || argument->name != "equals") + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Dynamic data type argument should be in a form 'max_types=N'"); + + auto identifier_name = argument->arguments->children[0]->as()->name(); + if (identifier_name != "max_types") + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected identifier: {}. Dynamic data type argument should be in a form 'max_types=N'", identifier_name); + + auto * literal = argument->arguments->children[1]->as(); + + if (!literal || literal->value.getType() != Field::Types::UInt64 || literal->value.get() == 0 || literal->value.get() > 255) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "'max_types' argument for Dynamic type should be a positive integer between 1 and 255"); + + return std::make_shared(literal->value.get()); +} + +void registerDataTypeDynamic(DataTypeFactory & factory) +{ + factory.registerDataType("Dynamic", create); +} + +std::unique_ptr DataTypeDynamic::getDynamicSubcolumnData(std::string_view subcolumn_name, const DB::IDataType::SubstreamData & data, bool throw_if_null) const +{ + auto [subcolumn_type_name, subcolumn_nested_name] = Nested::splitName(subcolumn_name); + /// Check if requested subcolumn is a valid data type. + auto subcolumn_type = DataTypeFactory::instance().tryGet(String(subcolumn_type_name)); + if (!subcolumn_type) + { + if (throw_if_null) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Dynamic type doesn't have subcolumn '{}'", subcolumn_type_name); + return nullptr; + } + + std::unique_ptr res = std::make_unique(subcolumn_type->getDefaultSerialization()); + res->type = subcolumn_type; + std::optional discriminator; + if (data.column) + { + /// If column was provided, we should extract subcolumn from Dynamic column. + const auto & dynamic_column = assert_cast(*data.column); + const auto & variant_info = dynamic_column.getVariantInfo(); + /// Check if provided Dynamic column has subcolumn of this type. + auto it = variant_info.variant_name_to_discriminator.find(subcolumn_type->getName()); + if (it != variant_info.variant_name_to_discriminator.end()) + { + discriminator = it->second; + res->column = dynamic_column.getVariantColumn().getVariantPtrByGlobalDiscriminator(*discriminator); + } + } + + /// Extract nested subcolumn of requested dynamic subcolumn if needed. + if (!subcolumn_nested_name.empty()) + { + res = getSubcolumnData(subcolumn_nested_name, *res, throw_if_null); + if (!res) + return nullptr; + } + + res->serialization = std::make_shared(res->serialization, subcolumn_type->getName()); + res->type = makeNullableOrLowCardinalityNullableSafe(res->type); + if (data.column) + { + if (discriminator) + { + /// Provided Dynamic column has subcolumn of this type, we should use VariantSubcolumnCreator to + /// create full subcolumn from variant according to discriminators. + const auto & variant_column = assert_cast(*data.column).getVariantColumn(); + auto creator = SerializationVariantElement::VariantSubcolumnCreator(variant_column.getLocalDiscriminatorsPtr(), "", *discriminator, variant_column.localDiscriminatorByGlobal(*discriminator)); + res->column = creator.create(res->column); + } + else + { + /// Provided Dynamic column doesn't have subcolumn of this type, just create column filled with default values. + auto column = res->type->createColumn(); + column->insertManyDefaults(data.column->size()); + res->column = std::move(column); + } + } + + return res; +} + +} diff --git a/src/DataTypes/DataTypeDynamic.h b/src/DataTypes/DataTypeDynamic.h new file mode 100644 index 00000000000..d5e4c5261ce --- /dev/null +++ b/src/DataTypes/DataTypeDynamic.h @@ -0,0 +1,55 @@ +#pragma once + +#include + +namespace DB +{ + +/// Dynamic type allows to store values of any type inside it and to read +/// subcolumns with any type without knowing all of them in advance. +class DataTypeDynamic final : public IDataType +{ +public: + static constexpr bool is_parametric = true; + + explicit DataTypeDynamic(size_t max_dynamic_types_ = DEFAULT_MAX_DYNAMIC_TYPES); + + TypeIndex getTypeId() const override { return TypeIndex::Dynamic; } + const char * getFamilyName() const override { return "Dynamic"; } + + bool isParametric() const override { return true; } + bool canBeInsideNullable() const override { return false; } + bool supportsSparseSerialization() const override { return false; } + bool canBeInsideSparseColumns() const override { return false; } + bool isComparable() const override { return true; } + + MutableColumnPtr createColumn() const override; + + Field getDefault() const override; + + /// 2 Dynamic types with different max_dynamic_types parameters are considered as different. + bool equals(const IDataType & rhs) const override + { + if (const auto * rhs_dynamic_type = typeid_cast(&rhs)) + return max_dynamic_types == rhs_dynamic_type->max_dynamic_types; + return false; + } + + bool haveSubtypes() const override { return false; } + + bool hasDynamicSubcolumnsData() const override { return true; } + std::unique_ptr getDynamicSubcolumnData(std::string_view subcolumn_name, const SubstreamData & data, bool throw_if_null) const override; + + size_t getMaxDynamicTypes() const { return max_dynamic_types; } + +private: + static constexpr size_t DEFAULT_MAX_DYNAMIC_TYPES = 32; + + SerializationPtr doGetDefaultSerialization() const override; + String doGetName() const override; + + size_t max_dynamic_types; +}; + +} + diff --git a/src/DataTypes/DataTypeFactory.cpp b/src/DataTypes/DataTypeFactory.cpp index 844384f3c95..8c8f9999ada 100644 --- a/src/DataTypes/DataTypeFactory.cpp +++ b/src/DataTypes/DataTypeFactory.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include @@ -292,6 +292,7 @@ DataTypeFactory::DataTypeFactory() registerDataTypeMap(*this); registerDataTypeObject(*this); registerDataTypeVariant(*this); + registerDataTypeDynamic(*this); } DataTypeFactory & DataTypeFactory::instance() diff --git a/src/DataTypes/DataTypeFactory.h b/src/DataTypes/DataTypeFactory.h index 4727cb3ae5c..86e0203358d 100644 --- a/src/DataTypes/DataTypeFactory.h +++ b/src/DataTypes/DataTypeFactory.h @@ -100,5 +100,6 @@ void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory); void registerDataTypeDomainGeo(DataTypeFactory & factory); void registerDataTypeObject(DataTypeFactory & factory); void registerDataTypeVariant(DataTypeFactory & factory); +void registerDataTypeDynamic(DataTypeFactory & factory); } diff --git a/src/DataTypes/DataTypeMap.cpp b/src/DataTypes/DataTypeMap.cpp index d9f70e1659d..4d7ab63f966 100644 --- a/src/DataTypes/DataTypeMap.cpp +++ b/src/DataTypes/DataTypeMap.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/src/DataTypes/DataTypeMap.h b/src/DataTypes/DataTypeMap.h index 7281cca1bb1..4866c3e78cc 100644 --- a/src/DataTypes/DataTypeMap.h +++ b/src/DataTypes/DataTypeMap.h @@ -42,7 +42,7 @@ public: bool isComparable() const override { return key_type->isComparable() && value_type->isComparable(); } bool isParametric() const override { return true; } bool haveSubtypes() const override { return true; } - bool hasDynamicSubcolumns() const override { return nested->hasDynamicSubcolumns(); } + bool hasDynamicSubcolumnsDeprecated() const override { return nested->hasDynamicSubcolumnsDeprecated(); } const DataTypePtr & getKeyType() const { return key_type; } const DataTypePtr & getValueType() const { return value_type; } diff --git a/src/DataTypes/DataTypeObject.h b/src/DataTypes/DataTypeObject.h index 937a9091371..c610a1a8ba4 100644 --- a/src/DataTypes/DataTypeObject.h +++ b/src/DataTypes/DataTypeObject.h @@ -36,7 +36,7 @@ public: bool haveSubtypes() const override { return false; } bool equals(const IDataType & rhs) const override; bool isParametric() const override { return true; } - bool hasDynamicSubcolumns() const override { return true; } + bool hasDynamicSubcolumnsDeprecated() const override { return true; } SerializationPtr doGetDefaultSerialization() const override; diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index 5bbd79160d4..6e32ed586ea 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include @@ -291,9 +291,9 @@ bool DataTypeTuple::haveMaximumSizeOfValue() const return std::all_of(elems.begin(), elems.end(), [](auto && elem) { return elem->haveMaximumSizeOfValue(); }); } -bool DataTypeTuple::hasDynamicSubcolumns() const +bool DataTypeTuple::hasDynamicSubcolumnsDeprecated() const { - return std::any_of(elems.begin(), elems.end(), [](auto && elem) { return elem->hasDynamicSubcolumns(); }); + return std::any_of(elems.begin(), elems.end(), [](auto && elem) { return elem->hasDynamicSubcolumnsDeprecated(); }); } bool DataTypeTuple::isComparable() const diff --git a/src/DataTypes/DataTypeTuple.h b/src/DataTypes/DataTypeTuple.h index 15561fe4286..fd00fce5a17 100644 --- a/src/DataTypes/DataTypeTuple.h +++ b/src/DataTypes/DataTypeTuple.h @@ -52,7 +52,7 @@ public: bool isComparable() const override; bool textCanContainOnlyValidUTF8() const override; bool haveMaximumSizeOfValue() const override; - bool hasDynamicSubcolumns() const override; + bool hasDynamicSubcolumnsDeprecated() const override; size_t getMaximumSizeOfValueInMemory() const override; size_t getSizeOfValueInMemory() const override; diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index 5989da90937..8a10ca7d06d 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -18,7 +17,6 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; - extern const int EMPTY_DATA_PASSED; } @@ -33,6 +31,9 @@ DataTypeVariant::DataTypeVariant(const DataTypes & variants_) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Nullable/LowCardinality(Nullable) types are not allowed inside Variant type"); if (type->getTypeId() == TypeIndex::Variant) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Nested Variant types are not allowed"); + if (type->getTypeId() == TypeIndex::Dynamic) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dynamic type is not allowed inside Variant type"); + /// Don't use Nothing type as a variant. if (!isNothing(type)) name_to_type[type->getName()] = type; @@ -42,9 +43,6 @@ DataTypeVariant::DataTypeVariant(const DataTypes & variants_) for (const auto & [_, type] : name_to_type) variants.push_back(type); - if (variants.empty()) - throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Variant cannot be empty"); - if (variants.size() > ColumnVariant::MAX_NESTED_COLUMNS) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Variant type with more than {} nested types is not allowed", ColumnVariant::MAX_NESTED_COLUMNS); } @@ -113,9 +111,16 @@ bool DataTypeVariant::equals(const IDataType & rhs) const return false; for (size_t i = 0; i < size; ++i) + { if (!variants[i]->equals(*rhs_variant.variants[i])) return false; + /// The same data types with different custom names considered different. + /// For example, UInt8 and Bool. + if ((variants[i]->hasCustomName() || rhs_variant.variants[i]) && variants[i]->getName() != rhs_variant.variants[i]->getName()) + return false; + } + return true; } @@ -129,17 +134,15 @@ bool DataTypeVariant::haveMaximumSizeOfValue() const return std::all_of(variants.begin(), variants.end(), [](auto && elem) { return elem->haveMaximumSizeOfValue(); }); } -bool DataTypeVariant::hasDynamicSubcolumns() const +bool DataTypeVariant::hasDynamicSubcolumnsDeprecated() const { - return std::any_of(variants.begin(), variants.end(), [](auto && elem) { return elem->hasDynamicSubcolumns(); }); + return std::any_of(variants.begin(), variants.end(), [](auto && elem) { return elem->hasDynamicSubcolumnsDeprecated(); }); } -std::optional DataTypeVariant::tryGetVariantDiscriminator(const IDataType & type) const +std::optional DataTypeVariant::tryGetVariantDiscriminator(const String & type_name) const { - String type_name = type.getName(); for (size_t i = 0; i != variants.size(); ++i) { - /// We don't use equals here, because it doesn't respect custom type names. if (variants[i]->getName() == type_name) return i; } @@ -183,7 +186,7 @@ void DataTypeVariant::forEachChild(const DB::IDataType::ChildCallback & callback static DataTypePtr create(const ASTPtr & arguments) { if (!arguments || arguments->children.empty()) - throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Variant cannot be empty"); + return std::make_shared(DataTypes{}); DataTypes nested_types; nested_types.reserve(arguments->children.size()); diff --git a/src/DataTypes/DataTypeVariant.h b/src/DataTypes/DataTypeVariant.h index ab471d37b2f..5ba1c3b40be 100644 --- a/src/DataTypes/DataTypeVariant.h +++ b/src/DataTypes/DataTypeVariant.h @@ -46,14 +46,14 @@ public: bool haveSubtypes() const override { return true; } bool textCanContainOnlyValidUTF8() const override; bool haveMaximumSizeOfValue() const override; - bool hasDynamicSubcolumns() const override; + bool hasDynamicSubcolumnsDeprecated() const override; size_t getMaximumSizeOfValueInMemory() const override; const DataTypePtr & getVariant(size_t i) const { return variants[i]; } const DataTypes & getVariants() const { return variants; } /// Check if Variant has provided type in the list of variants and return its discriminator. - std::optional tryGetVariantDiscriminator(const IDataType & type) const; + std::optional tryGetVariantDiscriminator(const String & type_name) const; void forEachChild(const ChildCallback & callback) const override; diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp index 344b81be960..1c9715bbf53 100644 --- a/src/DataTypes/IDataType.cpp +++ b/src/DataTypes/IDataType.cpp @@ -101,14 +101,12 @@ void IDataType::forEachSubcolumn( data.serialization->enumerateStreams(settings, callback_with_data, data); } -template -Ptr IDataType::getForSubcolumn( +std::unique_ptr IDataType::getSubcolumnData( std::string_view subcolumn_name, const SubstreamData & data, - Ptr SubstreamData::*member, - bool throw_if_null) const + bool throw_if_null) { - Ptr res; + std::unique_ptr res; ISerialization::StreamCallback callback_with_data = [&](const auto & subpath) { @@ -120,7 +118,29 @@ Ptr IDataType::getForSubcolumn( auto name = ISerialization::getSubcolumnNameForStream(subpath, prefix_len); /// Create data from path only if it's requested subcolumn. if (name == subcolumn_name) - res = ISerialization::createFromPath(subpath, prefix_len).*member; + { + res = std::make_unique(ISerialization::createFromPath(subpath, prefix_len)); + } + /// Check if this subcolumn is a prefix of requested subcolumn and it can create dynamic subcolumns. + else if (subcolumn_name.starts_with(name + ".") && subpath[i].data.type && subpath[i].data.type->hasDynamicSubcolumnsData()) + { + auto dynamic_subcolumn_name = subcolumn_name.substr(name.size() + 1); + auto dynamic_subcolumn_data = subpath[i].data.type->getDynamicSubcolumnData(dynamic_subcolumn_name, subpath[i].data, false); + if (dynamic_subcolumn_data) + { + /// Create requested subcolumn using dynamic subcolumn data. + auto tmp_subpath = subpath; + if (tmp_subpath[i].creator) + { + dynamic_subcolumn_data->type = tmp_subpath[i].creator->create(dynamic_subcolumn_data->type); + dynamic_subcolumn_data->column = tmp_subpath[i].creator->create(dynamic_subcolumn_data->column); + dynamic_subcolumn_data->serialization = tmp_subpath[i].creator->create(dynamic_subcolumn_data->serialization); + } + + tmp_subpath[i].data = *dynamic_subcolumn_data; + res = std::make_unique(ISerialization::createFromPath(tmp_subpath, prefix_len)); + } + } } subpath[i].visited = true; } @@ -130,8 +150,11 @@ Ptr IDataType::getForSubcolumn( settings.position_independent_encoding = false; data.serialization->enumerateStreams(settings, callback_with_data, data); + if (!res && data.type->hasDynamicSubcolumnsData()) + return data.type->getDynamicSubcolumnData(subcolumn_name, data, throw_if_null); + if (!res && throw_if_null) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, data.type->getName()); return res; } @@ -141,34 +164,51 @@ bool IDataType::hasSubcolumn(std::string_view subcolumn_name) const return tryGetSubcolumnType(subcolumn_name) != nullptr; } +bool IDataType::hasDynamicSubcolumns() const +{ + if (hasDynamicSubcolumnsData()) + return true; + + bool has_dynamic_subcolumns = false; + auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()); + auto callback = [&](const SubstreamPath &, const String &, const SubstreamData & subcolumn_data) + { + has_dynamic_subcolumns |= subcolumn_data.type->hasDynamicSubcolumnsData(); + }; + forEachSubcolumn(callback, data); + return has_dynamic_subcolumns; +} + DataTypePtr IDataType::tryGetSubcolumnType(std::string_view subcolumn_name) const { auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()); - return getForSubcolumn(subcolumn_name, data, &SubstreamData::type, false); + auto subcolumn_data = getSubcolumnData(subcolumn_name, data, false); + return subcolumn_data ? subcolumn_data->type : nullptr; } DataTypePtr IDataType::getSubcolumnType(std::string_view subcolumn_name) const { auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()); - return getForSubcolumn(subcolumn_name, data, &SubstreamData::type, true); + return getSubcolumnData(subcolumn_name, data, true)->type; } ColumnPtr IDataType::tryGetSubcolumn(std::string_view subcolumn_name, const ColumnPtr & column) const { - auto data = SubstreamData(getDefaultSerialization()).withColumn(column); - return getForSubcolumn(subcolumn_name, data, &SubstreamData::column, false); + auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()).withColumn(column); + auto subcolumn_data = getSubcolumnData(subcolumn_name, data, false); + return subcolumn_data ? subcolumn_data->column : nullptr; } ColumnPtr IDataType::getSubcolumn(std::string_view subcolumn_name, const ColumnPtr & column) const { - auto data = SubstreamData(getDefaultSerialization()).withColumn(column); - return getForSubcolumn(subcolumn_name, data, &SubstreamData::column, true); + auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()).withColumn(column); + return getSubcolumnData(subcolumn_name, data, true)->column; } SerializationPtr IDataType::getSubcolumnSerialization(std::string_view subcolumn_name, const SerializationPtr & serialization) const { - auto data = SubstreamData(serialization); - return getForSubcolumn(subcolumn_name, data, &SubstreamData::serialization, true); + auto data = SubstreamData(serialization).withType(getPtr()); + return getSubcolumnData(subcolumn_name, data, true)->serialization; } Names IDataType::getSubcolumnNames() const @@ -323,6 +363,7 @@ bool isMap(TYPE data_type) {return WhichDataType(data_type).isMap(); } \ bool isInterval(TYPE data_type) {return WhichDataType(data_type).isInterval(); } \ bool isObject(TYPE data_type) { return WhichDataType(data_type).isObject(); } \ bool isVariant(TYPE data_type) { return WhichDataType(data_type).isVariant(); } \ +bool isDynamic(TYPE data_type) { return WhichDataType(data_type).isDynamic(); } \ bool isNothing(TYPE data_type) { return WhichDataType(data_type).isNothing(); } \ \ bool isColumnedAsNumber(TYPE data_type) \ diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index eaf798a3017..46c30240ef8 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -11,6 +11,12 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + + class ReadBuffer; class WriteBuffer; @@ -311,8 +317,13 @@ public: /// Strings, Numbers, Date, DateTime, Nullable virtual bool canBeInsideLowCardinality() const { return false; } - /// Object, Array(Object), Tuple(..., Object, ...) - virtual bool hasDynamicSubcolumns() const { return false; } + /// Checks for deprecated Object type usage recursively: Object, Array(Object), Tuple(..., Object, ...) + virtual bool hasDynamicSubcolumnsDeprecated() const { return false; } + + /// Checks if column has dynamic subcolumns. + virtual bool hasDynamicSubcolumns() const; + /// Checks if column can create dynamic subcolumns data and getDynamicSubcolumnData can be called. + virtual bool hasDynamicSubcolumnsData() const { return false; } /// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column. static void updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint); @@ -329,16 +340,25 @@ protected: mutable SerializationPtr custom_serialization; public: + bool hasCustomName() const { return static_cast(custom_name.get()); } const IDataTypeCustomName * getCustomName() const { return custom_name.get(); } const ISerialization * getCustomSerialization() const { return custom_serialization.get(); } -private: - template - Ptr getForSubcolumn( +protected: + static std::unique_ptr getSubcolumnData( std::string_view subcolumn_name, const SubstreamData & data, - Ptr SubstreamData::*member, - bool throw_if_null) const; + bool throw_if_null); + + virtual std::unique_ptr getDynamicSubcolumnData( + std::string_view /*subcolumn_name*/, + const SubstreamData & /*data*/, + bool throw_if_null) const + { + if (throw_if_null) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getDynamicSubcolumnData() is not implemented for type {}", getName()); + return nullptr; + } }; @@ -423,6 +443,7 @@ struct WhichDataType constexpr bool isLowCardinality() const { return idx == TypeIndex::LowCardinality; } constexpr bool isVariant() const { return idx == TypeIndex::Variant; } + constexpr bool isDynamic() const { return idx == TypeIndex::Dynamic; } }; /// IDataType helpers (alternative for IDataType virtual methods with single point of truth) @@ -483,6 +504,7 @@ bool isMap(TYPE data_type); \ bool isInterval(TYPE data_type); \ bool isObject(TYPE data_type); \ bool isVariant(TYPE data_type); \ +bool isDynamic(TYPE data_type); \ bool isNothing(TYPE data_type); \ \ bool isColumnedAsNumber(TYPE data_type); \ diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp index 6a56f885503..650559d21d9 100644 --- a/src/DataTypes/NestedUtils.cpp +++ b/src/DataTypes/NestedUtils.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include #include "Columns/IColumn.h" #include diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp index 7c671fcf44f..6993523bcb7 100644 --- a/src/DataTypes/ObjectUtils.cpp +++ b/src/DataTypes/ObjectUtils.cpp @@ -177,7 +177,7 @@ static std::pair convertObjectColumnToTuple( static std::pair recursivlyConvertDynamicColumnToTuple( const ColumnPtr & column, const DataTypePtr & type) { - if (!type->hasDynamicSubcolumns()) + if (!type->hasDynamicSubcolumnsDeprecated()) return {column, type}; if (const auto * type_object = typeid_cast(type.get())) @@ -243,7 +243,7 @@ void convertDynamicColumnsToTuples(Block & block, const StorageSnapshotPtr & sto { for (auto & column : block) { - if (!column.type->hasDynamicSubcolumns()) + if (!column.type->hasDynamicSubcolumnsDeprecated()) continue; std::tie(column.column, column.type) @@ -417,7 +417,7 @@ static DataTypePtr getLeastCommonTypeForTuple( static DataTypePtr getLeastCommonTypeForDynamicColumnsImpl( const DataTypePtr & type_in_storage, const DataTypes & concrete_types, bool check_ambiguos_paths) { - if (!type_in_storage->hasDynamicSubcolumns()) + if (!type_in_storage->hasDynamicSubcolumnsDeprecated()) return type_in_storage; if (isObject(type_in_storage)) @@ -459,7 +459,7 @@ DataTypePtr getLeastCommonTypeForDynamicColumns( DataTypePtr createConcreteEmptyDynamicColumn(const DataTypePtr & type_in_storage) { - if (!type_in_storage->hasDynamicSubcolumns()) + if (!type_in_storage->hasDynamicSubcolumnsDeprecated()) return type_in_storage; if (isObject(type_in_storage)) @@ -494,7 +494,7 @@ bool hasDynamicSubcolumns(const ColumnsDescription & columns) return std::any_of(columns.begin(), columns.end(), [](const auto & column) { - return column.type->hasDynamicSubcolumns(); + return column.type->hasDynamicSubcolumnsDeprecated(); }); } @@ -1065,7 +1065,7 @@ Field FieldVisitorFoldDimension::operator()(const Null & x) const void setAllObjectsToDummyTupleType(NamesAndTypesList & columns) { for (auto & column : columns) - if (column.type->hasDynamicSubcolumns()) + if (column.type->hasDynamicSubcolumnsDeprecated()) column.type = createConcreteEmptyDynamicColumn(column.type); } diff --git a/src/DataTypes/ObjectUtils.h b/src/DataTypes/ObjectUtils.h index 3e3b1b96740..6599d8adef1 100644 --- a/src/DataTypes/ObjectUtils.h +++ b/src/DataTypes/ObjectUtils.h @@ -194,7 +194,7 @@ ColumnsDescription getConcreteObjectColumns( /// dummy column will be removed. for (const auto & column : storage_columns) { - if (column.type->hasDynamicSubcolumns()) + if (column.type->hasDynamicSubcolumnsDeprecated()) types_in_entries[column.name].push_back(createConcreteEmptyDynamicColumn(column.type)); } @@ -204,7 +204,7 @@ ColumnsDescription getConcreteObjectColumns( for (const auto & column : entry_columns) { auto storage_column = storage_columns.tryGetPhysical(column.name); - if (storage_column && storage_column->type->hasDynamicSubcolumns()) + if (storage_column && storage_column->type->hasDynamicSubcolumnsDeprecated()) types_in_entries[column.name].push_back(column.type); } } diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index a3a28f8091c..dbe27a5f3f6 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -196,6 +196,8 @@ String getNameForSubstreamPath( stream_name += ".variant_offsets"; else if (it->type == Substream::VariantElement) stream_name += "." + it->variant_element_name; + else if (it->type == SubstreamType::DynamicStructure) + stream_name += ".dynamic_structure"; } return stream_name; @@ -271,6 +273,23 @@ ColumnPtr ISerialization::getFromSubstreamsCache(SubstreamsCache * cache, const return it == cache->end() ? nullptr : it->second; } +void ISerialization::addToSubstreamsDeserializeStatesCache(SubstreamsDeserializeStatesCache * cache, const SubstreamPath & path, DeserializeBinaryBulkStatePtr state) +{ + if (!cache || path.empty()) + return; + + cache->emplace(getSubcolumnNameForStream(path), state); +} + +ISerialization::DeserializeBinaryBulkStatePtr ISerialization::getFromSubstreamsDeserializeStatesCache(SubstreamsDeserializeStatesCache * cache, const SubstreamPath & path) +{ + if (!cache || path.empty()) + return nullptr; + + auto it = cache->find(getSubcolumnNameForStream(path)); + return it == cache->end() ? nullptr : it->second; +} + bool ISerialization::isSpecialCompressionAllowed(const SubstreamPath & path) { for (const auto & elem : path) diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index ebaa26d19a6..914ff9cf4a2 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -99,6 +99,19 @@ public: using SubcolumnCreatorPtr = std::shared_ptr; + struct SerializeBinaryBulkState + { + virtual ~SerializeBinaryBulkState() = default; + }; + + struct DeserializeBinaryBulkState + { + virtual ~DeserializeBinaryBulkState() = default; + }; + + using SerializeBinaryBulkStatePtr = std::shared_ptr; + using DeserializeBinaryBulkStatePtr = std::shared_ptr; + struct SubstreamData { SubstreamData() = default; @@ -125,10 +138,22 @@ public: return *this; } + SubstreamData & withDeserializeState(DeserializeBinaryBulkStatePtr deserialize_state_) + { + deserialize_state = std::move(deserialize_state_); + return *this; + } + SerializationPtr serialization; DataTypePtr type; ColumnPtr column; SerializationInfoPtr serialization_info; + + /// For types with dynamic subcolumns deserialize state contains information + /// about current dynamic structure. And this information can be useful + /// when we call enumerateStreams after deserializeBinaryBulkStatePrefix + /// to enumerate dynamic streams. + DeserializeBinaryBulkStatePtr deserialize_state; }; struct Substream @@ -160,6 +185,9 @@ public: VariantElements, VariantElement, + DynamicData, + DynamicStructure, + Regular, }; @@ -218,19 +246,6 @@ public: using OutputStreamGetter = std::function; using InputStreamGetter = std::function; - struct SerializeBinaryBulkState - { - virtual ~SerializeBinaryBulkState() = default; - }; - - struct DeserializeBinaryBulkState - { - virtual ~DeserializeBinaryBulkState() = default; - }; - - using SerializeBinaryBulkStatePtr = std::shared_ptr; - using DeserializeBinaryBulkStatePtr = std::shared_ptr; - struct SerializeBinaryBulkSettings { OutputStreamGetter getter; @@ -240,6 +255,14 @@ public: bool low_cardinality_use_single_dictionary_for_part = true; bool position_independent_encoding = true; + + enum class DynamicStatisticsMode + { + NONE, /// Don't write statistics. + PREFIX, /// Write statistics in prefix. + SUFFIX, /// Write statistics in suffix. + }; + DynamicStatisticsMode dynamic_write_statistics = DynamicStatisticsMode::NONE; }; struct DeserializeBinaryBulkSettings @@ -256,6 +279,8 @@ public: /// If not zero, may be used to avoid reallocations while reading column of String type. double avg_value_size_hint = 0; + + bool dynamic_read_statistics = false; }; /// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark. @@ -270,10 +295,13 @@ public: SerializeBinaryBulkSettings & /*settings*/, SerializeBinaryBulkStatePtr & /*state*/) const {} + using SubstreamsDeserializeStatesCache = std::unordered_map; + /// Call before before deserializeBinaryBulkWithMultipleStreams chain to get DeserializeBinaryBulkStatePtr. virtual void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & /*settings*/, - DeserializeBinaryBulkStatePtr & /*state*/) const {} + DeserializeBinaryBulkStatePtr & /*state*/, + SubstreamsDeserializeStatesCache * /*cache*/) const {} /** 'offset' and 'limit' are used to specify range. * limit = 0 - means no limit. @@ -393,6 +421,9 @@ public: static void addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column); static ColumnPtr getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path); + static void addToSubstreamsDeserializeStatesCache(SubstreamsDeserializeStatesCache * cache, const SubstreamPath & path, DeserializeBinaryBulkStatePtr state); + static DeserializeBinaryBulkStatePtr getFromSubstreamsDeserializeStatesCache(SubstreamsDeserializeStatesCache * cache, const SubstreamPath & path); + static bool isSpecialCompressionAllowed(const SubstreamPath & path); static size_t getArrayLevel(const SubstreamPath & path); diff --git a/src/DataTypes/Serializations/SerializationAggregateFunction.cpp b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp index bab7c1d4cf2..55f7641e058 100644 --- a/src/DataTypes/Serializations/SerializationAggregateFunction.cpp +++ b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp @@ -146,10 +146,10 @@ void SerializationAggregateFunction::serializeTextEscaped(const IColumn & column } -void SerializationAggregateFunction::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationAggregateFunction::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String s; - readEscapedString(s, istr); + settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(s, istr) : readEscapedString(s, istr); deserializeFromString(function, column, s, version); } diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index e8aab615849..ac7b8f4d084 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -254,7 +254,8 @@ void SerializationArray::enumerateStreams( auto next_data = SubstreamData(nested) .withType(type_array ? type_array->getNestedType() : nullptr) .withColumn(column_array ? column_array->getDataPtr() : nullptr) - .withSerializationInfo(data.serialization_info); + .withSerializationInfo(data.serialization_info) + .withDeserializeState(data.deserialize_state); nested->enumerateStreams(settings, callback, next_data); settings.path.pop_back(); @@ -284,10 +285,11 @@ void SerializationArray::serializeBinaryBulkStateSuffix( void SerializationArray::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { settings.path.push_back(Substream::ArrayElements); - nested->deserializeBinaryBulkStatePrefix(settings, state); + nested->deserializeBinaryBulkStatePrefix(settings, state, cache); settings.path.pop_back(); } diff --git a/src/DataTypes/Serializations/SerializationArray.h b/src/DataTypes/Serializations/SerializationArray.h index 82f5e8bce45..c3353f0c251 100644 --- a/src/DataTypes/Serializations/SerializationArray.h +++ b/src/DataTypes/Serializations/SerializationArray.h @@ -55,7 +55,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, @@ -71,7 +72,6 @@ public: DeserializeBinaryBulkStatePtr & state, SubstreamsCache * cache) const override; -private: struct SubcolumnCreator : public ISubcolumnCreator { const ColumnPtr offsets; diff --git a/src/DataTypes/Serializations/SerializationBool.cpp b/src/DataTypes/Serializations/SerializationBool.cpp index b63f25ddc35..a71c8a91cef 100644 --- a/src/DataTypes/Serializations/SerializationBool.cpp +++ b/src/DataTypes/Serializations/SerializationBool.cpp @@ -242,8 +242,10 @@ void SerializationBool::deserializeTextEscaped(IColumn & column, ReadBuffer & is { if (istr.eof()) throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); - - deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); + if (settings.tsv.crlf_end_of_line_input) + deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n' || *buf.position() == '\r'; }); + else + deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); } bool SerializationBool::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp b/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp index 938fd050173..1ba16f8492e 100644 --- a/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp +++ b/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp @@ -75,7 +75,7 @@ void SerializationCustomSimpleText::serializeTextEscaped(const IColumn & column, void SerializationCustomSimpleText::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String str; - readEscapedString(str, istr); + settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(str, istr) : readEscapedString(str, istr); deserializeFromString(*this, column, str, settings); } diff --git a/src/DataTypes/Serializations/SerializationDynamic.cpp b/src/DataTypes/Serializations/SerializationDynamic.cpp new file mode 100644 index 00000000000..6351ff0ca0b --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDynamic.cpp @@ -0,0 +1,644 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; + extern const int LOGICAL_ERROR; +} + +struct SerializeBinaryBulkStateDynamic : public ISerialization::SerializeBinaryBulkState +{ + SerializationDynamic::DynamicStructureSerializationVersion structure_version; + DataTypePtr variant_type; + Names variant_names; + SerializationPtr variant_serialization; + ISerialization::SerializeBinaryBulkStatePtr variant_state; + + /// Variants statistics. Map (Variant name) -> (Variant size). + ColumnDynamic::Statistics statistics = { .source = ColumnDynamic::Statistics::Source::READ, .data = {} }; + + explicit SerializeBinaryBulkStateDynamic(UInt64 structure_version_) : structure_version(structure_version_) {} +}; + +struct DeserializeBinaryBulkStateDynamic : public ISerialization::DeserializeBinaryBulkState +{ + SerializationPtr variant_serialization; + ISerialization::DeserializeBinaryBulkStatePtr variant_state; + ISerialization::DeserializeBinaryBulkStatePtr structure_state; +}; + +void SerializationDynamic::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + settings.path.push_back(Substream::DynamicStructure); + callback(settings.path); + settings.path.pop_back(); + + const auto * column_dynamic = data.column ? &assert_cast(*data.column) : nullptr; + const auto * deserialize_state = data.deserialize_state ? checkAndGetState(data.deserialize_state) : nullptr; + + /// If column is nullptr and we don't have deserialize state yet, nothing to enumerate as we don't have any variants. + if (!column_dynamic && !deserialize_state) + return; + + const auto & variant_type = column_dynamic ? column_dynamic->getVariantInfo().variant_type : checkAndGetState(deserialize_state->structure_state)->variant_type; + auto variant_serialization = variant_type->getDefaultSerialization(); + + settings.path.push_back(Substream::DynamicData); + auto variant_data = SubstreamData(variant_serialization) + .withType(variant_type) + .withColumn(column_dynamic ? column_dynamic->getVariantColumnPtr() : nullptr) + .withSerializationInfo(data.serialization_info) + .withDeserializeState(deserialize_state ? deserialize_state->variant_state : nullptr); + settings.path.back().data = variant_data; + variant_serialization->enumerateStreams(settings, callback, variant_data); + settings.path.pop_back(); +} + +SerializationDynamic::DynamicStructureSerializationVersion::DynamicStructureSerializationVersion(UInt64 version) : value(static_cast(version)) +{ + checkVersion(version); +} + +void SerializationDynamic::DynamicStructureSerializationVersion::checkVersion(UInt64 version) +{ + if (version != VariantTypeName) + throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid version for Dynamic structure serialization."); +} + +void SerializationDynamic::serializeBinaryBulkStatePrefix( + const DB::IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const auto & column_dynamic = assert_cast(column); + const auto & variant_info = column_dynamic.getVariantInfo(); + + settings.path.push_back(Substream::DynamicStructure); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Missing stream for Dynamic column structure during serialization of binary bulk state prefix"); + + /// Write structure serialization version. + UInt64 structure_version = DynamicStructureSerializationVersion::Value::VariantTypeName; + writeBinaryLittleEndian(structure_version, *stream); + auto dynamic_state = std::make_shared(structure_version); + + dynamic_state->variant_type = variant_info.variant_type; + dynamic_state->variant_names = variant_info.variant_names; + const auto & variant_column = column_dynamic.getVariantColumn(); + + /// Write internal Variant type name. + writeStringBinary(dynamic_state->variant_type->getName(), *stream); + + /// Write statistics in prefix if needed. + if (settings.dynamic_write_statistics == SerializeBinaryBulkSettings::DynamicStatisticsMode::PREFIX) + { + const auto & statistics = column_dynamic.getStatistics(); + for (size_t i = 0; i != variant_info.variant_names.size(); ++i) + { + size_t size = 0; + /// Check if we can use statistics stored in the column. There are 2 possible sources + /// of this statistics: + /// - statistics calculated during merge of some data parts (Statistics::Source::MERGE) + /// - statistics read from the data part during deserialization of Dynamic column (Statistics::Source::READ). + /// We can rely only on statistics calculated during the merge, because column with statistics that was read + /// during deserialization from some data part could be filtered/limited/transformed/etc and so the statistics can be outdated. + if (!statistics.data.empty() && statistics.source == ColumnDynamic::Statistics::Source::MERGE) + size = statistics.data.at(variant_info.variant_names[i]); + /// Otherwise we can use only variant sizes from current column. + else + size = variant_column.getVariantByGlobalDiscriminator(i).size(); + writeVarUInt(size, *stream); + } + } + + dynamic_state->variant_serialization = dynamic_state->variant_type->getDefaultSerialization(); + settings.path.push_back(Substream::DynamicData); + dynamic_state->variant_serialization->serializeBinaryBulkStatePrefix(variant_column, settings, dynamic_state->variant_state); + settings.path.pop_back(); + + state = std::move(dynamic_state); +} + +void SerializationDynamic::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const +{ + DeserializeBinaryBulkStatePtr structure_state = deserializeDynamicStructureStatePrefix(settings, cache); + if (!structure_state) + return; + + auto dynamic_state = std::make_shared(); + dynamic_state->structure_state = structure_state; + dynamic_state->variant_serialization = checkAndGetState(structure_state)->variant_type->getDefaultSerialization(); + + settings.path.push_back(Substream::DynamicData); + dynamic_state->variant_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_state->variant_state, cache); + settings.path.pop_back(); + + state = std::move(dynamic_state); +} + +ISerialization::DeserializeBinaryBulkStatePtr SerializationDynamic::deserializeDynamicStructureStatePrefix( + DeserializeBinaryBulkSettings & settings, SubstreamsDeserializeStatesCache * cache) +{ + settings.path.push_back(Substream::DynamicStructure); + + DeserializeBinaryBulkStatePtr state = nullptr; + if (auto cached_state = getFromSubstreamsDeserializeStatesCache(cache, settings.path)) + { + state = cached_state; + } + else if (auto * structure_stream = settings.getter(settings.path)) + { + /// Read structure serialization version. + UInt64 structure_version; + readBinaryLittleEndian(structure_version, *structure_stream); + auto structure_state = std::make_shared(structure_version); + /// Read internal Variant type name. + String data_type_name; + readStringBinary(data_type_name, *structure_stream); + structure_state->variant_type = DataTypeFactory::instance().get(data_type_name); + const auto * variant_type = typeid_cast(structure_state->variant_type.get()); + if (!variant_type) + throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect type of Dynamic nested column, expected Variant, got {}", structure_state->variant_type->getName()); + + /// Read statistics. + if (settings.dynamic_read_statistics) + { + const auto & variants = variant_type->getVariants(); + size_t variant_size; + for (const auto & variant : variants) + { + readVarUInt(variant_size, *structure_stream); + structure_state->statistics.data[variant->getName()] = variant_size; + } + } + + state = structure_state; + addToSubstreamsDeserializeStatesCache(cache, settings.path, state); + } + + settings.path.pop_back(); + return state; +} + +void SerializationDynamic::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const +{ + auto * dynamic_state = checkAndGetState(state); + settings.path.push_back(Substream::DynamicStructure); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Missing stream for Dynamic column structure during serialization of binary bulk state prefix"); + + /// Write statistics in suffix if needed. + if (settings.dynamic_write_statistics == SerializeBinaryBulkSettings::DynamicStatisticsMode::SUFFIX) + { + for (const auto & variant_name : dynamic_state->variant_names) + writeVarUInt(dynamic_state->statistics.data[variant_name], *stream); + } + + settings.path.push_back(Substream::DynamicData); + dynamic_state->variant_serialization->serializeBinaryBulkStateSuffix(settings, dynamic_state->variant_state); + settings.path.pop_back(); +} + +void SerializationDynamic::serializeBinaryBulkWithMultipleStreams( + const DB::IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const auto & column_dynamic = assert_cast(column); + auto * dynamic_state = checkAndGetState(state); + const auto & variant_info = column_dynamic.getVariantInfo(); + const auto * variant_column = &column_dynamic.getVariantColumn(); + + if (!variant_info.variant_type->equals(*dynamic_state->variant_type)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Mismatch of internal columns of Dynamic. Expected: {}, Got: {}", dynamic_state->variant_type->getName(), variant_info.variant_type->getName()); + + settings.path.push_back(Substream::DynamicData); + assert_cast(*dynamic_state->variant_serialization) + .serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(*variant_column, offset, limit, settings, dynamic_state->variant_state, dynamic_state->statistics.data); + settings.path.pop_back(); +} + +void SerializationDynamic::deserializeBinaryBulkWithMultipleStreams( + DB::ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + if (!state) + return; + + auto mutable_column = column->assumeMutable(); + auto * dynamic_state = checkAndGetState(state); + auto * structure_state = checkAndGetState(dynamic_state->structure_state); + + if (mutable_column->empty()) + mutable_column = ColumnDynamic::create(structure_state->variant_type->createColumn(), structure_state->variant_type, max_dynamic_types, structure_state->statistics); + + auto & column_dynamic = assert_cast(*mutable_column); + const auto & variant_info = column_dynamic.getVariantInfo(); + if (!variant_info.variant_type->equals(*structure_state->variant_type)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Mismatch of internal columns of Dynamic. Expected: {}, Got: {}", structure_state->variant_type->getName(), variant_info.variant_type->getName()); + + settings.path.push_back(Substream::DynamicData); + dynamic_state->variant_serialization->deserializeBinaryBulkWithMultipleStreams(column_dynamic.getVariantColumnPtr(), limit, settings, dynamic_state->variant_state, cache); + settings.path.pop_back(); + + column = std::move(mutable_column); +} + +void SerializationDynamic::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const +{ + UInt8 null_bit = field.isNull(); + writeBinary(null_bit, ostr); + if (null_bit) + return; + + auto field_type = applyVisitor(FieldToDataType(), field); + auto field_type_name = field_type->getName(); + writeVarUInt(field_type_name.size(), ostr); + writeString(field_type_name, ostr); + field_type->getDefaultSerialization()->serializeBinary(field, ostr, settings); +} + +void SerializationDynamic::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const +{ + UInt8 null_bit; + readBinary(null_bit, istr); + if (null_bit) + { + field = Null(); + return; + } + + size_t field_type_name_size; + readVarUInt(field_type_name_size, istr); + String field_type_name(field_type_name_size, 0); + istr.readStrict(field_type_name.data(), field_type_name_size); + auto field_type = DataTypeFactory::instance().get(field_type_name); + field_type->getDefaultSerialization()->deserializeBinary(field, istr, settings); +} + +void SerializationDynamic::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + const auto & variant_info = dynamic_column.getVariantInfo(); + const auto & variant_column = dynamic_column.getVariantColumn(); + auto global_discr = variant_column.globalDiscriminatorAt(row_num); + + UInt8 null_bit = global_discr == ColumnVariant::NULL_DISCRIMINATOR; + writeBinary(null_bit, ostr); + if (null_bit) + return; + + const auto & variant_type = assert_cast(*variant_info.variant_type).getVariant(global_discr); + const auto & variant_type_name = variant_info.variant_names[global_discr]; + writeVarUInt(variant_type_name.size(), ostr); + writeString(variant_type_name, ostr); + variant_type->getDefaultSerialization()->serializeBinary(variant_column.getVariantByGlobalDiscriminator(global_discr), variant_column.offsetAt(row_num), ostr, settings); +} + +template +static void deserializeVariant( + ColumnVariant & variant_column, + const DataTypePtr & variant_type, + ColumnVariant::Discriminator global_discr, + ReadBuffer & istr, + DeserializeFunc deserialize) +{ + auto & variant = variant_column.getVariantByGlobalDiscriminator(global_discr); + deserialize(*variant_type->getDefaultSerialization(), variant, istr); + variant_column.getLocalDiscriminators().push_back(variant_column.localDiscriminatorByGlobal(global_discr)); + variant_column.getOffsets().push_back(variant.size() - 1); +} + +void SerializationDynamic::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto & dynamic_column = assert_cast(column); + UInt8 null_bit; + readBinary(null_bit, istr); + if (null_bit) + { + dynamic_column.insertDefault(); + return; + } + + size_t variant_type_name_size; + readVarUInt(variant_type_name_size, istr); + String variant_type_name(variant_type_name_size, 0); + istr.readStrict(variant_type_name.data(), variant_type_name_size); + + const auto & variant_info = dynamic_column.getVariantInfo(); + auto it = variant_info.variant_name_to_discriminator.find(variant_type_name); + if (it != variant_info.variant_name_to_discriminator.end()) + { + const auto & variant_type = assert_cast(*variant_info.variant_type).getVariant(it->second); + deserializeVariant(dynamic_column.getVariantColumn(), variant_type, it->second, istr, [&settings](const ISerialization & serialization, IColumn & variant, ReadBuffer & buf){ serialization.deserializeBinary(variant, buf, settings); }); + return; + } + + /// We don't have this variant yet. Let's try to add it. + auto variant_type = DataTypeFactory::instance().get(variant_type_name); + if (dynamic_column.addNewVariant(variant_type)) + { + auto discr = variant_info.variant_name_to_discriminator.at(variant_type_name); + deserializeVariant(dynamic_column.getVariantColumn(), variant_type, discr, istr, [&settings](const ISerialization & serialization, IColumn & variant, ReadBuffer & buf){ serialization.deserializeBinary(variant, buf, settings); }); + return; + } + + /// We reached maximum number of variants and couldn't add new variant. + /// This case should be really rare in real use cases. + /// We should always be able to add String variant and insert value as String. + dynamic_column.addStringVariant(); + auto tmp_variant_column = variant_type->createColumn(); + variant_type->getDefaultSerialization()->deserializeBinary(*tmp_variant_column, istr, settings); + auto string_column = castColumn(ColumnWithTypeAndName(tmp_variant_column->getPtr(), variant_type, ""), std::make_shared()); + auto & variant_column = dynamic_column.getVariantColumn(); + variant_column.insertIntoVariantFrom(variant_info.variant_name_to_discriminator.at("String"), *string_column, 0); +} + +void SerializationDynamic::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextCSV(dynamic_column.getVariantColumn(), row_num, ostr, settings); +} + +template +static void deserializeTextImpl( + IColumn & column, + ReadBuffer & istr, + const FormatSettings & settings, + ReadFieldFunc read_field, + FormatSettings::EscapingRule escaping_rule, + TryDeserializeVariantFunc try_deserialize_variant, + DeserializeVariant deserialize_variant) +{ + auto & dynamic_column = assert_cast(column); + auto & variant_column = dynamic_column.getVariantColumn(); + const auto & variant_info = dynamic_column.getVariantInfo(); + String field = read_field(istr); + auto field_buf = std::make_unique(field); + JSONInferenceInfo json_info; + auto variant_type = tryInferDataTypeByEscapingRule(field, settings, escaping_rule, &json_info); + if (escaping_rule == FormatSettings::EscapingRule::JSON) + transformFinalInferredJSONTypeIfNeeded(variant_type, settings, &json_info); + + if (checkIfTypeIsComplete(variant_type) && dynamic_column.addNewVariant(variant_type)) + { + auto discr = variant_info.variant_name_to_discriminator.at(variant_type->getName()); + deserializeVariant(dynamic_column.getVariantColumn(), variant_type, discr, *field_buf, deserialize_variant); + return; + } + + /// We couldn't infer type or add new variant. Try to insert field into current variants. + field_buf = std::make_unique(field); + if (try_deserialize_variant(*variant_info.variant_type->getDefaultSerialization(), variant_column, *field_buf)) + return; + + /// We couldn't insert field into any existing variant, add String variant and read value as String. + dynamic_column.addStringVariant(); + + if (escaping_rule == FormatSettings::EscapingRule::Quoted && (field.size() < 2 || field.front() != '\'' || field.back() != '\'')) + field = "'" + field + "'"; + + field_buf = std::make_unique(field); + auto string_discr = variant_info.variant_name_to_discriminator.at("String"); + deserializeVariant(dynamic_column.getVariantColumn(), std::make_shared(), string_discr, *field_buf, deserialize_variant); +} + +void SerializationDynamic::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_field = [&settings](ReadBuffer & buf) + { + String field; + readCSVField(field, buf, settings.csv); + return field; + }; + + auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + return serialization.tryDeserializeTextCSV(col, buf, settings); + }; + + auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + serialization.deserializeTextCSV(col, buf, settings); + }; + + deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::CSV, try_deserialize_variant, deserialize_variant); +} + +bool SerializationDynamic::tryDeserializeTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeTextCSV(column, istr, settings); + return true; +} + +void SerializationDynamic::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextEscaped(dynamic_column.getVariantColumn(), row_num, ostr, settings); +} + +void SerializationDynamic::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_field = [](ReadBuffer & buf) + { + String field; + readEscapedString(field, buf); + return field; + }; + + auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + return serialization.tryDeserializeTextEscaped(col, buf, settings); + }; + + auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + serialization.deserializeTextEscaped(col, buf, settings); + }; + + deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::Escaped, try_deserialize_variant, deserialize_variant); +} + +bool SerializationDynamic::tryDeserializeTextEscaped(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeTextEscaped(column, istr, settings); + return true; +} + +void SerializationDynamic::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextQuoted(dynamic_column.getVariantColumn(), row_num, ostr, settings); +} + +void SerializationDynamic::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_field = [](ReadBuffer & buf) + { + String field; + readQuotedField(field, buf); + return field; + }; + + auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + return serialization.tryDeserializeTextQuoted(col, buf, settings); + }; + + auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + serialization.deserializeTextQuoted(col, buf, settings); + }; + + deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::Quoted, try_deserialize_variant, deserialize_variant); +} + +bool SerializationDynamic::tryDeserializeTextQuoted(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeTextQuoted(column, istr, settings); + return true; +} + +void SerializationDynamic::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextJSON(dynamic_column.getVariantColumn(), row_num, ostr, settings); +} + +void SerializationDynamic::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_field = [&settings](ReadBuffer & buf) + { + String field; + readJSONField(field, buf, settings.json); + return field; + }; + + auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + return serialization.tryDeserializeTextJSON(col, buf, settings); + }; + + auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + serialization.deserializeTextJSON(col, buf, settings); + }; + + deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::JSON, try_deserialize_variant, deserialize_variant); +} + +bool SerializationDynamic::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeTextJSON(column, istr, settings); + return true; +} + +void SerializationDynamic::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextRaw(dynamic_column.getVariantColumn(), row_num, ostr, settings); +} + +void SerializationDynamic::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_field = [](ReadBuffer & buf) + { + String field; + readString(field, buf); + return field; + }; + + auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + return serialization.tryDeserializeTextRaw(col, buf, settings); + }; + + auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + serialization.deserializeTextRaw(col, buf, settings); + }; + + deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::Raw, try_deserialize_variant, deserialize_variant); +} + +bool SerializationDynamic::tryDeserializeTextRaw(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeTextRaw(column, istr, settings); + return true; +} + +void SerializationDynamic::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeText(dynamic_column.getVariantColumn(), row_num, ostr, settings); +} + +void SerializationDynamic::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_field = [](ReadBuffer & buf) + { + String field; + readStringUntilEOF(field, buf); + return field; + }; + + auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + return serialization.tryDeserializeWholeText(col, buf, settings); + }; + + auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + serialization.deserializeWholeText(col, buf, settings); + }; + + deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::Raw, try_deserialize_variant, deserialize_variant); +} + +bool SerializationDynamic::tryDeserializeWholeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeWholeText(column, istr, settings); + return true; +} + +void SerializationDynamic::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextXML(dynamic_column.getVariantColumn(), row_num, ostr, settings); +} + +} diff --git a/src/DataTypes/Serializations/SerializationDynamic.h b/src/DataTypes/Serializations/SerializationDynamic.h new file mode 100644 index 00000000000..001a3cf87ce --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDynamic.h @@ -0,0 +1,116 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class SerializationDynamicElement; + +class SerializationDynamic : public ISerialization +{ +public: + explicit SerializationDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_dynamic_types_) + { + } + + struct DynamicStructureSerializationVersion + { + enum Value + { + VariantTypeName = 1, + }; + + Value value; + + static void checkVersion(UInt64 version); + + explicit DynamicStructureSerializationVersion(UInt64 version); + }; + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; + + static DeserializeBinaryBulkStatePtr deserializeDynamicStructureStatePrefix( + DeserializeBinaryBulkSettings & settings, + SubstreamsDeserializeStatesCache * cache); + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + +private: + friend SerializationDynamicElement; + + struct DeserializeBinaryBulkStateDynamicStructure : public ISerialization::DeserializeBinaryBulkState + { + DynamicStructureSerializationVersion structure_version; + DataTypePtr variant_type; + ColumnDynamic::Statistics statistics = {.source = ColumnDynamic::Statistics::Source::READ, .data = {}}; + + explicit DeserializeBinaryBulkStateDynamicStructure(UInt64 structure_version_) : structure_version(structure_version_) {} + }; + + size_t max_dynamic_types; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationDynamicElement.cpp b/src/DataTypes/Serializations/SerializationDynamicElement.cpp new file mode 100644 index 00000000000..dafd6d663b0 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDynamicElement.cpp @@ -0,0 +1,119 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + + +struct DeserializeBinaryBulkStateDynamicElement : public ISerialization::DeserializeBinaryBulkState +{ + ISerialization::DeserializeBinaryBulkStatePtr structure_state; + SerializationPtr variant_serialization; + ISerialization::DeserializeBinaryBulkStatePtr variant_element_state; +}; + +void SerializationDynamicElement::enumerateStreams( + DB::ISerialization::EnumerateStreamsSettings & settings, + const DB::ISerialization::StreamCallback & callback, + const DB::ISerialization::SubstreamData & data) const +{ + settings.path.push_back(Substream::DynamicStructure); + callback(settings.path); + settings.path.pop_back(); + + /// If we didn't deserialize prefix yet, we don't know if we actually have this variant in Dynamic column, + /// so we cannot enumerate variant streams. + if (!data.deserialize_state) + return; + + auto * deserialize_state = checkAndGetState(data.deserialize_state); + /// If we don't have this variant, no need to enumerate streams for it as we won't read from any stream. + if (!deserialize_state->variant_serialization) + return; + + settings.path.push_back(Substream::DynamicData); + auto variant_data = SubstreamData(deserialize_state->variant_serialization) + .withType(data.type) + .withColumn(data.column) + .withSerializationInfo(data.serialization_info) + .withDeserializeState(deserialize_state->variant_element_state); + deserialize_state->variant_serialization->enumerateStreams(settings, callback, variant_data); + settings.path.pop_back(); +} + +void SerializationDynamicElement::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationDynamicElement"); +} + +void SerializationDynamicElement::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationDynamicElement"); +} + +void SerializationDynamicElement::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const +{ + DeserializeBinaryBulkStatePtr structure_state = SerializationDynamic::deserializeDynamicStructureStatePrefix(settings, cache); + if (!structure_state) + return; + + auto dynamic_element_state = std::make_shared(); + dynamic_element_state->structure_state = std::move(structure_state); + const auto & variant_type = checkAndGetState(dynamic_element_state->structure_state)->variant_type; + /// Check if we actually have required element in the Variant. + if (auto global_discr = assert_cast(*variant_type).tryGetVariantDiscriminator(dynamic_element_name)) + { + settings.path.push_back(Substream::DynamicData); + dynamic_element_state->variant_serialization = std::make_shared(nested_serialization, dynamic_element_name, *global_discr); + dynamic_element_state->variant_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_element_state->variant_element_state, cache); + settings.path.pop_back(); + } + + state = std::move(dynamic_element_state); +} + +void SerializationDynamicElement::serializeBinaryBulkWithMultipleStreams(const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkWithMultipleStreams is not implemented for SerializationDynamicElement"); +} + +void SerializationDynamicElement::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & result_column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + if (!state) + return; + + auto * dynamic_element_state = checkAndGetState(state); + + if (dynamic_element_state->variant_serialization) + { + settings.path.push_back(Substream::DynamicData); + dynamic_element_state->variant_serialization->deserializeBinaryBulkWithMultipleStreams(result_column, limit, settings, dynamic_element_state->variant_element_state, cache); + settings.path.pop_back(); + } + else + { + auto mutable_column = result_column->assumeMutable(); + mutable_column->insertManyDefaults(limit); + result_column = std::move(mutable_column); + } +} + +} diff --git a/src/DataTypes/Serializations/SerializationDynamicElement.h b/src/DataTypes/Serializations/SerializationDynamicElement.h new file mode 100644 index 00000000000..2ddc3324139 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDynamicElement.h @@ -0,0 +1,58 @@ +#pragma once + +#include + +namespace DB +{ + + +/// Serialization for Dynamic element when we read it as a subcolumn. +class SerializationDynamicElement final : public SerializationWrapper +{ +private: + /// To be able to deserialize Dynamic element as a subcolumn + /// we need its type name and global discriminator. + String dynamic_element_name; + +public: + SerializationDynamicElement(const SerializationPtr & nested_, const String & dynamic_element_name_) + : SerializationWrapper(nested_) + , dynamic_element_name(dynamic_element_name_) + { + } + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationEnum.cpp b/src/DataTypes/Serializations/SerializationEnum.cpp index d72442eec99..6d36c6a9a96 100644 --- a/src/DataTypes/Serializations/SerializationEnum.cpp +++ b/src/DataTypes/Serializations/SerializationEnum.cpp @@ -29,7 +29,7 @@ void SerializationEnum::deserializeTextEscaped(IColumn & column, ReadBuffe { /// NOTE It would be nice to do without creating a temporary object - at least extract std::string out. std::string field_name; - readEscapedString(field_name, istr); + settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(field_name, istr) : readEscapedString(field_name, istr); assert_cast(column).getData().push_back(ref_enum_values.getValue(StringRef(field_name), true)); } } diff --git a/src/DataTypes/Serializations/SerializationFixedString.cpp b/src/DataTypes/Serializations/SerializationFixedString.cpp index 481ae2a6165..f919dc16d33 100644 --- a/src/DataTypes/Serializations/SerializationFixedString.cpp +++ b/src/DataTypes/Serializations/SerializationFixedString.cpp @@ -10,8 +10,10 @@ #include #include +#include "Common/PODArray.h" #include #include +#include "base/types.h" namespace DB { @@ -183,14 +185,17 @@ static inline bool tryRead(const SerializationFixedString & self, IColumn & colu } -void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - read(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); }); + read(*this, column, [&istr, &settings](ColumnFixedString::Chars & data) + { + settings.tsv.crlf_end_of_line_input ? readEscapedStringInto(data, istr) : readEscapedStringInto(data, istr); + }); } bool SerializationFixedString::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); return true; }); + return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto,false>(data, istr); return true; }); } diff --git a/src/DataTypes/Serializations/SerializationInterval.cpp b/src/DataTypes/Serializations/SerializationInterval.cpp index c4ef34b4325..ef96ad4729f 100644 --- a/src/DataTypes/Serializations/SerializationInterval.cpp +++ b/src/DataTypes/Serializations/SerializationInterval.cpp @@ -68,9 +68,9 @@ void SerializationInterval::deserializeBinaryBulk(IColumn & column, ReadBuffer & } void SerializationInterval::deserializeBinaryBulkStatePrefix( - DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const { - dispatch(&ISerialization::deserializeBinaryBulkStatePrefix, FormatSettings::IntervalOutputFormat::Numeric, settings, state); + dispatch(&ISerialization::deserializeBinaryBulkStatePrefix, FormatSettings::IntervalOutputFormat::Numeric, settings, state, cache); } diff --git a/src/DataTypes/Serializations/SerializationInterval.h b/src/DataTypes/Serializations/SerializationInterval.h index a4e6c204e4f..368aff4f0c3 100644 --- a/src/DataTypes/Serializations/SerializationInterval.h +++ b/src/DataTypes/Serializations/SerializationInterval.h @@ -34,7 +34,10 @@ public: void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - void deserializeBinaryBulkStatePrefix(DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const override; + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void deserializeBinaryBulkWithMultipleStreams( ColumnPtr & column, size_t limit, diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp index 2d2be195098..2b88c5d68d6 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp +++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp @@ -267,7 +267,8 @@ void SerializationLowCardinality::serializeBinaryBulkStateSuffix( void SerializationLowCardinality::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * /*cache*/) const { settings.path.push_back(Substream::DictionaryKeys); auto * stream = settings.getter(settings.path); diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.h b/src/DataTypes/Serializations/SerializationLowCardinality.h index d2c3a95c702..aa64e956a64 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.h +++ b/src/DataTypes/Serializations/SerializationLowCardinality.h @@ -33,7 +33,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index 7b6f87baf2e..70fe5182ade 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include @@ -398,7 +398,8 @@ void SerializationMap::enumerateStreams( auto next_data = SubstreamData(nested) .withType(data.type ? assert_cast(*data.type).getNestedType() : nullptr) .withColumn(data.column ? assert_cast(*data.column).getNestedColumnPtr() : nullptr) - .withSerializationInfo(data.serialization_info); + .withSerializationInfo(data.serialization_info) + .withDeserializeState(data.deserialize_state); nested->enumerateStreams(settings, callback, next_data); } @@ -420,9 +421,10 @@ void SerializationMap::serializeBinaryBulkStateSuffix( void SerializationMap::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { - nested->deserializeBinaryBulkStatePrefix(settings, state); + nested->deserializeBinaryBulkStatePrefix(settings, state, cache); } diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h index 3e27ef1b04a..cfcde445c1f 100644 --- a/src/DataTypes/Serializations/SerializationMap.h +++ b/src/DataTypes/Serializations/SerializationMap.h @@ -51,7 +51,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/SerializationNamed.cpp b/src/DataTypes/Serializations/SerializationNamed.cpp index 2792827e690..07f5f9ea7ed 100644 --- a/src/DataTypes/Serializations/SerializationNamed.cpp +++ b/src/DataTypes/Serializations/SerializationNamed.cpp @@ -54,10 +54,11 @@ void SerializationNamed::serializeBinaryBulkStateSuffix( void SerializationNamed::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { addToPath(settings.path); - nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, state, cache); settings.path.pop_back(); } diff --git a/src/DataTypes/Serializations/SerializationNamed.h b/src/DataTypes/Serializations/SerializationNamed.h index 0633ba2ea6f..bb2161e40e6 100644 --- a/src/DataTypes/Serializations/SerializationNamed.h +++ b/src/DataTypes/Serializations/SerializationNamed.h @@ -36,7 +36,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 4d31451f92d..e72dd3a42f5 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -95,10 +95,11 @@ void SerializationNullable::serializeBinaryBulkStateSuffix( void SerializationNullable::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { settings.path.push_back(Substream::NullableElements); - nested->deserializeBinaryBulkStatePrefix(settings, state); + nested->deserializeBinaryBulkStatePrefix(settings, state, cache); settings.path.pop_back(); } @@ -286,7 +287,7 @@ bool SerializationNullable::tryDeserializeNullRaw(DB::ReadBuffer & istr, const D } template -ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization, bool & is_null) +ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization, bool & is_null) { static constexpr bool throw_exception = std::is_same_v; @@ -319,10 +320,10 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, /// Check if we have enough data in buffer to check if it's a null. if (istr.available() > null_representation.size()) { - auto check_for_null = [&null_representation](ReadBuffer & buf) + auto check_for_null = [&null_representation, &settings](ReadBuffer & buf) { auto * pos = buf.position(); - if (checkString(null_representation, buf) && (*buf.position() == '\t' || *buf.position() == '\n')) + if (checkString(null_representation, buf) && (*buf.position() == '\t' || *buf.position() == '\n' || (settings.tsv.crlf_end_of_line_input && *buf.position() == '\r'))) return true; buf.position() = pos; return false; @@ -334,14 +335,14 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, /// Use PeekableReadBuffer to make a checkpoint before checking null /// representation and rollback if check was failed. PeekableReadBuffer peekable_buf(istr, true); - auto check_for_null = [&null_representation](ReadBuffer & buf_) + auto check_for_null = [&null_representation, &settings](ReadBuffer & buf_) { auto & buf = assert_cast(buf_); buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); - if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n')) - return true; + if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n' || (settings.tsv.crlf_end_of_line_input && *buf.position() == '\r'))) + return true; buf.rollbackToCheckpoint(); return false; }; @@ -371,7 +372,10 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos) throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation " - "containing '\\t' or '\\n' may not work correctly for large input."); + "containing '\\t' or '\\n' may not work correctly for large input."); + if (settings.tsv.crlf_end_of_line_input && null_representation.find('\r') != std::string::npos) + throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation " + "containing '\\r' may not work correctly for large input."); WriteBufferFromOwnString parsed_value; if constexpr (escaped) diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h index 37858ccdefd..f7d2d2eadf0 100644 --- a/src/DataTypes/Serializations/SerializationNullable.h +++ b/src/DataTypes/Serializations/SerializationNullable.h @@ -29,7 +29,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/SerializationObject.cpp b/src/DataTypes/Serializations/SerializationObject.cpp index 67bf7af7799..c6c87b5aa7b 100644 --- a/src/DataTypes/Serializations/SerializationObject.cpp +++ b/src/DataTypes/Serializations/SerializationObject.cpp @@ -104,9 +104,9 @@ void SerializationObject::deserializeWholeText(IColumn & column, ReadBuf } template -void SerializationObject::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationObject::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - deserializeTextImpl(column, [&](String & s) { readEscapedString(s, istr); }); + deserializeTextImpl(column, [&](String & s) { settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(s, istr) : readEscapedString(s, istr); }); } template @@ -210,7 +210,8 @@ void SerializationObject::serializeBinaryBulkStateSuffix( template void SerializationObject::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { checkSerializationIsSupported(settings); if (state) @@ -258,7 +259,7 @@ void SerializationObject::deserializeBinaryBulkStatePrefix( } settings.path.push_back(Substream::ObjectData); - state_object->nested_serialization->deserializeBinaryBulkStatePrefix(settings, state_object->nested_state); + state_object->nested_serialization->deserializeBinaryBulkStatePrefix(settings, state_object->nested_state, cache); settings.path.pop_back(); state = std::move(state_object); diff --git a/src/DataTypes/Serializations/SerializationObject.h b/src/DataTypes/Serializations/SerializationObject.h index 39e1c514640..4cb7d0ab6a8 100644 --- a/src/DataTypes/Serializations/SerializationObject.h +++ b/src/DataTypes/Serializations/SerializationObject.h @@ -41,7 +41,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/SerializationSparse.cpp b/src/DataTypes/Serializations/SerializationSparse.cpp index 4d7514271ad..73488d308bb 100644 --- a/src/DataTypes/Serializations/SerializationSparse.cpp +++ b/src/DataTypes/Serializations/SerializationSparse.cpp @@ -152,7 +152,7 @@ void SerializationSparse::enumerateStreams( const StreamCallback & callback, const SubstreamData & data) const { - const auto * column_sparse = data.column ? &assert_cast(*data.column) : nullptr; + const auto * column_sparse = data.column ? typeid_cast(data.column.get()) : nullptr; size_t column_size = column_sparse ? column_sparse->size() : 0; settings.path.push_back(Substream::SparseOffsets); @@ -170,7 +170,7 @@ void SerializationSparse::enumerateStreams( auto next_data = SubstreamData(nested) .withType(data.type) - .withColumn(column_sparse ? column_sparse->getValuesPtr() : nullptr) + .withColumn(column_sparse ? column_sparse->getValuesPtr() : data.column) .withSerializationInfo(data.serialization_info); nested->enumerateStreams(settings, callback, next_data); @@ -242,12 +242,13 @@ void SerializationSparse::serializeBinaryBulkStateSuffix( void SerializationSparse::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { auto state_sparse = std::make_shared(); settings.path.push_back(Substream::SparseElements); - nested->deserializeBinaryBulkStatePrefix(settings, state_sparse->nested); + nested->deserializeBinaryBulkStatePrefix(settings, state_sparse->nested, cache); settings.path.pop_back(); state = std::move(state_sparse); diff --git a/src/DataTypes/Serializations/SerializationSparse.h b/src/DataTypes/Serializations/SerializationSparse.h index b1ed7b613f0..a55856bacf0 100644 --- a/src/DataTypes/Serializations/SerializationSparse.h +++ b/src/DataTypes/Serializations/SerializationSparse.h @@ -43,7 +43,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; /// Allows to write ColumnSparse and other columns in sparse serialization. void serializeBinaryBulkWithMultipleStreams( diff --git a/src/DataTypes/Serializations/SerializationString.cpp b/src/DataTypes/Serializations/SerializationString.cpp index 8abaa3bd5ea..9e39ab23709 100644 --- a/src/DataTypes/Serializations/SerializationString.cpp +++ b/src/DataTypes/Serializations/SerializationString.cpp @@ -147,7 +147,6 @@ void SerializationString::serializeBinaryBulk(const IColumn & column, WriteBuffe } } - template static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnString::Offsets & offsets, ReadBuffer & istr, size_t limit) { @@ -324,14 +323,17 @@ bool SerializationString::tryDeserializeWholeText(IColumn & column, ReadBuffer & return read(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); return true; }); } -void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) + { + settings.tsv.crlf_end_of_line_input ? readEscapedStringInto,true>(data, istr) : readEscapedStringInto,false>(data, istr); + }); } bool SerializationString::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - return read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); return true; }); + return read(column, [&](ColumnString::Chars & data) { readEscapedStringInto,true>(data, istr); return true; }); } void SerializationString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index 632a019d2d9..ef0a75fac40 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -549,26 +549,6 @@ bool SerializationTuple::tryDeserializeTextCSV(IColumn & column, ReadBuffer & is return tryDeserializeText(column, rb, settings, true); } -void SerializationTuple::enumerateStreams( - EnumerateStreamsSettings & settings, - const StreamCallback & callback, - const SubstreamData & data) const -{ - const auto * type_tuple = data.type ? &assert_cast(*data.type) : nullptr; - const auto * column_tuple = data.column ? &assert_cast(*data.column) : nullptr; - const auto * info_tuple = data.serialization_info ? &assert_cast(*data.serialization_info) : nullptr; - - for (size_t i = 0; i < elems.size(); ++i) - { - auto next_data = SubstreamData(elems[i]) - .withType(type_tuple ? type_tuple->getElement(i) : nullptr) - .withColumn(column_tuple ? column_tuple->getColumnPtr(i) : nullptr) - .withSerializationInfo(info_tuple ? info_tuple->getElementInfo(i) : nullptr); - - elems[i]->enumerateStreams(settings, callback, next_data); - } -} - struct SerializeBinaryBulkStateTuple : public ISerialization::SerializeBinaryBulkState { std::vector states; @@ -579,6 +559,27 @@ struct DeserializeBinaryBulkStateTuple : public ISerialization::DeserializeBinar std::vector states; }; +void SerializationTuple::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + const auto * type_tuple = data.type ? &assert_cast(*data.type) : nullptr; + const auto * column_tuple = data.column ? &assert_cast(*data.column) : nullptr; + const auto * info_tuple = data.serialization_info ? &assert_cast(*data.serialization_info) : nullptr; + const auto * tuple_deserialize_state = data.deserialize_state ? checkAndGetState(data.deserialize_state) : nullptr; + + for (size_t i = 0; i < elems.size(); ++i) + { + auto next_data = SubstreamData(elems[i]) + .withType(type_tuple ? type_tuple->getElement(i) : nullptr) + .withColumn(column_tuple ? column_tuple->getColumnPtr(i) : nullptr) + .withSerializationInfo(info_tuple ? info_tuple->getElementInfo(i) : nullptr) + .withDeserializeState(tuple_deserialize_state ? tuple_deserialize_state->states[i] : nullptr); + + elems[i]->enumerateStreams(settings, callback, next_data); + } +} void SerializationTuple::serializeBinaryBulkStatePrefix( const IColumn & column, @@ -606,13 +607,14 @@ void SerializationTuple::serializeBinaryBulkStateSuffix( void SerializationTuple::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { auto tuple_state = std::make_shared(); tuple_state->states.resize(elems.size()); for (size_t i = 0; i < elems.size(); ++i) - elems[i]->deserializeBinaryBulkStatePrefix(settings, tuple_state->states[i]); + elems[i]->deserializeBinaryBulkStatePrefix(settings, tuple_state->states[i], cache); state = std::move(tuple_state); } diff --git a/src/DataTypes/Serializations/SerializationTuple.h b/src/DataTypes/Serializations/SerializationTuple.h index d9c63a05217..810673d8b21 100644 --- a/src/DataTypes/Serializations/SerializationTuple.h +++ b/src/DataTypes/Serializations/SerializationTuple.h @@ -53,7 +53,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 300686ff8d3..b386fd8ab45 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -28,6 +28,16 @@ namespace ErrorCodes extern const int INCORRECT_DATA; } +struct SerializeBinaryBulkStateVariant : public ISerialization::SerializeBinaryBulkState +{ + std::vector states; +}; + +struct DeserializeBinaryBulkStateVariant : public ISerialization::DeserializeBinaryBulkState +{ + std::vector states; +}; + void SerializationVariant::enumerateStreams( EnumerateStreamsSettings & settings, const StreamCallback & callback, @@ -35,6 +45,7 @@ void SerializationVariant::enumerateStreams( { const auto * type_variant = data.type ? &assert_cast(*data.type) : nullptr; const auto * column_variant = data.column ? &assert_cast(*data.column) : nullptr; + const auto * variant_deserialize_state = data.deserialize_state ? checkAndGetState(data.deserialize_state) : nullptr; auto discriminators_serialization = std::make_shared(std::make_shared>(), "discr", SubstreamType::NamedVariantDiscriminators); auto local_discriminators = column_variant ? column_variant->getLocalDiscriminatorsPtr() : nullptr; @@ -59,7 +70,8 @@ void SerializationVariant::enumerateStreams( auto variant_data = SubstreamData(variants[i]) .withType(type_variant ? type_variant->getVariant(i) : nullptr) .withColumn(column_variant ? column_variant->getVariantPtrByGlobalDiscriminator(i) : nullptr) - .withSerializationInfo(data.serialization_info); + .withSerializationInfo(data.serialization_info) + .withDeserializeState(variant_deserialize_state ? variant_deserialize_state->states[i] : nullptr); addVariantElementToPath(settings.path, i); settings.path.back().data = variant_data; @@ -70,16 +82,6 @@ void SerializationVariant::enumerateStreams( settings.path.pop_back(); } -struct SerializeBinaryBulkStateVariant : public ISerialization::SerializeBinaryBulkState -{ - std::vector states; -}; - -struct DeserializeBinaryBulkStateVariant : public ISerialization::DeserializeBinaryBulkState -{ - std::vector states; -}; - void SerializationVariant::serializeBinaryBulkStatePrefix( const IColumn & column, SerializeBinaryBulkSettings & settings, @@ -123,7 +125,8 @@ void SerializationVariant::serializeBinaryBulkStateSuffix( void SerializationVariant::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { auto variant_state = std::make_shared(); variant_state->states.resize(variants.size()); @@ -132,7 +135,7 @@ void SerializationVariant::deserializeBinaryBulkStatePrefix( for (size_t i = 0; i < variants.size(); ++i) { addVariantElementToPath(settings.path, i); - variants[i]->deserializeBinaryBulkStatePrefix(settings, variant_state->states[i]); + variants[i]->deserializeBinaryBulkStatePrefix(settings, variant_state->states[i], cache); settings.path.pop_back(); } @@ -141,12 +144,13 @@ void SerializationVariant::deserializeBinaryBulkStatePrefix( } -void SerializationVariant::serializeBinaryBulkWithMultipleStreams( +void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics( const IColumn & column, size_t offset, size_t limit, SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const + SerializeBinaryBulkStatePtr & state, + std::unordered_map & variants_statistics) const { const ColumnVariant & col = assert_cast(column); if (const size_t size = col.size(); limit == 0 || offset + limit > size) @@ -185,6 +189,7 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams( { addVariantElementToPath(settings.path, i); variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->states[i]); + variants_statistics[variant_names[i]] += col.getVariantByGlobalDiscriminator(i).size(); settings.path.pop_back(); } settings.path.pop_back(); @@ -205,6 +210,7 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams( addVariantElementToPath(settings.path, non_empty_global_discr); /// We can use the same offset/limit as for whole Variant column variants[non_empty_global_discr]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(non_empty_global_discr), offset, limit, settings, variant_state->states[non_empty_global_discr]); + variants_statistics[variant_names[non_empty_global_discr]] += limit; settings.path.pop_back(); settings.path.pop_back(); return; @@ -244,12 +250,23 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams( variant_offsets_and_limits[i].second, settings, variant_state->states[i]); + variants_statistics[variant_names[i]] += variant_offsets_and_limits[i].second; settings.path.pop_back(); } } settings.path.pop_back(); } +void SerializationVariant::serializeBinaryBulkWithMultipleStreams( + const DB::IColumn & column, + size_t offset, + size_t limit, + DB::ISerialization::SerializeBinaryBulkSettings & settings, + DB::ISerialization::SerializeBinaryBulkStatePtr & state) const +{ + std::unordered_map tmp_statistics; + serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(column, offset, limit, settings, state, tmp_statistics); +} void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( ColumnPtr & column, @@ -599,14 +616,14 @@ void SerializationVariant::serializeTextEscaped(const IColumn & column, size_t r bool SerializationVariant::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String field; - readEscapedString(field, istr); + settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(field, istr) : readEscapedString(field, istr); return tryDeserializeTextEscapedImpl(column, field, settings); } void SerializationVariant::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String field; - readEscapedString(field, istr); + settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(field, istr) : readEscapedString(field, istr); if (!tryDeserializeTextEscapedImpl(column, field, settings)) throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse escaped value of type {} here: {}", variant_name, field); } diff --git a/src/DataTypes/Serializations/SerializationVariant.h b/src/DataTypes/Serializations/SerializationVariant.h index 3f53dcf1339..b6aa1534538 100644 --- a/src/DataTypes/Serializations/SerializationVariant.h +++ b/src/DataTypes/Serializations/SerializationVariant.h @@ -59,7 +59,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, @@ -68,6 +69,14 @@ public: SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const override; + void serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state, + std::unordered_map & variants_statistics) const; + void deserializeBinaryBulkWithMultipleStreams( ColumnPtr & column, size_t limit, diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index 7d4487fe6da..1f9a81ac671 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace DB { @@ -11,34 +12,6 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } -void SerializationVariantElement::enumerateStreams( - DB::ISerialization::EnumerateStreamsSettings & settings, - const DB::ISerialization::StreamCallback & callback, - const DB::ISerialization::SubstreamData & data) const -{ - /// We will need stream for discriminators during deserialization. - settings.path.push_back(Substream::VariantDiscriminators); - callback(settings.path); - settings.path.pop_back(); - - addVariantToPath(settings.path); - settings.path.back().data = data; - nested_serialization->enumerateStreams(settings, callback, data); - removeVariantFromPath(settings.path); -} - -void SerializationVariantElement::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const -{ - throw Exception( - ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationVariantElement"); -} - -void SerializationVariantElement::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const -{ - throw Exception( - ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationVariantElement"); -} - struct DeserializeBinaryBulkStateVariantElement : public ISerialization::DeserializeBinaryBulkState { /// During deserialization discriminators and variant streams can be shared. @@ -55,12 +28,47 @@ struct DeserializeBinaryBulkStateVariantElement : public ISerialization::Deseria ISerialization::DeserializeBinaryBulkStatePtr variant_element_state; }; -void SerializationVariantElement::deserializeBinaryBulkStatePrefix(DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const +void SerializationVariantElement::enumerateStreams( + DB::ISerialization::EnumerateStreamsSettings & settings, + const DB::ISerialization::StreamCallback & callback, + const DB::ISerialization::SubstreamData & data) const +{ + /// We will need stream for discriminators during deserialization. + settings.path.push_back(Substream::VariantDiscriminators); + callback(settings.path); + settings.path.pop_back(); + + const auto * deserialize_state = data.deserialize_state ? checkAndGetState(data.deserialize_state) : nullptr; + addVariantToPath(settings.path); + auto nested_data = SubstreamData(nested_serialization) + .withType(data.type ? removeNullableOrLowCardinalityNullable(data.type) : nullptr) + .withColumn(data.column ? removeNullableOrLowCardinalityNullable(data.column) : nullptr) + .withSerializationInfo(data.serialization_info) + .withDeserializeState(deserialize_state ? deserialize_state->variant_element_state : nullptr); + settings.path.back().data = nested_data; + nested_serialization->enumerateStreams(settings, callback, nested_data); + removeVariantFromPath(settings.path); +} + +void SerializationVariantElement::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationVariantElement"); +} + +void SerializationVariantElement::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationVariantElement"); +} + +void SerializationVariantElement::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const { auto variant_element_state = std::make_shared(); addVariantToPath(settings.path); - nested_serialization->deserializeBinaryBulkStatePrefix(settings, variant_element_state->variant_element_state); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, variant_element_state->variant_element_state, cache); removeVariantFromPath(settings.path); state = std::move(variant_element_state); diff --git a/src/DataTypes/Serializations/SerializationVariantElement.h b/src/DataTypes/Serializations/SerializationVariantElement.h index aafecf43d39..0ce0a72e250 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.h +++ b/src/DataTypes/Serializations/SerializationVariantElement.h @@ -43,7 +43,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, @@ -59,12 +60,6 @@ public: DeserializeBinaryBulkStatePtr & state, SubstreamsCache * cache) const override; -private: - friend SerializationVariant; - - void addVariantToPath(SubstreamPath & path) const; - void removeVariantFromPath(SubstreamPath & path) const; - struct VariantSubcolumnCreator : public ISubcolumnCreator { const ColumnPtr local_discriminators; @@ -82,6 +77,11 @@ private: ColumnPtr create(const ColumnPtr & prev) const override; SerializationPtr create(const SerializationPtr & prev) const override; }; +private: + friend SerializationVariant; + + void addVariantToPath(SubstreamPath & path) const; + void removeVariantFromPath(SubstreamPath & path) const; }; } diff --git a/src/DataTypes/Serializations/SerializationWrapper.cpp b/src/DataTypes/Serializations/SerializationWrapper.cpp index bde52bb8096..ecef533d7e0 100644 --- a/src/DataTypes/Serializations/SerializationWrapper.cpp +++ b/src/DataTypes/Serializations/SerializationWrapper.cpp @@ -29,9 +29,10 @@ void SerializationWrapper::serializeBinaryBulkStateSuffix( void SerializationWrapper::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { - nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, state, cache); } void SerializationWrapper::serializeBinaryBulkWithMultipleStreams( diff --git a/src/DataTypes/Serializations/SerializationWrapper.h b/src/DataTypes/Serializations/SerializationWrapper.h index 6c5e2046062..882f17bba0a 100644 --- a/src/DataTypes/Serializations/SerializationWrapper.h +++ b/src/DataTypes/Serializations/SerializationWrapper.h @@ -36,7 +36,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/tests/gtest_object_serialization.cpp b/src/DataTypes/Serializations/tests/gtest_object_serialization.cpp index fc7432d5bf6..c6337a31fce 100644 --- a/src/DataTypes/Serializations/tests/gtest_object_serialization.cpp +++ b/src/DataTypes/Serializations/tests/gtest_object_serialization.cpp @@ -49,7 +49,7 @@ TEST(SerializationObject, FromString) settings.position_independent_encoding = false; settings.getter = [&in](const auto &) { return ∈ }; - serialization->deserializeBinaryBulkStatePrefix(settings, state); + serialization->deserializeBinaryBulkStatePrefix(settings, state, nullptr); serialization->deserializeBinaryBulkWithMultipleStreams(result_column, column_string->size(), settings, state, nullptr); } diff --git a/src/DataTypes/Utils.cpp b/src/DataTypes/Utils.cpp index 2f29d57d454..e7e69e379af 100644 --- a/src/DataTypes/Utils.cpp +++ b/src/DataTypes/Utils.cpp @@ -224,6 +224,7 @@ bool canBeSafelyCasted(const DataTypePtr & from_type, const DataTypePtr & to_typ case TypeIndex::Nothing: case TypeIndex::JSONPaths: case TypeIndex::Variant: + case TypeIndex::Dynamic: return false; } diff --git a/src/DataTypes/getLeastSupertype.cpp b/src/DataTypes/getLeastSupertype.cpp index 0977bea362c..a71b19d6c92 100644 --- a/src/DataTypes/getLeastSupertype.cpp +++ b/src/DataTypes/getLeastSupertype.cpp @@ -19,6 +19,7 @@ #include #include #include +#include namespace DB @@ -256,6 +257,24 @@ DataTypePtr getLeastSupertype(const DataTypes & types) return types[0]; } + /// If one of the types is Dynamic, the supertype is Dynamic + { + bool have_dynamic = false; + size_t max_dynamic_types = 0; + + for (const auto & type : types) + { + if (const auto & dynamic_type = typeid_cast(type.get())) + { + have_dynamic = true; + max_dynamic_types = std::max(max_dynamic_types, dynamic_type->getMaxDynamicTypes()); + } + } + + if (have_dynamic) + return std::make_shared(max_dynamic_types); + } + /// Recursive rules /// If there are Nothing types, skip them diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index fb1b3ee626b..e72834eddbe 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -1,6 +1,14 @@ +#include + +#include +#include +#include +#include +#include +#include +#include #include #include -#include #include #include #include @@ -10,13 +18,7 @@ #include #include #include -#include -#include -#include -#include -#include -#include namespace fs = std::filesystem; diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 161be35f129..5cb4198e1a2 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -326,31 +327,36 @@ void DatabaseOnDisk::dropTable(ContextPtr local_context, const String & table_na StoragePtr table = detachTable(local_context, table_name); - /// This is possible for Lazy database. - if (!table) - return; - bool renamed = false; try { fs::rename(table_metadata_path, table_metadata_path_drop); renamed = true; - table->drop(); - table->is_dropped = true; - - fs::path table_data_dir(local_context->getPath() + table_data_path_relative); - if (fs::exists(table_data_dir)) - (void)fs::remove_all(table_data_dir); + // The table might be not loaded for Lazy database engine. + if (table) + { + table->drop(); + table->is_dropped = true; + } } catch (...) { LOG_WARNING(log, getCurrentExceptionMessageAndPattern(/* with_stacktrace */ true)); - attachTable(local_context, table_name, table, table_data_path_relative); + if (table) + attachTable(local_context, table_name, table, table_data_path_relative); if (renamed) fs::rename(table_metadata_path_drop, table_metadata_path); throw; } + for (const auto & [disk_name, disk] : getContext()->getDisksMap()) + { + if (disk->isReadOnly() || !disk->exists(table_data_path_relative)) + continue; + + LOG_INFO(log, "Removing data directory from disk {} with path {} for dropped table {} ", disk_name, table_data_path_relative, table_name); + disk->removeRecursive(table_data_path_relative); + } (void)fs::remove(table_metadata_path_drop); } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 78d502ec2c7..cc946fc22c4 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -929,6 +929,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep query_context->setSetting("allow_experimental_hash_functions", 1); query_context->setSetting("allow_experimental_object_type", 1); query_context->setSetting("allow_experimental_variant_type", 1); + query_context->setSetting("allow_experimental_dynamic_type", 1); query_context->setSetting("allow_experimental_annoy_index", 1); query_context->setSetting("allow_experimental_usearch_index", 1); query_context->setSetting("allow_experimental_bigint_types", 1); diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index fc75f8e44b9..5fee14ecc2a 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -1,4 +1,10 @@ #include + +#include +#include +#include +#include +#include #include #include #include @@ -8,17 +14,8 @@ #include #include #include -#include -#include -#include +#include #include -#include -#include - -namespace CurrentMetrics -{ - extern const Metric AttachedTable; -} namespace DB @@ -263,7 +260,7 @@ StoragePtr DatabaseWithOwnTablesBase::detachTableUnlocked(const String & table_n res = it->second; tables.erase(it); res->is_detached = true; - CurrentMetrics::sub(CurrentMetrics::AttachedTable, 1); + CurrentMetrics::sub(getAttachedCounterForStorage(res), 1); auto table_id = res->getStorageID(); if (table_id.hasUUID()) @@ -304,7 +301,7 @@ void DatabaseWithOwnTablesBase::attachTableUnlocked(const String & table_name, c /// It is important to reset is_detached here since in case of RENAME in /// non-Atomic database the is_detached is set to true before RENAME. table->is_detached = false; - CurrentMetrics::add(CurrentMetrics::AttachedTable, 1); + CurrentMetrics::add(getAttachedCounterForStorage(table), 1); } void DatabaseWithOwnTablesBase::shutdown() diff --git a/src/Dictionaries/CMakeLists.txt b/src/Dictionaries/CMakeLists.txt index 569acd9231a..783835356e6 100644 --- a/src/Dictionaries/CMakeLists.txt +++ b/src/Dictionaries/CMakeLists.txt @@ -39,7 +39,6 @@ target_link_libraries(clickhouse_dictionaries Poco::Data Poco::MongoDB Poco::Redis - string_utils ) target_link_libraries(clickhouse_dictionaries PUBLIC ch_contrib::abseil_swiss_tables) diff --git a/src/Dictionaries/DictionaryStructure.cpp b/src/Dictionaries/DictionaryStructure.cpp index 0b6bdea60a3..c2f2f4a8532 100644 --- a/src/Dictionaries/DictionaryStructure.cpp +++ b/src/Dictionaries/DictionaryStructure.cpp @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include diff --git a/src/Dictionaries/FileDictionarySource.cpp b/src/Dictionaries/FileDictionarySource.cpp index 16a4ecaee75..fde46fb27f0 100644 --- a/src/Dictionaries/FileDictionarySource.cpp +++ b/src/Dictionaries/FileDictionarySource.cpp @@ -1,6 +1,6 @@ #include "FileDictionarySource.h" #include -#include +#include #include #include #include diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h index e3eea71cd9a..f0b56cbf529 100644 --- a/src/Dictionaries/SSDCacheDictionaryStorage.h +++ b/src/Dictionaries/SSDCacheDictionaryStorage.h @@ -721,11 +721,10 @@ public: if (!block.checkCheckSum()) { std::string calculated_check_sum = std::to_string(block.calculateCheckSum()); - std::string check_sum = std::to_string(block.getCheckSum()); + std::string expected_check_sum = std::to_string(block.getCheckSum()); throw Exception(ErrorCodes::CORRUPTED_DATA, - "Cache data corrupted. Checksum validation failed. Calculated {} in block {}", - calculated_check_sum, - check_sum); + "Cache data corrupted. Checksum validation failed. Calculated {} expected in block {}, in file {}", + calculated_check_sum, expected_check_sum, file_path); } func(blocks_to_fetch[block_to_fetch_index], block.getBlockData()); diff --git a/src/Disks/DiskEncryptedTransaction.h b/src/Disks/DiskEncryptedTransaction.h index 94bd8ef5378..26f4c979bf8 100644 --- a/src/Disks/DiskEncryptedTransaction.h +++ b/src/Disks/DiskEncryptedTransaction.h @@ -244,6 +244,13 @@ public: return delegate_transaction->writeFile(wrapped_path, buf_size, mode, settings); } + /// Truncate file to the target size. + void truncateFile(const std::string & src_path, size_t target_size) override + { + auto wrapped_path = wrappedPath(src_path); + delegate_transaction->truncateFile(wrapped_path, target_size); + } + private: diff --git a/src/Disks/FakeDiskTransaction.h b/src/Disks/FakeDiskTransaction.h index f83642eee56..69f08de2517 100644 --- a/src/Disks/FakeDiskTransaction.h +++ b/src/Disks/FakeDiskTransaction.h @@ -2,10 +2,16 @@ #include #include +#include namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + /// Fake disk transaction implementation. /// Just execute all operations immediately, commit is noop operation. /// No support for atomicity and rollback. @@ -134,6 +140,11 @@ public: disk.createHardLink(src_path, dst_path); } + void truncateFile(const std::string & /* src_path */, size_t /* target_size */) override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Operation `truncateFile` is not implemented"); + } + private: IDisk & disk; }; diff --git a/src/Disks/IDiskTransaction.h b/src/Disks/IDiskTransaction.h index 49fcdde1a4f..fc84281baea 100644 --- a/src/Disks/IDiskTransaction.h +++ b/src/Disks/IDiskTransaction.h @@ -128,6 +128,9 @@ public: /// Create hardlink from `src_path` to `dst_path`. virtual void createHardLink(const std::string & src_path, const std::string & dst_path) = 0; + + /// Truncate file to the target size. + virtual void truncateFile(const std::string & src_path, size_t target_size) = 0; }; using DiskTransactionPtr = std::shared_ptr; diff --git a/src/Disks/IVolume.cpp b/src/Disks/IVolume.cpp index d763c55c4aa..e6be0f36193 100644 --- a/src/Disks/IVolume.cpp +++ b/src/Disks/IVolume.cpp @@ -1,6 +1,6 @@ #include "IVolume.h" -#include +#include #include #include diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index a535b007541..bae58f0b9c6 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -257,6 +257,7 @@ std::unique_ptr getAzureBlobStorageSettings(const Po settings->max_upload_part_size = config.getUInt64(config_prefix + ".max_upload_part_size", context->getSettings().azure_max_upload_part_size); settings->max_single_part_copy_size = config.getUInt64(config_prefix + ".max_single_part_copy_size", context->getSettings().azure_max_single_part_copy_size); settings->use_native_copy = config.getBool(config_prefix + ".use_native_copy", false); + settings->max_blocks_in_multipart_upload = config.getUInt64(config_prefix + ".max_blocks_in_multipart_upload", 50000); settings->max_unexpected_write_error_retries = config.getUInt64(config_prefix + ".max_unexpected_write_error_retries", context->getSettings().azure_max_unexpected_write_error_retries); settings->max_inflight_parts_for_one_file = config.getUInt64(config_prefix + ".max_inflight_parts_for_one_file", context->getSettings().azure_max_inflight_parts_for_one_file); settings->strict_upload_part_size = config.getUInt64(config_prefix + ".strict_upload_part_size", context->getSettings().azure_strict_upload_part_size); diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index f12ebb68dbb..c3062def763 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -63,6 +63,7 @@ struct AzureObjectStorageSettings bool use_native_copy = false; size_t max_unexpected_write_error_retries = 4; size_t max_inflight_parts_for_one_file = 20; + size_t max_blocks_in_multipart_upload = 50000; size_t strict_upload_part_size = 0; size_t upload_part_size_multiply_factor = 2; size_t upload_part_size_multiply_parts_count_threshold = 500; diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index f6980d1e8f1..d4ff9bc0b79 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -133,6 +133,14 @@ void DiskObjectStorage::moveFile(const String & from_path, const String & to_pat transaction->commit(); } +void DiskObjectStorage::truncateFile(const String & path, size_t size) +{ + LOG_TEST(log, "Truncate file operation {} to size : {}", path, size); + auto transaction = createObjectStorageTransaction(); + transaction->truncateFile(path, size); + transaction->commit(); +} + void DiskObjectStorage::copyFile( /// NOLINT const String & from_file_path, IDisk & to_disk, diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.h b/src/Disks/ObjectStorages/DiskObjectStorage.h index fe8a5e2844a..2a27ddf89a7 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.h +++ b/src/Disks/ObjectStorages/DiskObjectStorage.h @@ -84,6 +84,8 @@ public: void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override; + void truncateFile(const String & path, size_t size) override; + MetadataStoragePtr getMetadataStorage() override { return metadata_storage; } UInt32 getRefCount(const String & path) const override; diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp index 19b8b51384f..44854633d65 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp @@ -15,6 +15,7 @@ namespace DB namespace ErrorCodes { extern const int UNKNOWN_FORMAT; + extern const int LOGICAL_ERROR; } void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf) @@ -207,6 +208,18 @@ void DiskObjectStorageMetadata::addObject(ObjectStorageKey key, size_t size) keys_with_meta.emplace_back(std::move(key), ObjectMetadata{size, {}, {}}); } +ObjectKeyWithMetadata DiskObjectStorageMetadata::popLastObject() +{ + if (keys_with_meta.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't pop last object from metadata {}. Metadata already empty", metadata_file_path); + + ObjectKeyWithMetadata object = std::move(keys_with_meta.back()); + keys_with_meta.pop_back(); + total_size -= object.metadata.size_bytes; + + return object; +} + bool DiskObjectStorageMetadata::getWriteFullObjectKeySetting() { #ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h index 729d93af10d..4f45f5b7ddf 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h +++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h @@ -52,6 +52,7 @@ public: void addObject(ObjectStorageKey key, size_t size); + ObjectKeyWithMetadata popLastObject(); void deserialize(ReadBuffer & buf); void deserializeFromString(const std::string & data); diff --git a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp index 4e364e44624..e7c85bea1c6 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp @@ -559,6 +559,54 @@ struct CopyFileObjectStorageOperation final : public IDiskObjectStorageOperation } }; +struct TruncateFileObjectStorageOperation final : public IDiskObjectStorageOperation +{ + std::string path; + size_t size; + + TruncateFileOperationOutcomePtr truncate_outcome; + + TruncateFileObjectStorageOperation( + IObjectStorage & object_storage_, + IMetadataStorage & metadata_storage_, + const std::string & path_, + size_t size_) + : IDiskObjectStorageOperation(object_storage_, metadata_storage_) + , path(path_) + , size(size_) + {} + + std::string getInfoForLog() const override + { + return fmt::format("TruncateFileObjectStorageOperation (path: {}, size: {})", path, size); + } + + void execute(MetadataTransactionPtr tx) override + { + if (metadata_storage.exists(path)) + { + if (!metadata_storage.isFile(path)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Path {} is not a file", path); + + truncate_outcome = tx->truncateFile(path, size); + } + } + + void undo() override + { + + } + + void finalize() override + { + if (!truncate_outcome) + return; + + if (!truncate_outcome->objects_to_remove.empty()) + object_storage.removeObjectsIfExist(truncate_outcome->objects_to_remove); + } +}; + } void DiskObjectStorageTransaction::createDirectory(const std::string & path) @@ -598,6 +646,13 @@ void DiskObjectStorageTransaction::moveFile(const String & from_path, const Stri })); } +void DiskObjectStorageTransaction::truncateFile(const String & path, size_t size) +{ + operations_to_execute.emplace_back( + std::make_unique(object_storage, metadata_storage, path, size) + ); +} + void DiskObjectStorageTransaction::replaceFile(const std::string & from_path, const std::string & to_path) { auto operation = std::make_unique(object_storage, metadata_storage, from_path, to_path); diff --git a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.h b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.h index 67044751b84..23f66990d54 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.h +++ b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.h @@ -92,6 +92,8 @@ public: void createFile(const String & path) override; + void truncateFile(const String & path, size_t size) override; + void copyFile(const std::string & from_file_path, const std::string & to_file_path, const ReadSettings & read_settings, const WriteSettings &) override; /// writeFile is a difficult function for transactions. diff --git a/src/Disks/ObjectStorages/IMetadataStorage.h b/src/Disks/ObjectStorages/IMetadataStorage.h index 168160f61a6..bed24849ed6 100644 --- a/src/Disks/ObjectStorages/IMetadataStorage.h +++ b/src/Disks/ObjectStorages/IMetadataStorage.h @@ -31,7 +31,15 @@ struct UnlinkMetadataFileOperationOutcome UInt32 num_hardlinks = std::numeric_limits::max(); }; +struct TruncateFileOperationOutcome +{ + StoredObjects objects_to_remove; +}; + + using UnlinkMetadataFileOperationOutcomePtr = std::shared_ptr; +using TruncateFileOperationOutcomePtr = std::shared_ptr; + /// Tries to provide some "transactions" interface, which allow /// to execute (commit) operations simultaneously. We don't provide @@ -143,6 +151,11 @@ public: return nullptr; } + virtual TruncateFileOperationOutcomePtr truncateFile(const std::string & /* path */, size_t /* size */) + { + throwNotImplemented(); + } + virtual ~IMetadataTransaction() = default; protected: diff --git a/src/Disks/ObjectStorages/MetadataStorageFactory.cpp b/src/Disks/ObjectStorages/MetadataStorageFactory.cpp index 4a3e8a37d28..ab7c2069b43 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFactory.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFactory.cpp @@ -99,8 +99,10 @@ void registerMetadataStorageFromDisk(MetadataStorageFactory & factory) { auto metadata_path = config.getString(config_prefix + ".metadata_path", fs::path(Context::getGlobalContextInstance()->getPath()) / "disks" / name / ""); + auto metadata_keep_free_space_bytes = config.getUInt64(config_prefix + ".metadata_keep_free_space_bytes", 0); + fs::create_directories(metadata_path); - auto metadata_disk = std::make_shared(name + "-metadata", metadata_path, 0, config, config_prefix); + auto metadata_disk = std::make_shared(name + "-metadata", metadata_path, metadata_keep_free_space_bytes, config, config_prefix); auto key_compatibility_prefix = getObjectKeyCompatiblePrefix(*object_storage, config, config_prefix); return std::make_shared(metadata_disk, key_compatibility_prefix); }); diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp b/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp index ab952888419..493470982cb 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp @@ -259,4 +259,12 @@ UnlinkMetadataFileOperationOutcomePtr MetadataStorageFromDiskTransaction::unlink return result; } +TruncateFileOperationOutcomePtr MetadataStorageFromDiskTransaction::truncateFile(const std::string & path, size_t target_size) +{ + auto operation = std::make_unique(path, target_size, metadata_storage, *metadata_storage.getDisk()); + auto result = operation->outcome; + addOperation(std::move(operation)); + return result; +} + } diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDisk.h b/src/Disks/ObjectStorages/MetadataStorageFromDisk.h index df16bf76a3c..8096b3b4565 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromDisk.h +++ b/src/Disks/ObjectStorages/MetadataStorageFromDisk.h @@ -129,6 +129,8 @@ public: UnlinkMetadataFileOperationOutcomePtr unlinkMetadata(const std::string & path) override; + TruncateFileOperationOutcomePtr truncateFile(const std::string & src_path, size_t target_size) override; + }; diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp index 194a735f64f..79d1f4a1f7c 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp @@ -4,9 +4,12 @@ #include #include #include +#include +#include #include #include #include +#include namespace fs = std::filesystem; @@ -14,6 +17,11 @@ namespace fs = std::filesystem; namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + static std::string getTempFileName(const std::string & dir) { return fs::path(dir) / getRandomASCIIString(32); @@ -341,6 +349,35 @@ void UnlinkMetadataFileOperation::undo(std::unique_lock & lock) outcome->num_hardlinks++; } +void TruncateMetadataFileOperation::execute(std::unique_lock & metadata_lock) +{ + if (metadata_storage.exists(path)) + { + auto metadata = metadata_storage.readMetadataUnlocked(path, metadata_lock); + while (metadata->getTotalSizeBytes() > target_size) + { + auto object_key_with_metadata = metadata->popLastObject(); + outcome->objects_to_remove.emplace_back(object_key_with_metadata.key.serialize(), path, object_key_with_metadata.metadata.size_bytes); + } + if (metadata->getTotalSizeBytes() != target_size) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "File {} can't be truncated to size {}", path, target_size); + } + LOG_TEST(getLogger("TruncateMetadataFileOperation"), "Going to remove {} blobs.", outcome->objects_to_remove.size()); + + write_operation = std::make_unique(path, disk, metadata->serializeToString()); + + write_operation->execute(metadata_lock); + } +} + +void TruncateMetadataFileOperation::undo(std::unique_lock & lock) +{ + if (write_operation) + write_operation->undo(lock); +} + + void SetReadonlyFileOperation::execute(std::unique_lock & metadata_lock) { auto metadata = metadata_storage.readMetadataUnlocked(path, metadata_lock); diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h index 3df29833f44..26f9f6460a4 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h +++ b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h @@ -282,4 +282,34 @@ private: std::unique_ptr write_operation; }; +struct TruncateMetadataFileOperation final : public IMetadataOperation +{ + const TruncateFileOperationOutcomePtr outcome = std::make_shared(); + + TruncateMetadataFileOperation( + const std::string & path_, + size_t target_size_, + const MetadataStorageFromDisk & metadata_storage_, + IDisk & disk_) + : path(path_) + , target_size(target_size_) + , metadata_storage(metadata_storage_) + , disk(disk_) + { + } + + void execute(std::unique_lock & metadata_lock) override; + + void undo(std::unique_lock & lock) override; + +private: + std::string path; + size_t target_size; + + const MetadataStorageFromDisk & metadata_storage; + IDisk & disk; + + std::unique_ptr write_operation; +}; + } diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index cc53054c775..c83b9247b99 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -306,7 +306,6 @@ void registerAzureObjectStorage(ObjectStorageFactory & factory) bool /* skip_access_check */) -> ObjectStoragePtr { AzureBlobStorageEndpoint endpoint = processAzureBlobStorageEndpoint(config, config_prefix); - std::string endpoint_string = endpoint.getEndpoint(); return createObjectStorage( ObjectStorageType::Azure, config, config_prefix, name, diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 043e5b8ef8c..adbdd9d13aa 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index c3114eb0b6f..35913613326 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -4,7 +4,7 @@ #if USE_AWS_S3 -#include +#include #include #include #include diff --git a/src/Disks/VolumeJBOD.cpp b/src/Disks/VolumeJBOD.cpp index a0c71583a22..d0e9d32ff5e 100644 --- a/src/Disks/VolumeJBOD.cpp +++ b/src/Disks/VolumeJBOD.cpp @@ -1,6 +1,6 @@ #include "VolumeJBOD.h" -#include +#include #include #include #include diff --git a/src/Formats/CapnProtoSchema.cpp b/src/Formats/CapnProtoSchema.cpp index 559047a6f8d..6076dae4157 100644 --- a/src/Formats/CapnProtoSchema.cpp +++ b/src/Formats/CapnProtoSchema.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 3edade639df..89a7a31d033 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -76,7 +76,7 @@ void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule esca /// Empty field, just skip spaces break; case FormatSettings::EscapingRule::Escaped: - readEscapedStringInto(out, buf); + readEscapedStringInto(out, buf); break; case FormatSettings::EscapingRule::Quoted: readQuotedFieldInto(out, buf); diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 3199445864d..90630d30c20 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -15,7 +16,7 @@ #include #include #include -#include +#include #include @@ -202,6 +203,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.tsv.try_detect_header = settings.input_format_tsv_detect_header; format_settings.tsv.skip_trailing_empty_lines = settings.input_format_tsv_skip_trailing_empty_lines; format_settings.tsv.allow_variable_number_of_columns = settings.input_format_tsv_allow_variable_number_of_columns; + format_settings.tsv.crlf_end_of_line_input = settings.input_format_tsv_crlf_end_of_line; format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals; format_settings.values.allow_data_after_semicolon = settings.input_format_values_allow_data_after_semicolon; format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions; @@ -693,21 +695,12 @@ String FormatFactory::getFormatFromFileName(String file_name) std::optional FormatFactory::tryGetFormatFromFileDescriptor(int fd) { -#ifdef OS_LINUX - std::string proc_path = fmt::format("/proc/self/fd/{}", fd); - char file_path[PATH_MAX] = {'\0'}; - if (readlink(proc_path.c_str(), file_path, sizeof(file_path) - 1) != -1) - return tryGetFormatFromFileName(file_path); + std::optional file_name = tryGetFileNameFromFileDescriptor(fd); + + if (file_name) + return tryGetFormatFromFileName(*file_name); + return std::nullopt; -#elif defined(OS_DARWIN) - char file_path[PATH_MAX] = {'\0'}; - if (fcntl(fd, F_GETPATH, file_path) != -1) - return tryGetFormatFromFileName(file_path); - return std::nullopt; -#else - (void)fd; - return std::nullopt; -#endif } String FormatFactory::getFormatFromFileDescriptor(int fd) diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index f29fc51af6a..7287baef290 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -44,9 +44,9 @@ struct FormatSettings String column_names_for_schema_inference{}; String schema_inference_hints{}; - bool try_infer_integers = false; - bool try_infer_dates = false; - bool try_infer_datetimes = false; + bool try_infer_integers = true; + bool try_infer_dates = true; + bool try_infer_datetimes = true; bool try_infer_exponent_floats = false; enum class DateTimeInputFormat : uint8_t @@ -361,6 +361,7 @@ struct FormatSettings bool try_detect_header = true; bool skip_trailing_empty_lines = false; bool allow_variable_number_of_columns = false; + bool crlf_end_of_line_input = false; } tsv{}; struct diff --git a/src/Formats/NativeReader.cpp b/src/Formats/NativeReader.cpp index 8286b24d0a6..39915b0735e 100644 --- a/src/Formats/NativeReader.cpp +++ b/src/Formats/NativeReader.cpp @@ -93,7 +93,7 @@ void NativeReader::readData(const ISerialization & serialization, ColumnPtr & co ISerialization::DeserializeBinaryBulkStatePtr state; - serialization.deserializeBinaryBulkStatePrefix(settings, state); + serialization.deserializeBinaryBulkStatePrefix(settings, state, nullptr); serialization.deserializeBinaryBulkWithMultipleStreams(column, rows, settings, state, nullptr); if (column->size() != rows) diff --git a/src/Formats/StructureToCapnProtoSchema.cpp b/src/Formats/StructureToCapnProtoSchema.cpp index 99298fadee1..cd45b19d3c0 100644 --- a/src/Formats/StructureToCapnProtoSchema.cpp +++ b/src/Formats/StructureToCapnProtoSchema.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Formats/StructureToProtobufSchema.cpp b/src/Formats/StructureToProtobufSchema.cpp index 178c0ae3cc2..9fd02969adb 100644 --- a/src/Formats/StructureToProtobufSchema.cpp +++ b/src/Formats/StructureToProtobufSchema.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index 751e8cf5103..c52b00150ec 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -31,7 +31,7 @@ extract_into_parent_list(clickhouse_functions_headers dbms_headers add_library(clickhouse_functions_obj OBJECT ${clickhouse_functions_headers} ${clickhouse_functions_sources}) if (OMIT_HEAVY_DEBUG_SYMBOLS) target_compile_options(clickhouse_functions_obj PRIVATE "-g0") - set_source_files_properties(${DBMS_FUNCTIONS} PROPERTIES COMPILE_FLAGS "-g0") + set_source_files_properties(${DBMS_FUNCTIONS} DIRECTORY .. PROPERTIES COMPILE_FLAGS "-g0") endif() list (APPEND OBJECT_LIBS $) diff --git a/src/Functions/ExtractString.h b/src/Functions/ExtractString.h index aa0e1b04835..6beb8be830a 100644 --- a/src/Functions/ExtractString.h +++ b/src/Functions/ExtractString.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include #include #include diff --git a/src/Functions/FunctionHelpers.cpp b/src/Functions/FunctionHelpers.cpp index d85bb0e7060..3b057779ffe 100644 --- a/src/Functions/FunctionHelpers.cpp +++ b/src/Functions/FunctionHelpers.cpp @@ -21,6 +21,8 @@ namespace ErrorCodes const ColumnConst * checkAndGetColumnConstStringOrFixedString(const IColumn * column) { + if (!column) + return {}; if (!isColumnConst(*column)) return {}; diff --git a/src/Functions/FunctionTokens.h b/src/Functions/FunctionTokens.h index c80152bc71d..d6cf6a24983 100644 --- a/src/Functions/FunctionTokens.h +++ b/src/Functions/FunctionTokens.h @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index beb7e6feb47..44d0b750af9 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -38,11 +39,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -65,7 +68,6 @@ #include #include - namespace DB { @@ -574,7 +576,7 @@ ColumnUInt8::MutablePtr copyNullMap(ColumnPtr col) template struct ConvertImplGenericToString { - static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) + static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/, const ContextPtr & context) { static_assert(std::is_same_v || std::is_same_v, "Can be used only to serialize to ColumnString or ColumnFixedString"); @@ -595,7 +597,7 @@ struct ConvertImplGenericToString auto & write_buffer = write_helper.getWriteBuffer(); - FormatSettings format_settings; + FormatSettings format_settings = context ? getFormatSettings(context) : FormatSettings{}; auto serialization = type.getDefaultSerialization(); for (size_t row = 0; row < size; ++row) { @@ -1819,7 +1821,7 @@ struct ConvertImpl template struct ConvertImplGenericFromString { - static ColumnPtr execute(ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) + static ColumnPtr execute(ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count, const ContextPtr & context) { const IColumn & column_from = *arguments[0].column; const IDataType & data_type_to = *result_type; @@ -1827,7 +1829,7 @@ struct ConvertImplGenericFromString auto serialization = data_type_to.getDefaultSerialization(); const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr; - executeImpl(column_from, *res, *serialization, input_rows_count, null_map, result_type.get()); + executeImpl(column_from, *res, *serialization, input_rows_count, null_map, result_type.get(), context); return res; } @@ -1837,11 +1839,12 @@ struct ConvertImplGenericFromString const ISerialization & serialization_from, size_t input_rows_count, const PaddedPODArray * null_map, - const IDataType * result_type) + const IDataType * result_type, + const ContextPtr & context) { column_to.reserve(input_rows_count); - FormatSettings format_settings; + FormatSettings format_settings = context ? getFormatSettings(context) : FormatSettings{}; for (size_t i = 0; i < input_rows_count; ++i) { if (null_map && (*null_map)[i]) @@ -2298,7 +2301,7 @@ private: if constexpr (std::is_same_v) { if (from_type->getCustomSerialization()) - return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count, context); } bool done = false; @@ -2331,7 +2334,7 @@ private: /// Generic conversion of any type to String. if (std::is_same_v) { - return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count, context); } else throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", @@ -3287,8 +3290,17 @@ private: if (checkAndGetDataType(from_type.get())) { if (cast_type == CastType::accurateOrNull) - return &ConvertImplGenericFromString::execute; - return &ConvertImplGenericFromString::execute; + { + return [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) -> ColumnPtr + { + return ConvertImplGenericFromString::execute(arguments, result_type, column_nullable, input_rows_count, context); + }; + } + + return [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) -> ColumnPtr + { + return ConvertImplGenericFromString::execute(arguments, result_type, column_nullable, input_rows_count, context); + }; } return createWrapper(from_type, to_type, requested_result_is_nullable); @@ -3451,7 +3463,10 @@ private: /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return &ConvertImplGenericFromString::execute; + return [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) -> ColumnPtr + { + return ConvertImplGenericFromString::execute(arguments, result_type, column_nullable, input_rows_count, context); + }; } else if (const auto * agg_type = checkAndGetDataType(from_type_untyped.get())) { @@ -3494,7 +3509,10 @@ private: /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return &ConvertImplGenericFromString::execute; + return [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) -> ColumnPtr + { + return ConvertImplGenericFromString::execute(arguments, result_type, column_nullable, input_rows_count, context); + }; } DataTypePtr from_type_holder; @@ -3585,7 +3603,10 @@ private: /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return &ConvertImplGenericFromString::execute; + return [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) -> ColumnPtr + { + return ConvertImplGenericFromString::execute(arguments, result_type, column_nullable, input_rows_count, context); + }; } const auto * from_type = checkAndGetDataType(from_type_untyped.get()); @@ -3934,9 +3955,9 @@ private: } else if (checkAndGetDataType(from_type.get())) { - return [] (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * nullable_source, size_t input_rows_count) + return [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * nullable_source, size_t input_rows_count) { - auto res = ConvertImplGenericFromString::execute(arguments, result_type, nullable_source, input_rows_count)->assumeMutable(); + auto res = ConvertImplGenericFromString::execute(arguments, result_type, nullable_source, input_rows_count, context)->assumeMutable(); res->finalize(); return res; }; @@ -4053,9 +4074,9 @@ private: casted_variant_columns.reserve(variant_types.size()); for (size_t i = 0; i != variant_types.size(); ++i) { - auto variant_col = column_variant.getVariantPtrByLocalDiscriminator(i); + auto variant_col = column_variant.getVariantPtrByGlobalDiscriminator(i); ColumnsWithTypeAndName variant = {{variant_col, variant_types[i], "" }}; - const auto & variant_wrapper = variant_wrappers[column_variant.globalDiscriminatorByLocal(i)]; + const auto & variant_wrapper = variant_wrappers[i]; casted_variant_columns.push_back(variant_wrapper(variant, result_type, nullptr, variant_col->size())); } @@ -4065,11 +4086,11 @@ private: res->reserve(input_rows_count); for (size_t i = 0; i != input_rows_count; ++i) { - auto local_discr = local_discriminators[i]; - if (local_discr == ColumnVariant::NULL_DISCRIMINATOR) + auto global_discr = column_variant.globalDiscriminatorByLocal(local_discriminators[i]); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) res->insertDefault(); else - res->insertFrom(*casted_variant_columns[local_discr], column_variant.offsetAt(i)); + res->insertFrom(*casted_variant_columns[global_discr], column_variant.offsetAt(i)); } return res; @@ -4109,8 +4130,8 @@ private: args[0].type = removeNullable(removeLowCardinality(args[0].type)); if (cast_type == CastType::accurateOrNull) - return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count); - return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count); + return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count, context); + return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count, context); }; } @@ -4127,7 +4148,7 @@ private: }; } - auto variant_discr_opt = to_variant.tryGetVariantDiscriminator(*removeNullableOrLowCardinalityNullable(from_type)); + auto variant_discr_opt = to_variant.tryGetVariantDiscriminator(removeNullableOrLowCardinalityNullable(from_type)->getName()); /// Cast String to Variant through parsing if it's not Variant(String). if (isStringOrFixedString(removeNullable(removeLowCardinality(from_type))) && (!variant_discr_opt || to_variant.getVariants().size() > 1)) return createStringToVariantWrapper(); @@ -4239,6 +4260,284 @@ private: return createColumnToVariantWrapper(from_type, assert_cast(*to_type)); } + WrapperType createDynamicToColumnWrapper(const DataTypePtr &) const + { + return [this] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * col_nullable, size_t input_rows_count) -> ColumnPtr + { + const auto & column_dynamic = assert_cast(*arguments.front().column.get()); + const auto & variant_info = column_dynamic.getVariantInfo(); + auto variant_wrapper = createVariantToColumnWrapper(assert_cast(*variant_info.variant_type), result_type); + ColumnsWithTypeAndName args = {ColumnWithTypeAndName(column_dynamic.getVariantColumnPtr(), variant_info.variant_type, "")}; + return variant_wrapper(args, result_type, col_nullable, input_rows_count); + }; + } + + WrapperType createStringToDynamicThroughParsingWrapper() const + { + return [&](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + { + auto column = arguments[0].column->convertToFullColumnIfLowCardinality(); + auto args = arguments; + args[0].column = column; + + const ColumnNullable * column_nullable = nullptr; + if (isColumnNullable(*args[0].column)) + { + column_nullable = assert_cast(args[0].column.get()); + args[0].column = column_nullable->getNestedColumnPtr(); + } + + args[0].type = removeNullable(removeLowCardinality(args[0].type)); + + if (cast_type == CastType::accurateOrNull) + return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count, context); + return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count, context); + }; + } + + std::pair getReducedVariant( + const ColumnVariant & variant_column, + const DataTypePtr & variant_type, + const std::unordered_map & variant_name_to_discriminator, + size_t max_result_num_variants, + const ColumnDynamic::Statistics & statistics = {}) const + { + const auto & variant_types = assert_cast(*variant_type).getVariants(); + /// First check if we don't exceed the limit in current Variant column. + if (variant_types.size() < max_result_num_variants || (variant_types.size() == max_result_num_variants && variant_name_to_discriminator.contains("String"))) + return {variant_column.getPtr(), variant_type}; + + /// We want to keep the most frequent variants and convert to string the rarest. + std::vector> variant_sizes; + variant_sizes.reserve(variant_types.size()); + std::optional old_string_discriminator; + /// List of variants that should be converted to a single String variant. + std::vector variants_to_convert_to_string; + for (size_t i = 0; i != variant_types.size(); ++i) + { + /// String variant won't be removed. + String variant_name = variant_types[i]->getName(); + + if (variant_name == "String") + { + old_string_discriminator = i; + /// For simplicity, add this variant to the list that will be converted to string, + /// so we will process it with other variants when constructing the new String variant. + variants_to_convert_to_string.push_back(i); + } + else + { + size_t size = 0; + if (statistics.data.empty()) + size = variant_column.getVariantByGlobalDiscriminator(i).size(); + else + size = statistics.data.at(variant_name); + variant_sizes.emplace_back(size, i); + } + } + + /// Sort variants by sizes, so we will keep the most frequent. + std::sort(variant_sizes.begin(), variant_sizes.end(), std::greater()); + + DataTypes remaining_variants; + remaining_variants.reserve(max_result_num_variants); + /// Add String variant in advance. + remaining_variants.push_back(std::make_shared()); + for (auto [_, discr] : variant_sizes) + { + if (remaining_variants.size() != max_result_num_variants) + remaining_variants.push_back(variant_types[discr]); + else + variants_to_convert_to_string.push_back(discr); + } + + auto reduced_variant = std::make_shared(remaining_variants); + const auto & new_variants = reduced_variant->getVariants(); + /// To construct reduced variant column we will need mapping from old to new discriminators. + std::vector old_to_new_discriminators_mapping; + old_to_new_discriminators_mapping.resize(variant_types.size()); + ColumnVariant::Discriminator string_variant_discriminator = 0; + for (size_t i = 0; i != new_variants.size(); ++i) + { + String variant_name = new_variants[i]->getName(); + if (variant_name == "String") + { + string_variant_discriminator = i; + for (auto discr : variants_to_convert_to_string) + old_to_new_discriminators_mapping[discr] = i; + } + else + { + auto old_discr = variant_name_to_discriminator.at(variant_name); + old_to_new_discriminators_mapping[old_discr] = i; + } + } + + /// Convert all reduced variants to String. + std::unordered_map variants_converted_to_string; + variants_converted_to_string.reserve(variants_to_convert_to_string.size()); + size_t string_variant_size = 0; + for (auto discr : variants_to_convert_to_string) + { + auto string_type = std::make_shared(); + auto string_wrapper = prepareUnpackDictionaries(variant_types[discr], string_type); + auto column_to_convert = ColumnWithTypeAndName(variant_column.getVariantPtrByGlobalDiscriminator(discr), variant_types[discr], ""); + ColumnsWithTypeAndName args = {column_to_convert}; + auto variant_string_column = string_wrapper(args, string_type, nullptr, column_to_convert.column->size()); + string_variant_size += variant_string_column->size(); + variants_converted_to_string[discr] = variant_string_column; + } + + /// Create new discriminators and offsets and fill new String variant according to old discriminators. + auto string_variant = ColumnString::create(); + string_variant->reserve(string_variant_size); + auto new_discriminators_column = variant_column.getLocalDiscriminatorsPtr()->cloneEmpty(); + auto & new_discriminators_data = assert_cast(*new_discriminators_column).getData(); + new_discriminators_data.reserve(variant_column.size()); + auto new_offsets = variant_column.getOffsetsPtr()->cloneEmpty(); + auto & new_offsets_data = assert_cast(*new_offsets).getData(); + new_offsets_data.reserve(variant_column.size()); + const auto & old_local_discriminators = variant_column.getLocalDiscriminators(); + const auto & old_offsets = variant_column.getOffsets(); + for (size_t i = 0; i != old_local_discriminators.size(); ++i) + { + auto old_discr = variant_column.globalDiscriminatorByLocal(old_local_discriminators[i]); + + if (old_discr == ColumnVariant::NULL_DISCRIMINATOR) + { + new_discriminators_data.push_back(ColumnVariant::NULL_DISCRIMINATOR); + new_offsets_data.push_back(0); + continue; + } + + auto new_discr = old_to_new_discriminators_mapping[old_discr]; + new_discriminators_data.push_back(new_discr); + if (new_discr != string_variant_discriminator) + { + new_offsets_data.push_back(old_offsets[i]); + } + else + { + new_offsets_data.push_back(string_variant->size()); + string_variant->insertFrom(*variants_converted_to_string[old_discr], old_offsets[i]); + } + } + + /// Create new list of variant columns. + Columns new_variant_columns; + new_variant_columns.resize(new_variants.size()); + for (size_t i = 0; i != variant_types.size(); ++i) + { + auto new_discr = old_to_new_discriminators_mapping[i]; + if (new_discr != string_variant_discriminator) + new_variant_columns[new_discr] = variant_column.getVariantPtrByGlobalDiscriminator(i); + } + new_variant_columns[string_variant_discriminator] = std::move(string_variant); + return {ColumnVariant::create(std::move(new_discriminators_column), std::move(new_offsets), new_variant_columns), reduced_variant}; + } + + WrapperType createVariantToDynamicWrapper(const DataTypePtr & from_type, const DataTypeDynamic & dynamic_type) const + { + const auto & from_variant_type = assert_cast(*from_type); + size_t max_dynamic_types = dynamic_type.getMaxDynamicTypes(); + const auto & variants = from_variant_type.getVariants(); + std::unordered_map variant_name_to_discriminator; + variant_name_to_discriminator.reserve(variants.size()); + for (size_t i = 0; i != variants.size(); ++i) + variant_name_to_discriminator[variants[i]->getName()] = i; + + return [from_type, max_dynamic_types, variant_name_to_discriminator, this] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr + { + const auto & variant_column = assert_cast(*arguments.front().column); + auto [reduced_variant_column, reduced_variant_type] = getReducedVariant(variant_column, from_type, variant_name_to_discriminator, max_dynamic_types); + return ColumnDynamic::create(reduced_variant_column, reduced_variant_type, max_dynamic_types); + }; + } + + WrapperType createColumnToDynamicWrapper(const DataTypePtr & from_type, const DataTypeDynamic & dynamic_type) const + { + if (const auto * variant_type = typeid_cast(from_type.get())) + return createVariantToDynamicWrapper(from_type, dynamic_type); + + if (dynamic_type.getMaxDynamicTypes() == 1) + { + DataTypePtr string_type = std::make_shared(); + if (from_type->isNullable()) + string_type = makeNullable(string_type); + auto string_wrapper = prepareUnpackDictionaries(from_type, string_type); + auto variant_type = std::make_shared(DataTypes{removeNullable(string_type)}); + auto variant_wrapper = createColumnToVariantWrapper(string_type, *variant_type); + return [string_wrapper, variant_wrapper, string_type, variant_type, max_dynamic_types=dynamic_type.getMaxDynamicTypes()] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * col_nullable, size_t input_rows_count) -> ColumnPtr + { + auto string_column = string_wrapper(arguments, string_type, col_nullable, input_rows_count); + auto column = ColumnWithTypeAndName(string_column, string_type, ""); + ColumnsWithTypeAndName args = {column}; + auto variant_column = variant_wrapper(args, variant_type, nullptr, string_column->size()); + return ColumnDynamic::create(variant_column, variant_type, max_dynamic_types); + }; + } + + if (context && context->getSettingsRef().cast_string_to_dynamic_use_inference && isStringOrFixedString(removeNullable(removeLowCardinality(from_type)))) + return createStringToDynamicThroughParsingWrapper(); + + auto variant_type = std::make_shared(DataTypes{removeNullableOrLowCardinalityNullable(from_type)}); + auto variant_wrapper = createColumnToVariantWrapper(from_type, *variant_type); + return [variant_wrapper, variant_type, max_dynamic_types=dynamic_type.getMaxDynamicTypes()] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * col_nullable, size_t input_rows_count) -> ColumnPtr + { + auto variant_res = variant_wrapper(arguments, variant_type, col_nullable, input_rows_count); + return ColumnDynamic::create(variant_res, variant_type, max_dynamic_types); + }; + } + + WrapperType createDynamicToDynamicWrapper(const DataTypeDynamic & from_dynamic, const DataTypeDynamic & to_dynamic) const + { + size_t from_max_types = from_dynamic.getMaxDynamicTypes(); + size_t to_max_types = to_dynamic.getMaxDynamicTypes(); + if (from_max_types == to_max_types) + return createIdentityWrapper(from_dynamic.getPtr()); + + if (to_max_types > from_max_types) + { + return [to_max_types] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr + { + const auto & column_dynamic = assert_cast(*arguments[0].column); + return ColumnDynamic::create(column_dynamic.getVariantColumnPtr(), column_dynamic.getVariantInfo(), to_max_types); + }; + } + + return [to_max_types, this] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr + { + const auto & column_dynamic = assert_cast(*arguments[0].column); + auto [reduced_variant_column, reduced_variant_type] = getReducedVariant( + column_dynamic.getVariantColumn(), + column_dynamic.getVariantInfo().variant_type, + column_dynamic.getVariantInfo().variant_name_to_discriminator, + to_max_types, + column_dynamic.getStatistics()); + return ColumnDynamic::create(reduced_variant_column, reduced_variant_type, to_max_types); + }; + } + + /// Wrapper for conversion to/from Dynamic type + WrapperType createDynamicWrapper(const DataTypePtr & from_type, const DataTypePtr & to_type) const + { + if (const auto * from_dynamic = checkAndGetDataType(from_type.get())) + { + if (const auto * to_dynamic = checkAndGetDataType(to_type.get())) + return createDynamicToDynamicWrapper(*from_dynamic, *to_dynamic); + + return createDynamicToColumnWrapper(to_type); + } + + return createColumnToDynamicWrapper(from_type, *checkAndGetDataType(to_type.get())); + } + template WrapperType createEnumWrapper(const DataTypePtr & from_type, const DataTypeEnum * to_type) const { @@ -4418,8 +4717,11 @@ private: WrapperType prepareUnpackDictionaries(const DataTypePtr & from_type, const DataTypePtr & to_type) const { - /// Conversion from/to Variant data type is processed in a special way. + /// Conversion from/to Variant/Dynamic data type is processed in a special way. /// We don't need to remove LowCardinality/Nullable. + if (isDynamic(to_type) || isDynamic(from_type)) + return createDynamicWrapper(from_type, to_type); + if (isVariant(to_type) || isVariant(from_type)) return createVariantWrapper(from_type, to_type); @@ -4733,7 +5035,7 @@ private: if (to_type->getCustomSerialization() && to_type->getCustomName()) { - ret = [this, requested_result_is_nullable]( + ret = [requested_result_is_nullable, this]( ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, @@ -4744,9 +5046,9 @@ private: wrapped_result_type = makeNullable(result_type); if (this->cast_type == CastType::accurateOrNull) return ConvertImplGenericFromString::execute( - arguments, wrapped_result_type, column_nullable, input_rows_count); + arguments, wrapped_result_type, column_nullable, input_rows_count, context); return ConvertImplGenericFromString::execute( - arguments, wrapped_result_type, column_nullable, input_rows_count); + arguments, wrapped_result_type, column_nullable, input_rows_count, context); }; return true; } @@ -4782,9 +5084,9 @@ private: } else if (from_type->getCustomSerialization()) { - ret = [](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + ret = [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr { - return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count, context); }; return true; } diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 3d8a11319c4..27717ea3611 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -49,6 +49,8 @@ #include #include +#include + namespace DB { @@ -75,17 +77,29 @@ namespace impl ColumnPtr key0; ColumnPtr key1; bool is_const; + const ColumnArray::Offsets * offsets{}; size_t size() const { assert(key0 && key1); assert(key0->size() == key1->size()); + assert(offsets == nullptr || offsets->size() == key0->size()); + if (offsets != nullptr) + return offsets->back(); return key0->size(); } SipHashKey getKey(size_t i) const { if (is_const) i = 0; + if (offsets != nullptr) + { + const auto *const begin = offsets->begin(); + const auto * upper = std::upper_bound(begin, offsets->end(), i); + if (upper == offsets->end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "offset {} not found in function SipHashKeyColumns::getKey", i); + i = upper - begin; + } const auto & key0data = assert_cast(*key0).getData(); const auto & key1data = assert_cast(*key1).getData(); return {key0data[i], key1data[i]}; @@ -1112,7 +1126,15 @@ private: typename ColumnVector::Container vec_temp(nested_size); bool nested_is_first = true; - executeForArgument(key_cols, nested_type, nested_column, vec_temp, nested_is_first); + + if constexpr (Keyed) + { + KeyColumnsType key_cols_tmp{key_cols}; + key_cols_tmp.offsets = &offsets; + executeForArgument(key_cols_tmp, nested_type, nested_column, vec_temp, nested_is_first); + } + else + executeForArgument(key_cols, nested_type, nested_column, vec_temp, nested_is_first); const size_t size = offsets.size(); diff --git a/src/Functions/FunctionsProgrammingClassification.cpp b/src/Functions/FunctionsProgrammingClassification.cpp index a93e1d9a87d..c01e47ad0d7 100644 --- a/src/Functions/FunctionsProgrammingClassification.cpp +++ b/src/Functions/FunctionsProgrammingClassification.cpp @@ -2,7 +2,7 @@ #if USE_NLP -#include +#include #include #include diff --git a/src/Functions/FunctionsTonalityClassification.cpp b/src/Functions/FunctionsTonalityClassification.cpp index 3de38d99c88..a9321819a26 100644 --- a/src/Functions/FunctionsTonalityClassification.cpp +++ b/src/Functions/FunctionsTonalityClassification.cpp @@ -2,7 +2,7 @@ #if USE_NLP -#include +#include #include #include diff --git a/src/Functions/JSONPath/Parsers/ParserJSONPathMemberAccess.cpp b/src/Functions/JSONPath/Parsers/ParserJSONPathMemberAccess.cpp index 6d5e37623e9..fad822379d4 100644 --- a/src/Functions/JSONPath/Parsers/ParserJSONPathMemberAccess.cpp +++ b/src/Functions/JSONPath/Parsers/ParserJSONPathMemberAccess.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Functions/LowerUpperUTF8Impl.h b/src/Functions/LowerUpperUTF8Impl.h index bb794a0f8ed..eebba7b9d5f 100644 --- a/src/Functions/LowerUpperUTF8Impl.h +++ b/src/Functions/LowerUpperUTF8Impl.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #ifdef __SSE2__ @@ -94,7 +95,7 @@ struct LowerUpperUTF8Impl if (data.empty()) return; - bool all_ascii = UTF8::isAllASCII(data.data(), data.size()); + bool all_ascii = isAllASCII(data.data(), data.size()); if (all_ascii) { LowerUpperImpl::vector(data, offsets, res_data, res_offsets); diff --git a/src/Functions/URL/domain.h b/src/Functions/URL/domain.h index 87f5aeffda7..936fb9d5f00 100644 --- a/src/Functions/URL/domain.h +++ b/src/Functions/URL/domain.h @@ -3,7 +3,7 @@ #include "protocol.h" #include #include -#include +#include namespace DB { diff --git a/src/Functions/URL/netloc.cpp b/src/Functions/URL/netloc.cpp index abfa7ec26fd..d1ca4fa1614 100644 --- a/src/Functions/URL/netloc.cpp +++ b/src/Functions/URL/netloc.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include #include diff --git a/src/Functions/URL/port.cpp b/src/Functions/URL/port.cpp index 942f6b702fd..c8f50f10a56 100644 --- a/src/Functions/URL/port.cpp +++ b/src/Functions/URL/port.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include diff --git a/src/Functions/URL/protocol.h b/src/Functions/URL/protocol.h index c1d83192835..5e90f538ff1 100644 --- a/src/Functions/URL/protocol.h +++ b/src/Functions/URL/protocol.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include diff --git a/src/Functions/UserDefined/ExternalUserDefinedExecutableFunctionsLoader.cpp b/src/Functions/UserDefined/ExternalUserDefinedExecutableFunctionsLoader.cpp index a4f17aa1201..2c031158c48 100644 --- a/src/Functions/UserDefined/ExternalUserDefinedExecutableFunctionsLoader.cpp +++ b/src/Functions/UserDefined/ExternalUserDefinedExecutableFunctionsLoader.cpp @@ -1,7 +1,7 @@ #include "ExternalUserDefinedExecutableFunctionsLoader.h" #include -#include +#include #include diff --git a/src/Functions/UserDefined/UserDefinedSQLObjectsDiskStorage.cpp b/src/Functions/UserDefined/UserDefinedSQLObjectsDiskStorage.cpp index d874612ad04..b406cc8d317 100644 --- a/src/Functions/UserDefined/UserDefinedSQLObjectsDiskStorage.cpp +++ b/src/Functions/UserDefined/UserDefinedSQLObjectsDiskStorage.cpp @@ -3,7 +3,7 @@ #include "Functions/UserDefined/UserDefinedSQLFunctionFactory.h" #include "Functions/UserDefined/UserDefinedSQLObjectType.h" -#include +#include #include #include #include diff --git a/src/Functions/alphaTokens.cpp b/src/Functions/alphaTokens.cpp index 35f434e7498..f4d77f1d654 100644 --- a/src/Functions/alphaTokens.cpp +++ b/src/Functions/alphaTokens.cpp @@ -1,7 +1,7 @@ #include #include -#include +#include namespace DB diff --git a/src/Functions/arrayStringConcat.cpp b/src/Functions/arrayStringConcat.cpp index b787feeeca1..421408c01f2 100644 --- a/src/Functions/arrayStringConcat.cpp +++ b/src/Functions/arrayStringConcat.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include diff --git a/src/Functions/decodeHTMLComponent.cpp b/src/Functions/decodeHTMLComponent.cpp index 4db3c43f946..00a601b77a6 100644 --- a/src/Functions/decodeHTMLComponent.cpp +++ b/src/Functions/decodeHTMLComponent.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace DB diff --git a/src/Functions/decodeXMLComponent.cpp b/src/Functions/decodeXMLComponent.cpp index a25e67e0e37..cbbe46fcb8c 100644 --- a/src/Functions/decodeXMLComponent.cpp +++ b/src/Functions/decodeXMLComponent.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Functions/dynamicElement.cpp b/src/Functions/dynamicElement.cpp new file mode 100644 index 00000000000..202533dc5c8 --- /dev/null +++ b/src/Functions/dynamicElement.cpp @@ -0,0 +1,168 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int BAD_ARGUMENTS; +} + +namespace +{ + +/** Extract element of Dynamic by type name. + * Also the function looks through Arrays: you can get Array of Dynamic elements from Array of Dynamic. + */ +class FunctionDynamicElement : public IFunction +{ +public: + static constexpr auto name = "dynamicElement"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 2; } + bool useDefaultImplementationForConstants() const override { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + const size_t number_of_arguments = arguments.size(); + + if (number_of_arguments != 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 2", + getName(), number_of_arguments); + + size_t count_arrays = 0; + const IDataType * input_type = arguments[0].type.get(); + while (const DataTypeArray * array = checkAndGetDataType(input_type)) + { + input_type = array->getNestedType().get(); + ++count_arrays; + } + + if (!isDynamic(*input_type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Variant or Array of Variant. Actual {}", + getName(), + arguments[0].type->getName()); + + auto return_type = makeNullableOrLowCardinalityNullableSafe(getRequestedType(arguments[1].column)); + + for (; count_arrays; --count_arrays) + return_type = std::make_shared(return_type); + + return return_type; + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto & input_arg = arguments[0]; + const IDataType * input_type = input_arg.type.get(); + const IColumn * input_col = input_arg.column.get(); + + bool input_arg_is_const = false; + if (typeid_cast(input_col)) + { + input_col = assert_cast(input_col)->getDataColumnPtr().get(); + input_arg_is_const = true; + } + + Columns array_offsets; + while (const DataTypeArray * array_type = checkAndGetDataType(input_type)) + { + const ColumnArray * array_col = assert_cast(input_col); + + input_type = array_type->getNestedType().get(); + input_col = &array_col->getData(); + array_offsets.push_back(array_col->getOffsetsPtr()); + } + + const ColumnDynamic * input_col_as_dynamic = checkAndGetColumn(input_col); + const DataTypeDynamic * input_type_as_dynamic = checkAndGetDataType(input_type); + if (!input_col_as_dynamic || !input_type_as_dynamic) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Dynamic or array of Dynamics. Actual {}", getName(), input_arg.type->getName()); + + auto type = getRequestedType(arguments[1].column); + auto subcolumn = input_type_as_dynamic->getSubcolumn(type->getName(), input_col_as_dynamic->getPtr()); + return wrapInArraysAndConstIfNeeded(std::move(subcolumn), array_offsets, input_arg_is_const, input_rows_count); + } + +private: + DataTypePtr getRequestedType(const ColumnPtr & type_name_column) const + { + const auto * name_col = checkAndGetColumnConst(type_name_column.get()); + if (!name_col) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument of {} must be a constant String", getName()); + + String element_type_name = name_col->getValue(); + auto element_type = DataTypeFactory::instance().tryGet(element_type_name); + if (!element_type) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument of {} must be a valid type name. Got: {}", getName(), element_type_name); + + return element_type; + } + + ColumnPtr wrapInArraysAndConstIfNeeded(ColumnPtr res, const Columns & array_offsets, bool input_arg_is_const, size_t input_rows_count) const + { + for (auto it = array_offsets.rbegin(); it != array_offsets.rend(); ++it) + res = ColumnArray::create(res, *it); + + if (input_arg_is_const) + res = ColumnConst::create(res, input_rows_count); + + return res; + } +}; + +} + +REGISTER_FUNCTION(DynamicElement) +{ + factory.registerFunction(FunctionDocumentation{ + .description = R"( +Extracts a column with specified type from a `Dynamic` column. +)", + .syntax{"dynamicElement(dynamic, type_name)"}, + .arguments{ + {"dynamic", "Dynamic column"}, + {"type_name", "The name of the variant type to extract"}}, + .examples{{{ + "Example", + R"( +CREATE TABLE test (d Dynamic) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT d, dynamicType(d), dynamicElement(d, 'String'), dynamicElement(d, 'Int64'), dynamicElement(d, 'Array(Int64)'), dynamicElement(d, 'Date'), dynamicElement(d, 'Array(String)') FROM test;)", + R"( +┌─d─────────────┬─dynamicType(d)─┬─dynamicElement(d, 'String')─┬─dynamicElement(d, 'Int64')─┬─dynamicElement(d, 'Array(Int64)')─┬─dynamicElement(d, 'Date')─┬─dynamicElement(d, 'Array(String)')─┐ +│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ Int64 │ ᴺᵁᴸᴸ │ 42 │ [] │ ᴺᵁᴸᴸ │ [] │ +│ Hello, World! │ String │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ Array(Int64) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ ᴺᵁᴸᴸ │ [] │ +└───────────────┴────────────────┴─────────────────────────────┴────────────────────────────┴───────────────────────────────────┴───────────────────────────┴────────────────────────────────────┘ +)"}}}, + .categories{"Dynamic"}, + }); +} + +} diff --git a/src/Functions/dynamicType.cpp b/src/Functions/dynamicType.cpp new file mode 100644 index 00000000000..e8ca73597d6 --- /dev/null +++ b/src/Functions/dynamicType.cpp @@ -0,0 +1,112 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +namespace +{ + +/// Return String with type name for each row in Dynamic column. +class FunctionDynamicType : public IFunction +{ +public: + static constexpr auto name = "dynamicType"; + static constexpr auto name_for_null = "None"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + if (arguments.empty() || arguments.size() > 1) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 1", + getName(), arguments.empty()); + + if (!isDynamic(arguments[0].type.get())) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Dynamic, got {} instead", + getName(), arguments[0].type->getName()); + + return std::make_shared(std::make_shared()); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + const ColumnDynamic * dynamic_column = checkAndGetColumn(arguments[0].column.get()); + if (!dynamic_column) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Dynamic, got {} instead", + getName(), arguments[0].type->getName()); + + const auto & variant_info = dynamic_column->getVariantInfo(); + const auto & variant_column = dynamic_column->getVariantColumn(); + auto res = result_type->createColumn(); + String element_type; + for (size_t i = 0; i != input_rows_count; ++i) + { + auto global_discr = variant_column.globalDiscriminatorAt(i); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + element_type = name_for_null; + else + element_type = variant_info.variant_names[global_discr]; + + res->insertData(element_type.data(), element_type.size()); + } + + return res; + } +}; + +} + +REGISTER_FUNCTION(DynamicType) +{ + factory.registerFunction(FunctionDocumentation{ + .description = R"( +Returns the variant type name for each row of `Dynamic` column. If row contains NULL, it returns 'None' for it. +)", + .syntax = {"dynamicType(variant)"}, + .arguments = {{"dynamic", "Dynamic column"}}, + .examples = {{{ + "Example", + R"( +CREATE TABLE test (d Dynamic) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT d, dynamicType(d) FROM test; +)", + R"( +┌─d─────────────┬─dynamicType(d)─┐ +│ ᴺᵁᴸᴸ │ None │ +│ 42 │ Int64 │ +│ Hello, World! │ String │ +│ [1,2,3] │ Array(Int64) │ +└───────────────┴────────────────┘ +)"}}}, + .categories{"Variant"}, + }); +} + +} diff --git a/src/Functions/extractAll.cpp b/src/Functions/extractAll.cpp index f0c18bf79b9..5801a7b8f4f 100644 --- a/src/Functions/extractAll.cpp +++ b/src/Functions/extractAll.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Functions/extractTextFromHTML.cpp b/src/Functions/extractTextFromHTML.cpp index 4eefeaa9f86..d9aa004b279 100644 --- a/src/Functions/extractTextFromHTML.cpp +++ b/src/Functions/extractTextFromHTML.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include /** A function to extract text from HTML or XHTML. diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index abd3f036408..7a6d37d810d 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -1157,6 +1158,11 @@ private: variant_column->applyNullMap(assert_cast(*arg_cond.column).getData()); return result_column; } + else if (auto * dynamic_column = typeid_cast(result_column.get())) + { + dynamic_column->applyNullMap(assert_cast(*arg_cond.column).getData()); + return result_column; + } else return ColumnNullable::create(materializeColumnIfConst(result_column), arg_cond.column); } @@ -1200,6 +1206,11 @@ private: variant_column->applyNegatedNullMap(assert_cast(*arg_cond.column).getData()); return result_column; } + else if (auto * dynamic_column = typeid_cast(result_column.get())) + { + dynamic_column->applyNegatedNullMap(assert_cast(*arg_cond.column).getData()); + return result_column; + } else { size_t size = input_rows_count; diff --git a/src/Functions/initcap.cpp b/src/Functions/initcap.cpp index 5460ee06792..6b2958227bc 100644 --- a/src/Functions/initcap.cpp +++ b/src/Functions/initcap.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include namespace DB { diff --git a/src/Functions/isNotNull.cpp b/src/Functions/isNotNull.cpp index dd53c700221..ea95a5c2b1c 100644 --- a/src/Functions/isNotNull.cpp +++ b/src/Functions/isNotNull.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -44,9 +45,10 @@ public: { const ColumnWithTypeAndName & elem = arguments[0]; - if (isVariant(elem.type)) + if (isVariant(elem.type) || isDynamic(elem.type)) { - const auto & discriminators = checkAndGetColumn(*elem.column).getLocalDiscriminators(); + const auto & column_variant = isVariant(elem.type) ? checkAndGetColumn(*elem.column) : checkAndGetColumn(*elem.column).getVariantColumn(); + const auto & discriminators = column_variant.getLocalDiscriminators(); auto res = DataTypeUInt8().createColumn(); auto & data = typeid_cast(*res).getData(); data.resize(discriminators.size()); diff --git a/src/Functions/isNull.cpp b/src/Functions/isNull.cpp index 7a6dabab7af..a98ff2ab8e8 100644 --- a/src/Functions/isNull.cpp +++ b/src/Functions/isNull.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB @@ -46,9 +47,10 @@ public: { const ColumnWithTypeAndName & elem = arguments[0]; - if (isVariant(elem.type)) + if (isVariant(elem.type) || isDynamic(elem.type)) { - const auto & discriminators = checkAndGetColumn(*elem.column).getLocalDiscriminators(); + const auto & column_variant = isVariant(elem.type) ? checkAndGetColumn(*elem.column) : checkAndGetColumn(*elem.column).getVariantColumn(); + const auto & discriminators = column_variant.getLocalDiscriminators(); auto res = DataTypeUInt8().createColumn(); auto & data = typeid_cast(*res).getData(); data.reserve(discriminators.size()); diff --git a/src/Functions/padString.cpp b/src/Functions/padString.cpp index 0922e0ddb8a..8670c837e21 100644 --- a/src/Functions/padString.cpp +++ b/src/Functions/padString.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -237,8 +238,8 @@ namespace void executeForSource(SourceStrings && strings, const ColumnPtr & column_length, const String & pad_string, StringSink & res_sink) const { const auto & chars = strings.getElements(); - bool all_ascii = UTF8::isAllASCII(reinterpret_cast(pad_string.data()), pad_string.size()) - && UTF8::isAllASCII(chars.data(), chars.size()); + bool all_ascii = isAllASCII(reinterpret_cast(pad_string.data()), pad_string.size()) + && isAllASCII(chars.data(), chars.size()); bool is_actually_utf8 = is_utf8 && !all_ascii; if (!is_actually_utf8) diff --git a/src/Functions/reverseUTF8.cpp b/src/Functions/reverseUTF8.cpp index 4ea861919a1..1aee349fa8d 100644 --- a/src/Functions/reverseUTF8.cpp +++ b/src/Functions/reverseUTF8.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "reverse.h" @@ -27,7 +28,7 @@ struct ReverseUTF8Impl ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { - bool all_ascii = UTF8::isAllASCII(data.data(), data.size()); + bool all_ascii = isAllASCII(data.data(), data.size()); if (all_ascii) { ReverseImpl::vector(data, offsets, res_data, res_offsets); diff --git a/src/Functions/soundex.cpp b/src/Functions/soundex.cpp index 0cddfc90f7c..77ddb14a6ec 100644 --- a/src/Functions/soundex.cpp +++ b/src/Functions/soundex.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include namespace DB diff --git a/src/Functions/splitByChar.cpp b/src/Functions/splitByChar.cpp index d3d5dc9fe4a..52db5623b89 100644 --- a/src/Functions/splitByChar.cpp +++ b/src/Functions/splitByChar.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include diff --git a/src/Functions/splitByNonAlpha.cpp b/src/Functions/splitByNonAlpha.cpp index 4486a33aa88..17ff6cfb0a8 100644 --- a/src/Functions/splitByNonAlpha.cpp +++ b/src/Functions/splitByNonAlpha.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Functions/splitByRegexp.cpp b/src/Functions/splitByRegexp.cpp index 430089f14ee..042db97794d 100644 --- a/src/Functions/splitByRegexp.cpp +++ b/src/Functions/splitByRegexp.cpp @@ -1,9 +1,11 @@ #include +#include +#include #include #include -#include #include -#include +#include +#include #include @@ -102,7 +104,7 @@ public: return false; } - pos += 1; + ++pos; token_end = pos; ++splits; } @@ -148,11 +150,67 @@ public: using FunctionSplitByRegexp = FunctionTokens; +/// Fallback splitByRegexp to splitByChar when its 1st argument is a trivial char for better performance +class SplitByRegexpOverloadResolver : public IFunctionOverloadResolver +{ +public: + static constexpr auto name = "splitByRegexp"; + static FunctionOverloadResolverPtr create(ContextPtr context) { return std::make_unique(context); } + + explicit SplitByRegexpOverloadResolver(ContextPtr context_) + : context(context_) + , split_by_regexp(FunctionSplitByRegexp::create(context)) {} + + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return SplitByRegexpImpl::getNumberOfArguments(); } + bool isVariadic() const override { return SplitByRegexpImpl::isVariadic(); } + /// ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return SplitByRegexpImpl::getArgumentsThatAreAlwaysConstant(); } + + FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override + { + if (patternIsTrivialChar(arguments)) + return FunctionFactory::instance().getImpl("splitByChar", context)->build(arguments); + else + return std::make_unique( + split_by_regexp, collections::map(arguments, [](const auto & elem) { return elem.type; }), return_type); + } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + return split_by_regexp->getReturnTypeImpl(arguments); + } + +private: + bool patternIsTrivialChar(const ColumnsWithTypeAndName & arguments) const + { + if (!arguments[0].column.get()) + return false; + const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); + if (!col) + return false; + + String pattern = col->getValue(); + if (pattern.size() == 1) + { + OptimizedRegularExpression re = Regexps::createRegexp(pattern); + + std::string required_substring; + bool is_trivial; + bool required_substring_is_prefix; + re.getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix); + return is_trivial && required_substring == pattern; + } + return false; + } + + ContextPtr context; + FunctionPtr split_by_regexp; +}; } REGISTER_FUNCTION(SplitByRegexp) { - factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/splitByString.cpp b/src/Functions/splitByString.cpp index 5c97f9841e7..e9b70a58eab 100644 --- a/src/Functions/splitByString.cpp +++ b/src/Functions/splitByString.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include diff --git a/src/Functions/splitByWhitespace.cpp b/src/Functions/splitByWhitespace.cpp index cf21a218b15..5bf27f64c17 100644 --- a/src/Functions/splitByWhitespace.cpp +++ b/src/Functions/splitByWhitespace.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include namespace DB diff --git a/src/Functions/substring.cpp b/src/Functions/substring.cpp index 122f83d758b..f1dea7db018 100644 --- a/src/Functions/substring.cpp +++ b/src/Functions/substring.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -149,7 +150,7 @@ public: { if (const ColumnString * col = checkAndGetColumn(column_string.get())) { - bool all_ascii = UTF8::isAllASCII(col->getChars().data(), col->getChars().size()); + bool all_ascii = isAllASCII(col->getChars().data(), col->getChars().size()); if (all_ascii) return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, StringSource(*col), input_rows_count); else @@ -159,7 +160,7 @@ public: if (const ColumnConst * col_const = checkAndGetColumnConst(column_string.get())) { StringRef str_ref = col_const->getDataAt(0); - bool all_ascii = UTF8::isAllASCII(reinterpret_cast(str_ref.data), str_ref.size); + bool all_ascii = isAllASCII(reinterpret_cast(str_ref.data), str_ref.size); if (all_ascii) return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource(*col_const), input_rows_count); else diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp index 74474cb4b23..15a321bd5b0 100644 --- a/src/Functions/substringIndex.cpp +++ b/src/Functions/substringIndex.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -129,8 +130,8 @@ namespace res_data.reserve(str_column->getChars().size() / 2); res_offsets.reserve(rows); - bool all_ascii = UTF8::isAllASCII(str_column->getChars().data(), str_column->getChars().size()) - && UTF8::isAllASCII(reinterpret_cast(delim.data()), delim.size()); + bool all_ascii = isAllASCII(str_column->getChars().data(), str_column->getChars().size()) + && isAllASCII(reinterpret_cast(delim.data()), delim.size()); std::unique_ptr searcher = !is_utf8 || all_ascii ? nullptr : std::make_unique(delim.data(), delim.size()); @@ -162,8 +163,8 @@ namespace res_data.reserve(str_column->getChars().size() / 2); res_offsets.reserve(rows); - bool all_ascii = UTF8::isAllASCII(str_column->getChars().data(), str_column->getChars().size()) - && UTF8::isAllASCII(reinterpret_cast(delim.data()), delim.size()); + bool all_ascii = isAllASCII(str_column->getChars().data(), str_column->getChars().size()) + && isAllASCII(reinterpret_cast(delim.data()), delim.size()); std::unique_ptr searcher = !is_utf8 || all_ascii ? nullptr : std::make_unique(delim.data(), delim.size()); @@ -194,8 +195,8 @@ namespace res_data.reserve(str.size() * rows / 2); res_offsets.reserve(rows); - bool all_ascii = UTF8::isAllASCII(reinterpret_cast(str.data()), str.size()) - && UTF8::isAllASCII(reinterpret_cast(delim.data()), delim.size()); + bool all_ascii = isAllASCII(reinterpret_cast(str.data()), str.size()) + && isAllASCII(reinterpret_cast(delim.data()), delim.size()); std::unique_ptr searcher = !is_utf8 || all_ascii ? nullptr : std::make_unique(delim.data(), delim.size()); diff --git a/src/Functions/translate.cpp b/src/Functions/translate.cpp index c7173909029..2df08a5664e 100644 --- a/src/Functions/translate.cpp +++ b/src/Functions/translate.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Functions/variantElement.cpp b/src/Functions/variantElement.cpp index 2744a0dabb8..80d34083d9d 100644 --- a/src/Functions/variantElement.cpp +++ b/src/Functions/variantElement.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -111,61 +112,15 @@ public: throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be Variant or array of Variants. Actual {}", getName(), input_arg.type->getName()); - std::optional variant_global_discr = getVariantGlobalDiscriminator(arguments[1].column, *input_type_as_variant, arguments.size()); + auto variant_discr = getVariantGlobalDiscriminator(arguments[1].column, *input_type_as_variant, arguments.size()); - if (!variant_global_discr.has_value()) + if (!variant_discr) return arguments[2].column; - const auto & variant_type = input_type_as_variant->getVariant(*variant_global_discr); - const auto & variant_column = input_col_as_variant->getVariantPtrByGlobalDiscriminator(*variant_global_discr); - - /// If Variant has only NULLs or our variant doesn't have any real values, - /// just create column with default values and create null mask with 1. - if (input_col_as_variant->hasOnlyNulls() || variant_column->empty()) - { - auto res = variant_type->createColumn(); - - if (variant_type->lowCardinality()) - assert_cast(*res).nestedToNullable(); - - res->insertManyDefaults(input_col_as_variant->size()); - if (!variant_type->canBeInsideNullable()) - return wrapInArraysAndConstIfNeeded(std::move(res), array_offsets, input_arg_is_const, input_rows_count); - - auto null_map = ColumnUInt8::create(); - auto & null_map_data = null_map->getData(); - null_map_data.resize_fill(input_col_as_variant->size(), 1); - return wrapInArraysAndConstIfNeeded(ColumnNullable::create(std::move(res), std::move(null_map)), array_offsets, input_arg_is_const, input_rows_count); - } - - /// If we extract single non-empty column and have no NULLs, then just return this variant. - if (auto non_empty_local_discr = input_col_as_variant->getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) - { - /// If we were trying to extract some other variant, - /// it would be empty and we would already processed this case above. - chassert(input_col_as_variant->globalDiscriminatorByLocal(*non_empty_local_discr) == variant_global_discr); - return wrapInArraysAndConstIfNeeded(makeNullableOrLowCardinalityNullableSafe(variant_column), array_offsets, input_arg_is_const, input_rows_count); - } - - /// In general case we should calculate null-mask for variant - /// according to the discriminators column and expand - /// variant column by this mask to get a full column (with default values on NULLs) - const auto & local_discriminators = input_col_as_variant->getLocalDiscriminators(); - auto null_map = ColumnUInt8::create(); - auto & null_map_data = null_map->getData(); - null_map_data.reserve(local_discriminators.size()); - auto variant_local_discr = input_col_as_variant->localDiscriminatorByGlobal(*variant_global_discr); - for (auto local_discr : local_discriminators) - null_map_data.push_back(local_discr != variant_local_discr); - - auto expanded_variant_column = IColumn::mutate(variant_column); - if (variant_type->lowCardinality()) - expanded_variant_column = assert_cast(*expanded_variant_column).cloneNullable(); - expanded_variant_column->expand(null_map_data, /*inverted = */ true); - if (variant_type->canBeInsideNullable()) - return wrapInArraysAndConstIfNeeded(ColumnNullable::create(std::move(expanded_variant_column), std::move(null_map)), array_offsets, input_arg_is_const, input_rows_count); - return wrapInArraysAndConstIfNeeded(std::move(expanded_variant_column), array_offsets, input_arg_is_const, input_rows_count); + auto variant_column = input_type_as_variant->getSubcolumn(input_type_as_variant->getVariant(*variant_discr)->getName(), input_col_as_variant->getPtr()); + return wrapInArraysAndConstIfNeeded(std::move(variant_column), array_offsets, input_arg_is_const, input_rows_count); } + private: std::optional getVariantGlobalDiscriminator(const ColumnPtr & index_column, const DataTypeVariant & variant_type, size_t argument_size) const { @@ -175,20 +130,16 @@ private: "Second argument to {} with Variant argument must be a constant String", getName()); - String variant_element_name = name_col->getValue(); - auto variant_element_type = DataTypeFactory::instance().tryGet(variant_element_name); - if (variant_element_type) + auto variant_element_name = name_col->getValue(); + if (auto variant_element_type = DataTypeFactory::instance().tryGet(variant_element_name)) { - const auto & variants = variant_type.getVariants(); - for (size_t i = 0; i != variants.size(); ++i) - { - if (variants[i]->getName() == variant_element_type->getName()) - return i; - } + if (auto discr = variant_type.tryGetVariantDiscriminator(variant_element_type->getName())) + return discr; } if (argument_size == 2) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{} doesn't contain variant with type {}", variant_type.getName(), variant_element_name); + return std::nullopt; } @@ -213,10 +164,10 @@ REGISTER_FUNCTION(VariantElement) Extracts a column with specified type from a `Variant` column. )", .syntax{"variantElement(variant, type_name, [, default_value])"}, - .arguments{{ + .arguments{ {"variant", "Variant column"}, {"type_name", "The name of the variant type to extract"}, - {"default_value", "The default value that will be used if variant doesn't have variant with specified type. Can be any type. Optional"}}}, + {"default_value", "The default value that will be used if variant doesn't have variant with specified type. Can be any type. Optional"}}, .examples{{{ "Example", R"( diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 667e63729ca..8bd436f218c 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -30,6 +30,7 @@ namespace ErrorCodes { extern const int INVALID_CONFIG_PARAMETER; extern const int AZURE_BLOB_STORAGE_ERROR; + extern const int LOGICAL_ERROR; } namespace @@ -94,11 +95,56 @@ namespace void calculatePartSize() { - auto max_upload_part_size = settings->max_upload_part_size; - if (!max_upload_part_size) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_upload_part_size must not be 0"); + if (!total_size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Chosen multipart upload for an empty file. This must not happen"); + + auto max_part_number = settings->max_blocks_in_multipart_upload; + const auto min_upload_part_size = settings->min_upload_part_size; + const auto max_upload_part_size = settings->max_upload_part_size; + + if (!max_part_number) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_blocks_in_multipart_upload must not be 0"); + else if (!min_upload_part_size) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "min_upload_part_size must not be 0"); + else if (max_upload_part_size < min_upload_part_size) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_upload_part_size must not be less than min_upload_part_size"); + + size_t part_size = min_upload_part_size; + auto num_parts = (total_size + part_size - 1) / part_size; + + if (num_parts > max_part_number) + { + part_size = (total_size + max_part_number - 1) / max_part_number; + num_parts = (total_size + part_size - 1) / part_size; + } + + if (part_size > max_upload_part_size) + { + part_size = max_upload_part_size; + num_parts = (total_size + part_size - 1) / part_size; + } + + String error; + if (num_parts < 1) + error = "Number of parts is zero"; + else if (num_parts > max_part_number) + error = fmt::format("Number of parts exceeds {}/{}", num_parts, max_part_number); + else if (part_size < min_upload_part_size) + error = fmt::format("Size of a part is less than {}/{}", part_size, min_upload_part_size); + else if (part_size > max_upload_part_size) + error = fmt::format("Size of a part exceeds {}/{}", part_size, max_upload_part_size); + + if (!error.empty()) + { + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, + "{} while writing {} bytes to Azure. Check max_part_number = {}, " + "min_upload_part_size = {}, max_upload_part_size = {}", + error, total_size, max_part_number, min_upload_part_size, max_upload_part_size); + } + /// We've calculated the size of a normal part (the final part can be smaller). - normal_part_size = max_upload_part_size; + normal_part_size = part_size; } public: @@ -219,21 +265,22 @@ namespace auto block_blob_client = client->GetBlockBlobClient(dest_blob); auto read_buffer = std::make_unique(create_read_buffer(), task.part_offset, task.part_size); - while (!read_buffer->eof()) - { - auto size = read_buffer->available(); - if (size > 0) - { - auto block_id = getRandomASCIIString(64); - Azure::Core::IO::MemoryBodyStream memory(reinterpret_cast(read_buffer->position()), size); - block_blob_client.StageBlock(block_id, memory); - task.block_ids.emplace_back(block_id); - read_buffer->ignore(size); - LOG_TRACE(log, "Writing part. Container: {}, Blob: {}, block_id: {}", dest_container_for_logging, dest_blob, block_id); - } - } - std::lock_guard lock(bg_tasks_mutex); /// Protect bg_tasks from race - LOG_TRACE(log, "Writing part finished. Container: {}, Blob: {}, Parts: {}", dest_container_for_logging, dest_blob, bg_tasks.size()); + + /// task.part_size is already normalized according to min_upload_part_size and max_upload_part_size. + size_t size_to_stage = task.part_size; + + PODArray memory; + memory.resize(size_to_stage); + WriteBufferFromVector> wb(memory); + + copyData(*read_buffer, wb, size_to_stage); + Azure::Core::IO::MemoryBodyStream stream(reinterpret_cast(memory.data()), size_to_stage); + + const auto & block_id = task.block_ids.emplace_back(getRandomASCIIString(64)); + block_blob_client.StageBlock(block_id, stream); + + LOG_TRACE(log, "Writing part. Container: {}, Blob: {}, block_id: {}, size: {}", + dest_container_for_logging, dest_blob, block_id, size_to_stage); } @@ -300,21 +347,32 @@ void copyAzureBlobStorageFile( if (size < settings->max_single_part_copy_size) { + LOG_TRACE(getLogger("copyAzureBlobStorageFile"), "Copy blob sync {} -> {}", src_blob, dest_blob); block_blob_client_dest.CopyFromUri(source_uri); } else { Azure::Storage::Blobs::StartBlobCopyOperation operation = block_blob_client_dest.StartCopyFromUri(source_uri); - // Wait for the operation to finish, checking for status every 100 second. auto copy_response = operation.PollUntilDone(std::chrono::milliseconds(100)); auto properties_model = copy_response.Value; - if (properties_model.CopySource.HasValue()) - { - throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Copy failed"); - } + auto copy_status = properties_model.CopyStatus; + auto copy_status_description = properties_model.CopyStatusDescription; + + if (copy_status.HasValue() && copy_status.Value() == Azure::Storage::Blobs::Models::CopyStatus::Success) + { + LOG_TRACE(getLogger("copyAzureBlobStorageFile"), "Copy of {} to {} finished", properties_model.CopySource.Value(), dest_blob); + } + else + { + if (copy_status.HasValue()) + throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Copy from {} to {} failed with status {} description {} (operation is done {})", + src_blob, dest_blob, copy_status.Value().ToString(), copy_status_description.Value(), operation.IsDone()); + else + throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Copy from {} to {} didn't complete with success status (operation is done {})", src_blob, dest_blob, operation.IsDone()); + } } } else @@ -322,8 +380,8 @@ void copyAzureBlobStorageFile( LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Container: {}, Blob: {}", src_container_for_logging, src_blob); auto create_read_buffer = [&] { - return std::make_unique(src_client, src_blob, read_settings, settings->max_single_read_retries, - settings->max_single_download_retries); + return std::make_unique( + src_client, src_blob, read_settings, settings->max_single_read_retries, settings->max_single_download_retries); }; UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container_for_logging, dest_blob, settings, schedule, &Poco::Logger::get("copyAzureBlobStorageFile")}; diff --git a/src/IO/HTTPChunkedReadBuffer.cpp b/src/IO/HTTPChunkedReadBuffer.cpp index 41788fa8ce7..b5ac6a9b728 100644 --- a/src/IO/HTTPChunkedReadBuffer.cpp +++ b/src/IO/HTTPChunkedReadBuffer.cpp @@ -1,7 +1,7 @@ #include #include -#include +#include #include #include diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 8c83eac5cff..c771fced73a 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include #include @@ -352,7 +352,6 @@ static ReturnType parseComplexEscapeSequence(Vector & s, ReadBuffer & buf) { return error("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE); } - s.push_back(unhex2(hex_code)); } else if (char_after_backslash == 'N') @@ -608,13 +607,20 @@ static ReturnType parseJSONEscapeSequence(Vector & s, ReadBuffer & buf, bool kee } -template +template void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf) { while (!buf.eof()) { - char * next_pos = find_first_symbols<'\t', '\n', '\\'>(buf.position(), buf.buffer().end()); - + char * next_pos; + if constexpr (support_crlf) + { + next_pos = find_first_symbols<'\t', '\n', '\\','\r'>(buf.position(), buf.buffer().end()); + } + else + { + next_pos = find_first_symbols<'\t', '\n', '\\'>(buf.position(), buf.buffer().end()); + } appendToStringOrVector(s, buf, next_pos); buf.position() = next_pos; @@ -641,25 +647,46 @@ void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf) } } } + + if constexpr (support_crlf) + { + if (*buf.position() == '\r') + { + ++buf.position(); + if (!buf.eof() && *buf.position() != '\n') + { + s.push_back('\r'); + continue; + } + return; + } + } } } -template +template void readEscapedStringInto(Vector & s, ReadBuffer & buf) { - readEscapedStringIntoImpl(s, buf); + readEscapedStringIntoImpl(s, buf); } void readEscapedString(String & s, ReadBuffer & buf) { s.clear(); - readEscapedStringInto(s, buf); + readEscapedStringInto(s, buf); } -template void readEscapedStringInto>(PaddedPODArray & s, ReadBuffer & buf); -template void readEscapedStringInto(NullOutput & s, ReadBuffer & buf); +void readEscapedStringCRLF(String & s, ReadBuffer & buf) +{ + s.clear(); + readEscapedStringInto(s, buf); +} +template void readEscapedStringInto,false>(PaddedPODArray & s, ReadBuffer & buf); +template void readEscapedStringInto(NullOutput & s, ReadBuffer & buf); +template void readEscapedStringInto,true>(PaddedPODArray & s, ReadBuffer & buf); +template void readEscapedStringInto(NullOutput & s, ReadBuffer & buf); /** If enable_sql_style_quoting == true, * strings like 'abc''def' will be parsed as abc'def. @@ -2069,7 +2096,14 @@ bool tryReadJSONField(String & s, ReadBuffer & buf, const FormatSettings::JSON & void readTSVField(String & s, ReadBuffer & buf) { s.clear(); - readEscapedStringIntoImpl(s, buf); + readEscapedStringIntoImpl(s, buf); } +void readTSVFieldCRLF(String & s, ReadBuffer & buf) +{ + s.clear(); + readEscapedStringIntoImpl(s, buf); +} + + } diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 5cf7d3e5b66..ffba4fafb5c 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -30,7 +30,7 @@ #include #include -#include +#include #include #include @@ -583,6 +583,8 @@ void readString(String & s, ReadBuffer & buf); void readEscapedString(String & s, ReadBuffer & buf); +void readEscapedStringCRLF(String & s, ReadBuffer & buf); + void readQuotedString(String & s, ReadBuffer & buf); void readQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); @@ -645,7 +647,7 @@ void readStringInto(Vector & s, ReadBuffer & buf); template void readNullTerminated(Vector & s, ReadBuffer & buf); -template +template void readEscapedStringInto(Vector & s, ReadBuffer & buf); template @@ -1901,6 +1903,7 @@ void readJSONField(String & s, ReadBuffer & buf, const FormatSettings::JSON & se bool tryReadJSONField(String & s, ReadBuffer & buf, const FormatSettings::JSON & settings); void readTSVField(String & s, ReadBuffer & buf); +void readTSVFieldCRLF(String & s, ReadBuffer & buf); /** Parse the escape sequence, which can be simple (one character after backslash) or more complex (multiple characters). * It is assumed that the cursor is located on the `\` symbol diff --git a/src/IO/S3/copyS3File.cpp b/src/IO/S3/copyS3File.cpp index 549d0a569c6..cff6fa5ad21 100644 --- a/src/IO/S3/copyS3File.cpp +++ b/src/IO/S3/copyS3File.cpp @@ -316,23 +316,23 @@ namespace num_parts = (total_size + part_size - 1) / part_size; } - if (num_parts < 1 || num_parts > max_part_number || part_size < min_upload_part_size || part_size > max_upload_part_size) - { - String msg; - if (num_parts < 1) - msg = "Number of parts is zero"; - else if (num_parts > max_part_number) - msg = fmt::format("Number of parts exceeds {}", num_parts, max_part_number); - else if (part_size < min_upload_part_size) - msg = fmt::format("Size of a part is less than {}", part_size, min_upload_part_size); - else - msg = fmt::format("Size of a part exceeds {}", part_size, max_upload_part_size); + String error; + if (num_parts < 1) + error = "Number of parts is zero"; + else if (num_parts > max_part_number) + error = fmt::format("Number of parts exceeds {}/{}", num_parts, max_part_number); + else if (part_size < min_upload_part_size) + error = fmt::format("Size of a part is less than {}/{}", part_size, min_upload_part_size); + else if (part_size > max_upload_part_size) + error = fmt::format("Size of a part exceeds {}/{}", part_size, max_upload_part_size); + if (!error.empty()) + { throw Exception( ErrorCodes::INVALID_CONFIG_PARAMETER, "{} while writing {} bytes to S3. Check max_part_number = {}, " "min_upload_part_size = {}, max_upload_part_size = {}", - msg, total_size, max_part_number, min_upload_part_size, max_upload_part_size); + error, total_size, max_part_number, min_upload_part_size, max_upload_part_size); } /// We've calculated the size of a normal part (the final part can be smaller). diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 56e3e0df21b..4583b2bb0ac 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -1,7 +1,7 @@ #include #include -#include +#include #include #include "config.h" diff --git a/src/IO/WriteHelpers.h b/src/IO/WriteHelpers.h index a30e2feb439..d4b2d8ea0dc 100644 --- a/src/IO/WriteHelpers.h +++ b/src/IO/WriteHelpers.h @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include diff --git a/src/IO/parseDateTimeBestEffort.cpp b/src/IO/parseDateTimeBestEffort.cpp index 70401fdf72d..e046e837689 100644 --- a/src/IO/parseDateTimeBestEffort.cpp +++ b/src/IO/parseDateTimeBestEffort.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include diff --git a/src/IO/readFloatText.h b/src/IO/readFloatText.h index d1652784cc2..3a21d7201a9 100644 --- a/src/IO/readFloatText.h +++ b/src/IO/readFloatText.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wunneeded-internal-declaration" diff --git a/src/Interpreters/Access/InterpreterShowAccessEntitiesQuery.cpp b/src/Interpreters/Access/InterpreterShowAccessEntitiesQuery.cpp index 76979ed86c8..71fc1047cfa 100644 --- a/src/Interpreters/Access/InterpreterShowAccessEntitiesQuery.cpp +++ b/src/Interpreters/Access/InterpreterShowAccessEntitiesQuery.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/Access/InterpreterShowCreateAccessEntityQuery.cpp b/src/Interpreters/Access/InterpreterShowCreateAccessEntityQuery.cpp index 1147d74c146..96d8e55a74c 100644 --- a/src/Interpreters/Access/InterpreterShowCreateAccessEntityQuery.cpp +++ b/src/Interpreters/Access/InterpreterShowCreateAccessEntityQuery.cpp @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/Cache/QueryCache.cpp b/src/Interpreters/Cache/QueryCache.cpp index fafe50c170f..4b10bfd3dcd 100644 --- a/src/Interpreters/Cache/QueryCache.cpp +++ b/src/Interpreters/Cache/QueryCache.cpp @@ -177,6 +177,21 @@ ASTPtr removeQueryCacheSettings(ASTPtr ast) return transformed_ast; } +IAST::Hash calculateAstHash(ASTPtr ast, const String & current_database) +{ + ast = removeQueryCacheSettings(ast); + + /// Hash the AST, it must consider aliases (issue #56258) + SipHash hash; + ast->updateTreeHash(hash, /*ignore_aliases=*/ false); + + /// Also hash the database specified via SQL `USE db`, otherwise identifiers in same query (AST) may mean different columns in different + /// tables (issue #64136) + hash.update(current_database); + + return getSipHash128AsPair(hash); +} + String queryStringFromAST(ASTPtr ast) { WriteBufferFromOwnString buf; @@ -186,17 +201,15 @@ String queryStringFromAST(ASTPtr ast) } -/// Hashing of ASTs must consider aliases (issue #56258) -static constexpr bool ignore_aliases = false; - QueryCache::Key::Key( ASTPtr ast_, + const String & current_database, Block header_, std::optional user_id_, const std::vector & current_user_roles_, bool is_shared_, std::chrono::time_point expires_at_, bool is_compressed_) - : ast_hash(removeQueryCacheSettings(ast_)->getTreeHash(ignore_aliases)) + : ast_hash(calculateAstHash(ast_, current_database)) , header(header_) , user_id(user_id_) , current_user_roles(current_user_roles_) @@ -207,8 +220,8 @@ QueryCache::Key::Key( { } -QueryCache::Key::Key(ASTPtr ast_, std::optional user_id_, const std::vector & current_user_roles_) - : QueryCache::Key(ast_, {}, user_id_, current_user_roles_, false, std::chrono::system_clock::from_time_t(1), false) /// dummy values for everything != AST or user name +QueryCache::Key::Key(ASTPtr ast_, const String & current_database, std::optional user_id_, const std::vector & current_user_roles_) + : QueryCache::Key(ast_, current_database, {}, user_id_, current_user_roles_, false, std::chrono::system_clock::from_time_t(1), false) /// dummy values for everything != AST, current database, user name/roles { } diff --git a/src/Interpreters/Cache/QueryCache.h b/src/Interpreters/Cache/QueryCache.h index 814cad37f82..b5b6f477137 100644 --- a/src/Interpreters/Cache/QueryCache.h +++ b/src/Interpreters/Cache/QueryCache.h @@ -88,6 +88,7 @@ public: /// Ctor to construct a Key for writing into query cache. Key(ASTPtr ast_, + const String & current_database, Block header_, std::optional user_id_, const std::vector & current_user_roles_, bool is_shared_, @@ -95,7 +96,7 @@ public: bool is_compressed); /// Ctor to construct a Key for reading from query cache (this operation only needs the AST + user name). - Key(ASTPtr ast_, std::optional user_id_, const std::vector & current_user_roles_); + Key(ASTPtr ast_, const String & current_database, std::optional user_id_, const std::vector & current_user_roles_); bool operator==(const Key & other) const; }; diff --git a/src/Interpreters/Cluster.cpp b/src/Interpreters/Cluster.cpp index ec6283df649..59c98491c14 100644 --- a/src/Interpreters/Cluster.cpp +++ b/src/Interpreters/Cluster.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/ClusterDiscovery.cpp b/src/Interpreters/ClusterDiscovery.cpp index d432488964d..6f9c375c2f5 100644 --- a/src/Interpreters/ClusterDiscovery.cpp +++ b/src/Interpreters/ClusterDiscovery.cpp @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index e4d979d4078..4bbda982f5b 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -38,7 +38,8 @@ namespace ErrorCodes namespace ClusterProxy { -ContextMutablePtr updateSettingsForCluster(const Cluster & cluster, +ContextMutablePtr updateSettingsAndClientInfoForCluster(const Cluster & cluster, + bool is_remote_function, ContextPtr context, const Settings & settings, const StorageID & main_table, @@ -46,9 +47,17 @@ ContextMutablePtr updateSettingsForCluster(const Cluster & cluster, LoggerPtr log, const DistributedSettings * distributed_settings) { + ClientInfo new_client_info = context->getClientInfo(); Settings new_settings = settings; new_settings.queue_max_wait_ms = Cluster::saturate(new_settings.queue_max_wait_ms, settings.max_execution_time); + /// In case of interserver mode we should reset initial_user for remote() function to use passed user from the query. + if (is_remote_function) + { + const auto & address = cluster.getShardsAddresses().front().front(); + new_client_info.initial_user = address.user; + } + /// If "secret" (in remote_servers) is not in use, /// user on the shard is not the same as the user on the initiator, /// hence per-user limits should not be applied. @@ -168,9 +177,23 @@ ContextMutablePtr updateSettingsForCluster(const Cluster & cluster, auto new_context = Context::createCopy(context); new_context->setSettings(new_settings); + new_context->setClientInfo(new_client_info); return new_context; } +ContextMutablePtr updateSettingsForCluster(const Cluster & cluster, ContextPtr context, const Settings & settings, const StorageID & main_table) +{ + return updateSettingsAndClientInfoForCluster(cluster, + /* is_remote_function= */ false, + context, + settings, + main_table, + /* additional_filter_ast= */ {}, + /* log= */ {}, + /* distributed_settings= */ {}); +} + + static ThrottlerPtr getThrottler(const ContextPtr & context) { const Settings & settings = context->getSettingsRef(); @@ -209,7 +232,8 @@ void executeQuery( const ExpressionActionsPtr & sharding_key_expr, const std::string & sharding_key_column_name, const DistributedSettings & distributed_settings, - AdditionalShardFilterGenerator shard_filter_generator) + AdditionalShardFilterGenerator shard_filter_generator, + bool is_remote_function) { const Settings & settings = context->getSettingsRef(); @@ -222,8 +246,8 @@ void executeQuery( SelectStreamFactory::Shards remote_shards; auto cluster = query_info.getCluster(); - auto new_context = updateSettingsForCluster(*cluster, context, settings, main_table, query_info.additional_filter_ast, log, - &distributed_settings); + auto new_context = updateSettingsAndClientInfoForCluster(*cluster, is_remote_function, context, + settings, main_table, query_info.additional_filter_ast, log, &distributed_settings); if (context->getSettingsRef().allow_experimental_parallel_reading_from_replicas && context->getSettingsRef().allow_experimental_parallel_reading_from_replicas.value != new_context->getSettingsRef().allow_experimental_parallel_reading_from_replicas.value) diff --git a/src/Interpreters/ClusterProxy/executeQuery.h b/src/Interpreters/ClusterProxy/executeQuery.h index 582f8d74fd5..284fea05135 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.h +++ b/src/Interpreters/ClusterProxy/executeQuery.h @@ -38,13 +38,7 @@ class SelectStreamFactory; /// - optimize_skip_unused_shards_nesting /// /// @return new Context with adjusted settings -ContextMutablePtr updateSettingsForCluster(const Cluster & cluster, - ContextPtr context, - const Settings & settings, - const StorageID & main_table, - ASTPtr additional_filter_ast = nullptr, - LoggerPtr log = nullptr, - const DistributedSettings * distributed_settings = nullptr); +ContextMutablePtr updateSettingsForCluster(const Cluster & cluster, ContextPtr context, const Settings & settings, const StorageID & main_table); using AdditionalShardFilterGenerator = std::function; /// Execute a distributed query, creating a query plan, from which the query pipeline can be built. @@ -63,7 +57,8 @@ void executeQuery( const ExpressionActionsPtr & sharding_key_expr, const std::string & sharding_key_column_name, const DistributedSettings & distributed_settings, - AdditionalShardFilterGenerator shard_filter_generator); + AdditionalShardFilterGenerator shard_filter_generator, + bool is_remote_function); void executeQueryWithParallelReplicas( diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 1bd9601dd7e..e1d82a8f604 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -160,6 +160,8 @@ namespace CurrentMetrics extern const Metric TablesLoaderForegroundThreadsScheduled; extern const Metric IOWriterThreadsScheduled; extern const Metric AttachedTable; + extern const Metric AttachedView; + extern const Metric AttachedDictionary; extern const Metric AttachedDatabase; extern const Metric PartsActive; } @@ -359,6 +361,8 @@ struct ContextSharedPart : boost::noncopyable /// No lock required for format_schema_path modified only during initialization std::atomic_size_t max_database_num_to_warn = 1000lu; std::atomic_size_t max_table_num_to_warn = 5000lu; + std::atomic_size_t max_view_num_to_warn = 10000lu; + std::atomic_size_t max_dictionary_num_to_warn = 1000lu; std::atomic_size_t max_part_num_to_warn = 100000lu; String format_schema_path; /// Path to a directory that contains schema files used by input formats. String google_protos_path; /// Path to a directory that contains the proto files for the well-known Protobuf types. @@ -935,6 +939,10 @@ Strings Context::getWarnings() const common_warnings = shared->warnings; if (CurrentMetrics::get(CurrentMetrics::AttachedTable) > static_cast(shared->max_table_num_to_warn)) common_warnings.emplace_back(fmt::format("The number of attached tables is more than {}", shared->max_table_num_to_warn)); + if (CurrentMetrics::get(CurrentMetrics::AttachedView) > static_cast(shared->max_view_num_to_warn)) + common_warnings.emplace_back(fmt::format("The number of attached views is more than {}", shared->max_view_num_to_warn)); + if (CurrentMetrics::get(CurrentMetrics::AttachedDictionary) > static_cast(shared->max_dictionary_num_to_warn)) + common_warnings.emplace_back(fmt::format("The number of attached dictionaries is more than {}", shared->max_dictionary_num_to_warn)); if (CurrentMetrics::get(CurrentMetrics::AttachedDatabase) > static_cast(shared->max_database_num_to_warn)) common_warnings.emplace_back(fmt::format("The number of attached databases is more than {}", shared->max_database_num_to_warn)); if (CurrentMetrics::get(CurrentMetrics::PartsActive) > static_cast(shared->max_part_num_to_warn)) @@ -3711,6 +3719,18 @@ void Context::setMaxTableNumToWarn(size_t max_table_to_warn) shared->max_table_num_to_warn= max_table_to_warn; } +void Context::setMaxViewNumToWarn(size_t max_view_to_warn) +{ + SharedLockGuard lock(shared->mutex); + shared->max_view_num_to_warn= max_view_to_warn; +} + +void Context::setMaxDictionaryNumToWarn(size_t max_dictionary_to_warn) +{ + SharedLockGuard lock(shared->mutex); + shared->max_dictionary_num_to_warn= max_dictionary_to_warn; +} + void Context::setMaxDatabaseNumToWarn(size_t max_database_to_warn) { SharedLockGuard lock(shared->mutex); diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 7f663773e52..814534f7035 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -861,6 +861,8 @@ public: const HTTPHeaderFilter & getHTTPHeaderFilter() const; void setMaxTableNumToWarn(size_t max_table_to_warn); + void setMaxViewNumToWarn(size_t max_view_to_warn); + void setMaxDictionaryNumToWarn(size_t max_dictionary_to_warn); void setMaxDatabaseNumToWarn(size_t max_database_to_warn); void setMaxPartNumToWarn(size_t max_part_to_warn); /// The port that the server listens for executing SQL queries. diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 395218f834f..d80d5cd5b93 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -47,7 +47,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/ExternalLoader.cpp b/src/Interpreters/ExternalLoader.cpp index a636e59fa1a..96405f35f3f 100644 --- a/src/Interpreters/ExternalLoader.cpp +++ b/src/Interpreters/ExternalLoader.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/ExternalLoaderXMLConfigRepository.cpp b/src/Interpreters/ExternalLoaderXMLConfigRepository.cpp index a15f918f457..e404797501d 100644 --- a/src/Interpreters/ExternalLoaderXMLConfigRepository.cpp +++ b/src/Interpreters/ExternalLoaderXMLConfigRepository.cpp @@ -2,7 +2,7 @@ #include -#include +#include #include #include #include diff --git a/src/Interpreters/ITokenExtractor.cpp b/src/Interpreters/ITokenExtractor.cpp index 9c4027dfa0a..1c5d0d4b6d4 100644 --- a/src/Interpreters/ITokenExtractor.cpp +++ b/src/Interpreters/ITokenExtractor.cpp @@ -2,7 +2,7 @@ #include -#include +#include #include #include diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 519cbde588f..61ca606186a 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -6,7 +6,7 @@ #include #include "Common/Exception.h" -#include +#include #include #include #include @@ -977,6 +977,13 @@ void InterpreterCreateQuery::setEngine(ASTCreateQuery & create) const if (as_create.is_ordinary_view) throw Exception(ErrorCodes::INCORRECT_QUERY, "Cannot CREATE a table AS {}, it is a View", qualified_name); + if (as_create.is_materialized_view && as_create.to_table_id) + throw Exception( + ErrorCodes::INCORRECT_QUERY, + "Cannot CREATE a table AS {}, it is a Materialized View without storage. Use \"AS `{}`\" instead", + qualified_name, + as_create.to_table_id.getQualifiedName()); + if (as_create.is_live_view) throw Exception(ErrorCodes::INCORRECT_QUERY, "Cannot CREATE a table AS {}, it is a Live View", qualified_name); @@ -1493,7 +1500,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, validateVirtualColumns(*res); - if (!res->supportsDynamicSubcolumns() && hasDynamicSubcolumns(res->getInMemoryMetadataPtr()->getColumns())) + if (!res->supportsDynamicSubcolumnsDeprecated() && hasDynamicSubcolumns(res->getInMemoryMetadataPtr()->getColumns())) { throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot create table with column of type Object, " diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index 12677c422b8..128854e87ba 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -552,7 +552,11 @@ BlockIO InterpreterInsertQuery::execute() { /// Change query sample block columns to Nullable to allow inserting nullable columns, where NULL values will be substituted with /// default column values (in AddingDefaultsTransform), so all values will be cast correctly. - if (isNullableOrLowCardinalityNullable(input_columns[col_idx].type) && !isNullableOrLowCardinalityNullable(query_columns[col_idx].type) && !isVariant(query_columns[col_idx].type) && output_columns.has(query_columns[col_idx].name)) + if (isNullableOrLowCardinalityNullable(input_columns[col_idx].type) + && !isNullableOrLowCardinalityNullable(query_columns[col_idx].type) + && !isVariant(query_columns[col_idx].type) + && !isDynamic(query_columns[col_idx].type) + && output_columns.has(query_columns[col_idx].name)) query_sample_block.setColumn(col_idx, ColumnWithTypeAndName(makeNullableOrLowCardinalityNullable(query_columns[col_idx].column), makeNullableOrLowCardinalityNullable(query_columns[col_idx].type), query_columns[col_idx].name)); } } diff --git a/src/Interpreters/InterserverCredentials.cpp b/src/Interpreters/InterserverCredentials.cpp index c344732a262..1327a2ef388 100644 --- a/src/Interpreters/InterserverCredentials.cpp +++ b/src/Interpreters/InterserverCredentials.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include namespace DB { diff --git a/src/Interpreters/JoinToSubqueryTransformVisitor.cpp b/src/Interpreters/JoinToSubqueryTransformVisitor.cpp index 5cda4c982b4..6a3a181ed26 100644 --- a/src/Interpreters/JoinToSubqueryTransformVisitor.cpp +++ b/src/Interpreters/JoinToSubqueryTransformVisitor.cpp @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include namespace DB diff --git a/src/Interpreters/QueryNormalizer.cpp b/src/Interpreters/QueryNormalizer.cpp index f47635a3c3f..a8639906aad 100644 --- a/src/Interpreters/QueryNormalizer.cpp +++ b/src/Interpreters/QueryNormalizer.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 1ee8ca14b2f..6191eb73fd4 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include diff --git a/src/Interpreters/ThreadStatusExt.cpp b/src/Interpreters/ThreadStatusExt.cpp index 6607df8d9af..9ca521a4ab3 100644 --- a/src/Interpreters/ThreadStatusExt.cpp +++ b/src/Interpreters/ThreadStatusExt.cpp @@ -462,8 +462,8 @@ void ThreadStatus::initGlobalProfiler([[maybe_unused]] UInt64 global_profiler_re { #if !defined(SANITIZER) && !defined(__APPLE__) /// profilers are useless without trace collector - auto global_context_ptr = global_context.lock(); - if (!global_context_ptr || !global_context_ptr->hasTraceCollector()) + auto context = Context::getGlobalContextInstance(); + if (!context->hasTraceCollector()) return; try diff --git a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp index 03df7283992..c21c4d34fa8 100644 --- a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp +++ b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index 50c28fbc8b2..a3c5a7ed3ed 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -1188,6 +1188,33 @@ bool TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select } } + /// Check for dynamic subcolums in unknown required columns. + if (!unknown_required_source_columns.empty()) + { + for (const NameAndTypePair & pair : source_columns_ordinary) + { + if (!pair.type->hasDynamicSubcolumns()) + continue; + + for (auto it = unknown_required_source_columns.begin(); it != unknown_required_source_columns.end();) + { + auto [column_name, dynamic_subcolumn_name] = Nested::splitName(*it); + + if (column_name == pair.name) + { + if (auto dynamic_subcolumn_type = pair.type->tryGetSubcolumnType(dynamic_subcolumn_name)) + { + source_columns.emplace_back(*it, dynamic_subcolumn_type); + it = unknown_required_source_columns.erase(it); + continue; + } + } + + ++it; + } + } + } + if (!unknown_required_source_columns.empty()) { constexpr auto format_string = "Missing columns: {} while processing query: '{}', required columns:{}{}"; diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index 25085ff4823..9363e3d83eb 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -504,7 +505,7 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID else if (const DataTypeVariant * type_variant = typeid_cast(&type)) { /// If we have type hint and Variant contains such type, no need to convert field. - if (from_type_hint && type_variant->tryGetVariantDiscriminator(*from_type_hint)) + if (from_type_hint && type_variant->tryGetVariantDiscriminator(from_type_hint->getName())) return src; /// Create temporary column and check if we can insert this field to the variant. @@ -513,6 +514,11 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID if (col->tryInsert(src)) return src; } + else if (isDynamic(type)) + { + /// We can insert any field to Dynamic column. + return src; + } /// Conversion from string by parsing. if (src.getType() == Field::Types::String) diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index f1f72a4ea4a..56f08dbb902 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -103,7 +103,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; extern const int QUERY_WAS_CANCELLED; - extern const int INCORRECT_DATA; extern const int SYNTAX_ERROR; extern const int SUPPORT_IS_DISABLED; extern const int INCORRECT_QUERY; @@ -1102,7 +1101,7 @@ static std::tuple executeQueryImpl( { if (can_use_query_cache && settings.enable_reads_from_query_cache) { - QueryCache::Key key(ast, context->getUserID(), context->getCurrentRoles()); + QueryCache::Key key(ast, context->getCurrentDatabase(), context->getUserID(), context->getCurrentRoles()); QueryCache::Reader reader = query_cache->createReader(key); if (reader.hasCacheEntryForKey()) { @@ -1225,7 +1224,7 @@ static std::tuple executeQueryImpl( && (!ast_contains_system_tables || system_table_handling == QueryCacheSystemTableHandling::Save)) { QueryCache::Key key( - ast, res.pipeline.getHeader(), + ast, context->getCurrentDatabase(), res.pipeline.getHeader(), context->getUserID(), context->getCurrentRoles(), settings.query_cache_share_between_users, std::chrono::system_clock::now() + std::chrono::seconds(settings.query_cache_ttl), @@ -1256,34 +1255,6 @@ static std::tuple executeQueryImpl( } } } - // Here we check if our our projections contain force_optimize_projection_name - if (!settings.force_optimize_projection_name.value.empty()) - { - bool found = false; - std::set projections; - { - const auto & access_info = context->getQueryAccessInfo(); - std::lock_guard lock(access_info.mutex); - projections = access_info.projections; - } - - for (const auto &projection : projections) - { - // projection value has structure like: .. - // We need to get only the projection name - size_t last_dot_pos = projection.find_last_of('.'); - std::string projection_name = (last_dot_pos != std::string::npos) ? projection.substr(last_dot_pos + 1) : projection; - if (settings.force_optimize_projection_name.value == projection_name) - { - found = true; - break; - } - } - - if (!found) - throw Exception(ErrorCodes::INCORRECT_DATA, "Projection {} is specified in setting force_optimize_projection_name but not used", - settings.force_optimize_projection_name.value); - } if (process_list_entry) { @@ -1421,7 +1392,16 @@ void executeQuery( const char * begin; const char * end; - istr.nextIfAtEnd(); + try + { + istr.nextIfAtEnd(); + } + catch (...) + { + /// If buffer contains invalid data and we failed to decompress, we still want to have some information about the query in the log. + logQuery("", context, /* internal = */ false, QueryProcessingStage::Complete); + throw; + } size_t max_query_size = context->getSettingsRef().max_query_size; diff --git a/src/Interpreters/misc.h b/src/Interpreters/misc.h index c009808de3f..b77fc5aee1e 100644 --- a/src/Interpreters/misc.h +++ b/src/Interpreters/misc.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include namespace DB diff --git a/src/Interpreters/parseColumnsListForTableFunction.cpp b/src/Interpreters/parseColumnsListForTableFunction.cpp index 27c364073ae..3529863a623 100644 --- a/src/Interpreters/parseColumnsListForTableFunction.cpp +++ b/src/Interpreters/parseColumnsListForTableFunction.cpp @@ -40,7 +40,7 @@ void validateDataType(const DataTypePtr & type_to_check, const DataTypeValidatio if (!settings.allow_experimental_object_type) { - if (data_type.hasDynamicSubcolumns()) + if (data_type.hasDynamicSubcolumnsDeprecated()) { throw Exception( ErrorCodes::ILLEGAL_COLUMN, @@ -107,6 +107,18 @@ void validateDataType(const DataTypePtr & type_to_check, const DataTypeValidatio } } } + + if (!settings.allow_experimental_dynamic_type) + { + if (data_type.hasDynamicSubcolumns()) + { + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Cannot create column with type '{}' because experimental Dynamic type is not allowed. " + "Set setting allow_experimental_dynamic_type = 1 in order to allow it", + data_type.getName()); + } + } }; validate_callback(*type_to_check); diff --git a/src/Interpreters/parseColumnsListForTableFunction.h b/src/Interpreters/parseColumnsListForTableFunction.h index ffb59bfa457..e2d2bc97ff7 100644 --- a/src/Interpreters/parseColumnsListForTableFunction.h +++ b/src/Interpreters/parseColumnsListForTableFunction.h @@ -21,6 +21,7 @@ struct DataTypeValidationSettings , allow_experimental_variant_type(settings.allow_experimental_variant_type) , allow_suspicious_variant_types(settings.allow_suspicious_variant_types) , validate_nested_types(settings.validate_experimental_and_suspicious_types_inside_nested_types) + , allow_experimental_dynamic_type(settings.allow_experimental_dynamic_type) { } @@ -30,6 +31,7 @@ struct DataTypeValidationSettings bool allow_experimental_variant_type = true; bool allow_suspicious_variant_types = true; bool validate_nested_types = true; + bool allow_experimental_dynamic_type = true; }; void validateDataType(const DataTypePtr & type, const DataTypeValidationSettings & settings); diff --git a/src/Loggers/Loggers.cpp b/src/Loggers/Loggers.cpp index 4b17469f4d7..0bd4b94d999 100644 --- a/src/Loggers/Loggers.cpp +++ b/src/Loggers/Loggers.cpp @@ -263,7 +263,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log } } #ifndef WITHOUT_TEXT_LOG - if (config.has("text_log")) + if (allowTextLog() && config.has("text_log")) { String text_log_level_str = config.getString("text_log.level", "trace"); int text_log_level = Poco::Logger::parseLevel(text_log_level_str); diff --git a/src/Loggers/Loggers.h b/src/Loggers/Loggers.h index 9eff731a4c5..9923d66ebcb 100644 --- a/src/Loggers/Loggers.h +++ b/src/Loggers/Loggers.h @@ -23,6 +23,10 @@ public: /// Close log files. On next log write files will be reopened. void closeLogs(Poco::Logger & logger); + virtual ~Loggers() = default; + +protected: + virtual bool allowTextLog() const { return true; } private: Poco::AutoPtr log_file; diff --git a/src/Loggers/OwnSplitChannel.cpp b/src/Loggers/OwnSplitChannel.cpp index fee33781c27..dc51a13e01f 100644 --- a/src/Loggers/OwnSplitChannel.cpp +++ b/src/Loggers/OwnSplitChannel.cpp @@ -107,6 +107,10 @@ void OwnSplitChannel::logSplit(const Poco::Message & msg) [[maybe_unused]] bool push_result = logs_queue->emplace(std::move(columns)); } + auto text_log_locked = text_log.lock(); + if (!text_log_locked) + return; + /// Also log to system.text_log table, if message is not too noisy auto text_log_max_priority_loaded = text_log_max_priority.load(std::memory_order_relaxed); if (text_log_max_priority_loaded && msg.getPriority() <= text_log_max_priority_loaded) @@ -146,10 +150,7 @@ void OwnSplitChannel::logSplit(const Poco::Message & msg) #undef SET_VALUE_IF_EXISTS - std::shared_ptr> text_log_locked{}; - text_log_locked = text_log.lock(); - if (text_log_locked) - text_log_locked->push(std::move(elem)); + text_log_locked->push(std::move(elem)); } #endif } diff --git a/src/Loggers/OwnSplitChannel.h b/src/Loggers/OwnSplitChannel.h index b75554eefc4..7ca27cf6584 100644 --- a/src/Loggers/OwnSplitChannel.h +++ b/src/Loggers/OwnSplitChannel.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include #include diff --git a/src/Parsers/CMakeLists.txt b/src/Parsers/CMakeLists.txt index 3bc1b3a981f..d5653da7b3a 100644 --- a/src/Parsers/CMakeLists.txt +++ b/src/Parsers/CMakeLists.txt @@ -7,7 +7,7 @@ add_headers_and_sources(clickhouse_parsers ./Kusto) add_headers_and_sources(clickhouse_parsers ./PRQL) add_headers_and_sources(clickhouse_parsers ./Kusto/KustoFunctions) add_library(clickhouse_parsers ${clickhouse_parsers_headers} ${clickhouse_parsers_sources}) -target_link_libraries(clickhouse_parsers PUBLIC clickhouse_common_io clickhouse_common_access string_utils) +target_link_libraries(clickhouse_parsers PUBLIC clickhouse_common_io clickhouse_common_access) if (TARGET ch_rust::prql) target_link_libraries(clickhouse_parsers PRIVATE ch_rust::prql) endif () diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 59b586d46a0..416f696323c 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include "Parsers/CommonParsers.h" diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp index ee9e199b9b8..7cdfaf988a3 100644 --- a/src/Parsers/ExpressionListParsers.cpp +++ b/src/Parsers/ExpressionListParsers.cpp @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include diff --git a/src/Parsers/Kusto/KustoFunctions/KQLAggregationFunctions.cpp b/src/Parsers/Kusto/KustoFunctions/KQLAggregationFunctions.cpp index 16436d38d32..0eb83b8b5ac 100644 --- a/src/Parsers/Kusto/KustoFunctions/KQLAggregationFunctions.cpp +++ b/src/Parsers/Kusto/KustoFunctions/KQLAggregationFunctions.cpp @@ -14,7 +14,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Parsers/Kusto/ParserKQLDateTypeTimespan.cpp b/src/Parsers/Kusto/ParserKQLDateTypeTimespan.cpp index c4f84d576cb..19625f6624d 100644 --- a/src/Parsers/Kusto/ParserKQLDateTypeTimespan.cpp +++ b/src/Parsers/Kusto/ParserKQLDateTypeTimespan.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Parsers/Kusto/ParserKQLStatement.cpp b/src/Parsers/Kusto/ParserKQLStatement.cpp index fbf2110e664..e508b69bdff 100644 --- a/src/Parsers/Kusto/ParserKQLStatement.cpp +++ b/src/Parsers/Kusto/ParserKQLStatement.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB { @@ -62,49 +63,51 @@ bool ParserKQLWithUnionQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & exp bool ParserKQLTableFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { - ParserKQLWithUnionQuery kql_p; - ASTPtr select; - ParserToken s_lparen(TokenType::OpeningRoundBracket); + ParserToken lparen(TokenType::OpeningRoundBracket); - auto begin = pos; - auto paren_count = 0; + ASTPtr string_literal; + ParserStringLiteral parser_string_literal; + + if (!lparen.ignore(pos, expected)) + return false; + + size_t paren_count = 0; String kql_statement; - - if (s_lparen.ignore(pos, expected)) + if (parser_string_literal.parse(pos, string_literal, expected)) { - if (pos->type == TokenType::HereDoc) - { - kql_statement = String(pos->begin + 2, pos->end - 2); - } - else - { - ++paren_count; - auto pos_start = pos; - while (isValidKQLPos(pos)) - { - if (pos->type == TokenType::ClosingRoundBracket) - --paren_count; - if (pos->type == TokenType::OpeningRoundBracket) - ++paren_count; - - if (paren_count == 0) - break; - ++pos; - } - kql_statement = String(pos_start->begin, (--pos)->end); - } - ++pos; - Tokens token_kql(kql_statement.c_str(), kql_statement.c_str() + kql_statement.size()); - IParser::Pos pos_kql(token_kql, pos.max_depth, pos.max_backtracks); - - if (kql_p.parse(pos_kql, select, expected)) - { - node = select; - ++pos; - return true; - } + kql_statement = typeid_cast(*string_literal).value.safeGet(); } - pos = begin; - return false; + else + { + ++paren_count; + auto pos_start = pos; + while (isValidKQLPos(pos)) + { + if (pos->type == TokenType::ClosingRoundBracket) + --paren_count; + if (pos->type == TokenType::OpeningRoundBracket) + ++paren_count; + + if (paren_count == 0) + break; + ++pos; + } + if (!isValidKQLPos(pos)) + { + return false; + } + --pos; + kql_statement = String(pos_start->begin, pos->end); + ++pos; + } + + Tokens token_kql(kql_statement.data(), kql_statement.data() + kql_statement.size()); + IParser::Pos pos_kql(token_kql, pos.max_depth, pos.max_backtracks); + Expected kql_expected; + kql_expected.enable_highlighting = false; + if (!ParserKQLWithUnionQuery().parse(pos_kql, node, kql_expected)) + return false; + ++pos; + return true; } } diff --git a/src/Parsers/Kusto/parseKQLQuery.cpp b/src/Parsers/Kusto/parseKQLQuery.cpp index 34a009873f8..34076168480 100644 --- a/src/Parsers/Kusto/parseKQLQuery.cpp +++ b/src/Parsers/Kusto/parseKQLQuery.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Parsers/Lexer.cpp b/src/Parsers/Lexer.cpp index 9ac6e623803..34855a7ce20 100644 --- a/src/Parsers/Lexer.cpp +++ b/src/Parsers/Lexer.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include namespace DB diff --git a/src/Parsers/ParserDataType.cpp b/src/Parsers/ParserDataType.cpp index fdd712f2e68..b5bc9f89990 100644 --- a/src/Parsers/ParserDataType.cpp +++ b/src/Parsers/ParserDataType.cpp @@ -1,11 +1,12 @@ #include #include +#include #include #include #include #include -#include +#include namespace DB @@ -14,18 +15,60 @@ namespace DB namespace { +/// Parser of Dynamic type arguments: Dynamic(max_types=N) +class DynamicArgumentsParser : public IParserBase +{ +private: + const char * getName() const override { return "Dynamic data type optional argument"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override + { + ASTPtr identifier; + ParserIdentifier identifier_parser; + if (!identifier_parser.parse(pos, identifier, expected)) + return false; + + if (pos->type != TokenType::Equals) + { + expected.add(pos, "equals operator"); + return false; + } + + ++pos; + + ASTPtr number; + ParserNumber number_parser; + if (!number_parser.parse(pos, number, expected)) + return false; + + node = makeASTFunction("equals", identifier, number); + return true; + } +}; + /// Wrapper to allow mixed lists of nested and normal types. /// Parameters are either: /// - Nested table elements; /// - Enum element in form of 'a' = 1; /// - literal; -/// - another data type (or identifier) +/// - Dynamic type arguments; +/// - another data type (or identifier); class ParserDataTypeArgument : public IParserBase { +public: + explicit ParserDataTypeArgument(std::string_view type_name_) : type_name(type_name_) + { + } + private: const char * getName() const override { return "data type argument"; } bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override { + if (type_name == "Dynamic") + { + DynamicArgumentsParser parser; + return parser.parse(pos, node, expected); + } + ParserNestedTable nested_parser; ParserDataType data_type_parser; ParserAllCollectionsOfLiterals literal_parser(false); @@ -40,6 +83,8 @@ private: || literal_parser.parse(pos, node, expected) || data_type_parser.parse(pos, node, expected); } + + std::string_view type_name; }; } @@ -148,7 +193,7 @@ bool ParserDataType::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ++pos; /// Parse optional parameters - ParserList args_parser(std::make_unique(), std::make_unique(TokenType::Comma)); + ParserList args_parser(std::make_unique(type_name), std::make_unique(TokenType::Comma)); ASTPtr expr_list_args; if (!args_parser.parse(pos, expr_list_args, expected)) diff --git a/src/Parsers/formatSettingName.cpp b/src/Parsers/formatSettingName.cpp index efbfffddd7b..59973379167 100644 --- a/src/Parsers/formatSettingName.cpp +++ b/src/Parsers/formatSettingName.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/src/Parsers/obfuscateQueries.cpp b/src/Parsers/obfuscateQueries.cpp index 2ed551851e8..074b6797517 100644 --- a/src/Parsers/obfuscateQueries.cpp +++ b/src/Parsers/obfuscateQueries.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Parsers/parseQuery.cpp b/src/Parsers/parseQuery.cpp index 66bd76687aa..41c51267496 100644 --- a/src/Parsers/parseQuery.cpp +++ b/src/Parsers/parseQuery.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Parsers/queryNormalization.cpp b/src/Parsers/queryNormalization.cpp index 4a9dd8ceb98..4890ad6952d 100644 --- a/src/Parsers/queryNormalization.cpp +++ b/src/Parsers/queryNormalization.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include namespace DB diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index 5382527fcdc..4d67bc1a4e9 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -135,7 +135,7 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex /// If the key is not found, skip the value. NullOutput sink; - readEscapedStringInto(sink, *in); + readEscapedStringInto(sink, *in); } else { diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 09f8fa92e5f..6d4dcba9e60 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -10,6 +10,8 @@ #include #include #include +#include +#include "Formats/FormatSettings.h" namespace DB { @@ -28,7 +30,8 @@ static void checkForCarriageReturn(ReadBuffer & in) throw Exception(ErrorCodes::INCORRECT_DATA, "\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row." "\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format." " You must transform your file to Unix format." - "\nBut if you really need carriage return at end of string value of last column, you need to escape it as \\r."); + "\nBut if you really need carriage return at end of string value of last column, you need to escape it as \\r" + "\nor else enable setting 'input_format_tsv_crlf_end_of_line'"); } TabSeparatedRowInputFormat::TabSeparatedRowInputFormat( @@ -92,7 +95,12 @@ void TabSeparatedFormatReader::skipRowEndDelimiter() if (buf->eof()) return; - if (unlikely(first_row)) + if (format_settings.tsv.crlf_end_of_line_input) + { + if (*buf->position() == '\r') + ++buf->position(); + } + else if (unlikely(first_row)) { checkForCarriageReturn(*buf); first_row = false; @@ -105,14 +113,15 @@ template String TabSeparatedFormatReader::readFieldIntoString() { String field; + bool support_crlf = format_settings.tsv.crlf_end_of_line_input; if (is_raw) readString(field, *buf); else { if constexpr (read_string) - readEscapedString(field, *buf); + support_crlf ? readEscapedStringCRLF(field, *buf) : readEscapedString(field, *buf); else - readTSVField(field, *buf); + support_crlf ? readTSVFieldCRLF(field, *buf) : readTSVField(field, *buf); } return field; } @@ -123,7 +132,7 @@ void TabSeparatedFormatReader::skipField() if (is_raw) readStringInto(out, *buf); else - readEscapedStringInto(out, *buf); + format_settings.tsv.crlf_end_of_line_input ? readEscapedStringInto(out, *buf) : readEscapedStringInto(out, *buf); } void TabSeparatedFormatReader::skipHeaderRow() @@ -155,7 +164,7 @@ bool TabSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & t const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/) { const bool at_delimiter = !is_last_file_column && !buf->eof() && *buf->position() == '\t'; - const bool at_last_column_line_end = is_last_file_column && (buf->eof() || *buf->position() == '\n'); + const bool at_last_column_line_end = is_last_file_column && (buf->eof() || *buf->position() == '\n' || (format_settings.tsv.crlf_end_of_line_input && *buf->position() == '\r')); if (format_settings.tsv.empty_as_default && (at_delimiter || at_last_column_line_end)) { @@ -220,7 +229,10 @@ bool TabSeparatedFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) try { - assertChar('\n', *buf); + if (!format_settings.tsv.crlf_end_of_line_input) + assertChar('\n', *buf); + else + assertChar('\r', *buf); } catch (const DB::Exception &) { @@ -233,7 +245,10 @@ bool TabSeparatedFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) else if (*buf->position() == '\r') { out << "ERROR: Carriage return found where line feed is expected." - " It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n"; + " It's like your file has DOS/Windows style line separators. \n" + "You must transform your file to Unix format. \n" + "But if you really need carriage return at end of string value of last column, you need to escape it as \\r \n" + "or else enable setting 'input_format_tsv_crlf_end_of_line'"; } else { @@ -348,7 +363,7 @@ void TabSeparatedFormatReader::skipRow() bool TabSeparatedFormatReader::checkForEndOfRow() { - return buf->eof() || *buf->position() == '\n'; + return buf->eof() || *buf->position() == '\n' || (format_settings.tsv.crlf_end_of_line_input && *buf->position() == '\r'); } TabSeparatedSchemaReader::TabSeparatedSchemaReader( diff --git a/src/Processors/IInflatingTransform.cpp b/src/Processors/IInflatingTransform.cpp index ffa5b55dc76..a59eda0feb2 100644 --- a/src/Processors/IInflatingTransform.cpp +++ b/src/Processors/IInflatingTransform.cpp @@ -45,8 +45,13 @@ IInflatingTransform::Status IInflatingTransform::prepare() { if (input.isFinished()) { - output.finish(); - return Status::Finished; + if (is_finished) + { + output.finish(); + return Status::Finished; + } + is_finished = true; + return Status::Ready; } input.setNeeded(); @@ -73,6 +78,14 @@ void IInflatingTransform::work() generated = true; can_generate = canGenerate(); } + else if (is_finished) + { + if (can_generate || generated || has_input) + throw Exception(ErrorCodes::LOGICAL_ERROR, "IInflatingTransform cannot finish work because it has generated data or has input data"); + + current_chunk = getRemaining(); + generated = !current_chunk.empty(); + } else { if (!has_input) diff --git a/src/Processors/IInflatingTransform.h b/src/Processors/IInflatingTransform.h index 0ad12f6cd65..0cb7fc06cc4 100644 --- a/src/Processors/IInflatingTransform.h +++ b/src/Processors/IInflatingTransform.h @@ -10,13 +10,14 @@ namespace DB /// for (chunk : input_chunks) /// { /// transform.consume(chunk); -/// /// while (transform.canGenerate()) /// { /// transformed_chunk = transform.generate(); /// ... (process transformed chunk) /// } /// } +/// transformed_chunk = transform.getRemaining(); +/// ... (process remaining data) /// class IInflatingTransform : public IProcessor { @@ -32,6 +33,7 @@ protected: virtual void consume(Chunk chunk) = 0; virtual bool canGenerate() = 0; virtual Chunk generate() = 0; + virtual Chunk getRemaining() { return {}; } public: IInflatingTransform(Block input_header, Block output_header); @@ -41,6 +43,9 @@ public: InputPort & getInputPort() { return input; } OutputPort & getOutputPort() { return output; } + + /// canGenerate can flush data when input is finished. + bool is_finished = false; }; } diff --git a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp index 3bd0b532d90..857f5040b79 100644 --- a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp @@ -70,25 +70,6 @@ static AggregatingSortedAlgorithm::ColumnsDefinition defineColumns( return def; } -static MutableColumns getMergedColumns(const Block & header, const AggregatingSortedAlgorithm::ColumnsDefinition & def) -{ - MutableColumns columns; - columns.resize(header.columns()); - - for (const auto & desc : def.columns_to_simple_aggregate) - { - const auto & type = desc.nested_type ? desc.nested_type - : desc.real_type; - columns[desc.column_number] = type->createColumn(); - } - - for (size_t i = 0; i < columns.size(); ++i) - if (!columns[i]) - columns[i] = header.getByPosition(i).type->createColumn(); - - return columns; -} - /// Remove constants and LowCardinality for SimpleAggregateFunction static void preprocessChunk(Chunk & chunk, const AggregatingSortedAlgorithm::ColumnsDefinition & def) { @@ -159,12 +140,24 @@ AggregatingSortedAlgorithm::SimpleAggregateDescription::~SimpleAggregateDescript AggregatingSortedAlgorithm::AggregatingMergedData::AggregatingMergedData( - MutableColumns columns_, UInt64 max_block_size_rows_, UInt64 max_block_size_bytes_, ColumnsDefinition & def_) - : MergedData(std::move(columns_), false, max_block_size_rows_, max_block_size_bytes_), def(def_) + : MergedData(false, max_block_size_rows_, max_block_size_bytes_), def(def_) { +} + +void AggregatingSortedAlgorithm::AggregatingMergedData::initialize(const DB::Block & header, const IMergingAlgorithm::Inputs & inputs) +{ + MergedData::initialize(header, inputs); + + for (const auto & desc : def.columns_to_simple_aggregate) + { + const auto & type = desc.nested_type ? desc.nested_type + : desc.real_type; + columns[desc.column_number] = type->createColumn(); + } + initAggregateDescription(); /// Just to make startGroup() simpler. @@ -267,12 +260,14 @@ AggregatingSortedAlgorithm::AggregatingSortedAlgorithm( size_t max_block_size_bytes_) : IMergingAlgorithmWithDelayedChunk(header_, num_inputs, description_) , columns_definition(defineColumns(header_, description_)) - , merged_data(getMergedColumns(header_, columns_definition), max_block_size_rows_, max_block_size_bytes_, columns_definition) + , merged_data(max_block_size_rows_, max_block_size_bytes_, columns_definition) { } void AggregatingSortedAlgorithm::initialize(Inputs inputs) { + merged_data.initialize(header, inputs); + for (auto & input : inputs) if (input.chunk) preprocessChunk(input.chunk, columns_definition); diff --git a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h index db8ee66ab2b..53c103e7038 100644 --- a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h @@ -102,11 +102,12 @@ private: public: AggregatingMergedData( - MutableColumns columns_, UInt64 max_block_size_rows_, UInt64 max_block_size_bytes_, ColumnsDefinition & def_); + void initialize(const Block & header, const IMergingAlgorithm::Inputs & inputs) override; + /// Group is a group of rows with the same sorting key. It represents single row in result. /// Algorithm is: start group, add several rows, finish group. /// Then pull chunk when enough groups were added. diff --git a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp index 8948cee217c..07ee8f4ddef 100644 --- a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp @@ -31,8 +31,13 @@ CollapsingSortedAlgorithm::CollapsingSortedAlgorithm( LoggerPtr log_, WriteBuffer * out_row_sources_buf_, bool use_average_block_sizes) - : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs) - , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size_rows_, max_block_size_bytes_) + : IMergingAlgorithmWithSharedChunks( + header_, + num_inputs, + std::move(description_), + out_row_sources_buf_, + max_row_refs, + std::make_unique(use_average_block_sizes, max_block_size_rows_, max_block_size_bytes_)) , sign_column_number(header_.getPositionByName(sign_column)) , only_positive_sign(only_positive_sign_) , log(log_) @@ -65,7 +70,7 @@ void CollapsingSortedAlgorithm::reportIncorrectData() void CollapsingSortedAlgorithm::insertRow(RowRef & row) { - merged_data.insertRow(*row.all_columns, row.row_num, row.owned_chunk->getNumRows()); + merged_data->insertRow(*row.all_columns, row.row_num, row.owned_chunk->getNumRows()); } std::optional CollapsingSortedAlgorithm::insertRows() @@ -90,8 +95,8 @@ std::optional CollapsingSortedAlgorithm::insertRows() if (count_positive >= count_negative) { - if (merged_data.hasEnoughRows()) - res = merged_data.pull(); + if (merged_data->hasEnoughRows()) + res = merged_data->pull(); insertRow(last_positive_row); @@ -121,8 +126,8 @@ std::optional CollapsingSortedAlgorithm::insertRows() IMergingAlgorithm::Status CollapsingSortedAlgorithm::merge() { /// Rare case, which may happen when index_granularity is 1, but we needed to insert 2 rows inside insertRows(). - if (merged_data.hasEnoughRows()) - return Status(merged_data.pull()); + if (merged_data->hasEnoughRows()) + return Status(merged_data->pull()); /// Take rows in required order and put them into `merged_data`, while the rows are no more than `max_block_size` while (queue.isValid()) @@ -148,8 +153,8 @@ IMergingAlgorithm::Status CollapsingSortedAlgorithm::merge() if (key_differs) { /// if there are enough rows and the last one is calculated completely - if (merged_data.hasEnoughRows()) - return Status(merged_data.pull()); + if (merged_data->hasEnoughRows()) + return Status(merged_data->pull()); /// We write data for the previous primary key. auto res = insertRows(); @@ -220,7 +225,7 @@ IMergingAlgorithm::Status CollapsingSortedAlgorithm::merge() return Status(std::move(*res)); } - return Status(merged_data.pull(), true); + return Status(merged_data->pull(), true); } } diff --git a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h index be1a3a3bf33..99fd95d82d9 100644 --- a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h @@ -42,8 +42,6 @@ public: Status merge() override; private: - MergedData merged_data; - const size_t sign_column_number; const bool only_positive_sign; diff --git a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp index 814625d7aee..2b891592b20 100644 --- a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp @@ -46,8 +46,8 @@ GraphiteRollupSortedAlgorithm::GraphiteRollupSortedAlgorithm( size_t max_block_size_bytes_, Graphite::Params params_, time_t time_of_merge_) - : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), nullptr, max_row_refs) - , merged_data(header_.cloneEmptyColumns(), false, max_block_size_rows_, max_block_size_bytes_) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), nullptr, max_row_refs, std::make_unique(false, max_block_size_rows_, max_block_size_bytes_)) + , graphite_rollup_merged_data(assert_cast(*merged_data)) , params(std::move(params_)) , time_of_merge(time_of_merge_) { @@ -63,7 +63,7 @@ GraphiteRollupSortedAlgorithm::GraphiteRollupSortedAlgorithm( } } - merged_data.allocMemForAggregates(max_size_of_aggregate_state, max_alignment_of_aggregate_state); + graphite_rollup_merged_data.allocMemForAggregates(max_size_of_aggregate_state, max_alignment_of_aggregate_state); columns_definition = defineColumns(header_, params); } @@ -113,7 +113,7 @@ IMergingAlgorithm::Status GraphiteRollupSortedAlgorithm::merge() const DateLUTImpl & date_lut = timezone ? timezone->getTimeZone() : DateLUT::instance(); - /// Take rows in needed order and put them into `merged_data` until we get `max_block_size` rows. + /// Take rows in needed order and put them into `graphite_rollup_merged_data` until we get `max_block_size` rows. /// /// Variables starting with current_* refer to the rows previously popped from the queue that will /// contribute towards current output row. @@ -142,10 +142,10 @@ IMergingAlgorithm::Status GraphiteRollupSortedAlgorithm::merge() if (is_new_key) { /// Accumulate the row that has maximum version in the previous group of rows with the same key: - if (merged_data.wasGroupStarted()) + if (graphite_rollup_merged_data.wasGroupStarted()) accumulateRow(current_subgroup_newest_row); - Graphite::RollupRule next_rule = merged_data.currentRule(); + Graphite::RollupRule next_rule = graphite_rollup_merged_data.currentRule(); if (new_path) next_rule = selectPatternForPath(this->params, next_path); @@ -167,15 +167,15 @@ IMergingAlgorithm::Status GraphiteRollupSortedAlgorithm::merge() if (will_be_new_key) { - if (merged_data.wasGroupStarted()) + if (graphite_rollup_merged_data.wasGroupStarted()) { finishCurrentGroup(); /// We have enough rows - return, but don't advance the loop. At the beginning of the /// next call to merge() the same next_cursor will be processed once more and /// the next output row will be created from it. - if (merged_data.hasEnoughRows()) - return Status(merged_data.pull()); + if (graphite_rollup_merged_data.hasEnoughRows()) + return Status(graphite_rollup_merged_data.pull()); } /// At this point previous row has been fully processed, so we can advance the loop @@ -218,28 +218,28 @@ IMergingAlgorithm::Status GraphiteRollupSortedAlgorithm::merge() } /// Write result row for the last group. - if (merged_data.wasGroupStarted()) + if (graphite_rollup_merged_data.wasGroupStarted()) { accumulateRow(current_subgroup_newest_row); finishCurrentGroup(); } - return Status(merged_data.pull(), true); + return Status(graphite_rollup_merged_data.pull(), true); } void GraphiteRollupSortedAlgorithm::startNextGroup(SortCursor & cursor, Graphite::RollupRule next_rule) { - merged_data.startNextGroup(cursor->all_columns, cursor->getRow(), next_rule, columns_definition); + graphite_rollup_merged_data.startNextGroup(cursor->all_columns, cursor->getRow(), next_rule, columns_definition); } void GraphiteRollupSortedAlgorithm::finishCurrentGroup() { - merged_data.insertRow(current_time_rounded, current_subgroup_newest_row, columns_definition); + graphite_rollup_merged_data.insertRow(current_time_rounded, current_subgroup_newest_row, columns_definition); } void GraphiteRollupSortedAlgorithm::accumulateRow(RowRef & row) { - merged_data.accumulateRow(row, columns_definition); + graphite_rollup_merged_data.accumulateRow(row, columns_definition); } void GraphiteRollupSortedAlgorithm::GraphiteRollupMergedData::startNextGroup( diff --git a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h index a20a6eaf11f..aaa3859efb6 100644 --- a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h @@ -53,7 +53,7 @@ public: { public: using MergedData::MergedData; - ~GraphiteRollupMergedData(); + ~GraphiteRollupMergedData() override; void startNextGroup(const ColumnRawPtrs & raw_columns, size_t row, Graphite::RollupRule next_rule, ColumnsDefinition & def); @@ -72,7 +72,7 @@ public: }; private: - GraphiteRollupMergedData merged_data; + GraphiteRollupMergedData & graphite_rollup_merged_data; const Graphite::Params params; ColumnsDefinition columns_definition; diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h index b8e73aec0dc..cf4b8589441 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h @@ -34,9 +34,9 @@ protected: return !lhs.hasEqualSortColumnsWith(rhs); } -private: Block header; +private: /// Inputs currently being merged. Inputs current_inputs; SortCursorImpls cursors; diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp index c8b69382e89..fe5186736b5 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp @@ -5,7 +5,7 @@ namespace DB { IMergingAlgorithmWithSharedChunks::IMergingAlgorithmWithSharedChunks( - Block header_, size_t num_inputs, SortDescription description_, WriteBuffer * out_row_sources_buf_, size_t max_row_refs) + Block header_, size_t num_inputs, SortDescription description_, WriteBuffer * out_row_sources_buf_, size_t max_row_refs, std::unique_ptr merged_data_) : header(std::move(header_)) , description(std::move(description_)) , chunk_allocator(num_inputs + max_row_refs) @@ -13,6 +13,7 @@ IMergingAlgorithmWithSharedChunks::IMergingAlgorithmWithSharedChunks( , sources(num_inputs) , sources_origin_merge_tree_part_level(num_inputs) , out_row_sources_buf(out_row_sources_buf_) + , merged_data(std::move(merged_data_)) { } @@ -28,6 +29,8 @@ static void prepareChunk(Chunk & chunk) void IMergingAlgorithmWithSharedChunks::initialize(Inputs inputs) { + merged_data->initialize(header, inputs); + for (size_t source_num = 0; source_num < inputs.size(); ++source_num) { if (!inputs[source_num].chunk) diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h index 3b4f9e92c5d..bc1aafe93f7 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include namespace DB @@ -10,7 +11,7 @@ class IMergingAlgorithmWithSharedChunks : public IMergingAlgorithm { public: IMergingAlgorithmWithSharedChunks( - Block header_, size_t num_inputs, SortDescription description_, WriteBuffer * out_row_sources_buf_, size_t max_row_refs); + Block header_, size_t num_inputs, SortDescription description_, WriteBuffer * out_row_sources_buf_, size_t max_row_refs, std::unique_ptr merged_data_); void initialize(Inputs inputs) override; void consume(Input & input, size_t source_num) override; @@ -25,7 +26,6 @@ private: SortCursorImpls cursors; protected: - struct Source { detail::SharedChunkPtr chunk; @@ -43,6 +43,8 @@ protected: /// If it is not nullptr then it should be populated during execution WriteBuffer * out_row_sources_buf = nullptr; + std::unique_ptr merged_data; + using RowRef = detail::RowRefWithOwnedChunk; void setRowRef(RowRef & row, SortCursor & cursor) { row.set(cursor, sources[cursor.impl->order].chunk); } bool skipLastRowFor(size_t input_number) const { return sources[input_number].skip_last_row; } diff --git a/src/Processors/Merges/Algorithms/MergedData.h b/src/Processors/Merges/Algorithms/MergedData.h index 7ffde835ad0..c5bb074bb0c 100644 --- a/src/Processors/Merges/Algorithms/MergedData.h +++ b/src/Processors/Merges/Algorithms/MergedData.h @@ -1,7 +1,9 @@ #pragma once #include +#include #include +#include #include #include @@ -19,17 +21,40 @@ namespace ErrorCodes class MergedData { public: - explicit MergedData(MutableColumns columns_, bool use_average_block_size_, UInt64 max_block_size_, UInt64 max_block_size_bytes_) - : columns(std::move(columns_)), max_block_size(max_block_size_), max_block_size_bytes(max_block_size_bytes_), use_average_block_size(use_average_block_size_) + explicit MergedData(bool use_average_block_size_, UInt64 max_block_size_, UInt64 max_block_size_bytes_) + : max_block_size(max_block_size_), max_block_size_bytes(max_block_size_bytes_), use_average_block_size(use_average_block_size_) { } + virtual void initialize(const Block & header, const IMergingAlgorithm::Inputs & inputs) + { + columns = header.cloneEmptyColumns(); + std::vector source_columns; + source_columns.resize(columns.size()); + for (const auto & input : inputs) + { + if (!input.chunk) + continue; + + const auto & input_columns = input.chunk.getColumns(); + for (size_t i = 0; i != input_columns.size(); ++i) + source_columns[i].push_back(input_columns[i]); + } + + for (size_t i = 0; i != columns.size(); ++i) + { + if (columns[i]->hasDynamicStructure()) + columns[i]->takeDynamicStructureFromSourceColumns(source_columns[i]); + } + } + /// Pull will be called at next prepare call. void flush() { need_flush = true; } void insertRow(const ColumnRawPtrs & raw_columns, size_t row, size_t block_size) { size_t num_columns = raw_columns.size(); + chassert(columns.size() == num_columns); for (size_t i = 0; i < num_columns; ++i) columns[i]->insertFrom(*raw_columns[i], row); @@ -41,6 +66,7 @@ public: void insertRows(const ColumnRawPtrs & raw_columns, size_t start_index, size_t length, size_t block_size) { size_t num_columns = raw_columns.size(); + chassert(columns.size() == num_columns); for (size_t i = 0; i < num_columns; ++i) { if (length == 1) @@ -61,6 +87,7 @@ public: UInt64 num_rows = chunk.getNumRows(); UInt64 num_columns = chunk.getNumColumns(); + chassert(columns.size() == num_columns); auto chunk_columns = chunk.mutateColumns(); /// Here is a special code for constant columns. @@ -69,9 +96,21 @@ public: for (size_t i = 0; i < num_columns; ++i) { if (isColumnConst(*columns[i])) + { columns[i] = columns[i]->cloneResized(num_rows); + } + /// For columns with Dynamic structure we cannot just take column from input chunk because resulting column may have + /// different Dynamic structure (and have some merge statistics after calling takeDynamicStructureFromSourceColumns). + /// We should insert into data resulting column using insertRangeFrom. + else if (columns[i]->hasDynamicStructure()) + { + columns[i] = columns[i]->cloneEmpty(); + columns[i]->insertRangeFrom(*chunk_columns[i], 0, num_rows); + } else + { columns[i] = std::move(chunk_columns[i]); + } } if (rows_size < num_rows) @@ -144,6 +183,8 @@ public: UInt64 totalAllocatedBytes() const { return total_allocated_bytes; } UInt64 maxBlockSize() const { return max_block_size; } + virtual ~MergedData() = default; + protected: MutableColumns columns; diff --git a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp index 408d9a16c31..d17a4d859ee 100644 --- a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp @@ -18,7 +18,7 @@ MergingSortedAlgorithm::MergingSortedAlgorithm( WriteBuffer * out_row_sources_buf_, bool use_average_block_sizes) : header(std::move(header_)) - , merged_data(header.cloneEmptyColumns(), use_average_block_sizes, max_block_size_, max_block_size_bytes_) + , merged_data(use_average_block_sizes, max_block_size_, max_block_size_bytes_) , description(description_) , limit(limit_) , out_row_sources_buf(out_row_sources_buf_) @@ -49,6 +49,7 @@ void MergingSortedAlgorithm::addInput() void MergingSortedAlgorithm::initialize(Inputs inputs) { + merged_data.initialize(header, inputs); current_inputs = std::move(inputs); for (size_t source_num = 0; source_num < current_inputs.size(); ++source_num) diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp index 9e5c1249c4e..7b2c7d82a01 100644 --- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp @@ -41,9 +41,8 @@ ReplacingSortedAlgorithm::ReplacingSortedAlgorithm( bool use_average_block_sizes, bool cleanup_, bool enable_vertical_final_) - : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs) - , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size_rows, max_block_size_bytes), cleanup(cleanup_) - , enable_vertical_final(enable_vertical_final_) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs, std::make_unique(use_average_block_sizes, max_block_size_rows, max_block_size_bytes)) + , cleanup(cleanup_), enable_vertical_final(enable_vertical_final_) { if (!is_deleted_column.empty()) is_deleted_column_number = header_.getPositionByName(is_deleted_column); @@ -75,7 +74,7 @@ void ReplacingSortedAlgorithm::insertRow() to_be_emitted.push(std::move(selected_row.owned_chunk)); } else - merged_data.insertRow(*selected_row.all_columns, selected_row.row_num, selected_row.owned_chunk->getNumRows()); + merged_data->insertRow(*selected_row.all_columns, selected_row.row_num, selected_row.owned_chunk->getNumRows()); selected_row.clear(); } @@ -109,8 +108,8 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge() if (key_differs) { /// If there are enough rows and the last one is calculated completely - if (merged_data.hasEnoughRows()) - return Status(merged_data.pull()); + if (merged_data->hasEnoughRows()) + return Status(merged_data->pull()); /// Write the data for the previous primary key. if (!selected_row.empty()) @@ -168,8 +167,8 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge() } /// If have enough rows, return block, because it prohibited to overflow requested number of rows. - if (merged_data.hasEnoughRows()) - return Status(merged_data.pull()); + if (merged_data->hasEnoughRows()) + return Status(merged_data->pull()); /// We will write the data for the last primary key. if (!selected_row.empty()) @@ -193,7 +192,7 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge() return emitChunk(chunk, to_be_emitted.empty()); } - return Status(merged_data.pull(), true); + return Status(merged_data->pull(), true); } void ReplacingSortedAlgorithm::saveChunkForSkippingFinalFromSelectedRow() diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h index 2fbd73c9072..a3ccccf0845 100644 --- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h @@ -44,8 +44,6 @@ public: Status merge() override; private: - MergedData merged_data; - ssize_t is_deleted_column_number = -1; ssize_t version_column_number = -1; bool cleanup = false; diff --git a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp index 79b5dae2d6e..7329821cf97 100644 --- a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include @@ -382,39 +382,6 @@ static SummingSortedAlgorithm::ColumnsDefinition defineColumns( return def; } -static MutableColumns getMergedDataColumns( - const Block & header, - const SummingSortedAlgorithm::ColumnsDefinition & def) -{ - MutableColumns columns; - size_t num_columns = def.column_numbers_not_to_aggregate.size() + def.columns_to_aggregate.size(); - columns.reserve(num_columns); - - for (const auto & desc : def.columns_to_aggregate) - { - // Wrap aggregated columns in a tuple to match function signature - if (!desc.is_agg_func_type && !desc.is_simple_agg_func_type && isTuple(desc.function->getResultType())) - { - size_t tuple_size = desc.column_numbers.size(); - MutableColumns tuple_columns(tuple_size); - for (size_t i = 0; i < tuple_size; ++i) - tuple_columns[i] = header.safeGetByPosition(desc.column_numbers[i]).column->cloneEmpty(); - - columns.emplace_back(ColumnTuple::create(std::move(tuple_columns))); - } - else - { - const auto & type = desc.nested_type ? desc.nested_type : desc.real_type; - columns.emplace_back(type->createColumn()); - } - } - - for (const auto & column_number : def.column_numbers_not_to_aggregate) - columns.emplace_back(header.safeGetByPosition(column_number).type->createColumn()); - - return columns; -} - static void preprocessChunk(Chunk & chunk, const SummingSortedAlgorithm::ColumnsDefinition & def) { auto num_rows = chunk.getNumRows(); @@ -504,11 +471,44 @@ static void setRow(Row & row, const ColumnRawPtrs & raw_columns, size_t row_num, } -SummingSortedAlgorithm::SummingMergedData::SummingMergedData( - MutableColumns columns_, UInt64 max_block_size_rows_, UInt64 max_block_size_bytes_, ColumnsDefinition & def_) - : MergedData(std::move(columns_), false, max_block_size_rows_, max_block_size_bytes_) +SummingSortedAlgorithm::SummingMergedData::SummingMergedData(UInt64 max_block_size_rows_, UInt64 max_block_size_bytes_, ColumnsDefinition & def_) + : MergedData(false, max_block_size_rows_, max_block_size_bytes_) , def(def_) { +} + +void SummingSortedAlgorithm::SummingMergedData::initialize(const DB::Block & header, const IMergingAlgorithm::Inputs & inputs) +{ + MergedData::initialize(header, inputs); + + MutableColumns new_columns; + size_t num_columns = def.column_numbers_not_to_aggregate.size() + def.columns_to_aggregate.size(); + new_columns.reserve(num_columns); + + for (const auto & desc : def.columns_to_aggregate) + { + // Wrap aggregated columns in a tuple to match function signature + if (!desc.is_agg_func_type && !desc.is_simple_agg_func_type && isTuple(desc.function->getResultType())) + { + size_t tuple_size = desc.column_numbers.size(); + MutableColumns tuple_columns(tuple_size); + for (size_t i = 0; i < tuple_size; ++i) + tuple_columns[i] = std::move(columns[desc.column_numbers[i]]); + + new_columns.emplace_back(ColumnTuple::create(std::move(tuple_columns))); + } + else + { + const auto & type = desc.nested_type ? desc.nested_type : desc.real_type; + new_columns.emplace_back(type->createColumn()); + } + } + + for (const auto & column_number : def.column_numbers_not_to_aggregate) + new_columns.emplace_back(std::move(columns[column_number])); + + columns = std::move(new_columns); + current_row.resize(def.column_names.size()); initAggregateDescription(); @@ -698,12 +698,14 @@ SummingSortedAlgorithm::SummingSortedAlgorithm( size_t max_block_size_bytes) : IMergingAlgorithmWithDelayedChunk(header_, num_inputs, std::move(description_)) , columns_definition(defineColumns(header_, description, column_names_to_sum, partition_key_columns)) - , merged_data(getMergedDataColumns(header_, columns_definition), max_block_size_rows, max_block_size_bytes, columns_definition) + , merged_data(max_block_size_rows, max_block_size_bytes, columns_definition) { } void SummingSortedAlgorithm::initialize(Inputs inputs) { + merged_data.initialize(header, inputs); + for (auto & input : inputs) if (input.chunk) preprocessChunk(input.chunk, columns_definition); diff --git a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h index dbbe4e53a5f..664b171c4b9 100644 --- a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h @@ -65,7 +65,9 @@ public: using MergedData::insertRow; public: - SummingMergedData(MutableColumns columns_, UInt64 max_block_size_rows, UInt64 max_block_size_bytes_, ColumnsDefinition & def_); + SummingMergedData(UInt64 max_block_size_rows, UInt64 max_block_size_bytes_, ColumnsDefinition & def_); + + void initialize(const Block & header, const IMergingAlgorithm::Inputs & inputs) override; void startGroup(ColumnRawPtrs & raw_columns, size_t row); void finishGroup(); diff --git a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp index e7a431dc1d0..9f124c6ba18 100644 --- a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp @@ -16,8 +16,7 @@ VersionedCollapsingAlgorithm::VersionedCollapsingAlgorithm( size_t max_block_size_bytes_, WriteBuffer * out_row_sources_buf_, bool use_average_block_sizes) - : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, MAX_ROWS_IN_MULTIVERSION_QUEUE) - , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size_rows_, max_block_size_bytes_) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, MAX_ROWS_IN_MULTIVERSION_QUEUE, std::make_unique(use_average_block_sizes, max_block_size_rows_, max_block_size_bytes_)) /// -1 for +1 in FixedSizeDequeWithGaps's internal buffer. 3 is a reasonable minimum size to collapse anything. , max_rows_in_queue(std::min(std::max(3, max_block_size_rows_), MAX_ROWS_IN_MULTIVERSION_QUEUE) - 1) , current_keys(max_rows_in_queue) @@ -47,7 +46,7 @@ void VersionedCollapsingAlgorithm::insertGap(size_t gap_size) void VersionedCollapsingAlgorithm::insertRow(size_t skip_rows, const RowRef & row) { - merged_data.insertRow(*row.all_columns, row.row_num, row.owned_chunk->getNumRows()); + merged_data->insertRow(*row.all_columns, row.row_num, row.owned_chunk->getNumRows()); insertGap(skip_rows); @@ -104,8 +103,8 @@ IMergingAlgorithm::Status VersionedCollapsingAlgorithm::merge() --num_rows_to_insert; /// It's ok to return here, because we didn't affect queue. - if (merged_data.hasEnoughRows()) - return Status(merged_data.pull()); + if (merged_data->hasEnoughRows()) + return Status(merged_data->pull()); } if (current_keys.empty()) @@ -147,13 +146,13 @@ IMergingAlgorithm::Status VersionedCollapsingAlgorithm::merge() insertRow(gap, row); current_keys.popFront(); - if (merged_data.hasEnoughRows()) - return Status(merged_data.pull()); + if (merged_data->hasEnoughRows()) + return Status(merged_data->pull()); } /// Write information about last collapsed rows. insertGap(current_keys.frontGap()); - return Status(merged_data.pull(), true); + return Status(merged_data->pull(), true); } } diff --git a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.h b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.h index d98529b301c..e6d20ddac75 100644 --- a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.h +++ b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.h @@ -29,8 +29,6 @@ public: Status merge() override; private: - MergedData merged_data; - size_t sign_column_number = 0; const size_t max_rows_in_queue; diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h index 18f1496d26a..b33a373a970 100644 --- a/src/Processors/QueryPlan/Optimizations/Optimizations.h +++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h @@ -111,8 +111,11 @@ void optimizePrimaryKeyCondition(const Stack & stack); void optimizePrewhere(Stack & stack, QueryPlan::Nodes & nodes); void optimizeReadInOrder(QueryPlan::Node & node, QueryPlan::Nodes & nodes); void optimizeAggregationInOrder(QueryPlan::Node & node, QueryPlan::Nodes &); -bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & nodes, bool allow_implicit_projections); -bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes); + +/// Returns the name of used projection or nullopt if no projection is used. +std::optional optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & nodes, bool allow_implicit_projections); +std::optional optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes); + bool addPlansForSets(QueryPlan & plan, QueryPlan::Node & node, QueryPlan::Nodes & nodes); /// Enable memory bound merging of aggregation states for remote queries diff --git a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp index 80923159ddc..2738de1ff5f 100644 --- a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp +++ b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp @@ -46,7 +46,7 @@ QueryPlanOptimizationSettings QueryPlanOptimizationSettings::fromSettings(const settings.optimize_projection = from.optimize_use_projections; settings.force_use_projection = settings.optimize_projection && from.force_optimize_projection; - settings.force_projection_name = from.force_optimize_projection_name; + settings.force_projection_name = settings.optimize_projection ? from.force_optimize_projection_name.value : ""; settings.optimize_use_implicit_projections = settings.optimize_projection && from.optimize_use_implicit_projections; return settings; diff --git a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp index 915e664ea8f..df9e095af30 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp @@ -12,6 +12,7 @@ namespace DB namespace ErrorCodes { + extern const int INCORRECT_DATA; extern const int TOO_MANY_QUERY_PLAN_OPTIMIZATIONS; extern const int PROJECTION_NOT_USED; } @@ -106,7 +107,7 @@ void optimizeTreeFirstPass(const QueryPlanOptimizationSettings & settings, Query void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_settings, QueryPlan::Node & root, QueryPlan::Nodes & nodes) { const size_t max_optimizations_to_apply = optimization_settings.max_optimizations_to_apply; - size_t num_applied_projection = 0; + std::unordered_set applied_projection_names; bool has_reading_from_mt = false; Stack stack; @@ -159,9 +160,11 @@ void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_s /// Projection optimization relies on PK optimization if (optimization_settings.optimize_projection) - num_applied_projection - += optimizeUseAggregateProjections(*frame.node, nodes, optimization_settings.optimize_use_implicit_projections); - + { + auto applied_projection = optimizeUseAggregateProjections(*frame.node, nodes, optimization_settings.optimize_use_implicit_projections); + if (applied_projection) + applied_projection_names.insert(*applied_projection); + } if (optimization_settings.aggregation_in_order) optimizeAggregationInOrder(*frame.node, nodes); @@ -180,11 +183,11 @@ void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_s if (optimization_settings.optimize_projection) { /// Projection optimization relies on PK optimization - if (optimizeUseNormalProjections(stack, nodes)) + if (auto applied_projection = optimizeUseNormalProjections(stack, nodes)) { - ++num_applied_projection; + applied_projection_names.insert(*applied_projection); - if (max_optimizations_to_apply && max_optimizations_to_apply < num_applied_projection) + if (max_optimizations_to_apply && max_optimizations_to_apply < applied_projection_names.size()) throw Exception(ErrorCodes::TOO_MANY_QUERY_PLAN_OPTIMIZATIONS, "Too many projection optimizations applied to query plan. Current limit {}", max_optimizations_to_apply); @@ -201,10 +204,16 @@ void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_s stack.pop_back(); } - if (optimization_settings.force_use_projection && has_reading_from_mt && num_applied_projection == 0) + if (optimization_settings.force_use_projection && has_reading_from_mt && applied_projection_names.empty()) throw Exception( ErrorCodes::PROJECTION_NOT_USED, "No projection is used when optimize_use_projections = 1 and force_optimize_projection = 1"); + + if (!optimization_settings.force_projection_name.empty() && has_reading_from_mt && !applied_projection_names.contains(optimization_settings.force_projection_name)) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "Projection {} is specified in setting force_optimize_projection_name but not used", + optimization_settings.force_projection_name); } void optimizeTreeThirdPass(QueryPlan & plan, QueryPlan::Node & root, QueryPlan::Nodes & nodes) diff --git a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp index 30ff9970790..4017670ad14 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp @@ -552,28 +552,28 @@ static QueryPlan::Node * findReadingStep(QueryPlan::Node & node) return nullptr; } -bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & nodes, bool allow_implicit_projections) +std::optional optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & nodes, bool allow_implicit_projections) { if (node.children.size() != 1) - return false; + return {}; auto * aggregating = typeid_cast(node.step.get()); if (!aggregating) - return false; + return {}; if (!aggregating->canUseProjection()) - return false; + return {}; QueryPlan::Node * reading_node = findReadingStep(*node.children.front()); if (!reading_node) - return false; + return {}; auto * reading = typeid_cast(reading_node->step.get()); if (!reading) - return false; + return {}; if (!canUseProjectionForReadingStep(reading)) - return false; + return {}; std::shared_ptr max_added_blocks = getMaxAddedBlocks(reading); @@ -597,7 +597,7 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & if (ordinary_reading_marks == 0) { reading->setAnalyzedResult(std::move(ordinary_reading_select_result)); - return false; + return {}; } const auto & parts_with_ranges = ordinary_reading_select_result->parts_with_ranges; @@ -631,15 +631,14 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & if (!best_candidate) { reading->setAnalyzedResult(std::move(ordinary_reading_select_result)); - return false; + return {}; } } else { - return false; + return {}; } - Context::QualifiedProjectionName projection_name; chassert(best_candidate != nullptr); QueryPlanStepPtr projection_reading; @@ -654,12 +653,6 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & Pipe pipe(std::make_shared(std::move(candidates.minmax_projection->block))); projection_reading = std::make_unique(std::move(pipe)); has_ordinary_parts = false; - - projection_name = Context::QualifiedProjectionName - { - .storage_id = reading->getMergeTreeData().getStorageID(), - .projection_name = candidates.minmax_projection->candidate.projection->name, - }; } else { @@ -691,12 +684,6 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & projection_reading = std::make_unique(std::move(pipe)); } - projection_name = Context::QualifiedProjectionName - { - .storage_id = reading->getMergeTreeData().getStorageID(), - .projection_name = best_candidate->projection->name, - }; - has_ordinary_parts = best_candidate->merge_tree_ordinary_select_result_ptr != nullptr; if (has_ordinary_parts) reading->setAnalyzedResult(std::move(best_candidate->merge_tree_ordinary_select_result_ptr)); @@ -746,7 +733,7 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & node.children.push_back(&expr_or_filter_node); } - return true; + return best_candidate->projection->name; } } diff --git a/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp b/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp index 13c6c6b0821..728aaaa6fc4 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp @@ -73,16 +73,16 @@ static bool hasAllRequiredColumns(const ProjectionDescription * projection, cons } -bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes) +std::optional optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes) { const auto & frame = stack.back(); auto * reading = typeid_cast(frame.node->step.get()); if (!reading) - return false; + return {}; if (!canUseProjectionForReadingStep(reading)) - return false; + return {}; auto iter = stack.rbegin(); while (std::next(iter) != stack.rend()) @@ -96,7 +96,7 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes) /// Dangling query plan node. This might be generated by StorageMerge. if (iter->node->step.get() == reading) - return false; + return {}; const auto metadata = reading->getStorageMetadata(); const auto & projections = metadata->projections; @@ -107,7 +107,7 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes) normal_projections.push_back(&projection); if (normal_projections.empty()) - return false; + return {}; ContextPtr context = reading->getContext(); auto it = std::find_if(normal_projections.begin(), normal_projections.end(), [&](const auto * projection) @@ -126,7 +126,7 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes) { auto & child = iter->node->children[iter->next_child - 1]; if (!query.build(*child)) - return false; + return {}; if (query.dag) query.dag->removeUnusedActions(); @@ -146,7 +146,7 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes) if (ordinary_reading_marks == 0) { reading->setAnalyzedResult(std::move(ordinary_reading_select_result)); - return false; + return {}; } const auto & parts_with_ranges = ordinary_reading_select_result->parts_with_ranges; @@ -185,7 +185,7 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes) if (!best_candidate) { reading->setAnalyzedResult(std::move(ordinary_reading_select_result)); - return false; + return {}; } auto storage_snapshot = reading->getStorageSnapshot(); @@ -283,8 +283,7 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes) /// Here we remove last steps from stack to be able to optimize again. /// In theory, read-in-order can be applied to projection. stack.resize(iter.base() - stack.begin()); - - return true; + return best_candidate->projection->name; } } diff --git a/src/Processors/QueryPlan/PartsSplitter.cpp b/src/Processors/QueryPlan/PartsSplitter.cpp index 6d70aa8a60d..ed4b1906635 100644 --- a/src/Processors/QueryPlan/PartsSplitter.cpp +++ b/src/Processors/QueryPlan/PartsSplitter.cpp @@ -624,14 +624,11 @@ SplitPartsRangesResult splitPartsRanges(RangesInDataParts ranges_in_data_parts, } /// Process parts ranges with undefined value at end mark - bool is_intersecting = part_index_start_to_range.size() > 1; + /// The last parts ranges could be non-intersect only if: (1) there is only one part range left, (2) it belongs to a non-L0 part, + /// and (3) the begin value of this range is larger than the largest end value of all previous ranges. This is too complicated + /// to check, so we just add the last part ranges to the intersecting ranges. for (const auto & [part_range_index, mark_range] : part_index_start_to_range) - { - if (is_intersecting) - add_intersecting_range(part_range_index.part_index, mark_range); - else - add_non_intersecting_range(part_range_index.part_index, mark_range); - } + add_intersecting_range(part_range_index.part_index, mark_range); auto && non_intersecting_ranges_in_data_parts = std::move(non_intersecting_ranges_in_data_parts_builder.getCurrentRangesInDataParts()); auto && intersecting_ranges_in_data_parts = std::move(intersecting_ranges_in_data_parts_builder.getCurrentRangesInDataParts()); diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index e523a2c243c..6f0fa55c349 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1534,25 +1534,7 @@ void ReadFromMergeTree::applyFilters(ActionDAGNodes added_filter_nodes) { if (!indexes) { - /// Analyzer generates unique ColumnIdentifiers like __table1.__partition_id in filter nodes, - /// while key analysis still requires unqualified column names. - std::unordered_map node_name_to_input_node_column; - if (query_info.planner_context) - { - const auto & table_expression_data = query_info.planner_context->getTableExpressionDataOrThrow(query_info.table_expression); - const auto & alias_column_expressions = table_expression_data.getAliasColumnExpressions(); - for (const auto & [column_identifier, column_name] : table_expression_data.getColumnIdentifierToColumnName()) - { - /// ALIAS columns cannot be used in the filter expression without being calculated in ActionsDAG, - /// so they should not be added to the input nodes. - if (alias_column_expressions.contains(column_name)) - continue; - const auto & column = table_expression_data.getColumnOrThrow(column_name); - node_name_to_input_node_column.emplace(column_identifier, ColumnWithTypeAndName(column.type, column_name)); - } - } - - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes, node_name_to_input_node_column); + filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes, query_info.buildNodeNameToInputNodeColumn()); /// NOTE: Currently we store two DAGs for analysis: /// (1) SourceStepWithFilter::filter_nodes, (2) query_info.filter_actions_dag. Make sure there are consistent. diff --git a/src/Processors/QueryPlan/ReadFromPreparedSource.cpp b/src/Processors/QueryPlan/ReadFromPreparedSource.cpp index b845101125b..92c936cdc20 100644 --- a/src/Processors/QueryPlan/ReadFromPreparedSource.cpp +++ b/src/Processors/QueryPlan/ReadFromPreparedSource.cpp @@ -1,6 +1,5 @@ #include #include -#include #include namespace DB diff --git a/src/Processors/QueryPlan/SourceStepWithFilter.cpp b/src/Processors/QueryPlan/SourceStepWithFilter.cpp index ce5a59a92f9..ad0940b90b9 100644 --- a/src/Processors/QueryPlan/SourceStepWithFilter.cpp +++ b/src/Processors/QueryPlan/SourceStepWithFilter.cpp @@ -80,7 +80,7 @@ Block SourceStepWithFilter::applyPrewhereActions(Block block, const PrewhereInfo void SourceStepWithFilter::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes, query_info.buildNodeNameToInputNodeColumn()); } void SourceStepWithFilter::updatePrewhereInfo(const PrewhereInfoPtr & prewhere_info_value) diff --git a/src/Processors/Transforms/ColumnGathererTransform.cpp b/src/Processors/Transforms/ColumnGathererTransform.cpp index b2e8e9bc89e..b6bcec26c0c 100644 --- a/src/Processors/Transforms/ColumnGathererTransform.cpp +++ b/src/Processors/Transforms/ColumnGathererTransform.cpp @@ -32,15 +32,23 @@ ColumnGathererStream::ColumnGathererStream( void ColumnGathererStream::initialize(Inputs inputs) { + Columns source_columns; + source_columns.reserve(inputs.size()); for (size_t i = 0; i < inputs.size(); ++i) { if (inputs[i].chunk) { sources[i].update(inputs[i].chunk.detachColumns().at(0)); - if (!result_column) - result_column = sources[i].column->cloneEmpty(); + source_columns.push_back(sources[i].column); } } + + if (source_columns.empty()) + return; + + result_column = source_columns[0]->cloneEmpty(); + if (result_column->hasDynamicStructure()) + result_column->takeDynamicStructureFromSourceColumns(source_columns); } IMergingAlgorithm::Status ColumnGathererStream::merge() @@ -52,7 +60,19 @@ IMergingAlgorithm::Status ColumnGathererStream::merge() if (source_to_fully_copy) /// Was set on a previous iteration { Chunk res; - res.addColumn(source_to_fully_copy->column); + /// For columns with Dynamic structure we cannot just take column source_to_fully_copy because resulting column may have + /// different Dynamic structure (and have some merge statistics after calling takeDynamicStructureFromSourceColumns). + /// We should insert into data resulting column using insertRangeFrom. + if (result_column->hasDynamicStructure()) + { + auto col = result_column->cloneEmpty(); + col->insertRangeFrom(*source_to_fully_copy->column, 0, source_to_fully_copy->column->size()); + res.addColumn(std::move(col)); + } + else + { + res.addColumn(source_to_fully_copy->column); + } merged_rows += source_to_fully_copy->size; source_to_fully_copy->pos = source_to_fully_copy->size; source_to_fully_copy = nullptr; @@ -96,7 +116,16 @@ IMergingAlgorithm::Status ColumnGathererStream::merge() Chunk res; merged_rows += source_to_fully_copy->column->size(); merged_bytes += source_to_fully_copy->column->allocatedBytes(); - res.addColumn(source_to_fully_copy->column); + if (result_column->hasDynamicStructure()) + { + auto col = result_column->cloneEmpty(); + col->insertRangeFrom(*source_to_fully_copy->column, 0, source_to_fully_copy->column->size()); + res.addColumn(std::move(col)); + } + else + { + res.addColumn(source_to_fully_copy->column); + } source_to_fully_copy->pos = source_to_fully_copy->size; source_to_fully_copy = nullptr; return Status(std::move(res)); diff --git a/src/Processors/Transforms/SquashingChunksTransform.cpp b/src/Processors/Transforms/SquashingChunksTransform.cpp index 0d69b6e0a8d..267490dc89e 100644 --- a/src/Processors/Transforms/SquashingChunksTransform.cpp +++ b/src/Processors/Transforms/SquashingChunksTransform.cpp @@ -56,49 +56,34 @@ void SquashingChunksTransform::work() SimpleSquashingChunksTransform::SimpleSquashingChunksTransform( const Block & header, size_t min_block_size_rows, size_t min_block_size_bytes) - : ISimpleTransform(header, header, true), squashing(min_block_size_rows, min_block_size_bytes) + : IInflatingTransform(header, header), squashing(min_block_size_rows, min_block_size_bytes) { } -void SimpleSquashingChunksTransform::transform(Chunk & chunk) +void SimpleSquashingChunksTransform::consume(Chunk chunk) { - if (!finished) - { - if (auto block = squashing.add(getInputPort().getHeader().cloneWithColumns(chunk.detachColumns()))) - chunk.setColumns(block.getColumns(), block.rows()); - } - else - { - if (chunk.hasRows()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Chunk expected to be empty, otherwise it will be lost"); - - auto block = squashing.add({}); - chunk.setColumns(block.getColumns(), block.rows()); - } + Block current_block = squashing.add(getInputPort().getHeader().cloneWithColumns(chunk.detachColumns())); + squashed_chunk.setColumns(current_block.getColumns(), current_block.rows()); } -IProcessor::Status SimpleSquashingChunksTransform::prepare() +Chunk SimpleSquashingChunksTransform::generate() { - if (!finished && input.isFinished()) - { - if (output.isFinished()) - return Status::Finished; + if (squashed_chunk.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't generate chunk in SimpleSquashingChunksTransform"); - if (!output.canPush()) - return Status::PortFull; + return std::move(squashed_chunk); +} - if (has_output) - { - output.pushData(std::move(output_data)); - has_output = false; - return Status::PortFull; - } +bool SimpleSquashingChunksTransform::canGenerate() +{ + return !squashed_chunk.empty(); +} - finished = true; - /// On the next call to transform() we will return all data buffered in `squashing` (if any) - return Status::Ready; - } - return ISimpleTransform::prepare(); +Chunk SimpleSquashingChunksTransform::getRemaining() +{ + Block current_block = squashing.add({}); + squashed_chunk.setColumns(current_block.getColumns(), current_block.rows()); + return std::move(squashed_chunk); } } diff --git a/src/Processors/Transforms/SquashingChunksTransform.h b/src/Processors/Transforms/SquashingChunksTransform.h index f82e9e46a61..8c30a6032e4 100644 --- a/src/Processors/Transforms/SquashingChunksTransform.h +++ b/src/Processors/Transforms/SquashingChunksTransform.h @@ -2,6 +2,7 @@ #include #include +#include #include namespace DB @@ -29,7 +30,7 @@ private: }; /// Doesn't care about propagating exceptions and thus doesn't throw LOGICAL_ERROR if the following transform closes its input port. -class SimpleSquashingChunksTransform : public ISimpleTransform +class SimpleSquashingChunksTransform : public IInflatingTransform { public: explicit SimpleSquashingChunksTransform(const Block & header, size_t min_block_size_rows, size_t min_block_size_bytes); @@ -37,14 +38,14 @@ public: String getName() const override { return "SimpleSquashingTransform"; } protected: - void transform(Chunk &) override; - - IProcessor::Status prepare() override; + void consume(Chunk chunk) override; + bool canGenerate() override; + Chunk generate() override; + Chunk getRemaining() override; private: SquashingTransform squashing; - - /// When consumption is finished we need to release the final chunk regardless of its size. - bool finished = false; + Chunk squashed_chunk; }; + } diff --git a/src/Processors/Transforms/buildPushingToViewsChain.cpp b/src/Processors/Transforms/buildPushingToViewsChain.cpp index 5e8ecdca95e..cdcfad4442c 100644 --- a/src/Processors/Transforms/buildPushingToViewsChain.cpp +++ b/src/Processors/Transforms/buildPushingToViewsChain.cpp @@ -414,7 +414,8 @@ std::optional generateViewChain( out.getInputHeader(), view_id, nullptr, - std::move(runtime_stats)}); + std::move(runtime_stats), + insert_context}); if (type == QueryViewsLogElement::ViewType::MATERIALIZED) { @@ -590,7 +591,7 @@ Chain buildPushingToViewsChain( static QueryPipeline process(Block block, ViewRuntimeData & view, const ViewsData & views_data) { - const auto & context = views_data.context; + const auto & context = view.context; /// We create a table with the same name as original table and the same alias columns, /// but it will contain single block (that is INSERT-ed into main table). diff --git a/src/Processors/Transforms/buildPushingToViewsChain.h b/src/Processors/Transforms/buildPushingToViewsChain.h index 53aceeda1cc..a1feed91b60 100644 --- a/src/Processors/Transforms/buildPushingToViewsChain.h +++ b/src/Processors/Transforms/buildPushingToViewsChain.h @@ -33,6 +33,9 @@ struct ViewRuntimeData /// Info which is needed for query views log. std::unique_ptr runtime_stats; + /// An overridden context bounded to this view with the correct SQL security grants. + ContextPtr context; + void setException(std::exception_ptr e) { exception = e; diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index ce80d0c22c6..d1db4cb3951 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include @@ -707,11 +707,11 @@ void HTTPHandler::processQuery( /// The data can also be compressed using incompatible internal algorithm. This is indicated by /// 'decompress' query parameter. std::unique_ptr in_post_maybe_compressed; - bool in_post_compressed = false; + bool is_in_post_compressed = false; if (params.getParsed("decompress", false)) { - in_post_maybe_compressed = std::make_unique(*in_post); - in_post_compressed = true; + in_post_maybe_compressed = std::make_unique(*in_post, /* allow_different_codecs_ = */ false, /* external_data_ = */ true); + is_in_post_compressed = true; } else in_post_maybe_compressed = std::move(in_post); @@ -845,7 +845,7 @@ void HTTPHandler::processQuery( /// If 'http_native_compression_disable_checksumming_on_decompress' setting is turned on, /// checksums of client data compressed with internal algorithm are not checked. - if (in_post_compressed && settings.http_native_compression_disable_checksumming_on_decompress) + if (is_in_post_compressed && settings.http_native_compression_disable_checksumming_on_decompress) static_cast(*in_post_maybe_compressed).disableChecksumming(); /// Add CORS header if 'add_http_cors_header' setting is turned on send * in Access-Control-Allow-Origin diff --git a/src/Server/HTTPHandlerFactory.h b/src/Server/HTTPHandlerFactory.h index ac18c36e6c9..b4c32366463 100644 --- a/src/Server/HTTPHandlerFactory.h +++ b/src/Server/HTTPHandlerFactory.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Server/HTTPHandlerRequestFilter.h b/src/Server/HTTPHandlerRequestFilter.h index 15e64cf7f48..de1920bd535 100644 --- a/src/Server/HTTPHandlerRequestFilter.h +++ b/src/Server/HTTPHandlerRequestFilter.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index ef6c5f7362c..4879d1a16dc 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -1288,7 +1288,7 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const /// Looks like there is something around default expression for this column (method `getDefault` is not implemented for the data type Object). /// But after ALTER TABLE ADD COLUMN we need to fill existing rows with something (exactly the default value). /// So we don't allow to do it for now. - if (command.data_type->hasDynamicSubcolumns()) + if (command.data_type->hasDynamicSubcolumnsDeprecated()) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Adding a new column of a type which has dynamic subcolumns to an existing table is not allowed. It has known bugs"); if (virtuals->tryGet(column_name, VirtualsKind::Persistent)) @@ -1366,8 +1366,8 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const const GetColumnsOptions options(GetColumnsOptions::All); const auto old_data_type = all_columns.getColumn(options, column_name).type; - bool new_type_has_object = command.data_type->hasDynamicSubcolumns(); - bool old_type_has_object = old_data_type->hasDynamicSubcolumns(); + bool new_type_has_object = command.data_type->hasDynamicSubcolumnsDeprecated(); + bool old_type_has_object = old_data_type->hasDynamicSubcolumnsDeprecated(); if (new_type_has_object || old_type_has_object) throw Exception( diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 16b89f24243..4cf66649ad1 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -547,7 +547,19 @@ bool ColumnsDescription::hasNested(const String & column_name) const bool ColumnsDescription::hasSubcolumn(const String & column_name) const { - return subcolumns.get<0>().count(column_name); + if (subcolumns.get<0>().count(column_name)) + return true; + + /// Check for dynamic subcolumns + auto [ordinary_column_name, dynamic_subcolumn_name] = Nested::splitName(column_name); + auto it = columns.get<1>().find(ordinary_column_name); + if (it != columns.get<1>().end() && it->type->hasDynamicSubcolumns()) + { + if (auto dynamic_subcolumn_type = it->type->tryGetSubcolumnType(dynamic_subcolumn_name)) + return true; + } + + return false; } const ColumnDescription & ColumnsDescription::get(const String & column_name) const @@ -644,6 +656,15 @@ std::optional ColumnsDescription::tryGetColumn(const GetColumns return *jt; } + /// Check for dynamic subcolumns. + auto [ordinary_column_name, dynamic_subcolumn_name] = Nested::splitName(column_name); + it = columns.get<1>().find(ordinary_column_name); + if (it != columns.get<1>().end() && it->type->hasDynamicSubcolumns()) + { + if (auto dynamic_subcolumn_type = it->type->tryGetSubcolumnType(dynamic_subcolumn_name)) + return NameAndTypePair(ordinary_column_name, dynamic_subcolumn_name, it->type, dynamic_subcolumn_type); + } + return {}; } @@ -730,9 +751,19 @@ bool ColumnsDescription::hasAlias(const String & column_name) const bool ColumnsDescription::hasColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const { auto it = columns.get<1>().find(column_name); - return (it != columns.get<1>().end() - && (defaultKindToGetKind(it->default_desc.kind) & kind)) - || hasSubcolumn(column_name); + if ((it != columns.get<1>().end() && (defaultKindToGetKind(it->default_desc.kind) & kind)) || hasSubcolumn(column_name)) + return true; + + /// Check for dynamic subcolumns. + auto [ordinary_column_name, dynamic_subcolumn_name] = Nested::splitName(column_name); + it = columns.get<1>().find(ordinary_column_name); + if (it != columns.get<1>().end() && it->type->hasDynamicSubcolumns()) + { + if (auto dynamic_subcolumn_type = it->type->hasSubcolumn(dynamic_subcolumn_name)) + return true; + } + + return false; } bool ColumnsDescription::hasColumnOrNested(GetColumnsOptions::Kind kind, const String & column_name) const diff --git a/src/Storages/CompressionCodecSelector.h b/src/Storages/CompressionCodecSelector.h index ad6e943e821..e03d06bacdb 100644 --- a/src/Storages/CompressionCodecSelector.h +++ b/src/Storages/CompressionCodecSelector.h @@ -1,7 +1,7 @@ #pragma once #include #include -#include +#include #include #include #include diff --git a/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.cpp b/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.cpp index 14866c25365..d471c67553d 100644 --- a/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.cpp +++ b/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.cpp @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 0f3b03f0955..33bde34b4f9 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -994,7 +994,8 @@ private: void ReadFromHDFS::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + const ActionsDAG::Node * predicate = nullptr; if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index b8faa27d678..9a6e192d49b 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -79,6 +79,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + static ColumnsDescription getTableStructureFromData( const String & format, const String & uri, diff --git a/src/Storages/HDFS/StorageHDFSCluster.h b/src/Storages/HDFS/StorageHDFSCluster.h index 0b5c6242aa9..7a651805982 100644 --- a/src/Storages/HDFS/StorageHDFSCluster.h +++ b/src/Storages/HDFS/StorageHDFSCluster.h @@ -36,6 +36,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override { return true; } private: diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index b532abc9074..920155bf689 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -1,6 +1,6 @@ #include -#include +#include #include #include #include diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 37613704c6a..1369548c822 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -172,8 +172,10 @@ public: /// This method can return true for readonly engines that return the same rows for reading (such as SystemNumbers) virtual bool supportsTransactions() const { return false; } + /// Returns true if the storage supports storing of data type Object. + virtual bool supportsDynamicSubcolumnsDeprecated() const { return false; } + /// Returns true if the storage supports storing of dynamic subcolumns. - /// For now it makes sense only for data type Object. virtual bool supportsDynamicSubcolumns() const { return false; } /// Requires squashing small blocks to large for optimal storage. diff --git a/src/Storages/IStorageCluster.cpp b/src/Storages/IStorageCluster.cpp index ab45ce877c2..9c5b29ae265 100644 --- a/src/Storages/IStorageCluster.cpp +++ b/src/Storages/IStorageCluster.cpp @@ -86,7 +86,8 @@ private: void ReadFromCluster::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + const ActionsDAG::Node * predicate = nullptr; if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 463ca07ec57..3e785fd2dd2 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include @@ -793,7 +793,8 @@ void IMergeTreeDataPart::addProjectionPart( projection_parts[projection_name] = std::move(projection_part); } -void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency, bool & has_broken_projection, bool if_not_loaded) +void IMergeTreeDataPart::loadProjections( + bool require_columns_checksums, bool check_consistency, bool & has_broken_projection, bool if_not_loaded, bool only_metadata) { auto metadata_snapshot = storage.getInMemoryMetadataPtr(); for (const auto & projection : metadata_snapshot->projections) @@ -813,7 +814,10 @@ void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch try { - part->loadColumnsChecksumsIndexes(require_columns_checksums, check_consistency); + if (only_metadata) + part->loadChecksums(require_columns_checksums); + else + part->loadColumnsChecksumsIndexes(require_columns_checksums, check_consistency); } catch (...) { @@ -2430,6 +2434,38 @@ void IMergeTreeDataPart::setBrokenReason(const String & message, int code) const exception_code = code; } +ColumnPtr IMergeTreeDataPart::getColumnSample(const NameAndTypePair & column) const +{ + const size_t total_mark = getMarksCount(); + /// If column doesn't have dynamic subcolumns or part has no data, just create column using it's type. + if (!column.type->hasDynamicSubcolumns() || !total_mark) + return column.type->createColumn(); + + /// Otherwise, read sample column with 0 rows from the part, so it will load dynamic structure. + NamesAndTypesList cols; + cols.emplace_back(column); + + StorageMetadataPtr metadata_ptr = storage.getInMemoryMetadataPtr(); + StorageSnapshotPtr storage_snapshot_ptr = std::make_shared(storage, metadata_ptr); + + MergeTreeReaderPtr reader = getReader( + cols, + storage_snapshot_ptr, + MarkRanges{MarkRange(0, 1)}, + /*virtual_fields=*/ {}, + /*uncompressed_cache=*/{}, + storage.getContext()->getMarkCache().get(), + std::make_shared(), + MergeTreeReaderSettings{}, + ValueSizeMap{}, + ReadBufferFromFileBase::ProfileCallback{}); + + Columns result; + result.resize(1); + reader->readRows(0, 1, false, 0, result); + return result[0]; +} + bool isCompactPart(const MergeTreeDataPartPtr & data_part) { return (data_part && data_part->getType() == MergeTreeDataPartType::Compact); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index f38a80455c4..6d365353c4d 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -166,6 +166,10 @@ public: NameAndTypePair getColumn(const String & name) const; std::optional tryGetColumn(const String & column_name) const; + /// Get sample column from part. For ordinary columns it just creates column using it's type. + /// For columns with dynamic structure it reads sample column with 0 rows from the part. + ColumnPtr getColumnSample(const NameAndTypePair & column) const; + const SerializationInfoByName & getSerializationInfos() const { return serialization_infos; } SerializationPtr getSerialization(const String & column_name) const; @@ -447,7 +451,15 @@ public: bool hasBrokenProjection(const String & projection_name) const; /// Return true, if all projections were loaded successfully and none was marked as broken. - void loadProjections(bool require_columns_checksums, bool check_consistency, bool & has_broken_projection, bool if_not_loaded = false); + void loadProjections( + bool require_columns_checksums, + bool check_consistency, + bool & has_broken_projection, + bool if_not_loaded = false, + bool only_metadata = false); + + /// If checksums.txt exists, reads file's checksums (and sizes) from it + void loadChecksums(bool require); void setBrokenReason(const String & message, int code) const; @@ -673,9 +685,6 @@ private: static void appendFilesOfColumns(Strings & files); - /// If checksums.txt exists, reads file's checksums (and sizes) from it - void loadChecksums(bool require); - static void appendFilesOfChecksums(Strings & files); /// Loads marks index granularity into memory diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 1f7e0a19b3a..4ea4caa302d 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -188,6 +189,7 @@ namespace ErrorCodes extern const int CANNOT_SCHEDULE_TASK; extern const int LIMIT_EXCEEDED; extern const int CANNOT_FORGET_PARTITION; + extern const int DATA_TYPE_CANNOT_BE_USED_IN_KEY; } static void checkSuspiciousIndices(const ASTFunction * index_function) @@ -3873,7 +3875,7 @@ void MergeTreeData::checkPartDynamicColumns(MutableDataPartPtr & part, DataParts continue; auto storage_column = columns.getPhysical(part_column.name); - if (!storage_column.type->hasDynamicSubcolumns()) + if (!storage_column.type->hasDynamicSubcolumnsDeprecated()) continue; auto concrete_storage_column = object_columns.getPhysical(part_column.name); @@ -8553,6 +8555,16 @@ void MergeTreeData::unloadPrimaryKeys() } } +void MergeTreeData::verifySortingKey(const KeyDescription & sorting_key) +{ + /// Aggregate functions already forbidden, but SimpleAggregateFunction are not + for (const auto & data_type : sorting_key.data_types) + { + if (dynamic_cast(data_type->getCustomName())) + throw Exception(ErrorCodes::DATA_TYPE_CANNOT_BE_USED_IN_KEY, "Column with type {} is not allowed in key expression", data_type->getCustomName()->getName()); + } +} + bool updateAlterConversionsMutations(const MutationCommands & commands, std::atomic & alter_conversions_mutations, bool remove) { for (const auto & command : commands) diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index ff93c7c5ae4..fb8f2ec29aa 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -434,6 +434,7 @@ public: bool supportsTTL() const override { return true; } + bool supportsDynamicSubcolumnsDeprecated() const override { return true; } bool supportsDynamicSubcolumns() const override { return true; } bool supportsLightweightDelete() const override; @@ -738,6 +739,8 @@ public: const ASTPtr & new_settings, AlterLockHolder & table_lock_holder); + static void verifySortingKey(const KeyDescription & sorting_key); + /// Should be called if part data is suspected to be corrupted. /// Has the ability to check all other parts /// which reside on the same disk of the suspicious part. diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index 1605e5cdb9a..e34822ce6df 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -49,16 +49,33 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( for (const auto & column : columns_list) { auto compression = storage_snapshot->getCodecDescOrDefault(column.name, default_codec); - addStreams(column, compression); + addStreams(column, nullptr, compression); } } -void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column, const ASTPtr & effective_codec_desc) +void MergeTreeDataPartWriterCompact::initDynamicStreamsIfNeeded(const Block & block) +{ + if (is_dynamic_streams_initialized) + return; + + is_dynamic_streams_initialized = true; + auto storage_snapshot = std::make_shared(data_part->storage, metadata_snapshot); + for (const auto & column : columns_list) + { + if (column.type->hasDynamicSubcolumns()) + { + auto compression = storage_snapshot->getCodecDescOrDefault(column.name, default_codec); + addStreams(column, block.getByName(column.name).column, compression); + } + } +} + +void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & name_and_type, const ColumnPtr & column, const ASTPtr & effective_codec_desc) { ISerialization::StreamCallback callback = [&](const auto & substream_path) { assert(!substream_path.empty()); - String stream_name = ISerialization::getFileNameForStream(column, substream_path); + String stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path); /// Shared offsets for Nested type. if (compressed_streams.contains(stream_name)) @@ -81,7 +98,7 @@ void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column, compressed_streams.emplace(stream_name, stream); }; - data_part->getSerialization(column.name)->enumerateStreams(callback, column.type); + data_part->getSerialization(name_and_type.name)->enumerateStreams(callback, name_and_type.type, column); } namespace @@ -138,6 +155,7 @@ void writeColumnSingleGranule( serialize_settings.getter = stream_getter; serialize_settings.position_independent_encoding = true; serialize_settings.low_cardinality_max_dictionary_size = 0; + serialize_settings.dynamic_write_statistics = ISerialization::SerializeBinaryBulkSettings::DynamicStatisticsMode::PREFIX; serialization->serializeBinaryBulkStatePrefix(*column.column, serialize_settings, state); serialization->serializeBinaryBulkWithMultipleStreams(*column.column, from_row, number_of_rows, serialize_settings, state); @@ -148,6 +166,9 @@ void writeColumnSingleGranule( void MergeTreeDataPartWriterCompact::write(const Block & block, const IColumn::Permutation * permutation) { + /// On first block of data initialize streams for dynamic subcolumns. + initDynamicStreamsIfNeeded(block); + /// Fill index granularity for this block /// if it's unknown (in case of insert data or horizontal merge, /// but not in case of vertical merge) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h index ddb6178dce6..f35479387f6 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h @@ -42,7 +42,9 @@ private: void addToChecksums(MergeTreeDataPartChecksums & checksums); - void addStreams(const NameAndTypePair & column, const ASTPtr & effective_codec_desc); + void addStreams(const NameAndTypePair & name_and_type, const ColumnPtr & column, const ASTPtr & effective_codec_desc); + + void initDynamicStreamsIfNeeded(const Block & block); Block header; @@ -96,6 +98,8 @@ private: /// then finally to 'marks_file'. std::unique_ptr marks_compressor; std::unique_ptr marks_source_hashing; + + bool is_dynamic_streams_initialized = false; }; } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index 6a3b08d4d65..fb7ee9f7fe8 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -93,12 +93,31 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( for (const auto & column : columns_list) { auto compression = storage_snapshot->getCodecDescOrDefault(column.name, default_codec); - addStreams(column, compression); + addStreams(column, nullptr, compression); + } +} + +void MergeTreeDataPartWriterWide::initDynamicStreamsIfNeeded(const DB::Block & block) +{ + if (is_dynamic_streams_initialized) + return; + + is_dynamic_streams_initialized = true; + block_sample = block.cloneEmpty(); + auto storage_snapshot = std::make_shared(data_part->storage, metadata_snapshot); + for (const auto & column : columns_list) + { + if (column.type->hasDynamicSubcolumns()) + { + auto compression = storage_snapshot->getCodecDescOrDefault(column.name, default_codec); + addStreams(column, block_sample.getByName(column.name).column, compression); + } } } void MergeTreeDataPartWriterWide::addStreams( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, + const ColumnPtr & column, const ASTPtr & effective_codec_desc) { ISerialization::StreamCallback callback = [&](const auto & substream_path) @@ -106,7 +125,7 @@ void MergeTreeDataPartWriterWide::addStreams( assert(!substream_path.empty()); auto storage_settings = storage.getSettings(); - auto full_stream_name = ISerialization::getFileNameForStream(column, substream_path); + auto full_stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path); String stream_name; if (storage_settings->replace_long_file_name_to_hash && full_stream_name.size() > storage_settings->max_file_name_length) @@ -114,6 +133,10 @@ void MergeTreeDataPartWriterWide::addStreams( else stream_name = full_stream_name; + /// Shared offsets for Nested type. + if (column_streams.contains(stream_name)) + return; + auto it = stream_name_to_full_name.find(stream_name); if (it != stream_name_to_full_name.end() && it->second != full_stream_name) throw Exception(ErrorCodes::INCORRECT_FILE_NAME, @@ -121,10 +144,6 @@ void MergeTreeDataPartWriterWide::addStreams( " It is a collision between a filename for one column and a hash of filename for another column or a bug", stream_name, it->second, full_stream_name); - /// Shared offsets for Nested type. - if (column_streams.contains(stream_name)) - return; - const auto & subtype = substream_path.back().data.type; CompressionCodecPtr compression_codec; @@ -138,7 +157,7 @@ void MergeTreeDataPartWriterWide::addStreams( auto ast = parseQuery(codec_parser, "(" + Poco::toUpper(settings.marks_compression_codec) + ")", 0, DBMS_DEFAULT_MAX_PARSER_DEPTH, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS); CompressionCodecPtr marks_compression_codec = CompressionCodecFactory::instance().get(ast, nullptr); - const auto column_desc = metadata_snapshot->columns.tryGetColumnDescription(GetColumnsOptions(GetColumnsOptions::AllPhysical), column.getNameInStorage()); + const auto column_desc = metadata_snapshot->columns.tryGetColumnDescription(GetColumnsOptions(GetColumnsOptions::AllPhysical), name_and_type.getNameInStorage()); UInt64 max_compress_block_size = 0; if (column_desc) @@ -163,7 +182,7 @@ void MergeTreeDataPartWriterWide::addStreams( }; ISerialization::SubstreamPath path; - data_part->getSerialization(column.name)->enumerateStreams(callback, column.type); + data_part->getSerialization(name_and_type.name)->enumerateStreams(callback, name_and_type.type, column); } const String & MergeTreeDataPartWriterWide::getStreamName( @@ -222,6 +241,9 @@ void MergeTreeDataPartWriterWide::shiftCurrentMark(const Granules & granules_wri void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Permutation * permutation) { + /// On first block of data initialize streams for dynamic subcolumns. + initDynamicStreamsIfNeeded(block); + /// Fill index granularity for this block /// if it's unknown (in case of insert data or horizontal merge, /// but not in case of vertical part of vertical merge) @@ -302,11 +324,12 @@ void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Perm } void MergeTreeDataPartWriterWide::writeSingleMark( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, WrittenOffsetColumns & offset_columns, size_t number_of_rows) { - StreamsWithMarks marks = getCurrentMarksForColumn(column, offset_columns); + auto * sample_column = block_sample.findByName(name_and_type.name); + StreamsWithMarks marks = getCurrentMarksForColumn(name_and_type, sample_column ? sample_column->column : nullptr, offset_columns); for (const auto & mark : marks) flushMarkToFile(mark, number_of_rows); } @@ -323,21 +346,22 @@ void MergeTreeDataPartWriterWide::flushMarkToFile(const StreamNameAndMark & stre } StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, + const ColumnPtr & column_sample, WrittenOffsetColumns & offset_columns) { StreamsWithMarks result; - const auto column_desc = metadata_snapshot->columns.tryGetColumnDescription(GetColumnsOptions(GetColumnsOptions::AllPhysical), column.getNameInStorage()); + const auto column_desc = metadata_snapshot->columns.tryGetColumnDescription(GetColumnsOptions(GetColumnsOptions::AllPhysical), name_and_type.getNameInStorage()); UInt64 min_compress_block_size = 0; if (column_desc) if (const auto * value = column_desc->settings.tryGet("min_compress_block_size")) min_compress_block_size = value->safeGet(); if (!min_compress_block_size) min_compress_block_size = settings.min_compress_block_size; - data_part->getSerialization(column.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) + data_part->getSerialization(name_and_type.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) { bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; - auto stream_name = getStreamName(column, substream_path); + auto stream_name = getStreamName(name_and_type, substream_path); /// Don't write offsets more than one time for Nested type. if (is_offsets && offset_columns.contains(stream_name)) @@ -355,7 +379,7 @@ StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn( stream_with_mark.mark.offset_in_decompressed_block = stream.compressed_hashing.offset(); result.push_back(stream_with_mark); - }); + }, name_and_type.type, column_sample); return result; } @@ -382,7 +406,7 @@ void MergeTreeDataPartWriterWide::writeSingleGranule( return; column_streams.at(stream_name)->compressed_hashing.nextIfAtEnd(); - }); + }, name_and_type.type, column.getPtr()); } /// Column must not be empty. (column.size() !== 0) @@ -424,7 +448,7 @@ void MergeTreeDataPartWriterWide::writeColumn( "We have to add new mark for column, but already have non written mark. " "Current mark {}, total marks {}, offset {}", getCurrentMark(), index_granularity.getMarksCount(), rows_written_in_last_mark); - last_non_written_marks[name] = getCurrentMarksForColumn(name_and_type, offset_columns); + last_non_written_marks[name] = getCurrentMarksForColumn(name_and_type, column.getPtr(), offset_columns); } writeSingleGranule( @@ -453,7 +477,7 @@ void MergeTreeDataPartWriterWide::writeColumn( bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; if (is_offsets) offset_columns.insert(getStreamName(name_and_type, substream_path)); - }); + }, name_and_type.type, column.getPtr()); } @@ -591,7 +615,6 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const NameAndTypePai " index granularity size {}, last rows {}", column->size(), mark_num, index_granularity.getMarksCount(), index_granularity_rows); } - } void MergeTreeDataPartWriterWide::fillDataChecksums(IMergeTreeDataPart::Checksums & checksums, NameSet & checksums_to_remove) @@ -622,6 +645,7 @@ void MergeTreeDataPartWriterWide::fillDataChecksums(IMergeTreeDataPart::Checksum if (!serialization_states.empty()) { serialize_settings.getter = createStreamGetter(*it, written_offset_columns ? *written_offset_columns : offset_columns); + serialize_settings.dynamic_write_statistics = ISerialization::SerializeBinaryBulkSettings::DynamicStatisticsMode::SUFFIX; data_part->getSerialization(it->name)->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[it->name]); } @@ -703,17 +727,17 @@ void MergeTreeDataPartWriterWide::finish(bool sync) } void MergeTreeDataPartWriterWide::writeFinalMark( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, WrittenOffsetColumns & offset_columns) { - writeSingleMark(column, offset_columns, 0); + writeSingleMark(name_and_type, offset_columns, 0); /// Memoize information about offsets - data_part->getSerialization(column.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) + data_part->getSerialization(name_and_type.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) { bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; if (is_offsets) - offset_columns.insert(getStreamName(column, substream_path)); - }); + offset_columns.insert(getStreamName(name_and_type, substream_path)); + }, name_and_type.type, block_sample.getByName(name_and_type.name).column); } static void fillIndexGranularityImpl( diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h index f5ff323563d..8343144f2e1 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h @@ -63,7 +63,8 @@ private: /// Take offsets from column and return as MarkInCompressed file with stream name StreamsWithMarks getCurrentMarksForColumn( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, + const ColumnPtr & column_sample, WrittenOffsetColumns & offset_columns); /// Write mark to disk using stream and rows count @@ -73,18 +74,21 @@ private: /// Write mark for column taking offsets from column stream void writeSingleMark( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, WrittenOffsetColumns & offset_columns, size_t number_of_rows); void writeFinalMark( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, WrittenOffsetColumns & offset_columns); void addStreams( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, + const ColumnPtr & column, const ASTPtr & effective_codec_desc); + void initDynamicStreamsIfNeeded(const Block & block); + /// Method for self check (used in debug-build only). Checks that written /// data and corresponding marks are consistent. Otherwise throws logical /// errors. @@ -129,6 +133,10 @@ private: /// How many rows we have already written in the current mark. /// More than zero when incoming blocks are smaller then their granularity. size_t rows_written_in_last_mark = 0; + + Block block_sample; + + bool is_dynamic_streams_initialized = false; }; } diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index daa163d741c..e5821075c3f 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -422,7 +422,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( auto columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames()); for (auto & column : columns) - if (column.type->hasDynamicSubcolumns()) + if (column.type->hasDynamicSubcolumnsDeprecated()) column.type = block.getByName(column.name).type; auto minmax_idx = std::make_shared(); diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.cpp b/src/Storages/MergeTree/MergeTreeIndexSet.cpp index 3e5cbb34556..86319796435 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexSet.cpp @@ -113,7 +113,7 @@ void MergeTreeIndexGranuleSet::deserializeBinary(ReadBuffer & istr, MergeTreeInd ISerialization::DeserializeBinaryBulkStatePtr state; auto serialization = elem.type->getDefaultSerialization(); - serialization->deserializeBinaryBulkStatePrefix(settings, state); + serialization->deserializeBinaryBulkStatePrefix(settings, state, nullptr); serialization->deserializeBinaryBulkWithMultipleStreams(elem.column, rows_to_read, settings, state, nullptr); } } diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp index 34fb214a1ce..a2b8f0ad96f 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp @@ -196,7 +196,7 @@ void MergeTreeReaderCompact::readPrefix( deserialize_settings.getter = buffer_getter_for_prefix; ISerialization::DeserializeBinaryBulkStatePtr state_for_prefix; - serialization_for_prefix->deserializeBinaryBulkStatePrefix(deserialize_settings, state_for_prefix); + serialization_for_prefix->deserializeBinaryBulkStatePrefix(deserialize_settings, state_for_prefix, nullptr); } SerializationPtr serialization; @@ -206,7 +206,8 @@ void MergeTreeReaderCompact::readPrefix( serialization = getSerializationInPart(name_and_type); deserialize_settings.getter = buffer_getter; - serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name_and_type.name]); + deserialize_settings.dynamic_read_statistics = true; + serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name_and_type.name], nullptr); } catch (Exception & e) { diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp index 59feb4dda19..b6882fdced9 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp @@ -43,6 +43,8 @@ MergeTreeReaderWide::MergeTreeReaderWide( mark_ranges_, settings_, avg_value_size_hints_) + , profile_callback(profile_callback_) + , clock_type(clock_type_) , read_without_marks( settings.can_read_part_without_marks && all_mark_ranges.isOneRangeForWholePart(data_part_info_for_read->getMarksCount())) @@ -50,7 +52,7 @@ MergeTreeReaderWide::MergeTreeReaderWide( try { for (size_t i = 0; i < columns_to_read.size(); ++i) - addStreams(columns_to_read[i], serializations[i], profile_callback_, clock_type_); + addStreams(columns_to_read[i], serializations[i]); } catch (...) { @@ -103,9 +105,10 @@ void MergeTreeReaderWide::prefetchForAllColumns( try { auto & cache = caches[columns_to_read[pos].getNameInStorage()]; + auto & deserialize_states_cache = deserialize_states_caches[columns_to_read[pos].getNameInStorage()]; prefetchForColumn( priority, columns_to_read[pos], serializations[pos], from_mark, continue_reading, - current_task_last_mark, cache); + current_task_last_mark, cache, deserialize_states_cache); } catch (Exception & e) { @@ -150,11 +153,12 @@ size_t MergeTreeReaderWide::readRows( { size_t column_size_before_reading = column->size(); auto & cache = caches[column_to_read.getNameInStorage()]; + auto & deserialize_states_cache = deserialize_states_caches[column_to_read.getNameInStorage()]; readData( column_to_read, serializations[pos], column, from_mark, continue_reading, current_task_last_mark, - max_rows_to_read, cache, /* was_prefetched =*/ !prefetched_streams.empty()); + max_rows_to_read, cache, deserialize_states_cache, /* was_prefetched =*/ !prefetched_streams.empty()); /// For elements of Nested, column_size_before_reading may be greater than column size /// if offsets are not empty and were already read, but elements are empty. @@ -202,9 +206,7 @@ size_t MergeTreeReaderWide::readRows( void MergeTreeReaderWide::addStreams( const NameAndTypePair & name_and_type, - const SerializationPtr & serialization, - const ReadBufferFromFileBase::ProfileCallback & profile_callback, - clockid_t clock_type) + const SerializationPtr & serialization) { bool has_any_stream = false; bool has_all_streams = true; @@ -228,43 +230,8 @@ void MergeTreeReaderWide::addStreams( return; } - auto context = data_part_info_for_read->getContext(); - auto * load_marks_threadpool = settings.read_settings.load_marks_asynchronously ? &context->getLoadMarksThreadpool() : nullptr; - size_t num_marks_in_part = data_part_info_for_read->getMarksCount(); - - auto marks_loader = std::make_shared( - data_part_info_for_read, - mark_cache, - data_part_info_for_read->getIndexGranularityInfo().getMarksFilePath(*stream_name), - num_marks_in_part, - data_part_info_for_read->getIndexGranularityInfo(), - settings.save_marks_in_cache, - settings.read_settings, - load_marks_threadpool, - /*num_columns_in_mark=*/ 1); - + addStream(substream_path, *stream_name); has_any_stream = true; - auto stream_settings = settings; - stream_settings.is_low_cardinality_dictionary = substream_path.size() > 1 && substream_path[substream_path.size() - 2].type == ISerialization::Substream::Type::DictionaryKeys; - - auto create_stream = [&]() - { - return std::make_unique( - data_part_info_for_read->getDataPartStorage(), *stream_name, DATA_FILE_EXTENSION, - num_marks_in_part, all_mark_ranges, stream_settings, - uncompressed_cache, data_part_info_for_read->getFileSizeOrZero(*stream_name + DATA_FILE_EXTENSION), - std::move(marks_loader), profile_callback, clock_type); - }; - - if (read_without_marks) - { - streams.emplace(*stream_name, create_stream.operator()()); - } - else - { - marks_loader->startAsyncLoad(); - streams.emplace(*stream_name, create_stream.operator()()); - } }; serialization->enumerateStreams(callback); @@ -273,11 +240,46 @@ void MergeTreeReaderWide::addStreams( partially_read_columns.insert(name_and_type.name); } -static ReadBuffer * getStream( +MergeTreeReaderWide::FileStreams::iterator MergeTreeReaderWide::addStream(const ISerialization::SubstreamPath & substream_path, const String & stream_name) +{ + auto context = data_part_info_for_read->getContext(); + auto * load_marks_threadpool = settings.read_settings.load_marks_asynchronously ? &context->getLoadMarksThreadpool() : nullptr; + size_t num_marks_in_part = data_part_info_for_read->getMarksCount(); + + auto marks_loader = std::make_shared( + data_part_info_for_read, + mark_cache, + data_part_info_for_read->getIndexGranularityInfo().getMarksFilePath(stream_name), + num_marks_in_part, + data_part_info_for_read->getIndexGranularityInfo(), + settings.save_marks_in_cache, + settings.read_settings, + load_marks_threadpool, + /*num_columns_in_mark=*/ 1); + + auto stream_settings = settings; + stream_settings.is_low_cardinality_dictionary = substream_path.size() > 1 && substream_path[substream_path.size() - 2].type == ISerialization::Substream::Type::DictionaryKeys; + + auto create_stream = [&]() + { + return std::make_unique( + data_part_info_for_read->getDataPartStorage(), stream_name, DATA_FILE_EXTENSION, + num_marks_in_part, all_mark_ranges, stream_settings, + uncompressed_cache, data_part_info_for_read->getFileSizeOrZero(stream_name + DATA_FILE_EXTENSION), + std::move(marks_loader), profile_callback, clock_type); + }; + + if (read_without_marks) + return streams.emplace(stream_name, create_stream.operator()()).first; + + marks_loader->startAsyncLoad(); + return streams.emplace(stream_name, create_stream.operator()()).first; +} + +ReadBuffer * MergeTreeReaderWide::getStream( bool seek_to_start, const ISerialization::SubstreamPath & substream_path, const MergeTreeDataPartChecksums & checksums, - MergeTreeReaderWide::FileStreams & streams, const NameAndTypePair & name_and_type, size_t from_mark, bool seek_to_mark, @@ -294,7 +296,13 @@ static ReadBuffer * getStream( auto it = streams.find(*stream_name); if (it == streams.end()) - return nullptr; + { + /// If we didn't create requested stream, but file with this path exists, create a stream for it. + /// It may happen during reading of columns with dynamic subcolumns, because all streams are known + /// only after deserializing of binary bulk prefix. + + it = addStream(substream_path, *stream_name); + } MergeTreeReaderStream & stream = *it->second; stream.adjustRightMark(current_task_last_mark); @@ -311,17 +319,19 @@ void MergeTreeReaderWide::deserializePrefix( const SerializationPtr & serialization, const NameAndTypePair & name_and_type, size_t current_task_last_mark, - ISerialization::SubstreamsCache & cache) + ISerialization::SubstreamsCache & cache, + ISerialization::SubstreamsDeserializeStatesCache & deserialize_states_cache) { const auto & name = name_and_type.name; if (!deserialize_binary_bulk_state_map.contains(name)) { ISerialization::DeserializeBinaryBulkSettings deserialize_settings; + deserialize_settings.dynamic_read_statistics = true; deserialize_settings.getter = [&](const ISerialization::SubstreamPath & substream_path) { - return getStream(/* seek_to_start = */true, substream_path, data_part_info_for_read->getChecksums(), streams, name_and_type, 0, /* seek_to_mark = */false, current_task_last_mark, cache); + return getStream(/* seek_to_start = */true, substream_path, data_part_info_for_read->getChecksums(), name_and_type, 0, /* seek_to_mark = */false, current_task_last_mark, cache); }; - serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name]); + serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name], &deserialize_states_cache); } } @@ -332,38 +342,48 @@ void MergeTreeReaderWide::prefetchForColumn( size_t from_mark, bool continue_reading, size_t current_task_last_mark, - ISerialization::SubstreamsCache & cache) + ISerialization::SubstreamsCache & cache, + ISerialization::SubstreamsDeserializeStatesCache & deserialize_states_cache) { - deserializePrefix(serialization, name_and_type, current_task_last_mark, cache); - - serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) + deserializePrefix(serialization, name_and_type, current_task_last_mark, cache, deserialize_states_cache); + auto callback = [&](const ISerialization::SubstreamPath & substream_path) { auto stream_name = IMergeTreeDataPart::getStreamNameForColumn(name_and_type, substream_path, data_part_info_for_read->getChecksums()); if (stream_name && !prefetched_streams.contains(*stream_name)) { bool seek_to_mark = !continue_reading && !read_without_marks; - - if (ReadBuffer * buf = getStream(false, substream_path, data_part_info_for_read->getChecksums(), streams, name_and_type, from_mark, seek_to_mark, current_task_last_mark, cache)) + if (ReadBuffer * buf = getStream(false, substream_path, data_part_info_for_read->getChecksums(), name_and_type, from_mark, seek_to_mark, current_task_last_mark, cache)) { buf->prefetch(priority); prefetched_streams.insert(*stream_name); } } - }); + }; + + auto data = ISerialization::SubstreamData(serialization).withType(name_and_type.type).withDeserializeState(deserialize_binary_bulk_state_map[name_and_type.name]); + ISerialization::EnumerateStreamsSettings settings; + serialization->enumerateStreams(settings, callback, data); } void MergeTreeReaderWide::readData( - const NameAndTypePair & name_and_type, const SerializationPtr & serialization, ColumnPtr & column, - size_t from_mark, bool continue_reading, size_t current_task_last_mark, - size_t max_rows_to_read, ISerialization::SubstreamsCache & cache, bool was_prefetched) + const NameAndTypePair & name_and_type, + const SerializationPtr & serialization, + ColumnPtr & column, + size_t from_mark, + bool continue_reading, + size_t current_task_last_mark, + size_t max_rows_to_read, + ISerialization::SubstreamsCache & cache, + ISerialization::SubstreamsDeserializeStatesCache & deserialize_states_cache, + bool was_prefetched) { double & avg_value_size_hint = avg_value_size_hints[name_and_type.name]; ISerialization::DeserializeBinaryBulkSettings deserialize_settings; deserialize_settings.avg_value_size_hint = avg_value_size_hint; - deserializePrefix(serialization, name_and_type, current_task_last_mark, cache); + deserializePrefix(serialization, name_and_type, current_task_last_mark, cache, deserialize_states_cache); deserialize_settings.getter = [&](const ISerialization::SubstreamPath & substream_path) { @@ -371,7 +391,7 @@ void MergeTreeReaderWide::readData( return getStream( /* seek_to_start = */false, substream_path, - data_part_info_for_read->getChecksums(), streams, + data_part_info_for_read->getChecksums(), name_and_type, from_mark, seek_to_mark, current_task_last_mark, cache); }; diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.h b/src/Storages/MergeTree/MergeTreeReaderWide.h index 9f6bdd79b00..841c2dc567d 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.h +++ b/src/Storages/MergeTree/MergeTreeReaderWide.h @@ -45,14 +45,31 @@ private: void addStreams( const NameAndTypePair & name_and_type, - const SerializationPtr & serialization, - const ReadBufferFromFileBase::ProfileCallback & profile_callback, - clockid_t clock_type); + const SerializationPtr & serialization); + + ReadBuffer * getStream( + bool seek_to_start, + const ISerialization::SubstreamPath & substream_path, + const MergeTreeDataPartChecksums & checksums, + const NameAndTypePair & name_and_type, + size_t from_mark, + bool seek_to_mark, + size_t current_task_last_mark, + ISerialization::SubstreamsCache & cache); + + FileStreams::iterator addStream(const ISerialization::SubstreamPath & substream_path, const String & stream_name); void readData( - const NameAndTypePair & name_and_type, const SerializationPtr & serialization, ColumnPtr & column, - size_t from_mark, bool continue_reading, size_t current_task_last_mark, size_t max_rows_to_read, - ISerialization::SubstreamsCache & cache, bool was_prefetched); + const NameAndTypePair & name_and_type, + const SerializationPtr & serialization, + ColumnPtr & column, + size_t from_mark, + bool continue_reading, + size_t current_task_last_mark, + size_t max_rows_to_read, + ISerialization::SubstreamsCache & cache, + ISerialization::SubstreamsDeserializeStatesCache & deserialize_states_cache, + bool was_prefetched); /// Make next readData more simple by calling 'prefetch' of all related ReadBuffers (column streams). void prefetchForColumn( @@ -62,17 +79,22 @@ private: size_t from_mark, bool continue_reading, size_t current_task_last_mark, - ISerialization::SubstreamsCache & cache); + ISerialization::SubstreamsCache & cache, + ISerialization::SubstreamsDeserializeStatesCache & deserialize_states_cache); void deserializePrefix( const SerializationPtr & serialization, const NameAndTypePair & name_and_type, size_t current_task_last_mark, - ISerialization::SubstreamsCache & cache); + ISerialization::SubstreamsCache & cache, + ISerialization::SubstreamsDeserializeStatesCache & deserialize_states_cache); std::unordered_map caches; + std::unordered_map deserialize_states_caches; std::unordered_set prefetched_streams; ssize_t prefetched_from_mark = -1; + ReadBufferFromFileBase::ProfileCallback profile_callback; + clockid_t clock_type; bool read_without_marks = false; }; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 5934756fb95..377fb5a1912 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -60,6 +60,21 @@ static bool checkOperationIsNotCanceled(ActionBlocker & merges_blocker, MergeLis return true; } +static bool haveMutationsOfDynamicColumns(const MergeTreeData::DataPartPtr & data_part, const MutationCommands & commands) +{ + for (const auto & command : commands) + { + if (!command.column_name.empty()) + { + auto column = data_part->tryGetColumn(command.column_name); + if (column && column->type->hasDynamicSubcolumns()) + return true; + } + } + + return false; +} + static UInt64 getExistingRowsCount(const Block & block) { auto column = block.getByName(RowExistsColumn::name).column; @@ -95,7 +110,7 @@ static void splitAndModifyMutationCommands( auto part_columns = part->getColumnsDescription(); const auto & table_columns = metadata_snapshot->getColumns(); - if (!isWidePart(part) || !isFullPartStorage(part->getDataPartStorage())) + if (haveMutationsOfDynamicColumns(part, commands) || !isWidePart(part) || !isFullPartStorage(part->getDataPartStorage())) { NameSet mutated_columns; NameSet dropped_columns; @@ -2249,7 +2264,9 @@ bool MutateTask::prepare() /// All columns from part are changed and may be some more that were missing before in part /// TODO We can materialize compact part without copying data - if (!isWidePart(ctx->source_part) || !isFullPartStorage(ctx->source_part->getDataPartStorage()) + /// Also currently mutations of types with dynamic subcolumns in Wide part are possible only by + /// rewriting the whole part. + if (MutationHelpers::haveMutationsOfDynamicColumns(ctx->source_part, ctx->commands_for_part) || !isWidePart(ctx->source_part) || !isFullPartStorage(ctx->source_part->getDataPartStorage()) || (ctx->interpreter && ctx->interpreter->isAffectingAllColumns())) { /// In case of replicated merge tree with zero copy replication diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartHeader.cpp b/src/Storages/MergeTree/ReplicatedMergeTreePartHeader.cpp index 24d907dbad6..9aadc3c3ca7 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartHeader.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartHeader.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index d6c36d12bf5..9a368bd44f5 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h index ca8ed9abdb5..a94508ad41f 100644 --- a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h +++ b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h @@ -87,6 +87,7 @@ public: bool supportsPrewhere() const override { return true; } + bool supportsDynamicSubcolumnsDeprecated() const override { return true; } bool supportsDynamicSubcolumns() const override { return true; } bool supportsSubcolumns() const override { return true; } diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index 5d05ef8ebf2..525960d5314 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -219,7 +219,7 @@ static IMergeTreeDataPart::Checksums checkDataPart( auto file_name = *stream_name + ".bin"; checksums_data.files[file_name] = checksum_compressed_file(data_part_storage, file_name); - }); + }, column.type, data_part->getColumnSample(column)); } } else diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 4244ccccfe0..d234103e52b 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -32,7 +31,6 @@ namespace ErrorCodes extern const int UNKNOWN_STORAGE; extern const int NO_REPLICA_NAME_GIVEN; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; - extern const int DATA_TYPE_CANNOT_BE_USED_IN_KEY; } @@ -113,16 +111,6 @@ static ColumnsDescription getColumnsDescriptionFromZookeeper(const String & raw_ return ColumnsDescription::parse(zookeeper->get(fs::path(zookeeper_path) / "columns", &columns_stat)); } -static void verifySortingKey(const KeyDescription & sorting_key) -{ - /// Aggregate functions already forbidden, but SimpleAggregateFunction are not - for (const auto & data_type : sorting_key.data_types) - { - if (dynamic_cast(data_type->getCustomName())) - throw Exception(ErrorCodes::DATA_TYPE_CANNOT_BE_USED_IN_KEY, "Column with type {} is not allowed in key expression", data_type->getCustomName()->getName()); - } -} - /// Returns whether a new syntax is used to define a table engine, i.e. MergeTree() PRIMARY KEY ... PARTITION BY ... SETTINGS ... /// instead of MergeTree(MergeTree(date, [sample_key], primary_key). static bool isExtendedStorageDef(const ASTCreateQuery & query) @@ -678,8 +666,8 @@ static StoragePtr create(const StorageFactory::Arguments & args) /// column if sorting key will be changed. metadata.sorting_key = KeyDescription::getSortingKeyFromAST( args.storage_def->order_by->ptr(), metadata.columns, context, merging_param_key_arg); - if (!local_settings.allow_suspicious_primary_key) - verifySortingKey(metadata.sorting_key); + if (!local_settings.allow_suspicious_primary_key && args.mode <= LoadingStrictnessLevel::CREATE) + MergeTreeData::verifySortingKey(metadata.sorting_key); /// If primary key explicitly defined, than get it from AST if (args.storage_def->primary_key) @@ -792,8 +780,8 @@ static StoragePtr create(const StorageFactory::Arguments & args) /// column if sorting key will be changed. metadata.sorting_key = KeyDescription::getSortingKeyFromAST(engine_args[arg_num], metadata.columns, context, merging_param_key_arg); - if (!local_settings.allow_suspicious_primary_key) - verifySortingKey(metadata.sorting_key); + if (!local_settings.allow_suspicious_primary_key && args.mode <= LoadingStrictnessLevel::CREATE) + MergeTreeData::verifySortingKey(metadata.sorting_key); /// In old syntax primary_key always equals to sorting key. metadata.primary_key = KeyDescription::getKeyFromAST(engine_args[arg_num], metadata.columns, context); diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp index 7094578a9cc..0baa234e7a3 100644 --- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp +++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp @@ -155,7 +155,7 @@ std::vector EmbeddedRocksDBBulkSink::squash(Chunk chunk) return {}; } -std::pair EmbeddedRocksDBBulkSink::serializeChunks(const std::vector & input_chunks) const +std::pair EmbeddedRocksDBBulkSink::serializeChunks(std::vector && input_chunks) const { auto serialized_key_column = ColumnString::create(); auto serialized_value_column = ColumnString::create(); @@ -168,7 +168,7 @@ std::pair EmbeddedRocksDBBulkSink::seriali WriteBufferFromVector writer_key(serialized_key_data); WriteBufferFromVector writer_value(serialized_value_data); - for (const auto & chunk : input_chunks) + for (auto && chunk : input_chunks) { const auto & columns = chunk.getColumns(); auto rows = chunk.getNumRows(); @@ -193,13 +193,14 @@ std::pair EmbeddedRocksDBBulkSink::seriali void EmbeddedRocksDBBulkSink::consume(Chunk chunk_) { - std::vector to_written = squash(std::move(chunk_)); + std::vector chunks_to_write = squash(std::move(chunk_)); - if (to_written.empty()) + if (chunks_to_write.empty()) return; - auto [serialized_key_column, serialized_value_column] = serializeChunks(to_written); + auto [serialized_key_column, serialized_value_column] = serializeChunks(std::move(chunks_to_write)); auto sst_file_path = getTemporarySSTFilePath(); + LOG_DEBUG(getLogger("EmbeddedRocksDBBulkSink"), "Writing {} rows to SST file {}", serialized_key_column->size(), sst_file_path); if (auto status = buildSSTFile(sst_file_path, *serialized_key_column, *serialized_value_column); !status.ok()) throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString()); @@ -209,6 +210,7 @@ void EmbeddedRocksDBBulkSink::consume(Chunk chunk_) if (auto status = storage.rocksdb_ptr->IngestExternalFile({sst_file_path}, ingest_options); !status.ok()) throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString()); + LOG_DEBUG(getLogger("EmbeddedRocksDBBulkSink"), "SST file {} has been ingested", sst_file_path); if (fs::exists(sst_file_path)) (void)fs::remove(sst_file_path); } @@ -237,4 +239,5 @@ bool EmbeddedRocksDBBulkSink::isEnoughSize(const Chunk & chunk) const { return chunk.getNumRows() >= min_block_size_rows; } + } diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h index 19ce1e3b83e..46193b152ca 100644 --- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h +++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h @@ -49,7 +49,7 @@ private: bool isEnoughSize(const std::vector & input_chunks) const; bool isEnoughSize(const Chunk & chunk) const; /// Serialize chunks to rocksdb key-value pairs - std::pair serializeChunks(const std::vector & input_chunks) const; + std::pair serializeChunks(std::vector && input_chunks) const; StorageEmbeddedRocksDB & storage; StorageMetadataPtr metadata_snapshot; diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index 01417b8977b..c3b7ae64c7e 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -189,6 +189,7 @@ StorageEmbeddedRocksDB::StorageEmbeddedRocksDB(const StorageID & table_id_, , rocksdb_dir(std::move(rocksdb_dir_)) , ttl(ttl_) , read_only(read_only_) + , log(getLogger(fmt::format("StorageEmbeddedRocksDB ({})", getStorageID().getNameForLogs()))) { setInMemoryMetadata(metadata_); setSettings(std::move(settings_)); @@ -316,6 +317,7 @@ void StorageEmbeddedRocksDB::mutate(const MutationCommands & commands, ContextPt void StorageEmbeddedRocksDB::drop() { + std::lock_guard lock(rocksdb_ptr_mx); rocksdb_ptr->Close(); rocksdb_ptr = nullptr; } @@ -463,18 +465,13 @@ void StorageEmbeddedRocksDB::initDB() { rocksdb::DB * db; if (read_only) - { status = rocksdb::DB::OpenForReadOnly(merged, rocksdb_dir, &db); - } else - { status = rocksdb::DB::Open(merged, rocksdb_dir, &db); - } + if (!status.ok()) - { - throw Exception(ErrorCodes::ROCKSDB_ERROR, "Failed to open rocksdb path at: {}: {}", - rocksdb_dir, status.ToString()); - } + throw Exception(ErrorCodes::ROCKSDB_ERROR, "Failed to open rocksdb path at: {}: {}", rocksdb_dir, status.ToString()); + rocksdb_ptr = std::unique_ptr(db); } } @@ -578,7 +575,8 @@ void ReadFromEmbeddedRocksDB::initializePipeline(QueryPipelineBuilder & pipeline void ReadFromEmbeddedRocksDB::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + const auto & sample_block = getOutputStream().header; auto primary_key_data_type = sample_block.getByName(storage.primary_key).type; std::tie(keys, all_scan) = getFilterKeys(storage.primary_key, primary_key_data_type, filter_actions_dag, context); @@ -588,8 +586,12 @@ SinkToStoragePtr StorageEmbeddedRocksDB::write( const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context, bool /*async_insert*/) { if (getSettings().optimize_for_bulk_insert) + { + LOG_DEBUG(log, "Using bulk insert"); return std::make_shared(query_context, *this, metadata_snapshot); + } + LOG_DEBUG(log, "Using regular insert"); return std::make_shared(*this, metadata_snapshot); } diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h index 9fc58ea6b38..61592398954 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h @@ -124,5 +124,7 @@ private: bool read_only; void initDB(); + + LoggerPtr log; }; } diff --git a/src/Storages/RocksDB/StorageSystemRocksDB.cpp b/src/Storages/RocksDB/StorageSystemRocksDB.cpp index 4406a7c3fd4..5105b190fd9 100644 --- a/src/Storages/RocksDB/StorageSystemRocksDB.cpp +++ b/src/Storages/RocksDB/StorageSystemRocksDB.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index c3a772e532c..16e42e32b8a 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -287,7 +287,8 @@ void ReadFromS3Queue::createIterator(const ActionsDAG::Node * predicate) void ReadFromS3Queue::applyFilters(ActionDAGNodes added_filter_nodes) { - auto filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + const ActionsDAG::Node * predicate = nullptr; if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); diff --git a/src/Storages/S3Queue/StorageS3Queue.h b/src/Storages/S3Queue/StorageS3Queue.h index 1f735b47819..fce6736aa07 100644 --- a/src/Storages/S3Queue/StorageS3Queue.h +++ b/src/Storages/S3Queue/StorageS3Queue.h @@ -81,6 +81,7 @@ private: void drop() override; bool supportsSubsetOfColumns(const ContextPtr & context_) const; bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } std::shared_ptr createFileIterator(ContextPtr local_context, const ActionsDAG::Node * predicate); std::shared_ptr createSource( diff --git a/src/Storages/SelectQueryInfo.cpp b/src/Storages/SelectQueryInfo.cpp index 665da7fee70..d59ccf0dfaf 100644 --- a/src/Storages/SelectQueryInfo.cpp +++ b/src/Storages/SelectQueryInfo.cpp @@ -13,4 +13,24 @@ bool SelectQueryInfo::isFinal() const return select.final(); } +std::unordered_map SelectQueryInfo::buildNodeNameToInputNodeColumn() const +{ + std::unordered_map node_name_to_input_node_column; + if (planner_context) + { + const auto & table_expression_data = planner_context->getTableExpressionDataOrThrow(table_expression); + const auto & alias_column_expressions = table_expression_data.getAliasColumnExpressions(); + for (const auto & [column_identifier, column_name] : table_expression_data.getColumnIdentifierToColumnName()) + { + /// ALIAS columns cannot be used in the filter expression without being calculated in ActionsDAG, + /// so they should not be added to the input nodes. + if (alias_column_expressions.contains(column_name)) + continue; + const auto & column = table_expression_data.getColumnOrThrow(column_name); + node_name_to_input_node_column.emplace(column_identifier, ColumnWithTypeAndName(column.type, column_name)); + } + } + return node_name_to_input_node_column; +} + } diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 655676812d9..11e2a2fc5e7 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -239,5 +239,11 @@ struct SelectQueryInfo bool merge_tree_enable_remove_parts_from_snapshot_optimization = true; bool isFinal() const; + + /// Analyzer generates unique ColumnIdentifiers like __table1.__partition_id in filter nodes, + /// while key analysis still requires unqualified column names. + /// This function generates a map that maps the unique names to table column names, + /// for the current table (`table_expression`). + std::unordered_map buildNodeNameToInputNodeColumn() const; }; } diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index e1c6ec0097c..365f93cc324 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -254,6 +254,10 @@ AzureObjectStorage::SettingsPtr StorageAzureBlob::createSettings(const ContextPt auto settings_ptr = std::make_unique(); settings_ptr->max_single_part_upload_size = context_settings.azure_max_single_part_upload_size; settings_ptr->max_single_read_retries = context_settings.azure_max_single_read_retries; + settings_ptr->strict_upload_part_size = context_settings.azure_strict_upload_part_size; + settings_ptr->max_upload_part_size = context_settings.azure_max_upload_part_size; + settings_ptr->max_blocks_in_multipart_upload = context_settings.azure_max_blocks_in_multipart_upload; + settings_ptr->min_upload_part_size = context_settings.azure_min_upload_part_size; settings_ptr->list_object_keys_size = static_cast(context_settings.azure_list_object_keys_size); return settings_ptr; @@ -799,7 +803,8 @@ private: void ReadFromAzureBlob::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + const ActionsDAG::Node * predicate = nullptr; if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); diff --git a/src/Storages/StorageAzureBlob.h b/src/Storages/StorageAzureBlob.h index b433cd92d68..04003d28035 100644 --- a/src/Storages/StorageAzureBlob.h +++ b/src/Storages/StorageAzureBlob.h @@ -98,6 +98,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsSubsetOfColumns(const ContextPtr & context) const; bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override { return true; } diff --git a/src/Storages/StorageAzureBlobCluster.h b/src/Storages/StorageAzureBlobCluster.h index eff4d70f1bd..aca9630b8bf 100644 --- a/src/Storages/StorageAzureBlobCluster.h +++ b/src/Storages/StorageAzureBlobCluster.h @@ -35,6 +35,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override { return true; } private: diff --git a/src/Storages/StorageBuffer.h b/src/Storages/StorageBuffer.h index 6c15c7e0238..cd6dd7b933f 100644 --- a/src/Storages/StorageBuffer.h +++ b/src/Storages/StorageBuffer.h @@ -89,6 +89,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool /*async_insert*/) override; void startup() override; diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 7b5916c0273..fbb40f8b79f 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -700,7 +700,7 @@ static bool requiresObjectColumns(const ColumnsDescription & all_columns, ASTPtr auto name_in_storage = Nested::splitName(required_column).first; auto column_in_storage = all_columns.tryGetPhysical(name_in_storage); - if (column_in_storage && column_in_storage->type->hasDynamicSubcolumns()) + if (column_in_storage && column_in_storage->type->hasDynamicSubcolumnsDeprecated()) return true; } @@ -926,7 +926,8 @@ void StorageDistributed::read( sharding_key_expr, sharding_key_column_name, distributed_settings, - additional_shard_filter_generator); + additional_shard_filter_generator, + /* is_remote_function= */ static_cast(owned_cluster)); /// This is a bug, it is possible only when there is no shards to query, and this is handled earlier. if (!query_plan.isInitialized()) diff --git a/src/Storages/StorageDistributed.h b/src/Storages/StorageDistributed.h index 3a7e63aef50..85a8de86953 100644 --- a/src/Storages/StorageDistributed.h +++ b/src/Storages/StorageDistributed.h @@ -85,6 +85,7 @@ public: bool supportsFinal() const override { return true; } bool supportsPrewhere() const override { return true; } bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumnsDeprecated() const override { return true; } bool supportsDynamicSubcolumns() const override { return true; } StoragePolicyPtr getStoragePolicy() const override; diff --git a/src/Storages/StorageDummy.h b/src/Storages/StorageDummy.h index ae9bf2483e1..572dc07b269 100644 --- a/src/Storages/StorageDummy.h +++ b/src/Storages/StorageDummy.h @@ -26,6 +26,7 @@ public: } bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumnsDeprecated() const override { return true; } bool supportsDynamicSubcolumns() const override { return true; } bool canMoveConditionsToPrewhere() const override { diff --git a/src/Storages/StorageFactory.cpp b/src/Storages/StorageFactory.cpp index 307a0aa001a..9d12a1569d8 100644 --- a/src/Storages/StorageFactory.cpp +++ b/src/Storages/StorageFactory.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 7db8fc2500a..51bcc64bceb 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1534,7 +1534,8 @@ private: void ReadFromFile::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + const ActionsDAG::Node * predicate = nullptr; if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index f3c57ba88ed..37da59c3664 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -90,6 +90,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool prefersLargeBlocks() const override; bool parallelizeOutputAfterReading(ContextPtr context) const override; diff --git a/src/Storages/StorageFileCluster.h b/src/Storages/StorageFileCluster.h index 973d595bbf0..f5a4362901e 100644 --- a/src/Storages/StorageFileCluster.h +++ b/src/Storages/StorageFileCluster.h @@ -32,6 +32,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override { return true; } private: diff --git a/src/Storages/StorageInMemoryMetadata.cpp b/src/Storages/StorageInMemoryMetadata.cpp index 1ac739f03fd..a5bae0acce5 100644 --- a/src/Storages/StorageInMemoryMetadata.cpp +++ b/src/Storages/StorageInMemoryMetadata.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include @@ -628,7 +628,7 @@ void StorageInMemoryMetadata::check(const NamesAndTypesList & provided_columns) const auto * available_type = it->getMapped(); - if (!available_type->hasDynamicSubcolumns() + if (!available_type->hasDynamicSubcolumnsDeprecated() && !column.type->equals(*available_type) && !isCompatibleEnumTypes(available_type, column.type.get())) throw Exception( @@ -676,7 +676,7 @@ void StorageInMemoryMetadata::check(const NamesAndTypesList & provided_columns, const auto * provided_column_type = it->getMapped(); const auto * available_column_type = jt->getMapped(); - if (!provided_column_type->hasDynamicSubcolumns() + if (!provided_column_type->hasDynamicSubcolumnsDeprecated() && !provided_column_type->equals(*available_column_type) && !isCompatibleEnumTypes(available_column_type, provided_column_type)) throw Exception( @@ -720,7 +720,7 @@ void StorageInMemoryMetadata::check(const Block & block, bool need_all) const listOfColumns(available_columns)); const auto * available_type = it->getMapped(); - if (!available_type->hasDynamicSubcolumns() + if (!available_type->hasDynamicSubcolumnsDeprecated() && !column.type->equals(*available_type) && !isCompatibleEnumTypes(available_type, column.type.get())) throw Exception( diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index 25c48de94e1..08e0526550d 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include @@ -254,7 +254,7 @@ void LogSource::readData(const NameAndTypePair & name_and_type, ColumnPtr & colu if (!deserialize_states.contains(name)) { settings.getter = create_stream_getter(true); - serialization->deserializeBinaryBulkStatePrefix(settings, deserialize_states[name]); + serialization->deserializeBinaryBulkStatePrefix(settings, deserialize_states[name], nullptr); } settings.getter = create_stream_getter(false); diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index c7c80078efc..5ecd2ec3819 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -32,6 +32,7 @@ public: bool supportsFinal() const override { return getTargetTable()->supportsFinal(); } bool supportsParallelInsert() const override { return getTargetTable()->supportsParallelInsert(); } bool supportsSubcolumns() const override { return getTargetTable()->supportsSubcolumns(); } + bool supportsDynamicSubcolumns() const override { return getTargetTable()->supportsDynamicSubcolumns(); } bool supportsTransactions() const override { return getTargetTable()->supportsTransactions(); } SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override; diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h index 50581aa0d61..5d269cf814d 100644 --- a/src/Storages/StorageMemory.h +++ b/src/Storages/StorageMemory.h @@ -60,6 +60,7 @@ public: bool supportsParallelInsert() const override { return true; } bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumnsDeprecated() const override { return true; } bool supportsDynamicSubcolumns() const override { return true; } /// Smaller blocks (e.g. 64K rows) are better for CPU cache. diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 7afa480149f..4c678a1228b 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -1622,7 +1622,7 @@ void ReadFromMerge::applyFilters(const QueryPlan & plan, const ActionDAGNodes & void ReadFromMerge::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(added_filter_nodes); filterTablesAndCreateChildrenPlans(); diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index a63ea1e32ef..735c8711a63 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -49,6 +49,7 @@ public: bool supportsSampling() const override { return true; } bool supportsFinal() const override { return true; } bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } bool supportsPrewhere() const override { return tableSupportsPrewhere(); } std::optional supportedPrewhereColumns() const override; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 9144ef7c0f7..ea698775298 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -333,17 +333,21 @@ void StorageMergeTree::alter( auto table_id = getStorageID(); auto old_storage_settings = getSettings(); + const auto & query_settings = local_context->getSettingsRef(); StorageInMemoryMetadata new_metadata = getInMemoryMetadata(); StorageInMemoryMetadata old_metadata = getInMemoryMetadata(); - auto maybe_mutation_commands = commands.getMutationCommands(new_metadata, local_context->getSettingsRef().materialize_ttl_after_modify, local_context); + auto maybe_mutation_commands = commands.getMutationCommands(new_metadata, query_settings.materialize_ttl_after_modify, local_context); if (!maybe_mutation_commands.empty()) delayMutationOrThrowIfNeeded(nullptr, local_context); Int64 mutation_version = -1; commands.apply(new_metadata, local_context); + if (!query_settings.allow_suspicious_primary_key) + MergeTreeData::verifySortingKey(new_metadata.sorting_key); + /// This alter can be performed at new_metadata level only if (commands.isSettingsAlter()) { @@ -396,7 +400,7 @@ void StorageMergeTree::alter( resetObjectColumnsFromActiveParts(parts_lock); } - if (!maybe_mutation_commands.empty() && local_context->getSettingsRef().alter_sync > 0) + if (!maybe_mutation_commands.empty() && query_settings.alter_sync > 0) waitForMutation(mutation_version, false); } diff --git a/src/Storages/StorageMergeTreeIndex.cpp b/src/Storages/StorageMergeTreeIndex.cpp index 4747232d7f7..0b1ad02f8c9 100644 --- a/src/Storages/StorageMergeTreeIndex.cpp +++ b/src/Storages/StorageMergeTreeIndex.cpp @@ -280,7 +280,8 @@ private: void ReadFromMergeTreeIndex::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); } diff --git a/src/Storages/StorageNull.h b/src/Storages/StorageNull.h index f7ee936db8d..74abf931f8f 100644 --- a/src/Storages/StorageNull.h +++ b/src/Storages/StorageNull.h @@ -48,6 +48,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + SinkToStoragePtr write(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr, bool) override { return std::make_shared(metadata_snapshot->getSampleBlock()); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index df5bbdf9f78..62bfcc223fd 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -590,6 +590,9 @@ void StorageReplicatedMergeTree::waitMutationToFinishOnReplicas( LOG_DEBUG(log, "Waiting for {} to apply mutation {}", replica, mutation_id); zkutil::EventPtr wait_event = std::make_shared(); + constexpr size_t MAX_RETRIES_ON_FAILED_MUTATION = 30; + size_t retries_on_failed_mutation = 0; + while (!partial_shutdown_called) { /// Mutation maybe killed or whole replica was deleted. @@ -637,18 +640,32 @@ void StorageReplicatedMergeTree::waitMutationToFinishOnReplicas( } } - /// If mutation status is empty, than local replica may just not loaded it into memory. - if (mutation_status && !mutation_status->latest_fail_reason.empty()) - { - LOG_DEBUG(log, "Mutation {} is done {} or failed {} (status: '{}')", mutation_id, mutation_status->is_done, !mutation_status->latest_fail_reason.empty(), mutation_status->latest_fail_reason); - break; - } - /// Replica can become inactive, so wait with timeout, if nothing happened -> recheck it if (!wait_event->tryWait(1000)) { LOG_TRACE(log, "Failed to wait for mutation '{}', will recheck", mutation_id); } + + /// If mutation status is empty, than local replica may just not loaded it into memory. + if (mutation_status && !mutation_status->latest_fail_reason.empty()) + { + LOG_DEBUG(log, "Mutation {} is done {} or failed {} (status: '{}')", mutation_id, mutation_status->is_done, !mutation_status->latest_fail_reason.empty(), mutation_status->latest_fail_reason); + + /// In some cases latest_fail_reason may be retryable and there's a chance it will be cleared after the next attempt + if (++retries_on_failed_mutation <= MAX_RETRIES_ON_FAILED_MUTATION) + continue; + + if (mutation_status->is_done) + { + LOG_DEBUG(log, "Looks like mutation {} is done, rechecking", mutation_id); + continue; + } + + /// It's still possible that latest_fail_reason will be cleared just before queue.getIncompleteMutationsStatus(...) below, + /// but it's unlikely. Anyway, rethrow the exception here to avoid exiting with is_done=false + checkMutationStatus(mutation_status, {mutation_id}); + throw Exception(ErrorCodes::LOGICAL_ERROR, "checkMutationStatus didn't throw when checking status of {}: {}", mutation_id, mutation_status->latest_fail_reason); + } } /// This replica inactive, don't check anything @@ -6027,6 +6044,7 @@ void StorageReplicatedMergeTree::alter( assertNotReadonly(); auto table_id = getStorageID(); + const auto & query_settings = query_context->getSettingsRef(); if (commands.isSettingsAlter()) { @@ -6054,6 +6072,13 @@ void StorageReplicatedMergeTree::alter( return; } + if (!query_settings.allow_suspicious_primary_key) + { + StorageInMemoryMetadata future_metadata = getInMemoryMetadata(); + commands.apply(future_metadata, query_context); + + MergeTreeData::verifySortingKey(future_metadata.sorting_key); + } auto ast_to_str = [](ASTPtr query) -> String { @@ -6186,7 +6211,7 @@ void StorageReplicatedMergeTree::alter( auto maybe_mutation_commands = commands.getMutationCommands( *current_metadata, - query_context->getSettingsRef().materialize_ttl_after_modify, + query_settings.materialize_ttl_after_modify, query_context); bool have_mutation = !maybe_mutation_commands.empty(); @@ -6309,7 +6334,7 @@ void StorageReplicatedMergeTree::alter( { LOG_DEBUG(log, "Metadata changes applied. Will wait for data changes."); merge_selecting_task->schedule(); - waitMutation(*mutation_znode, query_context->getSettingsRef().alter_sync); + waitMutation(*mutation_znode, query_settings.alter_sync); LOG_DEBUG(log, "Data changes applied."); } } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 9768653f3fe..2ce188c203c 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -1454,7 +1454,8 @@ void StorageS3::read( void ReadFromStorageS3Step::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + const ActionsDAG::Node * predicate = nullptr; if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 606c677f915..dd6d13b99cb 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -432,6 +432,8 @@ private: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsSubsetOfColumns(const ContextPtr & context) const; bool prefersLargeBlocks() const override; diff --git a/src/Storages/StorageS3Cluster.h b/src/Storages/StorageS3Cluster.h index 802fd3f9139..aa97bc01b02 100644 --- a/src/Storages/StorageS3Cluster.h +++ b/src/Storages/StorageS3Cluster.h @@ -32,6 +32,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override { return true; } protected: diff --git a/src/Storages/StorageSet.cpp b/src/Storages/StorageSet.cpp index 54218351cf1..205a90423bf 100644 --- a/src/Storages/StorageSet.cpp +++ b/src/Storages/StorageSet.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Storages/StorageSnapshot.cpp b/src/Storages/StorageSnapshot.cpp index 8b087a4a2bc..aada25168f8 100644 --- a/src/Storages/StorageSnapshot.cpp +++ b/src/Storages/StorageSnapshot.cpp @@ -115,7 +115,7 @@ std::optional StorageSnapshot::tryGetColumn(const GetColumnsOpt { const auto & columns = getMetadataForQuery()->getColumns(); auto column = columns.tryGetColumn(options, column_name); - if (column && (!column->type->hasDynamicSubcolumns() || !options.with_extended_objects)) + if (column && (!column->type->hasDynamicSubcolumnsDeprecated() || !options.with_extended_objects)) return column; if (options.with_extended_objects) diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index 48389dccf48..f0c5103d657 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -53,8 +54,13 @@ namespace ErrorCodes extern const int TIMEOUT_EXCEEDED; extern const int CANNOT_RESTORE_TABLE; extern const int NOT_IMPLEMENTED; + extern const int FAULT_INJECTED; } +namespace FailPoints +{ + extern const char stripe_log_sink_write_fallpoint[]; +} /// NOTE: The lock `StorageStripeLog::rwlock` is NOT kept locked while reading, /// because we read ranges of data that do not change. @@ -234,6 +240,11 @@ public: /// Save the new indices. storage.saveIndices(lock); + // While executing save file sizes the exception might occurs. S3::TooManyRequests for example. + fiu_do_on(FailPoints::stripe_log_sink_write_fallpoint, + { + throw Exception(ErrorCodes::FAULT_INJECTED, "Injecting fault for inserting into StipeLog table"); + }); /// Save the new file sizes. storage.saveFileSizes(lock); diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 8a71a771367..272f771194d 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -1038,7 +1038,8 @@ private: void ReadFromURL::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + const ActionsDAG::Node * predicate = nullptr; if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index 5aca3df1513..f550ccb2bc4 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -295,6 +295,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + static FormatSettings getFormatSettingsFromArgs(const StorageFactory::Arguments & args); struct Configuration : public StatelessTableEngineConfiguration diff --git a/src/Storages/StorageURLCluster.h b/src/Storages/StorageURLCluster.h index c80cdec74a2..a6334e7430d 100644 --- a/src/Storages/StorageURLCluster.h +++ b/src/Storages/StorageURLCluster.h @@ -35,6 +35,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override { return true; } private: diff --git a/src/Storages/System/CMakeLists.txt b/src/Storages/System/CMakeLists.txt index c3a2e726365..899c3d5cf40 100644 --- a/src/Storages/System/CMakeLists.txt +++ b/src/Storages/System/CMakeLists.txt @@ -47,7 +47,6 @@ add_library(clickhouse_storages_system ${storages_system_sources}) target_link_libraries(clickhouse_storages_system PRIVATE dbms common - string_utils clickhouse_common_zookeeper clickhouse_parsers Poco::JSON diff --git a/src/Storages/System/IStorageSystemOneBlock.cpp b/src/Storages/System/IStorageSystemOneBlock.cpp index 53399654c8d..456b7c4f90b 100644 --- a/src/Storages/System/IStorageSystemOneBlock.cpp +++ b/src/Storages/System/IStorageSystemOneBlock.cpp @@ -91,7 +91,8 @@ void ReadFromSystemOneBlock::initializePipeline(QueryPipelineBuilder & pipeline, void ReadFromSystemOneBlock::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); } diff --git a/src/Storages/System/StorageSystemColumns.cpp b/src/Storages/System/StorageSystemColumns.cpp index 74b44cc0a2d..49da1eba9ec 100644 --- a/src/Storages/System/StorageSystemColumns.cpp +++ b/src/Storages/System/StorageSystemColumns.cpp @@ -342,7 +342,8 @@ private: void ReadFromSystemColumns::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); } diff --git a/src/Storages/System/StorageSystemContributors.generated.cpp b/src/Storages/System/StorageSystemContributors.generated.cpp index 909599c00af..b42b070d518 100644 --- a/src/Storages/System/StorageSystemContributors.generated.cpp +++ b/src/Storages/System/StorageSystemContributors.generated.cpp @@ -48,6 +48,7 @@ const char * auto_contributors[] { "Alex Cao", "Alex Cheng", "Alex Karo", + "Alex Katsman", "Alex Krash", "Alex Ryndin", "Alex Zatelepin", @@ -101,6 +102,7 @@ const char * auto_contributors[] { "Alexey Korepanov", "Alexey Milovidov", "Alexey Perevyshin", + "Alexey Petrunyaka", "Alexey Tronov", "Alexey Vasiliev", "Alexey Zatelepin", @@ -109,6 +111,7 @@ const char * auto_contributors[] { "AlfVII", "Alfonso Martinez", "Alfred Xu", + "Ali", "Ali Demirci", "Aliaksandr Pliutau", "Aliaksandr Shylau", @@ -250,6 +253,7 @@ const char * auto_contributors[] { "Brian Hunter", "Brokenice0415", "Bulat Gaifullin", + "Caio Ricciuti", "Camden Cheek", "Camilo Sierra", "Carbyn", @@ -384,6 +388,7 @@ const char * auto_contributors[] { "Evgenii Pravda", "Evgeniia Sudarikova", "Evgeniy Gatov", + "Evgeniy Leko", "Evgeniy Udodov", "Evgeny", "Evgeny Konkov", @@ -413,6 +418,7 @@ const char * auto_contributors[] { "Fille", "Flowyi", "Francisco Barón", + "Francisco Javier Jurado Moreno", "Frank Chen", "Frank Zhao", "François Violette", @@ -425,6 +431,7 @@ const char * auto_contributors[] { "G5.Qin", "Gabriel", "Gabriel Archer", + "Gabriel Martinez", "Gagan Arneja", "Gagan Goel", "Gao Qiang", @@ -446,6 +453,7 @@ const char * auto_contributors[] { "Grigory Buteyko", "Grigory Pervakov", "GruffGemini", + "Grégoire Pineau", "Guillaume Tassery", "Guo Wangyang", "Guo Wei (William)", @@ -587,6 +595,7 @@ const char * auto_contributors[] { "Keiji Yoshida", "Ken Chen", "Ken MacInnis", + "KenL", "Kenji Noguchi", "Kerry Clendinning", "Kevin Chiang", @@ -640,6 +649,7 @@ const char * auto_contributors[] { "Leonardo Maciel", "Leonid Krylov", "Leopold Schabel", + "Leticia Webb", "Lev Borodin", "Lewinma", "Li Shuai", @@ -701,6 +711,7 @@ const char * auto_contributors[] { "Masha", "Mathieu Rey", "Matthew Peveler", + "Mattias Naarttijärvi", "Matwey V. Kornilov", "Max", "Max Akhmedov", @@ -711,6 +722,7 @@ const char * auto_contributors[] { "MaxTheHuman", "MaxWk", "Maxim Akhmedov", + "Maxim Alexeev", "Maxim Babenko", "Maxim Fedotov", "Maxim Fridental", @@ -739,6 +751,7 @@ const char * auto_contributors[] { "Michael Razuvaev", "Michael Schnerring", "Michael Smitasin", + "Michael Stetsyuk", "Michail Safronov", "Michal Lisowski", "MicrochipQ", @@ -879,6 +892,7 @@ const char * auto_contributors[] { "Pavlo Bashynskiy", "Pawel Rog", "Paweł Kudzia", + "Pazitiff9", "Peignon Melvyn", "Peng Jian", "Peng Liu", @@ -1084,6 +1098,7 @@ const char * auto_contributors[] { "Tom Bombadil", "Tom Risse", "Tomas Barton", + "Tomer Shafir", "Tomáš Hromada", "Tristan", "Tsarkova Anastasia", @@ -1123,6 +1138,7 @@ const char * auto_contributors[] { "Victor Krasnov", "Victor Tarnavsky", "Viktor Taranenko", + "Vinay Suryadevara", "Vincent", "Vincent Bernat", "Vitalii S", @@ -1162,6 +1178,9 @@ const char * auto_contributors[] { "Vladislav Smirnov", "Vladislav V", "Vojtech Splichal", + "Volodya", + "Volodya Giro", + "Volodyachan", "Volodymyr Kuznetsov", "Vsevolod Orlov", "Vxider", @@ -1179,6 +1198,7 @@ const char * auto_contributors[] { "XenoAmess", "Xianda Ke", "Xiang Zhou", + "Xiaofei Hu", "Xin Wang", "Xoel Lopez Barata", "Xudong Zhang", @@ -1224,6 +1244,7 @@ const char * auto_contributors[] { "Zhipeng", "Zhuo Qiu", "Zijie Lu", + "Zimu Li", "Ziy1-Tan", "Zoran Pandovski", "[데이터플랫폼팀] 이호선", @@ -1490,6 +1511,7 @@ const char * auto_contributors[] { "jiyoungyoooo", "jktng", "jkuklis", + "joe09@foxmail.com", "joelynch", "johanngan", "johnnymatthews", @@ -1658,6 +1680,7 @@ const char * auto_contributors[] { "ongkong", "orantius", "p0ny", + "p1rattttt", "palasonicq", "palegre-tiny", "pawelsz-rb", @@ -1667,6 +1690,7 @@ const char * auto_contributors[] { "pedro.riera", "pengxiangcai", "peshkurov", + "pet74alex", "peter279k", "philip.han", "pingyu", @@ -1680,6 +1704,7 @@ const char * auto_contributors[] { "pyos", "pzhdfy", "qaziqarta", + "qiangxuhui", "qianlixiang", "qianmoQ", "qieqieplus", @@ -1793,6 +1818,7 @@ const char * auto_contributors[] { "unknown", "urgordeadbeef", "usurai", + "v01dxyz", "vahid-sohrabloo", "vdimir", "velavokr", @@ -1802,6 +1828,7 @@ const char * auto_contributors[] { "vic", "vicdashkov", "vicgao", + "vinay92-ch", "vinity", "vitac", "vitstn", @@ -1818,6 +1845,7 @@ const char * auto_contributors[] { "weeds085490", "whysage", "wineternity", + "woodlzm", "wuxiaobai24", "wxybear", "wzl", @@ -1877,6 +1905,7 @@ const char * auto_contributors[] { "zhenjial", "zhifeng", "zhongyuankai", + "zhou", "zhoubintao", "zhukai", "zimv", @@ -1891,6 +1920,7 @@ const char * auto_contributors[] { "zxealous", "zy-kkk", "zzsmdfj", + "zzyReal666", "Šimon Podlipský", "Александр", "Александр Нам", diff --git a/src/Storages/System/StorageSystemDashboards.cpp b/src/Storages/System/StorageSystemDashboards.cpp index 23d8fcfc481..9682fbc74a1 100644 --- a/src/Storages/System/StorageSystemDashboards.cpp +++ b/src/Storages/System/StorageSystemDashboards.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include namespace DB { diff --git a/src/Storages/System/StorageSystemDataSkippingIndices.cpp b/src/Storages/System/StorageSystemDataSkippingIndices.cpp index 2afc03d0e5e..093adc59cc6 100644 --- a/src/Storages/System/StorageSystemDataSkippingIndices.cpp +++ b/src/Storages/System/StorageSystemDataSkippingIndices.cpp @@ -219,7 +219,8 @@ private: void ReadFromSystemDataSkippingIndices::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); } diff --git a/src/Storages/System/StorageSystemDetachedParts.cpp b/src/Storages/System/StorageSystemDetachedParts.cpp index 31d566ef8b6..f48a8c67971 100644 --- a/src/Storages/System/StorageSystemDetachedParts.cpp +++ b/src/Storages/System/StorageSystemDetachedParts.cpp @@ -313,7 +313,8 @@ protected: void ReadFromSystemDetachedParts::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + if (filter_actions_dag) { const auto * predicate = filter_actions_dag->getOutputs().at(0); diff --git a/src/Storages/System/StorageSystemPartsBase.cpp b/src/Storages/System/StorageSystemPartsBase.cpp index b1ea2dd3f2b..175c0834bcb 100644 --- a/src/Storages/System/StorageSystemPartsBase.cpp +++ b/src/Storages/System/StorageSystemPartsBase.cpp @@ -263,7 +263,8 @@ ReadFromSystemPartsBase::ReadFromSystemPartsBase( void ReadFromSystemPartsBase::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + if (filter_actions_dag) { const auto * predicate = filter_actions_dag->getOutputs().at(0); diff --git a/src/Storages/System/StorageSystemReplicas.cpp b/src/Storages/System/StorageSystemReplicas.cpp index 10d5c353c43..3bd5fd290db 100644 --- a/src/Storages/System/StorageSystemReplicas.cpp +++ b/src/Storages/System/StorageSystemReplicas.cpp @@ -290,7 +290,8 @@ private: void ReadFromSystemReplicas::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); } diff --git a/src/Storages/System/StorageSystemTables.cpp b/src/Storages/System/StorageSystemTables.cpp index d428d6bd6d0..1f900ec623e 100644 --- a/src/Storages/System/StorageSystemTables.cpp +++ b/src/Storages/System/StorageSystemTables.cpp @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -750,7 +750,8 @@ void StorageSystemTables::read( void ReadFromSystemTables::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); + const ActionsDAG::Node * predicate = nullptr; if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); diff --git a/src/Storages/System/StorageSystemZooKeeper.cpp b/src/Storages/System/StorageSystemZooKeeper.cpp index 7afa1894a64..cb46cd19517 100644 --- a/src/Storages/System/StorageSystemZooKeeper.cpp +++ b/src/Storages/System/StorageSystemZooKeeper.cpp @@ -474,7 +474,8 @@ static Paths extractPath(const ActionsDAG::NodeRawConstPtrs & filter_nodes, Cont void ReadFromSystemZooKeeper::applyFilters(ActionDAGNodes added_filter_nodes) { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(added_filter_nodes); + paths = extractPath(added_filter_nodes.nodes, context, context->getSettingsRef().allow_unrestricted_reads_from_keeper); } diff --git a/src/Storages/Utils.cpp b/src/Storages/Utils.cpp new file mode 100644 index 00000000000..ff73888e19d --- /dev/null +++ b/src/Storages/Utils.cpp @@ -0,0 +1,30 @@ +#include +#include + + +namespace CurrentMetrics +{ + extern const Metric AttachedTable; + extern const Metric AttachedView; + extern const Metric AttachedDictionary; +} + + +namespace DB +{ + CurrentMetrics::Metric getAttachedCounterForStorage(const StoragePtr & storage) + { + if (storage->isView()) + { + return CurrentMetrics::AttachedView; + } + else if (storage->isDictionary()) + { + return CurrentMetrics::AttachedDictionary; + } + else + { + return CurrentMetrics::AttachedTable; + } + } +} diff --git a/src/Storages/Utils.h b/src/Storages/Utils.h new file mode 100644 index 00000000000..c86c2a4c341 --- /dev/null +++ b/src/Storages/Utils.h @@ -0,0 +1,10 @@ +#pragma once + +#include +#include + + +namespace DB +{ + CurrentMetrics::Metric getAttachedCounterForStorage(const StoragePtr & storage); +} diff --git a/src/Storages/examples/CMakeLists.txt b/src/Storages/examples/CMakeLists.txt index cddfc9404d4..b4786b7313b 100644 --- a/src/Storages/examples/CMakeLists.txt +++ b/src/Storages/examples/CMakeLists.txt @@ -5,4 +5,4 @@ clickhouse_add_executable (merge_selector2 merge_selector2.cpp) target_link_libraries (merge_selector2 PRIVATE dbms) clickhouse_add_executable (get_current_inserts_in_replicated get_current_inserts_in_replicated.cpp) -target_link_libraries (get_current_inserts_in_replicated PRIVATE dbms clickhouse_common_config clickhouse_common_zookeeper string_utils) +target_link_libraries (get_current_inserts_in_replicated PRIVATE dbms clickhouse_common_config clickhouse_common_zookeeper) diff --git a/src/Storages/getStructureOfRemoteTable.cpp b/src/Storages/getStructureOfRemoteTable.cpp index 26e953c0578..6ea7bdc312d 100644 --- a/src/Storages/getStructureOfRemoteTable.cpp +++ b/src/Storages/getStructureOfRemoteTable.cpp @@ -210,7 +210,7 @@ ColumnsDescriptionByShardNum getExtendedObjectsOfRemoteTables( auto type_name = type_col[i].get(); auto storage_column = storage_columns.tryGetPhysical(name); - if (storage_column && storage_column->type->hasDynamicSubcolumns()) + if (storage_column && storage_column->type->hasDynamicSubcolumnsDeprecated()) res.add(ColumnDescription(std::move(name), DataTypeFactory::instance().get(type_name))); } } diff --git a/tests/ci/ci.py b/tests/ci/ci.py index 08048564383..99555b06bbf 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -17,7 +17,7 @@ from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union import docker_images_helper import upload_result_helper from build_check import get_release_or_pr -from ci_config import CI_CONFIG, Build, CILabels, CIStages, JobNames, StatusNames +from ci_config import CI_CONFIG, Build, CILabels, CIStages, JobNames from ci_utils import GHActions, is_hex, normalize_string from clickhouse_helper import ( CiLogsCredentials, @@ -34,20 +34,18 @@ from commit_status_helper import ( get_commit, post_commit_status, set_status_comment, - update_mergeable_check, - update_upstream_sync_status, ) from digest_helper import DockerDigester, JobDigester from env_helper import ( CI, GITHUB_JOB_API_URL, - GITHUB_REPOSITORY, GITHUB_RUN_URL, - GITHUB_UPSTREAM_REPOSITORY, REPO_COPY, REPORT_PATH, S3_BUILDS_BUCKET, TEMP_PATH, + GITHUB_RUN_ID, + GITHUB_REPOSITORY, ) from get_robot_token import get_best_robot_token from git_helper import GIT_PREFIX, Git @@ -56,7 +54,7 @@ from github_helper import GitHub from pr_info import PRInfo from report import ERROR, SUCCESS, BuildResult, JobReport from s3_helper import S3Helper -from synchronizer_utils import SYNC_BRANCH_PREFIX +from ci_metadata import CiMetadata from version_helper import get_version_from_repo # pylint: disable=too-many-lines @@ -71,12 +69,12 @@ class PendingState: class CiCache: """ CI cache is a bunch of records. Record is a file stored under special location on s3. - The file name has following format + The file name has a format: _[]--___.ci RECORD_TYPE: - SUCCESSFUL - for successfuly finished jobs + SUCCESSFUL - for successful jobs PENDING - for pending jobs ATTRIBUTES: @@ -508,7 +506,7 @@ class CiCache: self, job: str, batch: int, num_batches: int, release_branch: bool ) -> bool: """ - checks if a given job have already been done successfuly + checks if a given job have already been done successfully """ return self.exist( self.RecordType.SUCCESSFUL, job, batch, num_batches, release_branch @@ -749,7 +747,7 @@ class CiOptions: # list of specified jobs to run ci_jobs: Optional[List[str]] = None - # btaches to run for all multi-batch jobs + # batches to run for all multi-batch jobs job_batches: Optional[List[int]] = None do_not_test: bool = False @@ -891,9 +889,9 @@ class CiOptions: for job in job_with_parents: if job in jobs_to_do and job not in jobs_to_do_requested: jobs_to_do_requested.append(job) - assert ( - jobs_to_do_requested - ), f"Include tags are set but no job configured - Invalid tags, probably [{self.include_keywords}]" + print( + f"WARNING: Include tags are set but no job configured - Invalid tags, probably [{self.include_keywords}]" + ) if JobNames.STYLE_CHECK not in jobs_to_do_requested: # Style check must not be omitted jobs_to_do_requested.append(JobNames.STYLE_CHECK) @@ -903,7 +901,7 @@ class CiOptions: if self.ci_sets: for tag in self.ci_sets: label_config = CI_CONFIG.get_label_config(tag) - assert label_config, f"Unknonwn tag [{tag}]" + assert label_config, f"Unknown tag [{tag}]" print( f"NOTE: CI Set's tag: [{tag}], add jobs: [{label_config.run_jobs}]" ) @@ -953,7 +951,7 @@ class CiOptions: jobs_params[job] = { "batches": list(range(num_batches)), "num_batches": num_batches, - "run_if_ci_option_include_set": job_config.run_by_ci_option + "run_by_ci_option": job_config.run_by_ci_option and pr_info.is_pr, } @@ -968,10 +966,7 @@ class CiOptions: for job in jobs_to_do[:]: job_param = jobs_params[job] - if ( - job_param["run_if_ci_option_include_set"] - and job not in jobs_to_do_requested - ): + if job_param["run_by_ci_option"] and job not in jobs_to_do_requested: print( f"Erasing job '{job}' from list because it's not in included set, but will run only by include" ) @@ -996,7 +991,11 @@ def normalize_check_name(check_name: str) -> str: def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace: - # FIXME: consider switching to sub_parser for configure, pre, run, post actions + parser.add_argument( + "--cancel-previous-run", + action="store_true", + help="Action that cancels previous running PR workflow if PR added into the Merge Queue", + ) parser.add_argument( "--configure", action="store_true", @@ -1005,17 +1004,19 @@ def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace: parser.add_argument( "--update-gh-statuses", action="store_true", - help="Action that recreate success GH statuses for jobs that finished successfully in past and will be skipped this time", + help="Action that recreate success GH statuses for jobs that finished successfully in past and will be " + "skipped this time", ) parser.add_argument( "--pre", action="store_true", - help="Action that executes prerequesetes for the job provided in --job-name", + help="Action that executes prerequisites for the job provided in --job-name", ) parser.add_argument( "--run", action="store_true", - help="Action that executes run action for specified --job-name. run_command must be configured for a given job name.", + help="Action that executes run action for specified --job-name. run_command must be configured for a given " + "job name.", ) parser.add_argument( "--post", @@ -1080,7 +1081,7 @@ def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace: "--skip-jobs", action="store_true", default=False, - help="skip fetching data about job runs, used in --configure action (for debugging and nigthly ci)", + help="skip fetching data about job runs, used in --configure action (for debugging and nightly ci)", ) parser.add_argument( "--force", @@ -1093,7 +1094,8 @@ def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace: "--rebuild-all-binaries", action="store_true", default=False, - help="[DEPRECATED. to be removed, once no wf use it] will create run config without skipping build jobs in any case, used in --configure action (for release branches)", + help="[DEPRECATED. to be removed, once no wf use it] will create run config without skipping build jobs in " + "any case, used in --configure action (for release branches)", ) parser.add_argument( "--commit-message", @@ -1298,7 +1300,7 @@ def _configure_docker_jobs(docker_digest_or_latest: bool) -> Dict: missing_amd64 = [] missing_aarch64 = [] if not docker_digest_or_latest: - # look for missing arm and amd images only among missing multiarch manifests @missing_multi_dict + # look for missing arm and amd images only among missing multi-arch manifests @missing_multi_dict # to avoid extra dockerhub api calls missing_amd64 = list( check_missing_images_on_dockerhub(missing_multi_dict, "amd64") @@ -1396,7 +1398,7 @@ def _configure_jobs( ): continue - # fill job randomization buckets (for jobs with configured @random_bucket property)) + # fill job randomization buckets (for jobs with configured @random_bucket property) if job_config.random_bucket: if not job_config.random_bucket in randomization_buckets: randomization_buckets[job_config.random_bucket] = set() @@ -1445,8 +1447,7 @@ def _configure_jobs( jobs_params[job] = { "batches": batches_to_do, "num_batches": num_batches, - "run_if_ci_option_include_set": job_config.run_by_ci_option - and pr_info.is_pr, + "run_by_ci_option": job_config.run_by_ci_option and pr_info.is_pr, } elif add_to_skip: # treat job as being skipped only if it's controlled by digest @@ -1490,8 +1491,8 @@ def _configure_jobs( def _generate_ci_stage_config(jobs_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: """ populates GH Actions' workflow with real jobs - "Builds_1": [{"job_name": NAME, "runner_type": RUNER_TYPE}] - "Tests_1": [{"job_name": NAME, "runner_type": RUNER_TYPE}] + "Builds_1": [{"job_name": NAME, "runner_type": RUNNER_TYPE}] + "Tests_1": [{"job_name": NAME, "runner_type": RUNNER_TYPE}] ... """ result = {} # type: Dict[str, Any] @@ -1582,7 +1583,7 @@ def _fetch_commit_tokens(message: str, pr_info: PRInfo) -> List[str]: for match in matches if match in CILabels or match.startswith("job_") or match.startswith("batch_") ] - print(f"CI modifyers from commit message: [{res}]") + print(f"CI modifiers from commit message: [{res}]") res_2 = [] if pr_info.is_pr: matches = [match[-1] for match in re.findall(pattern, pr_info.body)] @@ -1593,7 +1594,7 @@ def _fetch_commit_tokens(message: str, pr_info: PRInfo) -> List[str]: or match.startswith("job_") or match.startswith("batch_") ] - print(f"CI modifyers from PR body: [{res_2}]") + print(f"CI modifiers from PR body: [{res_2}]") return list(set(res + res_2)) @@ -1659,7 +1660,7 @@ def _upload_build_artifacts( report_url = ci_cache.upload_build_report(build_result) print(f"Report file has been uploaded to [{report_url}]") - # Upload head master binaries + # Upload master head's binaries static_bin_name = CI_CONFIG.build_config[build_name].static_binary_name if pr_info.is_master and static_bin_name: # Full binary with debug info: @@ -1907,6 +1908,15 @@ def _get_ext_check_name(check_name: str) -> str: return check_name_with_group +def _cancel_pr_wf(s3: S3Helper, pr_number: int) -> None: + run_id = CiMetadata(s3, pr_number).fetch_meta().run_id + if not run_id: + print(f"ERROR: FIX IT: Run id has not been found PR [{pr_number}]!") + else: + print(f"Canceling PR workflow run_id: [{run_id}], pr: [{pr_number}]") + GitHub.cancel_wf(GITHUB_REPOSITORY, get_best_robot_token(), run_id) + + def main() -> int: logging.basicConfig(level=logging.INFO) exit_code = 0 @@ -1935,6 +1945,12 @@ def main() -> int: ### CONFIGURE action: start if args.configure: + if CI and pr_info.is_pr: + # store meta on s3 (now we need it only for PRs) + meta = CiMetadata(s3, pr_info.number) + meta.run_id = int(GITHUB_RUN_ID) + meta.push_meta() + ci_options = CiOptions.create_from_pr_message( args.commit_message or None, update_from_api=True ) @@ -2189,39 +2205,6 @@ def main() -> int: pr_info, dump_to_file=True, ) - if not pr_info.is_merge_queue: - # in the merge queue mergeable status must be set only in FinishCheck (last job in wf) - mergeable_status = update_mergeable_check( - commit, - pr_info, - job_report.check_name or _get_ext_check_name(args.job_name), - ) - - # Process upstream StatusNames.SYNC - if ( - pr_info.head_ref.startswith(f"{SYNC_BRANCH_PREFIX}/pr/") - and mergeable_status - and GITHUB_REPOSITORY != GITHUB_UPSTREAM_REPOSITORY - ): - upstream_pr_number = int( - pr_info.head_ref.split("/pr/", maxsplit=1)[1] - ) - update_upstream_sync_status( - upstream_pr_number, pr_info.number, gh, mergeable_status - ) - prepared_events = prepare_tests_results_for_clickhouse( - pr_info, - [], - job_report.status, - 0, - job_report.start_time, - f"https://github.com/ClickHouse/ClickHouse/pull/{upstream_pr_number}", - StatusNames.SYNC, - ) - prepared_events[0]["test_context_raw"] = args.job_name - ch_helper.insert_events_into( - db="default", table="checks", events=prepared_events - ) print(f"Job report url: [{check_url}]") prepared_events = prepare_tests_results_for_clickhouse( @@ -2260,6 +2243,13 @@ def main() -> int: assert indata, "Run config must be provided via --infile" _update_gh_statuses_action(indata=indata, s3=s3) + ### CANCEL PREVIOUS WORKFLOW RUN + elif args.cancel_previous_run: + assert ( + pr_info.is_merge_queue + ), "Currently it's supposed to be used in MQ wf to cancel running PR wf if any" + _cancel_pr_wf(s3, pr_info.merged_pr) + ### print results _print_results(result, args.outfile, args.pretty) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index dcd2a5a4228..68fa6f1cf10 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -26,6 +26,7 @@ class CIStages(metaclass=WithIter): BUILDS_2 = "Builds_2" TESTS_1 = "Tests_1" TESTS_2 = "Tests_2" + TESTS_3 = "Tests_3" class Runners(metaclass=WithIter): @@ -51,9 +52,9 @@ class CILabels(metaclass=WithIter): CI_SET_ARM = "ci_set_arm" CI_SET_INTEGRATION = "ci_set_integration" CI_SET_OLD_ANALYZER = "ci_set_old_analyzer" - CI_SET_STATLESS = "ci_set_stateless" + CI_SET_STATELESS = "ci_set_stateless" CI_SET_STATEFUL = "ci_set_stateful" - CI_SET_STATLESS_ASAN = "ci_set_stateless_asan" + CI_SET_STATELESS_ASAN = "ci_set_stateless_asan" CI_SET_STATEFUL_ASAN = "ci_set_stateful_asan" libFuzzer = "libFuzzer" @@ -205,7 +206,7 @@ class DigestConfig: include_paths: List[Union[str, Path]] = field(default_factory=list) # file suffixes to exclude from digest exclude_files: List[str] = field(default_factory=list) - # directories to exlude from digest + # directories to exclude from digest exclude_dirs: List[Union[str, Path]] = field(default_factory=list) # docker names to include into digest docker: List[str] = field(default_factory=list) @@ -216,7 +217,7 @@ class DigestConfig: @dataclass class LabelConfig: """ - configures different CI scenarious per GH label + configures different CI scenarios per GH label """ run_jobs: Iterable[str] = frozenset() @@ -230,7 +231,7 @@ class JobConfig: # configures digest calculation for the job digest: DigestConfig = field(default_factory=DigestConfig) - # will be triggered for the job if omited in CI workflow yml + # will be triggered for the job if omitted in CI workflow yml run_command: str = "" # job timeout, seconds timeout: Optional[int] = None @@ -241,7 +242,7 @@ class JobConfig: # to run always regardless of the job digest or/and label run_always: bool = False # if the job needs to be run on the release branch, including master (e.g. building packages, docker server). - # NOTE: Subsequent runs on the same branch with the similar digest are still considered skippable. + # NOTE: Subsequent runs on the same branch with the similar digest are still considered skip-able. required_on_release_branch: bool = False # job is for pr workflow only pr_only: bool = False @@ -469,7 +470,7 @@ compatibility_test_common_params = { "digest": compatibility_check_digest, "run_command": "compatibility_check.py", } -statless_test_common_params = { +stateless_test_common_params = { "digest": stateless_check_digest, "run_command": 'functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT', "timeout": 10800, @@ -581,7 +582,6 @@ class CIConfig: elif job_name == JobNames.BUILD_CHECK_SPECIAL: stage_type = CIStages.TESTS_2 elif self.is_test_job(job_name): - stage_type = CIStages.TESTS_1 if job_name in CI_CONFIG.test_configs: required_build = CI_CONFIG.test_configs[job_name].required_build assert required_build @@ -593,6 +593,8 @@ class CIConfig: stage_type = CIStages.TESTS_2 else: stage_type = CIStages.TESTS_1 + if job_name not in REQUIRED_CHECKS: + stage_type = CIStages.TESTS_3 assert stage_type, f"BUG [{job_name}]" return stage_type @@ -663,7 +665,7 @@ class CIConfig: # crosscompile - no arm required pass else: - # switch to aarch64 runnner + # switch to aarch64 runner result += "-aarch64" return result @@ -710,7 +712,7 @@ class CIConfig: break assert ( res - ), f"Error: Experimantal feature... Invlid request or not supported job [{check_name}]" + ), f"Error: Experimental feature... Invalid request or not supported job [{check_name}]" return res def get_digest_config(self, check_name: str) -> DigestConfig: @@ -813,16 +815,16 @@ class CIConfig: f"The following names of the build report '{build_report_name}' " f"are missed in build_config: {missed_names}", ) - # And finally, all of tests' requirements must be in the builds + # And finally, all tests' requirements must be in the builds for test_name, test_config in self.test_configs.items(): if test_config.required_build not in self.build_config.keys(): logging.error( - "The requierment '%s' for '%s' is not found in builds", + "The requirement '%s' for '%s' is not found in builds", test_config, test_name, ) errors.append( - f"The requierment '{test_config}' for " + f"The requirement '{test_config}' for " f"'{test_name}' is not found in builds" ) @@ -863,7 +865,7 @@ CI_CONFIG = CIConfig( JobNames.INTEGRATION_TEST_ASAN_OLD_ANALYZER, ] ), - CILabels.CI_SET_STATLESS: LabelConfig( + CILabels.CI_SET_STATELESS: LabelConfig( run_jobs=[ JobNames.STYLE_CHECK, JobNames.FAST_TEST, @@ -871,7 +873,7 @@ CI_CONFIG = CIConfig( JobNames.STATELESS_TEST_RELEASE, ] ), - CILabels.CI_SET_STATLESS_ASAN: LabelConfig( + CILabels.CI_SET_STATELESS_ASAN: LabelConfig( run_jobs=[ JobNames.STYLE_CHECK, JobNames.FAST_TEST, @@ -1178,49 +1180,49 @@ CI_CONFIG = CIConfig( # End stateful tests for parallel replicas JobNames.STATELESS_TEST_ASAN: TestConfig( Build.PACKAGE_ASAN, - job_config=JobConfig(num_batches=4, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=4, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_TSAN: TestConfig( Build.PACKAGE_TSAN, - job_config=JobConfig(num_batches=5, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=5, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_MSAN: TestConfig( Build.PACKAGE_MSAN, - job_config=JobConfig(num_batches=6, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=6, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_UBSAN: TestConfig( Build.PACKAGE_UBSAN, - job_config=JobConfig(num_batches=2, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=2, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_DEBUG: TestConfig( Build.PACKAGE_DEBUG, - job_config=JobConfig(num_batches=5, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=5, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_RELEASE: TestConfig( - Build.PACKAGE_RELEASE, job_config=JobConfig(**statless_test_common_params) # type: ignore + Build.PACKAGE_RELEASE, job_config=JobConfig(**stateless_test_common_params) # type: ignore ), JobNames.STATELESS_TEST_RELEASE_COVERAGE: TestConfig( Build.PACKAGE_RELEASE_COVERAGE, - job_config=JobConfig(num_batches=6, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=6, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_AARCH64: TestConfig( - Build.PACKAGE_AARCH64, job_config=JobConfig(**statless_test_common_params) # type: ignore + Build.PACKAGE_AARCH64, job_config=JobConfig(**stateless_test_common_params) # type: ignore ), JobNames.STATELESS_TEST_OLD_ANALYZER_S3_REPLICATED_RELEASE: TestConfig( Build.PACKAGE_RELEASE, - job_config=JobConfig(num_batches=4, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=4, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_S3_DEBUG: TestConfig( Build.PACKAGE_DEBUG, - job_config=JobConfig(num_batches=6, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=6, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_AZURE_ASAN: TestConfig( Build.PACKAGE_ASAN, - job_config=JobConfig(num_batches=4, **statless_test_common_params, release_only=True, run_by_ci_option=True), # type: ignore + job_config=JobConfig(num_batches=4, **stateless_test_common_params, release_only=True, run_by_ci_option=True), # type: ignore ), JobNames.STATELESS_TEST_S3_TSAN: TestConfig( Build.PACKAGE_TSAN, - job_config=JobConfig(num_batches=5, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=5, **stateless_test_common_params), # type: ignore ), JobNames.STRESS_TEST_DEBUG: TestConfig( Build.PACKAGE_DEBUG, job_config=JobConfig(**stress_test_common_params) # type: ignore @@ -1269,8 +1271,7 @@ CI_CONFIG = CIConfig( ), JobNames.INTEGRATION_TEST_ARM: TestConfig( Build.PACKAGE_AARCH64, - # add [run_by_label="test arm"] to not run in regular pr workflow by default - job_config=JobConfig(num_batches=6, **integration_test_common_params, run_by_label="test arm"), # type: ignore + job_config=JobConfig(num_batches=6, **integration_test_common_params), # type: ignore ), JobNames.INTEGRATION_TEST: TestConfig( Build.PACKAGE_RELEASE, @@ -1324,7 +1325,7 @@ CI_CONFIG = CIConfig( JobNames.STATELESS_TEST_FLAKY_ASAN: TestConfig( # replace to non-default Build.PACKAGE_ASAN, - job_config=JobConfig(pr_only=True, **{**statless_test_common_params, "timeout": 3600}), # type: ignore + job_config=JobConfig(pr_only=True, **{**stateless_test_common_params, "timeout": 3600}), # type: ignore ), JobNames.JEPSEN_KEEPER: TestConfig( Build.BINARY_RELEASE, @@ -1484,7 +1485,7 @@ CHECK_DESCRIPTIONS = [ "Checks if new added or modified tests are flaky by running them repeatedly, " "in parallel, with more randomization. Functional tests are run 100 times " "with address sanitizer, and additional randomization of thread scheduling. " - "Integrational tests are run up to 10 times. If at least once a new test has " + "Integration tests are run up to 10 times. If at least once a new test has " "failed, or was too long, this check will be red. We don't allow flaky tests, " 'read the doc', @@ -1574,7 +1575,7 @@ CHECK_DESCRIPTIONS = [ lambda x: x.startswith("ClickBench"), ), CheckDescription( - "Falback for unknown", + "Fallback for unknown", "There's no description for the check yet, please add it to " "tests/ci/ci_config.py:CHECK_DESCRIPTIONS", lambda x: True, diff --git a/tests/ci/ci_metadata.py b/tests/ci/ci_metadata.py new file mode 100644 index 00000000000..82d44cf1adc --- /dev/null +++ b/tests/ci/ci_metadata.py @@ -0,0 +1,116 @@ +from pathlib import Path +from typing import Optional + +from env_helper import ( + S3_BUILDS_BUCKET, + TEMP_PATH, +) +from s3_helper import S3Helper +from ci_utils import GHActions + + +# pylint: disable=too-many-lines + + +class CiMetadata: + """ + CI Metadata class owns data like workflow run_id for a given pr, etc. + Goal is to have everything we need to manage workflows on S3 and rely on GH api as little as possible + """ + + _S3_PREFIX = "CI_meta_v1" + _LOCAL_PATH = Path(TEMP_PATH) / "ci_meta" + _FILE_SUFFIX = ".cimd" + _FILENAME_RUN_ID = "run_id" + _FILE_SUFFIX + + def __init__( + self, + s3: S3Helper, + pr_number: Optional[int] = None, + sha: Optional[str] = None, + git_ref: Optional[str] = None, + ): + assert pr_number or (sha and git_ref) + + self.sha = sha + self.pr_number = pr_number + self.git_ref = git_ref + self.s3 = s3 + self.run_id = 0 + + if self.pr_number: + self.s3_path = f"{self._S3_PREFIX}/PRs/{self.pr_number}/" + else: + self.s3_path = f"{self._S3_PREFIX}/{self.git_ref}/{self.sha}/" + + self._updated = False + + if not self._LOCAL_PATH.exists(): + self._LOCAL_PATH.mkdir(parents=True, exist_ok=True) + + def fetch_meta(self): + """ + Fetches meta from s3 + """ + + # clean up + for file in self._LOCAL_PATH.glob("*" + self._FILE_SUFFIX): + file.unlink() + + _ = self.s3.download_files( + bucket=S3_BUILDS_BUCKET, + s3_path=self.s3_path, + file_suffix=self._FILE_SUFFIX, + local_directory=self._LOCAL_PATH, + ) + + meta_files = Path(self._LOCAL_PATH).rglob("*" + self._FILE_SUFFIX) + for file_name in meta_files: + path_in_str = str(file_name) + with open(path_in_str, "r", encoding="utf-8") as f: + # Read all lines in the file + lines = f.readlines() + assert len(lines) == 1 + if file_name.name == self._FILENAME_RUN_ID: + self.run_id = int(lines[0]) + + self._updated = True + return self + + def push_meta( + self, + ) -> None: + """ + Uploads meta on s3 + """ + assert self.run_id + GHActions.print_in_group( + f"Storing workflow metadata: PR [{self.pr_number}]", + [f"run_id: {self.run_id}"], + ) + + local_file = self._LOCAL_PATH / self._FILENAME_RUN_ID + with open(local_file, "w", encoding="utf-8") as file: + file.write(f"{self.run_id}\n") + + _ = self.s3.upload_file( + bucket=S3_BUILDS_BUCKET, + file_path=local_file, + s3_path=self.s3_path + local_file.name, + ) + + +if __name__ == "__main__": + # TEST: + s3 = S3Helper() + a = CiMetadata(s3, 12345, "deadbeaf", "test_branch") + a.run_id = 111 + a.push_meta() + b = CiMetadata(s3, 12345, "deadbeaf", "test_branch") + assert b.fetch_meta().run_id == a.run_id + + a = CiMetadata(s3, 0, "deadbeaf", "test_branch") + a.run_id = 112 + a.push_meta() + b = CiMetadata(s3, 0, "deadbeaf", "test_branch") + assert b.fetch_meta().run_id == a.run_id diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index fc939a08e11..e1c47353743 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -447,9 +447,7 @@ def set_mergeable_check( ) -def update_mergeable_check( - commit: Commit, pr_info: PRInfo, check_name: str -) -> Optional[CommitStatus]: +def update_mergeable_check(commit: Commit, pr_info: PRInfo, check_name: str) -> None: "check if the check_name in REQUIRED_CHECKS and then trigger update" not_run = ( pr_info.labels.intersection({Labels.SKIP_MERGEABLE_CHECK, Labels.RELEASE}) @@ -460,17 +458,21 @@ def update_mergeable_check( if not_run: # Let's avoid unnecessary work - return None + return logging.info("Update Mergeable Check by %s", check_name) statuses = get_commit_filtered_statuses(commit) - return trigger_mergeable_check(commit, statuses) + trigger_mergeable_check(commit, statuses) def trigger_mergeable_check( - commit: Commit, statuses: CommitStatuses, hide_url: bool = False -) -> CommitStatus: + commit: Commit, + statuses: CommitStatuses, + hide_url: bool = False, + set_if_green: bool = False, + workflow_failed: bool = False, +) -> StatusType: """calculate and update StatusNames.MERGEABLE""" required_checks = [status for status in statuses if is_required(status.context)] @@ -498,19 +500,27 @@ def trigger_mergeable_check( if fail: description = "failed: " + ", ".join(fail) state = FAILURE + elif workflow_failed: + description = "check workflow failures" + state = FAILURE description = format_description(description) - if mergeable_status is None or mergeable_status.description != description: - return set_mergeable_check(commit, description, state, hide_url) + if not set_if_green and state == SUCCESS: + # do not set green Mergeable Check status + pass + else: + if mergeable_status is None or mergeable_status.description != description: + set_mergeable_check(commit, description, state, hide_url) - return mergeable_status + return state def update_upstream_sync_status( upstream_pr_number: int, sync_pr_number: int, gh: Github, - mergeable_status: CommitStatus, + state: StatusType, + can_set_green_mergeable_status: bool = False, ) -> None: upstream_repo = gh.get_repo(GITHUB_UPSTREAM_REPOSITORY) upstream_pr = upstream_repo.get_pull(upstream_pr_number) @@ -518,46 +528,41 @@ def update_upstream_sync_status( sync_pr = sync_repo.get_pull(sync_pr_number) # Find the commit that is in both repos, upstream and cloud sync_commits = sync_pr.get_commits().reversed - upstream_commits = upstream_pr.get_commits() + upstream_commits = upstream_pr.get_commits().reversed # Github objects are compared by _url attribute. We can't compare them directly and # should compare commits by SHA1 - upstream_shas = [uc.sha for uc in upstream_commits] + upstream_shas = [c.sha for c in upstream_commits] logging.info("Commits in upstream PR:\n %s", ", ".join(upstream_shas)) - sync_shas = [uc.sha for uc in upstream_commits] + sync_shas = [c.sha for c in sync_commits] logging.info("Commits in sync PR:\n %s", ", ".join(reversed(sync_shas))) - found = False - for commit in sync_commits: - try: - idx = upstream_shas.index(commit.sha) - found = True - upstream_commit = upstream_commits[idx] + + # find latest synced commit + last_synced_upstream_commit = None + for commit in upstream_commits: + if commit.sha in sync_shas: + last_synced_upstream_commit = commit break - except ValueError: - continue - if not found: - logging.info( - "There's no same commits in upstream and sync PRs, probably force-push" - ) - return + assert last_synced_upstream_commit - sync_status = get_status(mergeable_status.state) + sync_status = get_status(state) logging.info( "Using commit %s to post the %s status `%s`: [%s]", - upstream_commit.sha, + last_synced_upstream_commit.sha, sync_status, StatusNames.SYNC, - mergeable_status.description, + "", ) post_commit_status( - upstream_commit, + last_synced_upstream_commit, sync_status, "", # let's won't expose any urls from cloud - mergeable_status.description, + "", StatusNames.SYNC, ) trigger_mergeable_check( - upstream_commit, - get_commit_filtered_statuses(upstream_commit), + last_synced_upstream_commit, + get_commit_filtered_statuses(last_synced_upstream_commit), True, + set_if_green=can_set_green_mergeable_status, ) diff --git a/tests/ci/finish_check.py b/tests/ci/finish_check.py index a66ebbeadf4..1a7000f5353 100644 --- a/tests/ci/finish_check.py +++ b/tests/ci/finish_check.py @@ -11,10 +11,13 @@ from commit_status_helper import ( post_commit_status, set_mergeable_check, trigger_mergeable_check, + update_upstream_sync_status, ) from get_robot_token import get_best_robot_token from pr_info import PRInfo from report import PENDING, SUCCESS +from synchronizer_utils import SYNC_BRANCH_PREFIX +from env_helper import GITHUB_REPOSITORY, GITHUB_UPSTREAM_REPOSITORY def main(): @@ -40,7 +43,21 @@ def main(): set_mergeable_check(commit, "workflow passed", "success") else: statuses = get_commit_filtered_statuses(commit) - trigger_mergeable_check(commit, statuses) + state = trigger_mergeable_check(commit, statuses, set_if_green=True) + + # Process upstream StatusNames.SYNC + if ( + pr_info.head_ref.startswith(f"{SYNC_BRANCH_PREFIX}/pr/") + and GITHUB_REPOSITORY != GITHUB_UPSTREAM_REPOSITORY + ): + upstream_pr_number = int(pr_info.head_ref.split("/pr/", maxsplit=1)[1]) + update_upstream_sync_status( + upstream_pr_number, + pr_info.number, + gh, + state, + can_set_green_mergeable_status=True, + ) statuses = [s for s in statuses if s.context == StatusNames.CI] if not statuses: diff --git a/tests/ci/github_helper.py b/tests/ci/github_helper.py index ae1eaf4c06a..eb0f6c24527 100644 --- a/tests/ci/github_helper.py +++ b/tests/ci/github_helper.py @@ -9,6 +9,7 @@ from time import sleep from typing import List, Optional, Tuple, Union import github +import requests # explicit reimport # pylint: disable=useless-import-alias @@ -260,3 +261,17 @@ class GitHub(github.Github): def retries(self, value: int) -> None: assert isinstance(value, int) self._retries = value + + # static methods not using pygithub + @staticmethod + def cancel_wf(repo, run_id, token, strict=False): + headers = {"Authorization": f"token {token}"} + url = f"https://api.github.com/repos/{repo}/actions/runs/{run_id}/cancel" + try: + response = requests.post(url, headers=headers, timeout=10) + response.raise_for_status() + print(f"NOTE: Workflow [{run_id}] has been cancelled") + except Exception as ex: + print("ERROR: Got exception executing wf cancel request", ex) + if strict: + raise ex diff --git a/tests/ci/merge_pr.py b/tests/ci/merge_pr.py index 450ece62d4b..500de4eb718 100644 --- a/tests/ci/merge_pr.py +++ b/tests/ci/merge_pr.py @@ -13,7 +13,11 @@ from github.PaginatedList import PaginatedList from github.PullRequestReview import PullRequestReview from github.WorkflowRun import WorkflowRun -from commit_status_helper import get_commit_filtered_statuses +from commit_status_helper import ( + get_commit_filtered_statuses, + get_commit, + trigger_mergeable_check, +) from get_robot_token import get_best_robot_token from github_helper import GitHub, NamedUser, PullRequest, Repository from pr_info import PRInfo @@ -173,6 +177,17 @@ def parse_args() -> argparse.Namespace: action="store_true", help="if set, the script won't merge the PR, just check the conditions", ) + parser.add_argument( + "--set-ci-status", + action="store_true", + help="if set, only update/set Mergeable Check status", + ) + parser.add_argument( + "--wf-status", + type=str, + default="", + help="overall workflow status [success|failure]. used with --set-ci-status only", + ) parser.add_argument( "--check-approved", action="store_true", @@ -226,6 +241,21 @@ def main(): token = args.token or get_best_robot_token() gh = GitHub(token) repo = gh.get_repo(args.repo) + + if args.set_ci_status: + assert args.wf_status in ("failure", "success") + # set mergeable check status and exit + commit = get_commit(gh, args.pr_info.sha) + statuses = get_commit_filtered_statuses(commit) + trigger_mergeable_check( + commit, + statuses, + hide_url=False, + set_if_green=True, + workflow_failed=(args.wf_status != "success"), + ) + return + # An ugly and not nice fix to patch the wrong organization URL, # see https://github.com/PyGithub/PyGithub/issues/2395#issuecomment-1378629710 # pylint: disable=protected-access diff --git a/tests/ci/test_ci_options.py b/tests/ci/test_ci_options.py index 0f10f7d4f85..c07c094d439 100644 --- a/tests/ci/test_ci_options.py +++ b/tests/ci/test_ci_options.py @@ -161,7 +161,7 @@ class TestCIOptions(unittest.TestCase): "Stateless tests (azure, asan)": { "batches": list(range(3)), "num_batches": 3, - "run_if_ci_option_include_set": True, + "run_by_ci_option": True, } } jobs_to_do, jobs_to_skip, job_params = ci_options.apply( @@ -226,10 +226,10 @@ class TestCIOptions(unittest.TestCase): job_params[job] = { "batches": list(range(3)), "num_batches": 3, - "run_if_ci_option_include_set": "azure" in job, + "run_by_ci_option": "azure" in job, } else: - job_params[job] = {"run_if_ci_option_include_set": False} + job_params[job] = {"run_by_ci_option": False} jobs_to_do, jobs_to_skip, job_params = ci_options.apply( jobs_to_do, jobs_to_skip, job_params, PRInfo() diff --git a/tests/config/config.d/max_num_to_warn.xml b/tests/config/config.d/max_num_to_warn.xml index 776c270823d..1f55e6fd674 100644 --- a/tests/config/config.d/max_num_to_warn.xml +++ b/tests/config/config.d/max_num_to_warn.xml @@ -1,5 +1,7 @@ 5 + 5 + 5 2 10 diff --git a/tests/fuzz/dictionaries/datatypes.dict b/tests/fuzz/dictionaries/datatypes.dict index 232e89db0c0..a01a94fd3e3 100644 --- a/tests/fuzz/dictionaries/datatypes.dict +++ b/tests/fuzz/dictionaries/datatypes.dict @@ -132,3 +132,4 @@ "YEAR" "bool" "boolean" +"Dynamic" diff --git a/tests/integration/test_azure_blob_storage_native_copy/test.py b/tests/integration/test_azure_blob_storage_native_copy/test.py index 4f543e4c8b2..77d400240b1 100644 --- a/tests/integration/test_azure_blob_storage_native_copy/test.py +++ b/tests/integration/test_azure_blob_storage_native_copy/test.py @@ -110,6 +110,11 @@ def cluster(): main_configs=[path], with_azurite=True, ) + cluster.add_instance( + "node3", + main_configs=[path], + with_azurite=True, + ) cluster.start() yield cluster @@ -216,3 +221,37 @@ def test_backup_restore_on_merge_tree_different_container(cluster): azure_query(node2, f"DROP TABLE test_simple_merge_tree_different_bucket") azure_query(node2, f"DROP TABLE test_simple_merge_tree_different_bucket_restored") + + +def test_backup_restore_on_merge_tree_native_copy_async(cluster): + node3 = cluster.instances["node3"] + azure_query( + node3, + f"CREATE TABLE test_simple_merge_tree_async(key UInt64, data String) Engine = MergeTree() ORDER BY tuple() SETTINGS storage_policy='policy_azure_cache'", + ) + azure_query(node3, f"INSERT INTO test_simple_merge_tree_async VALUES (1, 'a')") + + backup_destination = f"AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_merge_tree_async_backup')" + print("BACKUP DEST", backup_destination) + azure_query( + node3, + f"BACKUP TABLE test_simple_merge_tree_async TO {backup_destination}", + settings={"azure_max_single_part_copy_size": 0}, + ) + + assert node3.contains_in_log("using native copy") + + azure_query( + node3, + f"RESTORE TABLE test_simple_merge_tree_async AS test_simple_merge_tree_async_restored FROM {backup_destination};", + settings={"azure_max_single_part_copy_size": 0}, + ) + assert ( + azure_query(node3, f"SELECT * from test_simple_merge_tree_async_restored") + == "1\ta\n" + ) + + assert node3.contains_in_log("using native copy") + + azure_query(node3, f"DROP TABLE test_simple_merge_tree_async") + azure_query(node3, f"DROP TABLE test_simple_merge_tree_async_restored") diff --git a/tests/integration/test_backup_restore_azure_blob_storage/test.py b/tests/integration/test_backup_restore_azure_blob_storage/test.py index 1a1458cb68e..78b186e3227 100644 --- a/tests/integration/test_backup_restore_azure_blob_storage/test.py +++ b/tests/integration/test_backup_restore_azure_blob_storage/test.py @@ -281,7 +281,10 @@ def test_backup_restore_on_merge_tree(cluster): node = cluster.instances["node"] azure_query( node, - f"CREATE TABLE test_simple_merge_tree(key UInt64, data String) Engine = MergeTree() ORDER BY tuple() SETTINGS storage_policy='blob_storage_policy'", + f""" + DROP TABLE IF EXISTS test_simple_merge_tree; + CREATE TABLE test_simple_merge_tree(key UInt64, data String) Engine = MergeTree() ORDER BY tuple() SETTINGS storage_policy='blob_storage_policy' + """, ) azure_query(node, f"INSERT INTO test_simple_merge_tree VALUES (1, 'a')") @@ -299,3 +302,85 @@ def test_backup_restore_on_merge_tree(cluster): ) azure_query(node, f"DROP TABLE test_simple_merge_tree") azure_query(node, f"DROP TABLE test_simple_merge_tree_restored") + + +def test_backup_restore_correct_block_ids(cluster): + node = cluster.instances["node"] + azure_query( + node, + f""" + DROP TABLE IF EXISTS test_simple_merge_tree; + CREATE TABLE test_simple_merge_tree(key UInt64, data String) + Engine = MergeTree() + ORDER BY tuple() + SETTINGS storage_policy='blob_storage_policy'""", + ) + data_query = "SELECT number, repeat('a', 100) FROM numbers(1000)" + azure_query( + node, + f"INSERT INTO test_simple_merge_tree {data_query}", + ) + + for min_upload_size, max_upload_size, max_blocks, expected_block_size in [ + (42, 100, 1000, 42), + (42, 52, 86, 52), + ]: + data_path = f"test_backup_correct_block_ids_{max_blocks}" + + backup_destination = f"AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', '{data_path}')" + azure_query( + node, + f""" + SET azure_min_upload_part_size = {min_upload_size}; + SET azure_max_upload_part_size = {max_upload_size}; + SET azure_max_blocks_in_multipart_upload = {max_blocks}; + BACKUP TABLE test_simple_merge_tree TO {backup_destination} SETTINGS allow_azure_native_copy = 0; + """, + ) + + port = cluster.azurite_port + connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" + f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" + ) + container_name = "cont" + blob_service_client = BlobServiceClient.from_connection_string( + connection_string + ) + container_client = blob_service_client.get_container_client(container_name) + blobs = container_client.list_blobs() + + data_blob = ( + f"{data_path}/data/default/test_simple_merge_tree/all_1_1_0/data.bin" + ) + found = False + for blob in blobs: + if data_blob == blob.get("name"): + found = True + break + assert found + + blob_client = blob_service_client.get_blob_client( + blob=data_blob, container=container_name + ) + + blocks_num = len(blob_client.get_block_list()[0]) + assert blocks_num > 50 + + count = 0 + for block in blob_client.get_block_list()[0]: + count += 1 + if count < blocks_num: + assert block.get("size") == expected_block_size + else: + assert block.get("size") < expected_block_size + + azure_query( + node, + f"RESTORE TABLE test_simple_merge_tree AS test_simple_merge_tree_restored_{max_blocks} FROM {backup_destination};", + ) + assert azure_query( + node, + f"SELECT * from test_simple_merge_tree_restored_{max_blocks} ORDER BY key", + ) == node.query(data_query) diff --git a/tests/integration/test_distributed_inter_server_secret/configs/users.d/new_user.xml b/tests/integration/test_distributed_inter_server_secret/configs/users.d/new_user.xml new file mode 100644 index 00000000000..a747d61a0dd --- /dev/null +++ b/tests/integration/test_distributed_inter_server_secret/configs/users.d/new_user.xml @@ -0,0 +1,12 @@ + + + + + + ::/0 + + default + default + + + diff --git a/tests/integration/test_distributed_inter_server_secret/test.py b/tests/integration/test_distributed_inter_server_secret/test.py index 10dbb23d961..50d7be4d11e 100644 --- a/tests/integration/test_distributed_inter_server_secret/test.py +++ b/tests/integration/test_distributed_inter_server_secret/test.py @@ -12,12 +12,16 @@ from helpers.cluster import ClickHouseCluster, CLICKHOUSE_CI_MIN_TESTED_VERSION cluster = ClickHouseCluster(__file__) -def make_instance(name, cfg, *args, **kwargs): +def make_instance(name, *args, **kwargs): + main_configs = kwargs.pop("main_configs", []) + main_configs.append("configs/remote_servers.xml") + user_configs = kwargs.pop("user_configs", []) + user_configs.append("configs/users.xml") return cluster.add_instance( name, with_zookeeper=True, - main_configs=["configs/remote_servers.xml", cfg], - user_configs=["configs/users.xml"], + main_configs=main_configs, + user_configs=user_configs, *args, **kwargs, ) @@ -27,11 +31,16 @@ def make_instance(name, cfg, *args, **kwargs): assert CLICKHOUSE_CI_MIN_TESTED_VERSION < "23.3" # _n1/_n2 contains cluster with different -- should fail -n1 = make_instance("n1", "configs/remote_servers_n1.xml") -n2 = make_instance("n2", "configs/remote_servers_n2.xml") +# only n1 contains new_user +n1 = make_instance( + "n1", + main_configs=["configs/remote_servers_n1.xml"], + user_configs=["configs/users.d/new_user.xml"], +) +n2 = make_instance("n2", main_configs=["configs/remote_servers_n2.xml"]) backward = make_instance( "backward", - "configs/remote_servers_backward.xml", + main_configs=["configs/remote_servers_backward.xml"], image="clickhouse/clickhouse-server", # version without DBMS_MIN_REVISION_WITH_INTERSERVER_SECRET_V2 tag=CLICKHOUSE_CI_MIN_TESTED_VERSION, @@ -100,6 +109,12 @@ def bootstrap(): ) """ ) + n.query( + """ + CREATE TABLE dist_over_dist_secure AS data + Engine=Distributed(secure, currentDatabase(), dist_secure, key) + """ + ) @pytest.fixture(scope="module", autouse=True) @@ -432,3 +447,20 @@ def test_user_secure_cluster_from_backward(user, password): assert n1.contains_in_log( "Using deprecated interserver protocol because the client is too old. Consider upgrading all nodes in cluster." ) + + +def test_secure_cluster_distributed_over_distributed_different_users(): + # This works because we will have initial_user='default' + n1.query( + "SELECT * FROM remote('n1', currentDatabase(), dist_secure)", user="new_user" + ) + # While this is broken because now initial_user='new_user', and n2 does not has it + with pytest.raises(QueryRuntimeException): + n2.query( + "SELECT * FROM remote('n1', currentDatabase(), dist_secure, 'new_user')" + ) + # And this is still a problem, let's assume that this is OK, since we are + # expecting that in case of dist-over-dist the clusters are the same (users + # and stuff). + with pytest.raises(QueryRuntimeException): + n1.query("SELECT * FROM dist_over_dist_secure", user="new_user") diff --git a/tests/queries/0_stateless/00694_max_block_size_zero.reference b/tests/integration/test_lazy_database/__init__.py similarity index 100% rename from tests/queries/0_stateless/00694_max_block_size_zero.reference rename to tests/integration/test_lazy_database/__init__.py diff --git a/tests/integration/test_log_family_s3/configs/minio.xml b/tests/integration/test_lazy_database/configs/storage_policy.xml similarity index 100% rename from tests/integration/test_log_family_s3/configs/minio.xml rename to tests/integration/test_lazy_database/configs/storage_policy.xml diff --git a/tests/integration/test_lazy_database/test.py b/tests/integration/test_lazy_database/test.py new file mode 100644 index 00000000000..6890aa87374 --- /dev/null +++ b/tests/integration/test_lazy_database/test.py @@ -0,0 +1,88 @@ +import logging +import time +import pytest +import os +from helpers.cluster import ClickHouseCluster + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "node", + main_configs=["configs/storage_policy.xml"], + with_minio=True, + ) + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +def assert_objects_count(cluster, objects_count, path="data/"): + minio = cluster.minio_client + s3_objects = list(minio.list_objects(cluster.minio_bucket, path, recursive=True)) + if objects_count != len(s3_objects): + for s3_object in s3_objects: + object_meta = minio.stat_object(cluster.minio_bucket, s3_object.object_name) + logging.info("Existing S3 object: %s", str(object_meta)) + assert objects_count == len(s3_objects) + + +def list_of_files_on_ch_disk(node, disk, path): + disk_path = node.query( + f"SELECT path FROM system.disks WHERE name='{disk}'" + ).splitlines()[0] + return node.exec_in_container( + ["bash", "-c", f"ls {os.path.join(disk_path, path)}"], user="root" + ) + + +@pytest.mark.parametrize( + "engine", + [ + pytest.param("Log"), + ], +) +@pytest.mark.parametrize( + "disk,check_s3", + [ + pytest.param("default", False), + pytest.param("s3", True), + ], +) +@pytest.mark.parametrize( + "delay", + [ + pytest.param(0), + pytest.param(4), + ], +) +def test_drop_table(cluster, engine, disk, check_s3, delay): + node = cluster.instances["node"] + + node.query("DROP DATABASE IF EXISTS lazy") + node.query("CREATE DATABASE lazy ENGINE=Lazy(2)") + node.query( + "CREATE TABLE lazy.table (id UInt64) ENGINE={} SETTINGS disk = '{}'".format( + engine, + disk, + ) + ) + + node.query("INSERT INTO lazy.table SELECT number FROM numbers(10)") + assert node.query("SELECT count(*) FROM lazy.table") == "10\n" + if delay: + time.sleep(delay) + node.query("DROP TABLE lazy.table SYNC") + + if check_s3: + # There mustn't be any orphaned data + assert_objects_count(cluster, 0) + + # Local data must be removed + assert list_of_files_on_ch_disk(node, disk, "data/lazy/") == "" diff --git a/tests/integration/test_log_family_s3/configs/storage_configuration.xml b/tests/integration/test_log_family_s3/configs/storage_configuration.xml new file mode 100644 index 00000000000..d479a59b197 --- /dev/null +++ b/tests/integration/test_log_family_s3/configs/storage_configuration.xml @@ -0,0 +1,34 @@ + + + + + s3 + http://minio1:9001/root/data/ + minio + minio123 + + + s3 + http://minio1:9001/root/data/ + minio + minio123 + + true + + 1 + 0 + 1 + 20000 + + + + + +
+ s3_no_retries +
+
+
+
+
+
diff --git a/tests/integration/test_log_family_s3/test.py b/tests/integration/test_log_family_s3/test.py index bed379d098b..ed84bdf48e6 100644 --- a/tests/integration/test_log_family_s3/test.py +++ b/tests/integration/test_log_family_s3/test.py @@ -11,7 +11,7 @@ def cluster(): cluster = ClickHouseCluster(__file__) cluster.add_instance( "node", - main_configs=["configs/minio.xml", "configs/ssl.xml"], + main_configs=["configs/storage_configuration.xml", "configs/ssl.xml"], with_minio=True, ) logging.info("Starting cluster...") @@ -84,3 +84,39 @@ def test_log_family_s3(cluster, log_engine, files_overhead, files_overhead_per_i assert_objects_count(cluster, 0) finally: node.query("DROP TABLE s3_test") + + +# Imitate case when error occurs while inserting into table. +# For examle S3::TooManyRequests. +# In that case we can update data file, but not the size file. +# So due to exception we should do truncate of the data file to undo the insert query. +# See FileChecker::repair(). +def test_stripe_log_truncate(cluster): + node = cluster.instances["node"] + + node.query( + """ + CREATE TABLE stripe_table ( + a int + ) ENGINE = StripeLog() + SETTINGS storage_policy='s3_no_retries' + """ + ) + + node.query("SYSTEM ENABLE FAILPOINT stripe_log_sink_write_fallpoint") + node.query( + """ + INSERT INTO stripe_table SELECT number FROM numbers(10) + """, + ignore_error=True, + ) + node.query("SYSTEM DISABLE FAILPOINT stripe_log_sink_write_fallpoint") + node.query("SELECT count(*) FROM stripe_table") == "0\n" + node.query("INSERT INTO stripe_table SELECT number FROM numbers(10)") + node.query("SELECT count(*) FROM stripe_table") == "10\n" + + # Make sure that everything is okey with the table after restart. + node.query("DETACH TABLE stripe_table") + node.query("ATTACH TABLE stripe_table") + + assert node.query("DROP TABLE stripe_table") == "" diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index bb72574c6e5..3c43918d8c0 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -895,7 +895,7 @@ def test_hdfsCluster_unset_skip_unavailable_shards(started_cluster): assert ( node1.query( - "select * from hdfsCluster('cluster_non_existent_port', 'hdfs://hdfs1:9000/skip_unavailable_shards', 'TSV', 'id UInt64, text String, number Float64')" + "select * from hdfsCluster('cluster_non_existent_port', 'hdfs://hdfs1:9000/unskip_unavailable_shards', 'TSV', 'id UInt64, text String, number Float64')" ) == data ) diff --git a/tests/integration/test_storage_kerberized_kafka/test.py b/tests/integration/test_storage_kerberized_kafka/test.py index 451e1ab2ccf..24d10d7ff83 100644 --- a/tests/integration/test_storage_kerberized_kafka/test.py +++ b/tests/integration/test_storage_kerberized_kafka/test.py @@ -5,7 +5,7 @@ import time import pytest import logging -from helpers.cluster import ClickHouseCluster +from helpers.cluster import ClickHouseCluster, is_arm from helpers.test_tools import TSV from helpers.client import QueryRuntimeException @@ -18,6 +18,10 @@ from kafka.protocol.admin import DescribeGroupsResponse_v1, DescribeGroupsReques from kafka.protocol.group import MemberAssignment import socket +if is_arm(): + # skip due to no arm support for clickhouse/kerberos-kdc docker image + pytestmark = pytest.mark.skip + cluster = ClickHouseCluster(__file__) instance = cluster.add_instance( "instance", diff --git a/tests/performance/function_tokens.xml b/tests/performance/function_tokens.xml index 63b72f83df3..1ff56323d62 100644 --- a/tests/performance/function_tokens.xml +++ b/tests/performance/function_tokens.xml @@ -1,3 +1,5 @@ with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByChar(' ', materialize(s)) as w from numbers(1000000) + with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByRegexp(' ', materialize(s)) as w from numbers(1000000) + with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByRegexp('\s+', materialize(s)) as w from numbers(100000) diff --git a/tests/queries/0_stateless/00694_max_block_size_zero.sql b/tests/queries/0_stateless/00694_max_block_size_zero.sql deleted file mode 100644 index ba5b513bb5d..00000000000 --- a/tests/queries/0_stateless/00694_max_block_size_zero.sql +++ /dev/null @@ -1,4 +0,0 @@ -SET send_logs_level = 'fatal'; - -SET max_block_size = 0; -SELECT number FROM system.numbers; -- { serverError 12 } diff --git a/tests/queries/0_stateless/01866_split_by_regexp.reference b/tests/queries/0_stateless/01866_split_by_regexp.reference index a3ae2f35a5f..552d4d1f96a 100644 --- a/tests/queries/0_stateless/01866_split_by_regexp.reference +++ b/tests/queries/0_stateless/01866_split_by_regexp.reference @@ -5,3 +5,16 @@ ['gbye','bug'] [''] [] +Test fallback of splitByRegexp to splitByChar if regexp is trivial +['a','b','c'] +['a','b','c'] +['','','','','',''] +['a^b^c'] +['a$b$c'] +['a)b)c'] +['a','b','c'] +['a','b','c'] +['a','b','c'] +['a|b|c'] +['a\\b\\c'] +AST Fuzzer failure diff --git a/tests/queries/0_stateless/01866_split_by_regexp.sql b/tests/queries/0_stateless/01866_split_by_regexp.sql index e472fb68d94..bc25d3e1093 100644 --- a/tests/queries/0_stateless/01866_split_by_regexp.sql +++ b/tests/queries/0_stateless/01866_split_by_regexp.sql @@ -3,3 +3,23 @@ select splitByRegexp('', 'abcde'); select splitByRegexp('<[^<>]*>', x) from (select arrayJoin(['

hello

world

', 'gbyebug']) x); select splitByRegexp('ab', ''); select splitByRegexp('', ''); + +SELECT 'Test fallback of splitByRegexp to splitByChar if regexp is trivial'; +select splitByRegexp(' ', 'a b c'); +select splitByRegexp('-', 'a-b-c'); +select splitByRegexp('.', 'a.b.c'); +select splitByRegexp('^', 'a^b^c'); +select splitByRegexp('$', 'a$b$c'); +select splitByRegexp('+', 'a+b+c'); -- { serverError CANNOT_COMPILE_REGEXP } +select splitByRegexp('?', 'a?b?c'); -- { serverError CANNOT_COMPILE_REGEXP } +select splitByRegexp('(', 'a(b(c'); -- { serverError CANNOT_COMPILE_REGEXP } +select splitByRegexp(')', 'a)b)c'); +select splitByRegexp('[', 'a[b[c'); -- { serverError CANNOT_COMPILE_REGEXP } +select splitByRegexp(']', 'a]b]c'); +select splitByRegexp('{', 'a{b{c'); +select splitByRegexp('}', 'a}b}c'); +select splitByRegexp('|', 'a|b|c'); +select splitByRegexp('\\', 'a\\b\\c'); + +SELECT 'AST Fuzzer failure'; +SELECT splitByRegexp(materialize(1), NULL, 3) -- { serverError ILLEGAL_COLUMN } diff --git a/tests/queries/0_stateless/02115_rewrite_local_join_right_distribute_table.sql b/tests/queries/0_stateless/02115_rewrite_local_join_right_distribute_table.sql index 2ab324df787..d5ab82ba064 100644 --- a/tests/queries/0_stateless/02115_rewrite_local_join_right_distribute_table.sql +++ b/tests/queries/0_stateless/02115_rewrite_local_join_right_distribute_table.sql @@ -23,10 +23,6 @@ select t1.* from t1_all t1 join t2_all t2 on t1.a = t2.a ORDER BY t1.a; SELECT '-'; --- make sure data is fully written when reading from distributed -optimize table t1_local final; -optimize table t2_local final; - set distributed_product_mode = 'global'; select * from t1_all t1 where t1.a in (select t2.a from t2_all t2); explain syntax select t1.* from t1_all t1 join t2_all t2 on t1.a = t2.a; diff --git a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql index 245b2cc97e3..b2a04788bbb 100644 --- a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql +++ b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql @@ -61,6 +61,11 @@ CREATE TABLE github_events ) ENGINE = MergeTree ORDER BY (event_type, repo_name, created_at); -with top_repos as ( select repo_name from github_events where event_type = 'WatchEvent' and toDate(created_at) = today() - 1 group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toMonday(created_at) = toMonday(today() - interval 1 week) group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toYear(created_at) = toYear(today()) - 1 group by repo_name order by count() desc limit 100 ), last_day as ( select repo_name, count() as count_last_day, rowNumberInAllBlocks() + 1 as position_last_day from github_events where repo_name in (select repo_name from top_repos) and toDate(created_at) = today() - 1 group by repo_name order by count_last_day desc ), last_week as ( select repo_name, count() as count_last_week, rowNumberInAllBlocks() + 1 as position_last_week from github_events where repo_name in (select repo_name from top_repos) and toMonday(created_at) = toMonday(today()) - interval 1 week group by repo_name order by count_last_week desc ), last_month as ( select repo_name, count() as count_last_month, rowNumberInAllBlocks() + 1 as position_last_month from github_events where repo_name in (select repo_name from top_repos) and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count_last_month desc ) select d.repo_name, columns(count) from last_day d join last_week w on d.repo_name = w.repo_name join last_month m on d.repo_name = m.repo_name; +with + top_repos as ( select repo_name from github_events where event_type = 'WatchEvent' and toDate(created_at) = today() - 1 group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toMonday(created_at) = toMonday(today() - interval 1 week) group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toYear(created_at) = toYear(today()) - 1 group by repo_name order by count() desc limit 100 ), + last_day as ( select repo_name, count() as count_last_day, rowNumberInAllBlocks() + 1 as position_last_day from github_events where repo_name in (select repo_name from top_repos) and toDate(created_at) = today() - 1 group by repo_name order by count_last_day desc ), + last_week as ( select repo_name, count() as count_last_week, rowNumberInAllBlocks() + 1 as position_last_week from github_events where repo_name in (select repo_name from top_repos) and toMonday(created_at) = toMonday(today()) - interval 1 week group by repo_name order by count_last_week desc ), + last_month as ( select repo_name, count() as count_last_month, rowNumberInAllBlocks() + 1 as position_last_month from github_events where repo_name in (select repo_name from top_repos) and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count_last_month desc ) +select d.repo_name, columns('count') from last_day d join last_week w on d.repo_name = w.repo_name join last_month m on d.repo_name = m.repo_name; DROP TABLE github_events; diff --git a/tests/queries/0_stateless/02340_parts_refcnt_mergetree.reference b/tests/queries/0_stateless/02340_parts_refcnt_mergetree.reference index ae4fafae829..b06fee4af06 100644 --- a/tests/queries/0_stateless/02340_parts_refcnt_mergetree.reference +++ b/tests/queries/0_stateless/02340_parts_refcnt_mergetree.reference @@ -1,2 +1,2 @@ -data_02340 1_2_2_0 6 -data_02340_rep 1_0_0_0 6 +data_02340 1_2_2_0 1 +data_02340_rep 1_0_0_0 1 diff --git a/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh b/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh index 208a9038681..caa600298ce 100755 --- a/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh +++ b/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh @@ -9,40 +9,58 @@ function check_refcnt_for_table() { local table=$1 && shift - $CLICKHOUSE_CLIENT -q "system stop merges $table" + $CLICKHOUSE_CLIENT -nm -q " + system stop merges $table; + -- cleanup thread may hold the parts lock + system stop cleanup $table; + -- queue may hold the parts lock for awhile as well + system stop pulling replication log $table; + " $CLICKHOUSE_CLIENT --insert_keeper_fault_injection_probability=0 -q "insert into $table select number, number%4 from numbers(200)" local query_id query_id="$table-$(random_str 10)" - SETTINGS="--format Null --max_threads 1 --max_block_size 1 --merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability 0.0" + local log_file + log_file=$(mktemp "$CUR_DIR/clickhouse-tests.XXXXXX.log") + local args=( + --format Null + --max_threads 1 + --max_block_size 1 + --merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability 0.0 + --query_id "$query_id" + --send_logs_level "test" + --server_logs_file "$log_file" + ) # Notes: - # - query may sleep 1*(200/4)=50 seconds maximum, it is enough to check system.parts + # - query may sleep 0.1*(200/4)=5 seconds maximum, it is enough to check system.parts # - "part = 1" condition should prune all parts except first # - max_block_size=1 with index_granularity=1 will allow to cancel the query earlier - $CLICKHOUSE_CLIENT $SETTINGS --query_id "$query_id" -q "select sleepEachRow(1) from $table where part = 1" & + $CLICKHOUSE_CLIENT "${args[@]}" -q "select sleepEachRow(0.1) from $table where part = 1" & PID=$! - # wait for query to be started - while [ "$($CLICKHOUSE_CLIENT -q "select count() from system.processes where query_id = '$query_id'")" -ne 1 ]; do - sleep 0.1 - done - # When the query only starts it execution it holds reference for each part, # however when it starts reading, partition pruning takes place, # and it should hold only parts that are required for SELECT # - # But to reach partition prune the function sleepEachRow() will be executed twice, - # so 2 seconds for sleepEachRow() and 3 seconds just to ensure that it enters the reading stage. - sleep $((2+3)) + # So let's wait while the reading will be started. + while ! grep -F -q -e "Exception" -e "MergeTreeRangeReader" "$log_file"; do + sleep 0.1 + done - # NOTE: parts that are used in query will have refcount increased for each range - $CLICKHOUSE_CLIENT -q "select table, name, refcount from system.parts where database = '$CLICKHOUSE_DATABASE' and table = '$table' and refcount > 1" + # NOTE: parts that are used in query will be holded in multiple places, and + # this is where magic 6 came from. Also there could be some other + # background threads (i.e. asynchronous metrics) that uses the part, so we + # simply filter parts not by "refcount > 1" but with some delta - "3", to + # avoid flakiness. + $CLICKHOUSE_CLIENT -q "select table, name, refcount>=6 from system.parts where database = '$CLICKHOUSE_DATABASE' and table = '$table' and refcount >= 3" # Kill the query gracefully. kill -INT $PID wait $PID + grep -F Exception "$log_file" | grep -v -F QUERY_WAS_CANCELLED + rm -f "${log_file:?}" } # NOTE: index_granularity=1 to cancel ASAP @@ -52,11 +70,13 @@ $CLICKHOUSE_CLIENT -nmq " create table data_02340 (key Int, part Int) engine=MergeTree() partition by part order by key settings index_granularity=1; " || exit 1 check_refcnt_for_table data_02340 +$CLICKHOUSE_CLIENT -q "drop table data_02340 sync" $CLICKHOUSE_CLIENT -nmq " drop table if exists data_02340_rep sync; create table data_02340_rep (key Int, part Int) engine=ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX', '1') partition by part order by key settings index_granularity=1; " || exit 1 check_refcnt_for_table data_02340_rep +$CLICKHOUSE_CLIENT -q "drop table data_02340_rep sync" exit 0 diff --git a/tests/queries/0_stateless/02420_final_setting_analyzer.reference b/tests/queries/0_stateless/02420_final_setting_analyzer.reference index dd9fed65f13..780a6e5de68 100644 --- a/tests/queries/0_stateless/02420_final_setting_analyzer.reference +++ b/tests/queries/0_stateless/02420_final_setting_analyzer.reference @@ -132,3 +132,7 @@ SELECT * FROM merge_table ORDER BY id, val; 2 a 2 b 3 c +select sum(number) from numbers(10) settings final=1; +45 +select sum(number) from remote('127.0.0.{1,2}', numbers(10)) settings final=1; +90 diff --git a/tests/queries/0_stateless/02420_final_setting_analyzer.sql b/tests/queries/0_stateless/02420_final_setting_analyzer.sql index 14c832cfaf5..cbdec017602 100644 --- a/tests/queries/0_stateless/02420_final_setting_analyzer.sql +++ b/tests/queries/0_stateless/02420_final_setting_analyzer.sql @@ -102,3 +102,6 @@ insert into table_to_merge_c values (3,'c'); -- expected output: -- 1 c, 2 a, 2 b, 3 c SELECT * FROM merge_table ORDER BY id, val; + +select sum(number) from numbers(10) settings final=1; +select sum(number) from remote('127.0.0.{1,2}', numbers(10)) settings final=1; diff --git a/tests/queries/0_stateless/02477_single_value_data_string_regression.sql b/tests/queries/0_stateless/02477_single_value_data_string_regression.sql index 0f11a06f3fc..8499786f47a 100644 --- a/tests/queries/0_stateless/02477_single_value_data_string_regression.sql +++ b/tests/queries/0_stateless/02477_single_value_data_string_regression.sql @@ -103,11 +103,11 @@ SELECT '2^30-1', maxMerge(x) from (select CAST(unhex('ffffff3f') || randomString SELECT '1M without 0', length(maxMerge(x)) from (select CAST(unhex('00001000') || randomString(0x00100000 - 1) || 'x', 'AggregateFunction(max, String)') as x); SELECT '1M with 0', length(maxMerge(x)) from (select CAST(unhex('00001000') || randomString(0x00100000 - 1) || '\0', 'AggregateFunction(max, String)') as x); -SELECT 'fuzz1', finalizeAggregation(CAST(unhex('3000000\0303132333435363738393031323334353637383930313233343536373839303132333435363738393031323334353600010000000000000000'), 'AggregateFunction(argMax, String, UInt64)')); -- { serverError CORRUPTED_DATA } +SELECT 'fuzz1', finalizeAggregation(CAST(unhex('3000000\0303132333435363738393031323334353637383930313233343536373839303132333435363738393031323334353600010000000000000000'), 'AggregateFunction(argMax, String, UInt64)')); -- { serverError INCORRECT_DATA } SELECT 'fuzz2', finalizeAggregation(CAST(unhex('04000000' || '30313233' || '01' || 'ffffffffffffffff'), 'AggregateFunction(argMax, String, UInt64)')) as x, length(x); -SELECT 'fuzz3', finalizeAggregation(CAST(unhex('04000000' || '30313233' || '00' || 'ffffffffffffffff'), 'AggregateFunction(argMax, String, UInt64)')) as x, length(x); -- { serverError CORRUPTED_DATA } -SELECT 'fuzz4', finalizeAggregation(CAST(unhex('04000000' || '30313233' || '00'), 'AggregateFunction(argMax, String, UInt64)')) as x, length(x); -- { serverError CORRUPTED_DATA } -SELECT 'fuzz5', finalizeAggregation(CAST(unhex('0100000000000000000FFFFFFFF0'), 'AggregateFunction(argMax, UInt64, String)')); -- { serverError CORRUPTED_DATA } +SELECT 'fuzz3', finalizeAggregation(CAST(unhex('04000000' || '30313233' || '00' || 'ffffffffffffffff'), 'AggregateFunction(argMax, String, UInt64)')) as x, length(x); -- { serverError INCORRECT_DATA } +SELECT 'fuzz4', finalizeAggregation(CAST(unhex('04000000' || '30313233' || '00'), 'AggregateFunction(argMax, String, UInt64)')) as x, length(x); -- { serverError INCORRECT_DATA } +SELECT 'fuzz5', finalizeAggregation(CAST(unhex('0100000000000000000FFFFFFFF0'), 'AggregateFunction(argMax, UInt64, String)')); -- { serverError INCORRECT_DATA } drop table if exists aggr; diff --git a/tests/queries/0_stateless/02494_query_cache_use_database.reference b/tests/queries/0_stateless/02494_query_cache_use_database.reference new file mode 100644 index 00000000000..1191247b6d9 --- /dev/null +++ b/tests/queries/0_stateless/02494_query_cache_use_database.reference @@ -0,0 +1,2 @@ +1 +2 diff --git a/tests/queries/0_stateless/02494_query_cache_use_database.sql b/tests/queries/0_stateless/02494_query_cache_use_database.sql new file mode 100644 index 00000000000..df560f82ebb --- /dev/null +++ b/tests/queries/0_stateless/02494_query_cache_use_database.sql @@ -0,0 +1,30 @@ +-- Tags: no-parallel, no-fasttest +-- Tag no-fasttest: Depends on OpenSSL +-- Tag no-parallel: Messes with internal cache + +-- Test for issue #64136 + +SYSTEM DROP QUERY CACHE; + +DROP DATABASE IF EXISTS db1; +DROP DATABASE IF EXISTS db2; + +CREATE DATABASE db1; +CREATE DATABASE db2; + +CREATE TABLE db1.tab(a UInt64, PRIMARY KEY a); +CREATE TABLE db2.tab(a UInt64, PRIMARY KEY a); + +INSERT INTO db1.tab values(1); +INSERT INTO db2.tab values(2); + +USE db1; +SELECT * FROM tab SETTINGS use_query_cache=1; + +USE db2; +SELECT * FROM tab SETTINGS use_query_cache=1; + +DROP DATABASE db1; +DROP DATABASE db2; + +SYSTEM DROP QUERY CACHE; diff --git a/tests/queries/0_stateless/02534_keyed_siphash.reference b/tests/queries/0_stateless/02534_keyed_siphash.reference index e3fae07333a..3f478218ff1 100644 --- a/tests/queries/0_stateless/02534_keyed_siphash.reference +++ b/tests/queries/0_stateless/02534_keyed_siphash.reference @@ -236,3 +236,6 @@ Check asan bug 0 Check bug found fuzzing 9042C6691B1A75F0EA3314B6F55728BB +Check bug 2 found fuzzing +608E1FF030C9E206185B112C2A25F1A7 +ABB65AE97711A2E053E324ED88B1D08B diff --git a/tests/queries/0_stateless/02534_keyed_siphash.sql b/tests/queries/0_stateless/02534_keyed_siphash.sql index 112ae15bf46..fb707109c83 100644 --- a/tests/queries/0_stateless/02534_keyed_siphash.sql +++ b/tests/queries/0_stateless/02534_keyed_siphash.sql @@ -338,3 +338,10 @@ SELECT sipHash128((toUInt64(9223372036854775806), 1)) = sipHash128(1) GROUP BY s SELECT 'Check bug found fuzzing'; SELECT [(255, 1048575)], sipHash128ReferenceKeyed((toUInt64(2147483646), toUInt64(9223372036854775807)), ([(NULL, 100), (NULL, NULL), (1024, 10)], toUInt64(2), toUInt64(1024)), ''), hex(sipHash128ReferenceKeyed((-9223372036854775807, 1.), '-1', NULL)), ('', toUInt64(65535), [(9223372036854775807, 9223372036854775806)], toUInt64(65536)), arrayJoin((NULL, 65537, 255), [(NULL, NULL)]) GROUP BY tupleElement((NULL, NULL, NULL, -1), toUInt64(2), 2) = NULL; -- { serverError NOT_IMPLEMENTED } SELECT hex(sipHash128ReferenceKeyed((0::UInt64, 0::UInt64), ([1, 1]))); + +SELECT 'Check bug 2 found fuzzing'; +DROP TABLE IF EXISTS sipHashKeyed_keys; +CREATE TABLE sipHashKeyed_keys (`a` Map(String, String)) ENGINE = Memory; +INSERT INTO sipHashKeyed_keys FORMAT VALUES ({'a':'b', 'c':'d'}), ({'e':'f', 'g':'h'}); +SELECT hex(sipHash128ReferenceKeyed((0::UInt64, materialize(0::UInt64)), a)) FROM sipHashKeyed_keys ORDER BY a; +DROP TABLE sipHashKeyed_keys; diff --git a/tests/queries/0_stateless/02841_parquet_filter_pushdown.sql b/tests/queries/0_stateless/02841_parquet_filter_pushdown.sql index 8521ada04d5..950485d53f0 100644 --- a/tests/queries/0_stateless/02841_parquet_filter_pushdown.sql +++ b/tests/queries/0_stateless/02841_parquet_filter_pushdown.sql @@ -8,10 +8,6 @@ set optimize_or_like_chain = 0; set max_block_size = 100000; set max_insert_threads = 1; --- Analyzer breaks the queries with IN and some queries with BETWEEN. --- TODO: Figure out why. -set allow_experimental_analyzer=0; - -- Try all the types. insert into function file('02841.parquet') -- Use negative numbers to test sign extension for signed types and lack of sign extension for diff --git a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference index 9ba927fa201..0589fdeef04 100644 --- a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference +++ b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference @@ -24,6 +24,9 @@ OK 2 OK OK +OK +100 +100 ===== TestGrants ===== OK OK diff --git a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh index 9c9df120298..f32aee44bee 100755 --- a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh +++ b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh @@ -159,6 +159,45 @@ ${CLICKHOUSE_CLIENT} --query "REVOKE SELECT ON $db.test_table FROM $user1" (( $(${CLICKHOUSE_CLIENT} --user $user2 --query "SELECT * FROM $db.test_mv_4" 2>&1 | grep -c "Not enough privileges") >= 1 )) && echo "OK" || echo "UNEXPECTED" (( $(${CLICKHOUSE_CLIENT} --query "INSERT INTO $db.test_table VALUES ('foo'), ('bar');" 2>&1 | grep -c "Not enough privileges") >= 1 )) && echo "OK" || echo "UNEXPECTED" +${CLICKHOUSE_CLIENT} --multiquery <&1 | grep -c "Not enough privileges") >= 1 )) && echo "OK" || echo "UNEXPECTED" +${CLICKHOUSE_CLIENT} --query "GRANT INSERT ON $db.source TO $user2" +${CLICKHOUSE_CLIENT} --user $user2 --query "INSERT INTO source SELECT * FROM generateRandom() LIMIT 100" + +${CLICKHOUSE_CLIENT} --query "SELECT count() FROM destination1" +${CLICKHOUSE_CLIENT} --query "SELECT count() FROM destination2" echo "===== TestGrants =====" ${CLICKHOUSE_CLIENT} --query "GRANT CREATE ON *.* TO $user1" @@ -192,7 +231,6 @@ ${CLICKHOUSE_CLIENT} --user $user1 --query " ${CLICKHOUSE_CLIENT} --query "GRANT SET DEFINER ON $user2 TO $user1" - echo "===== TestRowPolicy =====" ${CLICKHOUSE_CLIENT} --multiquery < + +Akiba_Hebrew_Academy 2017-08-01 241 +Aegithina_tiphia 2018-02-01 34 +1971-72_Utah_Stars_season 2016-10-01 1 + +<-- Read DOS endings with setting input_format_tsv_crlf_end_of_line=1 --> + +Akiba_Hebrew_Academy 2017-08-01 241 +Aegithina_tiphia 2018-02-01 34 +1971-72_Utah_Stars_season 2016-10-01 1 diff --git a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh new file mode 100755 index 00000000000..14f28f1ba4a --- /dev/null +++ b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# Data preparation step +USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +UNIX_ENDINGS="${CLICKHOUSE_TEST_UNIQUE_NAME}_data_without_crlf.tsv" +DOS_ENDINGS="${CLICKHOUSE_TEST_UNIQUE_NAME}_data_with_crlf.tsv" +DATA_FILE_UNIX_ENDINGS="${USER_FILES_PATH:?}/${UNIX_ENDINGS}" +DATA_FILE_DOS_ENDINGS="${USER_FILES_PATH:?}/${DOS_ENDINGS}" + +touch $DATA_FILE_UNIX_ENDINGS +touch $DATA_FILE_DOS_ENDINGS + +echo -ne "Akiba_Hebrew_Academy\t2017-08-01\t241\nAegithina_tiphia\t2018-02-01\t34\n1971-72_Utah_Stars_season\t2016-10-01\t1\n" > $DATA_FILE_UNIX_ENDINGS +echo -ne "Akiba_Hebrew_Academy\t2017-08-01\t241\r\nAegithina_tiphia\t2018-02-01\t34\r\n1971-72_Utah_Stars_season\t2016-10-01\t1\r\n" > $DATA_FILE_DOS_ENDINGS + +echo -e "<-- Read UNIX endings -->\n" +$CLICKHOUSE_CLIENT --query "SELECT * FROM file(${UNIX_ENDINGS}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32');" +$CLICKHOUSE_CLIENT --multiquery --query "SELECT * FROM file(${DOS_ENDINGS}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32'); --{serverError 117}" + +echo -e "\n<-- Read DOS endings with setting input_format_tsv_crlf_end_of_line=1 -->\n" +$CLICKHOUSE_CLIENT --query "SELECT * FROM file(${DOS_ENDINGS}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32') SETTINGS input_format_tsv_crlf_end_of_line = 1;" + +# Test teardown +rm $DATA_FILE_UNIX_ENDINGS +rm $DATA_FILE_DOS_ENDINGS diff --git a/tests/queries/0_stateless/03020_order_by_SimpleAggregateFunction.sql b/tests/queries/0_stateless/03020_order_by_SimpleAggregateFunction.sql index f1727cb9e5c..fee42d1abc6 100644 --- a/tests/queries/0_stateless/03020_order_by_SimpleAggregateFunction.sql +++ b/tests/queries/0_stateless/03020_order_by_SimpleAggregateFunction.sql @@ -1,6 +1,6 @@ set allow_suspicious_primary_key = 0; -DROP TABLE IF EXISTS data; +drop table if exists data; create table data (key Int, value AggregateFunction(sum, UInt64)) engine=AggregatingMergeTree() order by (key, value); -- { serverError DATA_TYPE_CANNOT_BE_USED_IN_KEY } create table data (key Int, value SimpleAggregateFunction(sum, UInt64)) engine=AggregatingMergeTree() order by (key, value); -- { serverError DATA_TYPE_CANNOT_BE_USED_IN_KEY } @@ -12,7 +12,22 @@ create table data (key Int, value AggregateFunction(sum, UInt64)) engine=Aggrega create table data (key Int, value SimpleAggregateFunction(sum, UInt64)) engine=AggregatingMergeTree() primary key value order by (value, key); -- { serverError DATA_TYPE_CANNOT_BE_USED_IN_KEY } set allow_suspicious_primary_key = 1; - create table data (key Int, value SimpleAggregateFunction(sum, UInt64)) engine=AggregatingMergeTree() primary key value order by (value, key); -DROP TABLE data; +-- ATTACH should work regardless allow_suspicious_primary_key +set allow_suspicious_primary_key = 0; +detach table data; +attach table data; +drop table data; + +-- ALTER AggregatingMergeTree +create table data (key Int) engine=AggregatingMergeTree() order by (key); +alter table data add column value SimpleAggregateFunction(sum, UInt64), modify order by (key, value); -- { serverError DATA_TYPE_CANNOT_BE_USED_IN_KEY } +alter table data add column value SimpleAggregateFunction(sum, UInt64), modify order by (key, value) settings allow_suspicious_primary_key=1; +drop table data; + +-- ALTER ReplicatedAggregatingMergeTree +create table data_rep (key Int) engine=ReplicatedAggregatingMergeTree('/tables/{database}', 'r1') order by (key); +alter table data_rep add column value SimpleAggregateFunction(sum, UInt64), modify order by (key, value); -- { serverError DATA_TYPE_CANNOT_BE_USED_IN_KEY } +alter table data_rep add column value SimpleAggregateFunction(sum, UInt64), modify order by (key, value) settings allow_suspicious_primary_key=1; +drop table data_rep; diff --git a/tests/queries/0_stateless/03033_dynamic_text_serialization.reference b/tests/queries/0_stateless/03033_dynamic_text_serialization.reference new file mode 100644 index 00000000000..d965245266c --- /dev/null +++ b/tests/queries/0_stateless/03033_dynamic_text_serialization.reference @@ -0,0 +1,55 @@ +JSON +{"d":"42","dynamicType(d)":"Int64"} +{"d":42.42,"dynamicType(d)":"Float64"} +{"d":"str","dynamicType(d)":"String"} +{"d":["1","2","3"],"dynamicType(d)":"Array(Int64)"} +{"d":"2020-01-01","dynamicType(d)":"Date"} +{"d":"2020-01-01 10:00:00.000000000","dynamicType(d)":"DateTime64(9)"} +{"d":{"a":"42","b":"str"},"dynamicType(d)":"Tuple(a Int64, b String)"} +{"d":{"a":"43"},"dynamicType(d)":"Tuple(a Int64)"} +{"d":{"a":"44","c":["1","2","3"]},"dynamicType(d)":"Tuple(a Int64, c Array(Int64))"} +{"d":["1","str",["1","2","3"]],"dynamicType(d)":"Tuple(Int64, String, Array(Int64))"} +{"d":null,"dynamicType(d)":"None"} +{"d":true,"dynamicType(d)":"Bool"} +{"d":"42","dynamicType(d)":"Int64"} +{"d":"42.42","dynamicType(d)":"String"} +{"d":"str","dynamicType(d)":"String"} +{"d":null,"dynamicType(d)":"None"} +{"d":"1","dynamicType(d)":"Int64"} +CSV +42,"Int64" +42.42,"Float64" +"str","String" +"[1,2,3]","Array(Int64)" +"2020-01-01","Date" +"2020-01-01 10:00:00.000000000","DateTime64(9)" +"[1, 'str', [1, 2, 3]]","String" +\N,"None" +true,"Bool" +TSV +42 Int64 +42.42 Float64 +str String +[1,2,3] Array(Int64) +2020-01-01 Date +2020-01-01 10:00:00.000000000 DateTime64(9) +[1, \'str\', [1, 2, 3]] String +\N None +true Bool +Values +(42,'Int64'),(42.42,'Float64'),('str','String'),([1,2,3],'Array(Int64)'),('2020-01-01','Date'),('2020-01-01 10:00:00.000000000','DateTime64(9)'),(NULL,'None'),(true,'Bool') +Cast using parsing +42 Int64 +42.42 Float64 +[1,2,3] Array(Int64) +2020-01-01 Date +2020-01-01 10:00:00.000000000 DateTime64(9) +\N None +true Bool +42 Int64 +42.42 Float64 +[1, 2, 3] String +2020-01-01 String +2020-01-01 10:00:00 String +\N None +true String diff --git a/tests/queries/0_stateless/03033_dynamic_text_serialization.sql b/tests/queries/0_stateless/03033_dynamic_text_serialization.sql new file mode 100644 index 00000000000..d12d110fe28 --- /dev/null +++ b/tests/queries/0_stateless/03033_dynamic_text_serialization.sql @@ -0,0 +1,74 @@ +set allow_experimental_dynamic_type = 1; + +select 'JSON'; +select d, dynamicType(d) from format(JSONEachRow, 'd Dynamic', $$ +{"d" : 42} +{"d" : 42.42} +{"d" : "str"} +{"d" : [1, 2, 3]} +{"d" : "2020-01-01"} +{"d" : "2020-01-01 10:00:00"} +{"d" : {"a" : 42, "b" : "str"}} +{"d" : {"a" : 43}} +{"d" : {"a" : 44, "c" : [1, 2, 3]}} +{"d" : [1, "str", [1, 2, 3]]} +{"d" : null} +{"d" : true} +$$) format JSONEachRow; + +select d, dynamicType(d) from format(JSONEachRow, 'd Dynamic(max_types=2)', $$ +{"d" : 42} +{"d" : 42.42} +{"d" : "str"} +{"d" : null} +{"d" : true} +$$) format JSONEachRow; + +select 'CSV'; +select d, dynamicType(d) from format(CSV, 'd Dynamic', +$$42 +42.42 +"str" +"[1, 2, 3]" +"2020-01-01" +"2020-01-01 10:00:00" +"[1, 'str', [1, 2, 3]]" +\N +true +$$) format CSV; + +select 'TSV'; +select d, dynamicType(d) from format(TSV, 'd Dynamic', +$$42 +42.42 +str +[1, 2, 3] +2020-01-01 +2020-01-01 10:00:00 +[1, 'str', [1, 2, 3]] +\N +true +$$) format TSV; + +select 'Values'; +select d, dynamicType(d) from format(Values, 'd Dynamic', $$ +(42) +(42.42) +('str') +([1, 2, 3]) +('2020-01-01') +('2020-01-01 10:00:00') +(NULL) +(true) +$$) format Values; +select ''; + +select 'Cast using parsing'; +drop table if exists test; +create table test (s String) engine=Memory; +insert into test values ('42'), ('42.42'), ('[1, 2, 3]'), ('2020-01-01'), ('2020-01-01 10:00:00'), ('NULL'), ('true'); +set cast_string_to_dynamic_use_inference=1; +select s::Dynamic as d, dynamicType(d) from test; +select s::Dynamic(max_types=3) as d, dynamicType(d) from test; +drop table test; + diff --git a/tests/queries/0_stateless/03033_final_undefined_last_mark.reference b/tests/queries/0_stateless/03033_final_undefined_last_mark.reference new file mode 100644 index 00000000000..a30b755709b --- /dev/null +++ b/tests/queries/0_stateless/03033_final_undefined_last_mark.reference @@ -0,0 +1,2 @@ +Disabled 11338881281426660955 14765404159170880511 +Enabled 11338881281426660955 14765404159170880511 diff --git a/tests/queries/0_stateless/03033_final_undefined_last_mark.sql b/tests/queries/0_stateless/03033_final_undefined_last_mark.sql new file mode 100644 index 00000000000..25a30a365a5 --- /dev/null +++ b/tests/queries/0_stateless/03033_final_undefined_last_mark.sql @@ -0,0 +1,23 @@ +-- Tags: no-random-settings, no-random-merge-tree-settings + +DROP TABLE IF EXISTS account_test; + +CREATE TABLE account_test +( + `id` UInt64, + `row_ver` UInt64, +) +ENGINE = ReplacingMergeTree(row_ver) +ORDER BY id +SETTINGS index_granularity = 16, index_granularity_bytes = 0, + min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, + min_rows_for_compact_part = 0, min_bytes_for_compact_part = 0; + +SYSTEM STOP MERGES account_test; + +INSERT INTO account_test VALUES (11338881281426660955,717769962224129342),(12484100559155738267,7950971667203174918),(7603729260199571867,3255798127676911942),(7023543111808724827,911615979861855126),(10293135086416484571,3264379259750736572),(15561193439904316763,8419819469587131454),(17632407413882870235,7252071832370181502),(17009726455991851227,7525297506591593939),(12392078953873778779,8473049173389293961),(15283366022689446555,11692491360262171467),(9087459014730986523,2783662960221838603),(293823584550906267,4847630088179732782),(15693186194430465755,8163804880526285623),(7353080168325584795,17315892478487497859),(5980311238303466523,6943353798059390089),(14242621660019578011,8684624667957352769),(8241843507567433563,15731952080102886438); +INSERT INTO account_test VALUES (11338881281426660955, 14765404159170880511); + +SELECT 'Disabled', * FROM account_test FINAL WHERE id = 11338881281426660955 SETTINGS split_parts_ranges_into_intersecting_and_non_intersecting_final = 0; +SELECT 'Enabled', * FROM account_test FINAL WHERE id = 11338881281426660955 SETTINGS split_parts_ranges_into_intersecting_and_non_intersecting_final = 1; + diff --git a/tests/queries/0_stateless/03034_dynamic_conversions.reference b/tests/queries/0_stateless/03034_dynamic_conversions.reference new file mode 100644 index 00000000000..45f94f7ecc4 --- /dev/null +++ b/tests/queries/0_stateless/03034_dynamic_conversions.reference @@ -0,0 +1,88 @@ +0 UInt64 +1 UInt64 +2 UInt64 +0 String +1 String +2 String +0 +1 +2 +0 +1 +2 +1970-01-01 +1970-01-02 +1970-01-03 +0 UInt64 +1 UInt64 +2 UInt64 +0 UInt64 +\N None +2 UInt64 +0 UInt64 +str_1 String +[0,1] Array(UInt64) +\N None +4 UInt64 +str_5 String +0 String +str_1 String +[0,1] String +\N None +4 String +str_5 String +0 UInt64 +str_1 String +[0,1] String +\N None +4 UInt64 +str_5 String +0 UInt64 +str_1 String +[0,1] Array(UInt64) +\N None +4 UInt64 +str_5 String +0 +1 +2 +0 +1 +2 +0 UInt64 +str_1 String +[0,1] String +\N None +4 UInt64 +str_5 String +0 UInt64 +1970-01-02 Date +[0,1] String +\N None +4 UInt64 +1970-01-06 Date +0 +42 +42.42 +1 +0 +\N +42 +42.42 +1 +0 + +42 +42.42 +true +e10 +\N +42 +42.42 +true +e10 +\N +42 +\N +1 +\N diff --git a/tests/queries/0_stateless/03034_dynamic_conversions.sql b/tests/queries/0_stateless/03034_dynamic_conversions.sql new file mode 100644 index 00000000000..ed75fbf2377 --- /dev/null +++ b/tests/queries/0_stateless/03034_dynamic_conversions.sql @@ -0,0 +1,34 @@ +set allow_experimental_dynamic_type=1; +set allow_experimental_variant_type=1; +set use_variant_as_common_type=1; + +select number::Dynamic as d, dynamicType(d) from numbers(3); +select number::Dynamic(max_types=1) as d, dynamicType(d) from numbers(3); +select number::Dynamic::UInt64 as v from numbers(3); +select number::Dynamic::String as v from numbers(3); +select number::Dynamic::Date as v from numbers(3); +select number::Dynamic::Array(UInt64) as v from numbers(3); -- {serverError TYPE_MISMATCH} +select number::Dynamic::Variant(UInt64, String) as v, variantType(v) from numbers(3); +select (number % 2 ? NULL : number)::Dynamic as d, dynamicType(d) from numbers(3); + +select multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, range(number), NULL)::Dynamic as d, dynamicType(d) from numbers(6); +select multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, range(number), NULL)::Dynamic(max_types=1) as d, dynamicType(d) from numbers(6); +select multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, range(number), NULL)::Dynamic(max_types=2) as d, dynamicType(d) from numbers(6); +select multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, range(number), NULL)::Dynamic(max_types=3) as d, dynamicType(d) from numbers(6); + +select number::Dynamic(max_types=2)::Dynamic(max_types=3) as d from numbers(3); +select number::Dynamic(max_types=2)::Dynamic(max_types=1) as d from numbers(3); +select multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, range(number), NULL)::Dynamic(max_types=3)::Dynamic(max_types=2) as d, dynamicType(d) from numbers(6); +select multiIf(number % 4 == 0, number, number % 4 == 1, toDate(number), number % 4 == 2, range(number), NULL)::Dynamic(max_types=4)::Dynamic(max_types=3) as d, dynamicType(d) from numbers(6); + + +create table test (d Dynamic) engine = Memory; +insert into test values (NULL), (42), ('42.42'), (true), ('e10'); +select d::Float64 from test; +select d::Nullable(Float64) from test; +select d::String from test; +select d::Nullable(String) from test; +select d::UInt64 from test; -- {serverError CANNOT_PARSE_TEXT} +select d::Nullable(UInt64) from test; +select d::Date from test; -- {serverError CANNOT_PARSE_DATE} + diff --git a/tests/queries/0_stateless/03035_dynamic_sorting.reference b/tests/queries/0_stateless/03035_dynamic_sorting.reference new file mode 100644 index 00000000000..9b8df11c7a9 --- /dev/null +++ b/tests/queries/0_stateless/03035_dynamic_sorting.reference @@ -0,0 +1,299 @@ +order by d1 nulls first +\N None +\N None +\N None +\N None +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,4] Array(Int64) +42 Int64 +42 Int64 +42 Int64 +42 Int64 +42 Int64 +43 Int64 +abc String +abc String +abc String +abc String +abc String +abd String +order by d1 nulls last +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,4] Array(Int64) +42 Int64 +42 Int64 +42 Int64 +42 Int64 +42 Int64 +43 Int64 +abc String +abc String +abc String +abc String +abc String +abd String +\N None +\N None +\N None +\N None +order by d2 nulls first +\N None +\N None +\N None +\N None +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,4] Array(Int64) +42 Int64 +42 Int64 +42 Int64 +42 Int64 +42 Int64 +43 Int64 +abc String +abc String +abc String +abc String +abc String +abd String +order by d2 nulls last +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,4] Array(Int64) +42 Int64 +42 Int64 +42 Int64 +42 Int64 +42 Int64 +43 Int64 +abc String +abc String +abc String +abc String +abc String +abd String +\N None +\N None +\N None +\N None +order by d1, d2 nulls first +[1,2,3] \N Array(Int64) None +[1,2,3] [1,2,3] Array(Int64) Array(Int64) +[1,2,3] [1,2,4] Array(Int64) Array(Int64) +[1,2,3] 42 Array(Int64) Int64 +[1,2,3] abc Array(Int64) String +[1,2,4] [1,2,3] Array(Int64) Array(Int64) +42 \N Int64 None +42 [1,2,3] Int64 Array(Int64) +42 42 Int64 Int64 +42 43 Int64 Int64 +42 abc Int64 String +43 42 Int64 Int64 +abc \N String None +abc [1,2,3] String Array(Int64) +abc 42 String Int64 +abc abc String String +abc abd String String +abd abc String String +\N \N None None +\N [1,2,3] None Array(Int64) +\N 42 None Int64 +\N abc None String +order by d1, d2 nulls last +[1,2,3] [1,2,3] Array(Int64) Array(Int64) +[1,2,3] [1,2,4] Array(Int64) Array(Int64) +[1,2,3] 42 Array(Int64) Int64 +[1,2,3] abc Array(Int64) String +[1,2,3] \N Array(Int64) None +[1,2,4] [1,2,3] Array(Int64) Array(Int64) +42 [1,2,3] Int64 Array(Int64) +42 42 Int64 Int64 +42 43 Int64 Int64 +42 abc Int64 String +42 \N Int64 None +43 42 Int64 Int64 +abc [1,2,3] String Array(Int64) +abc 42 String Int64 +abc abc String String +abc abd String String +abc \N String None +abd abc String String +\N [1,2,3] None Array(Int64) +\N 42 None Int64 +\N abc None String +\N \N None None +order by d2, d1 nulls first +\N [1,2,3] None Array(Int64) +[1,2,3] [1,2,3] Array(Int64) Array(Int64) +[1,2,4] [1,2,3] Array(Int64) Array(Int64) +42 [1,2,3] Int64 Array(Int64) +abc [1,2,3] String Array(Int64) +[1,2,3] [1,2,4] Array(Int64) Array(Int64) +\N 42 None Int64 +[1,2,3] 42 Array(Int64) Int64 +42 42 Int64 Int64 +43 42 Int64 Int64 +abc 42 String Int64 +42 43 Int64 Int64 +\N abc None String +[1,2,3] abc Array(Int64) String +42 abc Int64 String +abc abc String String +abd abc String String +abc abd String String +\N \N None None +[1,2,3] \N Array(Int64) None +42 \N Int64 None +abc \N String None +order by d2, d1 nulls last +[1,2,3] [1,2,3] Array(Int64) Array(Int64) +[1,2,4] [1,2,3] Array(Int64) Array(Int64) +42 [1,2,3] Int64 Array(Int64) +abc [1,2,3] String Array(Int64) +\N [1,2,3] None Array(Int64) +[1,2,3] [1,2,4] Array(Int64) Array(Int64) +[1,2,3] 42 Array(Int64) Int64 +42 42 Int64 Int64 +43 42 Int64 Int64 +abc 42 String Int64 +\N 42 None Int64 +42 43 Int64 Int64 +[1,2,3] abc Array(Int64) String +42 abc Int64 String +abc abc String String +abd abc String String +\N abc None String +abc abd String String +[1,2,3] \N Array(Int64) None +42 \N Int64 None +abc \N String None +\N \N None None +d1 = d2 +[1,2,3] [1,2,3] 1 Array(Int64) Array(Int64) +[1,2,3] [1,2,4] 0 Array(Int64) Array(Int64) +[1,2,3] 42 0 Array(Int64) Int64 +[1,2,3] abc 0 Array(Int64) String +[1,2,3] \N 0 Array(Int64) None +[1,2,4] [1,2,3] 0 Array(Int64) Array(Int64) +42 [1,2,3] 0 Int64 Array(Int64) +42 42 1 Int64 Int64 +42 43 0 Int64 Int64 +42 abc 0 Int64 String +42 \N 0 Int64 None +43 42 0 Int64 Int64 +abc [1,2,3] 0 String Array(Int64) +abc 42 0 String Int64 +abc abc 1 String String +abc abd 0 String String +abc \N 0 String None +abd abc 0 String String +\N [1,2,3] 0 None Array(Int64) +\N 42 0 None Int64 +\N abc 0 None String +\N \N 1 None None +d1 < d2 +[1,2,3] [1,2,3] 0 Array(Int64) Array(Int64) +[1,2,3] [1,2,4] 1 Array(Int64) Array(Int64) +[1,2,3] 42 1 Array(Int64) Int64 +[1,2,3] abc 1 Array(Int64) String +[1,2,3] \N 1 Array(Int64) None +[1,2,4] [1,2,3] 0 Array(Int64) Array(Int64) +42 [1,2,3] 0 Int64 Array(Int64) +42 42 0 Int64 Int64 +42 43 1 Int64 Int64 +42 abc 1 Int64 String +42 \N 1 Int64 None +43 42 0 Int64 Int64 +abc [1,2,3] 0 String Array(Int64) +abc 42 0 String Int64 +abc abc 0 String String +abc abd 1 String String +abc \N 1 String None +abd abc 0 String String +\N [1,2,3] 0 None Array(Int64) +\N 42 0 None Int64 +\N abc 0 None String +\N \N 0 None None +d1 <= d2 +[1,2,3] [1,2,3] 1 Array(Int64) Array(Int64) +[1,2,3] [1,2,4] 1 Array(Int64) Array(Int64) +[1,2,3] 42 1 Array(Int64) Int64 +[1,2,3] abc 1 Array(Int64) String +[1,2,3] \N 1 Array(Int64) None +[1,2,4] [1,2,3] 0 Array(Int64) Array(Int64) +42 [1,2,3] 0 Int64 Array(Int64) +42 42 1 Int64 Int64 +42 43 1 Int64 Int64 +42 abc 1 Int64 String +42 \N 1 Int64 None +43 42 0 Int64 Int64 +abc [1,2,3] 0 String Array(Int64) +abc 42 0 String Int64 +abc abc 1 String String +abc abd 1 String String +abc \N 1 String None +abd abc 0 String String +\N [1,2,3] 0 None Array(Int64) +\N 42 0 None Int64 +\N abc 0 None String +\N \N 1 None None +d1 > d2 +[1,2,3] [1,2,3] 0 Array(Int64) Array(Int64) +[1,2,3] [1,2,4] 0 Array(Int64) Array(Int64) +[1,2,3] 42 0 Array(Int64) Int64 +[1,2,3] abc 0 Array(Int64) String +[1,2,3] \N 0 Array(Int64) None +[1,2,4] [1,2,3] 1 Array(Int64) Array(Int64) +42 [1,2,3] 1 Int64 Array(Int64) +42 42 0 Int64 Int64 +42 43 0 Int64 Int64 +42 abc 0 Int64 String +42 \N 0 Int64 None +43 42 1 Int64 Int64 +abc [1,2,3] 1 String Array(Int64) +abc 42 1 String Int64 +abc abc 0 String String +abc abd 0 String String +abc \N 0 String None +abd abc 1 String String +\N [1,2,3] 1 None Array(Int64) +\N 42 1 None Int64 +\N abc 1 None String +\N \N 0 None None +d1 >= d2 +[1,2,3] [1,2,3] 1 Array(Int64) Array(Int64) +[1,2,3] [1,2,4] 1 Array(Int64) Array(Int64) +[1,2,3] 42 1 Array(Int64) Int64 +[1,2,3] abc 1 Array(Int64) String +[1,2,3] \N 1 Array(Int64) None +[1,2,4] [1,2,3] 1 Array(Int64) Array(Int64) +42 [1,2,3] 1 Int64 Array(Int64) +42 42 1 Int64 Int64 +42 43 1 Int64 Int64 +42 abc 1 Int64 String +42 \N 1 Int64 None +43 42 1 Int64 Int64 +abc [1,2,3] 1 String Array(Int64) +abc 42 1 String Int64 +abc abc 1 String String +abc abd 1 String String +abc \N 1 String None +abd abc 1 String String +\N [1,2,3] 1 None Array(Int64) +\N 42 1 None Int64 +\N abc 1 None String +\N \N 1 None None diff --git a/tests/queries/0_stateless/03035_dynamic_sorting.sql b/tests/queries/0_stateless/03035_dynamic_sorting.sql new file mode 100644 index 00000000000..0487fafc955 --- /dev/null +++ b/tests/queries/0_stateless/03035_dynamic_sorting.sql @@ -0,0 +1,80 @@ +set allow_experimental_dynamic_type = 1; + +drop table if exists test; +create table test (d1 Dynamic, d2 Dynamic) engine=Memory; + +insert into test values (42, 42); +insert into test values (42, 43); +insert into test values (43, 42); + +insert into test values ('abc', 'abc'); +insert into test values ('abc', 'abd'); +insert into test values ('abd', 'abc'); + +insert into test values ([1,2,3], [1,2,3]); +insert into test values ([1,2,3], [1,2,4]); +insert into test values ([1,2,4], [1,2,3]); + +insert into test values (NULL, NULL); + +insert into test values (42, 'abc'); +insert into test values ('abc', 42); + +insert into test values (42, [1,2,3]); +insert into test values ([1,2,3], 42); + +insert into test values (42, NULL); +insert into test values (NULL, 42); + +insert into test values ('abc', [1,2,3]); +insert into test values ([1,2,3], 'abc'); + +insert into test values ('abc', NULL); +insert into test values (NULL, 'abc'); + +insert into test values ([1,2,3], NULL); +insert into test values (NULL, [1,2,3]); + + +select 'order by d1 nulls first'; +select d1, dynamicType(d1) from test order by d1 nulls first; + +select 'order by d1 nulls last'; +select d1, dynamicType(d1) from test order by d1 nulls last; + +select 'order by d2 nulls first'; +select d2, dynamicType(d2) from test order by d2 nulls first; + +select 'order by d2 nulls last'; +select d2, dynamicType(d2) from test order by d2 nulls last; + + +select 'order by d1, d2 nulls first'; +select d1, d2, dynamicType(d1), dynamicType(d2) from test order by d1, d2 nulls first; + +select 'order by d1, d2 nulls last'; +select d1, d2, dynamicType(d1), dynamicType(d2) from test order by d1, d2 nulls last; + +select 'order by d2, d1 nulls first'; +select d1, d2, dynamicType(d1), dynamicType(d2) from test order by d2, d1 nulls first; + +select 'order by d2, d1 nulls last'; +select d1, d2, dynamicType(d1), dynamicType(d2) from test order by d2, d1 nulls last; + +select 'd1 = d2'; +select d1, d2, d1 = d2, dynamicType(d1), dynamicType(d2) from test order by d1, d2; + +select 'd1 < d2'; +select d1, d2, d1 < d2, dynamicType(d1), dynamicType(d2) from test order by d1, d2; + +select 'd1 <= d2'; +select d1, d2, d1 <= d2, dynamicType(d1), dynamicType(d2) from test order by d1, d2; + +select 'd1 > d2'; +select d1, d2, d1 > d2, dynamicType(d1), dynamicType(d2) from test order by d1, d2; + +select 'd1 >= d2'; +select d1, d2, d2 >= d2, dynamicType(d1), dynamicType(d2) from test order by d1, d2; + +drop table test; + diff --git a/tests/queries/0_stateless/03036_dynamic_read_subcolumns.reference b/tests/queries/0_stateless/03036_dynamic_read_subcolumns.reference new file mode 100644 index 00000000000..36984bc8b9b --- /dev/null +++ b/tests/queries/0_stateless/03036_dynamic_read_subcolumns.reference @@ -0,0 +1,57 @@ +Memory +test +Array(Array(Dynamic)) +Array(Variant(String, UInt64)) +None +String +UInt64 +200000 +200000 +200000 +200000 +0 +0 +200000 +200000 +100000 +100000 +200000 +0 +MergeTree compact +test +Array(Array(Dynamic)) +Array(Variant(String, UInt64)) +None +String +UInt64 +200000 +200000 +200000 +200000 +0 +0 +200000 +200000 +100000 +100000 +200000 +0 +MergeTree wide +test +Array(Array(Dynamic)) +Array(Variant(String, UInt64)) +None +String +UInt64 +200000 +200000 +200000 +200000 +0 +0 +200000 +200000 +100000 +100000 +200000 +0 diff --git a/tests/queries/0_stateless/03036_dynamic_read_subcolumns.sh b/tests/queries/0_stateless/03036_dynamic_read_subcolumns.sh new file mode 100755 index 00000000000..65517061b99 --- /dev/null +++ b/tests/queries/0_stateless/03036_dynamic_read_subcolumns.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --allow_experimental_dynamic_type=1" + + +function test() +{ + echo "test" + $CH_CLIENT -q "insert into test select number, number from numbers(100000) settings min_insert_block_size_rows=50000" + $CH_CLIENT -q "insert into test select number, 'str_' || toString(number) from numbers(100000, 100000) settings min_insert_block_size_rows=50000" + $CH_CLIENT -q "insert into test select number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10 + 1)) from numbers(200000, 100000) settings min_insert_block_size_rows=50000" + $CH_CLIENT -q "insert into test select number, NULL from numbers(300000, 100000) settings min_insert_block_size_rows=50000" + $CH_CLIENT -q "insert into test select number, multiIf(number % 4 == 3, 'str_' || toString(number), number % 4 == 2, NULL, number % 4 == 1, number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10 + 1))) from numbers(400000, 400000) settings min_insert_block_size_rows=50000" + $CH_CLIENT -q "insert into test select number, [range((number % 10 + 1)::UInt64)]::Array(Array(Dynamic)) from numbers(100000, 100000) settings min_insert_block_size_rows=50000" + + $CH_CLIENT -q "select distinct dynamicType(d) as type from test order by type" + $CH_CLIENT -q "select count() from test where dynamicType(d) == 'UInt64'" + $CH_CLIENT -q "select count() from test where d.UInt64 is not NULL" + $CH_CLIENT -q "select count() from test where dynamicType(d) == 'String'" + $CH_CLIENT -q "select count() from test where d.String is not NULL" + $CH_CLIENT -q "select count() from test where dynamicType(d) == 'Date'" + $CH_CLIENT -q "select count() from test where d.Date is not NULL" + $CH_CLIENT -q "select count() from test where dynamicType(d) == 'Array(Variant(String, UInt64))'" + $CH_CLIENT -q "select count() from test where not empty(d.\`Array(Variant(String, UInt64))\`)" + $CH_CLIENT -q "select count() from test where dynamicType(d) == 'Array(Array(Dynamic))'" + $CH_CLIENT -q "select count() from test where not empty(d.\`Array(Array(Dynamic))\`)" + $CH_CLIENT -q "select count() from test where d is NULL" + $CH_CLIENT -q "select count() from test where not empty(d.\`Tuple(a Array(Dynamic))\`.a.String)" + + $CH_CLIENT -q "select d, d.UInt64, d.String, d.\`Array(Variant(String, UInt64))\` from test format Null" + $CH_CLIENT -q "select d.UInt64, d.String, d.\`Array(Variant(String, UInt64))\` from test format Null" + $CH_CLIENT -q "select d.Int8, d.Date, d.\`Array(String)\` from test format Null" + $CH_CLIENT -q "select d, d.UInt64, d.Date, d.\`Array(Variant(String, UInt64))\`, d.\`Array(Variant(String, UInt64))\`.size0, d.\`Array(Variant(String, UInt64))\`.UInt64 from test format Null" + $CH_CLIENT -q "select d.UInt64, d.Date, d.\`Array(Variant(String, UInt64))\`, d.\`Array(Variant(String, UInt64))\`.size0, d.\`Array(Variant(String, UInt64))\`.UInt64, d.\`Array(Variant(String, UInt64))\`.String from test format Null" + $CH_CLIENT -q "select d, d.\`Tuple(a UInt64, b String)\`.a, d.\`Array(Dynamic)\`.\`Variant(String, UInt64)\`.UInt64, d.\`Array(Variant(String, UInt64))\`.UInt64 from test format Null" + $CH_CLIENT -q "select d.\`Array(Dynamic)\`.\`Variant(String, UInt64)\`.UInt64, d.\`Array(Dynamic)\`.size0, d.\`Array(Variant(String, UInt64))\`.UInt64 from test format Null" + $CH_CLIENT -q "select d.\`Array(Array(Dynamic))\`.size1, d.\`Array(Array(Dynamic))\`.UInt64, d.\`Array(Array(Dynamic))\`.\`Map(String, Tuple(a UInt64))\`.values.a from test format Null" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=Memory" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +test +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.reference b/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.reference new file mode 100644 index 00000000000..59297e46330 --- /dev/null +++ b/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.reference @@ -0,0 +1,60 @@ +MergeTree compact +test +50000 DateTime +60000 Date +70000 Array(UInt16) +80000 String +100000 None +100000 UInt64 +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +200000 Map(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +10000 Tuple(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +270000 String +MergeTree wide +test +50000 DateTime +60000 Date +70000 Array(UInt16) +80000 String +100000 None +100000 UInt64 +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +200000 Map(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +10000 Tuple(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +270000 String diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh b/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh new file mode 100755 index 00000000000..7c1ac41cfdc --- /dev/null +++ b/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --merge_max_block_size 8192 --merge_max_block_size_bytes=10485760 --index_granularity 8192" + +function test() +{ + echo "test" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, 'str_' || toString(number) from numbers(80000)" + $CH_CLIENT -q "insert into test select number, range(number % 10 + 1) from numbers(70000)" + $CH_CLIENT -q "insert into test select number, toDate(number) from numbers(60000)" + $CH_CLIENT -q "insert into test select number, toDateTime(number) from numbers(50000)" + $CH_CLIENT -q "insert into test select number, NULL from numbers(100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, map(number, number) from numbers(200000)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, tuple(number, number) from numbers(10000)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_columns_to_activate=10;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_columns_to_activate=10;" +test +$CH_CLIENT -q "drop table test;" + diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.reference b/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.reference new file mode 100644 index 00000000000..59297e46330 --- /dev/null +++ b/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.reference @@ -0,0 +1,60 @@ +MergeTree compact +test +50000 DateTime +60000 Date +70000 Array(UInt16) +80000 String +100000 None +100000 UInt64 +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +200000 Map(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +10000 Tuple(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +270000 String +MergeTree wide +test +50000 DateTime +60000 Date +70000 Array(UInt16) +80000 String +100000 None +100000 UInt64 +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +200000 Map(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +10000 Tuple(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +270000 String diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.sh b/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.sh new file mode 100755 index 00000000000..927ceac72b5 --- /dev/null +++ b/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + + + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --merge_max_block_size 8192 --merge_max_block_size_bytes=10485760 --index_granularity 8192" +function test() +{ + echo "test" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, 'str_' || toString(number) from numbers(80000)" + $CH_CLIENT -q "insert into test select number, range(number % 10 + 1) from numbers(70000)" + $CH_CLIENT -q "insert into test select number, toDate(number) from numbers(60000)" + $CH_CLIENT -q "insert into test select number, toDateTime(number) from numbers(50000)" + $CH_CLIENT -q "insert into test select number, NULL from numbers(100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, map(number, number) from numbers(200000)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, tuple(number, number) from numbers(10000)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" +test +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/03037_dynamic_merges_2.reference b/tests/queries/0_stateless/03037_dynamic_merges_2.reference new file mode 100644 index 00000000000..420b8185b16 --- /dev/null +++ b/tests/queries/0_stateless/03037_dynamic_merges_2.reference @@ -0,0 +1,20 @@ +MergeTree compact + horizontal merge +test +1000000 Array(UInt16) +1000000 String +1000000 UInt64 +MergeTree wide + horizontal merge +test +1000000 Array(UInt16) +1000000 String +1000000 UInt64 +MergeTree compact + vertical merge +test +1000000 Array(UInt16) +1000000 String +1000000 UInt64 +MergeTree wide + vertical merge +test +1000000 Array(UInt16) +1000000 String +1000000 UInt64 diff --git a/tests/queries/0_stateless/03037_dynamic_merges_2.sh b/tests/queries/0_stateless/03037_dynamic_merges_2.sh new file mode 100755 index 00000000000..40adbdd4262 --- /dev/null +++ b/tests/queries/0_stateless/03037_dynamic_merges_2.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1" + + +function test() +{ + echo "test" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, number from numbers(1000000)" + $CH_CLIENT -q "insert into test select number, 'str_' || toString(number) from numbers(1000000, 1000000)" + $CH_CLIENT -q "insert into test select number, range(number % 10 + 1) from numbers(2000000, 1000000)" + + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact + horizontal merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide + horizontal merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact + vertical merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide + vertical merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" +test +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/03038_nested_dynamic_merges.reference b/tests/queries/0_stateless/03038_nested_dynamic_merges.reference new file mode 100644 index 00000000000..65034647775 --- /dev/null +++ b/tests/queries/0_stateless/03038_nested_dynamic_merges.reference @@ -0,0 +1,92 @@ +MergeTree compact + horizontal merge +test +16667 Tuple(a Dynamic(max_types=3)):Date +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):String +50000 Tuple(a Dynamic(max_types=3)):UInt64 +100000 UInt64:None +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 UInt64:None +16667 Tuple(a Dynamic(max_types=3)):DateTime +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None +133333 Tuple(a Dynamic(max_types=3)):None +50000 Tuple(a Dynamic(max_types=3)):UInt64 +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None +116667 Tuple(a Dynamic(max_types=3)):String +133333 Tuple(a Dynamic(max_types=3)):None +MergeTree wide + horizontal merge +test +16667 Tuple(a Dynamic(max_types=3)):Date +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):String +50000 Tuple(a Dynamic(max_types=3)):UInt64 +100000 UInt64:None +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 UInt64:None +16667 Tuple(a Dynamic(max_types=3)):DateTime +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None +133333 Tuple(a Dynamic(max_types=3)):None +50000 Tuple(a Dynamic(max_types=3)):UInt64 +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None +116667 Tuple(a Dynamic(max_types=3)):String +133333 Tuple(a Dynamic(max_types=3)):None +MergeTree compact + vertical merge +test +16667 Tuple(a Dynamic(max_types=3)):Date +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):String +50000 Tuple(a Dynamic(max_types=3)):UInt64 +100000 UInt64:None +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 UInt64:None +16667 Tuple(a Dynamic(max_types=3)):DateTime +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None +133333 Tuple(a Dynamic(max_types=3)):None +50000 Tuple(a Dynamic(max_types=3)):UInt64 +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None +116667 Tuple(a Dynamic(max_types=3)):String +133333 Tuple(a Dynamic(max_types=3)):None +MergeTree wide + vertical merge +test +16667 Tuple(a Dynamic(max_types=3)):Date +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):String +50000 Tuple(a Dynamic(max_types=3)):UInt64 +100000 UInt64:None +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 UInt64:None +16667 Tuple(a Dynamic(max_types=3)):DateTime +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None +133333 Tuple(a Dynamic(max_types=3)):None +50000 Tuple(a Dynamic(max_types=3)):UInt64 +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None +116667 Tuple(a Dynamic(max_types=3)):String +133333 Tuple(a Dynamic(max_types=3)):None diff --git a/tests/queries/0_stateless/03038_nested_dynamic_merges.sh b/tests/queries/0_stateless/03038_nested_dynamic_merges.sh new file mode 100755 index 00000000000..b82ddb3813e --- /dev/null +++ b/tests/queries/0_stateless/03038_nested_dynamic_merges.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --allow_experimental_dynamic_type=1" + + +function test() +{ + echo "test" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, tuple(if(number % 2 == 0, number, 'str_' || toString(number)))::Tuple(a Dynamic(max_types=3)) from numbers(100000)" + $CH_CLIENT -q "insert into test select number, tuple(if(number % 3 == 0, toDate(number), range(number % 10)))::Tuple(a Dynamic(max_types=3)) from numbers(50000)" + + $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count(), type" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count(), type" + + $CH_CLIENT -q "insert into test select number, tuple(if(number % 3 == 0, toDateTime(number), NULL))::Tuple(a Dynamic(max_types=3)) from numbers(50000)" + $CH_CLIENT -q "insert into test select number, tuple(if(number % 2 == 0, tuple(number), NULL))::Tuple(a Dynamic(max_types=3)) from numbers(200000)" + + $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count(), type" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count(), type" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact + horizontal merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide + horizontal merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact + vertical merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide + vertical merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" +test +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference new file mode 100644 index 00000000000..6c69b81c183 --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference @@ -0,0 +1,88 @@ +MergeTree compact + horizontal merge +ReplacingMergeTree +100000 String +100000 UInt64 +50000 UInt64 +100000 String +SummingMergeTree +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 +AggregatingMergeTree +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 +MergeTree wide + horizontal merge +ReplacingMergeTree +100000 String +100000 UInt64 +50000 UInt64 +100000 String +SummingMergeTree +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 +AggregatingMergeTree +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 +MergeTree compact + vertical merge +ReplacingMergeTree +100000 String +100000 UInt64 +50000 UInt64 +100000 String +SummingMergeTree +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 +AggregatingMergeTree +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 +MergeTree wide + vertical merge +ReplacingMergeTree +100000 String +100000 UInt64 +50000 UInt64 +100000 String +SummingMergeTree +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 +AggregatingMergeTree +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh new file mode 100755 index 00000000000..9cfd2294c8d --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --optimize_aggregation_in_order 0 --index_granularity_bytes 10485760 --index_granularity 8128 --merge_max_block_size 8128" + + +function test() +{ + echo "ReplacingMergeTree" + $CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=ReplacingMergeTree order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, 'str_' || toString(number) from numbers(50000, 100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "drop table test" + + echo "SummingMergeTree" + $CH_CLIENT -q "create table test (id UInt64, sum UInt64, d Dynamic) engine=SummingMergeTree(sum) order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, 1, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, 1, 'str_' || toString(number) from numbers(50000, 100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select count(), sum from test group by sum order by sum, count()" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select count(), sum from test group by sum order by sum, count()" + $CH_CLIENT -q "drop table test" + + echo "AggregatingMergeTree" + $CH_CLIENT -q "create table test (id UInt64, sum AggregateFunction(sum, UInt64), d Dynamic) engine=AggregatingMergeTree() order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, sumState(1::UInt64), number from numbers(100000) group by number" + $CH_CLIENT -q "insert into test select number, sumState(1::UInt64), 'str_' || toString(number) from numbers(50000, 100000) group by number" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select count(), sum from (select sumMerge(sum) as sum from test group by id, _part) group by sum order by sum, count()" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select count(), sum from (select sumMerge(sum) as sum from test group by id, _part) group by sum order by sum, count()" + $CH_CLIENT -q "drop table test" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact + horizontal merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=10000000000, vertical_merge_algorithm_min_columns_to_activate=100000000000" + +echo "MergeTree wide + horizontal merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1,vertical_merge_algorithm_min_rows_to_activate=1000000000, vertical_merge_algorithm_min_columns_to_activate=1000000000000" + +echo "MergeTree compact + vertical merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" + +echo "MergeTree wide + vertical merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference new file mode 100644 index 00000000000..af6c7d8d567 --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference @@ -0,0 +1,44 @@ +MergeTree compact + horizontal merge +CollapsingMergeTree +100000 String +100000 UInt64 +50000 String +50000 UInt64 +VersionedCollapsingMergeTree +100000 String +100000 UInt64 +75000 String +75000 UInt64 +MergeTree wide + horizontal merge +CollapsingMergeTree +100000 String +100000 UInt64 +50000 String +50000 UInt64 +VersionedCollapsingMergeTree +100000 String +100000 UInt64 +75000 String +75000 UInt64 +MergeTree compact + vertical merge +CollapsingMergeTree +100000 String +100000 UInt64 +50000 String +50000 UInt64 +VersionedCollapsingMergeTree +100000 String +100000 UInt64 +75000 String +75000 UInt64 +MergeTree wide + vertical merge +CollapsingMergeTree +100000 String +100000 UInt64 +50000 String +50000 UInt64 +VersionedCollapsingMergeTree +100000 String +100000 UInt64 +75000 String +75000 UInt64 diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh new file mode 100755 index 00000000000..02362012960 --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --index_granularity 8128 --merge_max_block_size 8128" + + +function test() +{ + echo "CollapsingMergeTree" + $CH_CLIENT -q "create table test (id UInt64, sign Int8, d Dynamic) engine=CollapsingMergeTree(sign) order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, 1, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, -1, 'str_' || toString(number) from numbers(50000, 100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "drop table test" + + echo "VersionedCollapsingMergeTree" + $CH_CLIENT -q "create table test (id UInt64, sign Int8, version UInt8, d Dynamic) engine=VersionedCollapsingMergeTree(sign, version) order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, 1, 1, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, -1, number >= 75000 ? 2 : 1, 'str_' || toString(number) from numbers(50000, 100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "drop table test" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact + horizontal merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000" + +echo "MergeTree wide + horizontal merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1" + +echo "MergeTree compact + vertical merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" + +echo "MergeTree wide + vertical merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" diff --git a/tests/queries/0_stateless/03040_dynamic_type_alters_1.reference b/tests/queries/0_stateless/03040_dynamic_type_alters_1.reference new file mode 100644 index 00000000000..ca98ec0963c --- /dev/null +++ b/tests/queries/0_stateless/03040_dynamic_type_alters_1.reference @@ -0,0 +1,526 @@ +Memory +initial insert +alter add column 1 +3 None +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +insert after alter add column 1 +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +alter modify column 1 +7 None +8 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +insert after alter modify column 1 +8 None +11 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +alter modify column 2 +4 UInt64 +7 String +8 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +insert after alter modify column 2 +1 Date +5 UInt64 +8 String +9 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +19 19 \N \N \N \N \N +20 20 20 \N 20 \N \N +21 21 str_21 str_21 \N \N \N +22 22 1970-01-23 \N \N 1970-01-23 \N +alter modify column 3 +1 Date +5 UInt64 +8 String +9 None +0 0 0 \N \N \N \N \N \N +1 1 1 \N \N \N \N \N \N +2 2 2 \N \N \N \N \N \N +3 3 3 \N \N \N 3 \N \N +4 4 4 \N \N \N 4 \N \N +5 5 5 \N \N \N 5 \N \N +6 6 6 \N \N str_6 \N \N \N +7 7 7 \N \N str_7 \N \N \N +8 8 8 \N \N str_8 \N \N \N +9 9 9 \N \N \N \N \N \N +10 10 10 \N \N \N \N \N \N +11 11 11 \N \N \N \N \N \N +12 12 12 \N \N \N 12 \N \N +13 13 13 \N \N str_13 \N \N \N +14 14 14 \N \N \N \N \N \N +15 15 15 \N \N \N \N \N \N +16 16 16 \N \N 16 \N \N \N +17 17 17 \N \N str_17 \N \N \N +18 18 18 \N \N 1970-01-19 \N \N \N +19 19 19 \N \N \N \N \N \N +20 20 20 \N \N \N 20 \N \N +21 21 21 \N \N str_21 \N \N \N +22 22 22 \N \N \N \N 1970-01-23 \N +insert after alter modify column 3 +1 Date +5 UInt64 +8 String +12 None +0 0 0 \N \N \N \N \N \N +1 1 1 \N \N \N \N \N \N +2 2 2 \N \N \N \N \N \N +3 3 3 \N \N \N 3 \N \N +4 4 4 \N \N \N 4 \N \N +5 5 5 \N \N \N 5 \N \N +6 6 6 \N \N str_6 \N \N \N +7 7 7 \N \N str_7 \N \N \N +8 8 8 \N \N str_8 \N \N \N +9 9 9 \N \N \N \N \N \N +10 10 10 \N \N \N \N \N \N +11 11 11 \N \N \N \N \N \N +12 12 12 \N \N \N 12 \N \N +13 13 13 \N \N str_13 \N \N \N +14 14 14 \N \N \N \N \N \N +15 15 15 \N \N \N \N \N \N +16 16 16 \N \N 16 \N \N \N +17 17 17 \N \N str_17 \N \N \N +18 18 18 \N \N 1970-01-19 \N \N \N +19 19 19 \N \N \N \N \N \N +20 20 20 \N \N \N 20 \N \N +21 21 21 \N \N str_21 \N \N \N +22 22 22 \N \N \N \N 1970-01-23 \N +23 \N \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N \N +25 str_25 \N str_25 \N \N \N \N \N +MergeTree compact +initial insert +alter add column 1 +3 None +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +insert after alter add column 1 +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +alter modify column 1 +7 None +8 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +insert after alter modify column 1 +8 None +11 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +alter modify column 2 +8 None +11 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +insert after alter modify column 2 +1 Date +1 UInt64 +9 None +12 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +19 19 \N \N \N \N \N +20 20 20 \N 20 \N \N +21 21 str_21 str_21 \N \N \N +22 22 1970-01-23 \N \N 1970-01-23 \N +alter modify column 3 +1 Date +1 UInt64 +9 None +12 String +0 0 0 \N \N \N \N \N \N +1 1 1 \N \N \N \N \N \N +2 2 2 \N \N \N \N \N \N +3 3 3 \N \N 3 \N \N \N +4 4 4 \N \N 4 \N \N \N +5 5 5 \N \N 5 \N \N \N +6 6 6 \N \N str_6 \N \N \N +7 7 7 \N \N str_7 \N \N \N +8 8 8 \N \N str_8 \N \N \N +9 9 9 \N \N \N \N \N \N +10 10 10 \N \N \N \N \N \N +11 11 11 \N \N \N \N \N \N +12 12 12 \N \N 12 \N \N \N +13 13 13 \N \N str_13 \N \N \N +14 14 14 \N \N \N \N \N \N +15 15 15 \N \N \N \N \N \N +16 16 16 \N \N 16 \N \N \N +17 17 17 \N \N str_17 \N \N \N +18 18 18 \N \N 1970-01-19 \N \N \N +19 19 19 \N \N \N \N \N \N +20 20 20 \N \N \N 20 \N \N +21 21 21 \N \N str_21 \N \N \N +22 22 22 \N \N \N \N 1970-01-23 \N +insert after alter modify column 3 +1 Date +1 UInt64 +12 None +12 String +0 0 0 \N \N \N \N \N \N +1 1 1 \N \N \N \N \N \N +2 2 2 \N \N \N \N \N \N +3 3 3 \N \N 3 \N \N \N +4 4 4 \N \N 4 \N \N \N +5 5 5 \N \N 5 \N \N \N +6 6 6 \N \N str_6 \N \N \N +7 7 7 \N \N str_7 \N \N \N +8 8 8 \N \N str_8 \N \N \N +9 9 9 \N \N \N \N \N \N +10 10 10 \N \N \N \N \N \N +11 11 11 \N \N \N \N \N \N +12 12 12 \N \N 12 \N \N \N +13 13 13 \N \N str_13 \N \N \N +14 14 14 \N \N \N \N \N \N +15 15 15 \N \N \N \N \N \N +16 16 16 \N \N 16 \N \N \N +17 17 17 \N \N str_17 \N \N \N +18 18 18 \N \N 1970-01-19 \N \N \N +19 19 19 \N \N \N \N \N \N +20 20 20 \N \N \N 20 \N \N +21 21 21 \N \N str_21 \N \N \N +22 22 22 \N \N \N \N 1970-01-23 \N +23 \N \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N \N +25 str_25 \N str_25 \N \N \N \N \N +MergeTree wide +initial insert +alter add column 1 +3 None +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +insert after alter add column 1 +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +alter modify column 1 +7 None +8 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +insert after alter modify column 1 +8 None +11 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +alter modify column 2 +8 None +11 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +insert after alter modify column 2 +1 Date +1 UInt64 +9 None +12 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +19 19 \N \N \N \N \N +20 20 20 \N 20 \N \N +21 21 str_21 str_21 \N \N \N +22 22 1970-01-23 \N \N 1970-01-23 \N +alter modify column 3 +1 Date +1 UInt64 +9 None +12 String +0 0 0 \N \N \N \N \N \N +1 1 1 \N \N \N \N \N \N +2 2 2 \N \N \N \N \N \N +3 3 3 \N \N 3 \N \N \N +4 4 4 \N \N 4 \N \N \N +5 5 5 \N \N 5 \N \N \N +6 6 6 \N \N str_6 \N \N \N +7 7 7 \N \N str_7 \N \N \N +8 8 8 \N \N str_8 \N \N \N +9 9 9 \N \N \N \N \N \N +10 10 10 \N \N \N \N \N \N +11 11 11 \N \N \N \N \N \N +12 12 12 \N \N 12 \N \N \N +13 13 13 \N \N str_13 \N \N \N +14 14 14 \N \N \N \N \N \N +15 15 15 \N \N \N \N \N \N +16 16 16 \N \N 16 \N \N \N +17 17 17 \N \N str_17 \N \N \N +18 18 18 \N \N 1970-01-19 \N \N \N +19 19 19 \N \N \N \N \N \N +20 20 20 \N \N \N 20 \N \N +21 21 21 \N \N str_21 \N \N \N +22 22 22 \N \N \N \N 1970-01-23 \N +insert after alter modify column 3 +1 Date +1 UInt64 +12 None +12 String +0 0 0 \N \N \N \N \N \N +1 1 1 \N \N \N \N \N \N +2 2 2 \N \N \N \N \N \N +3 3 3 \N \N 3 \N \N \N +4 4 4 \N \N 4 \N \N \N +5 5 5 \N \N 5 \N \N \N +6 6 6 \N \N str_6 \N \N \N +7 7 7 \N \N str_7 \N \N \N +8 8 8 \N \N str_8 \N \N \N +9 9 9 \N \N \N \N \N \N +10 10 10 \N \N \N \N \N \N +11 11 11 \N \N \N \N \N \N +12 12 12 \N \N 12 \N \N \N +13 13 13 \N \N str_13 \N \N \N +14 14 14 \N \N \N \N \N \N +15 15 15 \N \N \N \N \N \N +16 16 16 \N \N 16 \N \N \N +17 17 17 \N \N str_17 \N \N \N +18 18 18 \N \N 1970-01-19 \N \N \N +19 19 19 \N \N \N \N \N \N +20 20 20 \N \N \N 20 \N \N +21 21 21 \N \N str_21 \N \N \N +22 22 22 \N \N \N \N 1970-01-23 \N +23 \N \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N \N +25 str_25 \N str_25 \N \N \N \N \N diff --git a/tests/queries/0_stateless/03040_dynamic_type_alters_1.sh b/tests/queries/0_stateless/03040_dynamic_type_alters_1.sh new file mode 100755 index 00000000000..7a73be20a4d --- /dev/null +++ b/tests/queries/0_stateless/03040_dynamic_type_alters_1.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --allow_experimental_analyzer=1" + +function run() +{ + echo "initial insert" + $CH_CLIENT -q "insert into test select number, number from numbers(3)" + + echo "alter add column 1" + $CH_CLIENT -q "alter table test add column d Dynamic(max_types=3) settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "insert after alter add column 1" + $CH_CLIENT -q "insert into test select number, number, number from numbers(3, 3)" + $CH_CLIENT -q "insert into test select number, number, 'str_' || toString(number) from numbers(6, 3)" + $CH_CLIENT -q "insert into test select number, number, NULL from numbers(9, 3)" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 3 == 0, number, number % 3 == 1, 'str_' || toString(number), NULL) from numbers(12, 3)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "alter modify column 1" + $CH_CLIENT -q "alter table test modify column d Dynamic(max_types=1) settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "insert after alter modify column 1" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, toDate(number), NULL) from numbers(15, 4)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "alter modify column 2" + $CH_CLIENT -q "alter table test modify column d Dynamic(max_types=3) settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "insert after alter modify column 2" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, toDate(number), NULL) from numbers(19, 4)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "alter modify column 3" + $CH_CLIENT -q "alter table test modify column y Dynamic settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, y.UInt64, y.String, y.\`Tuple(a UInt64)\`.a, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "insert after alter modify column 3" + $CH_CLIENT -q "insert into test select number, multiIf(number % 3 == 0, number, number % 3 == 1, 'str_' || toString(number), NULL), NULL from numbers(23, 3)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, y.UInt64, y.String, y.\`Tuple(a UInt64)\`.a, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (x UInt64, y UInt64) engine=Memory" +run +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (x UInt64, y UInt64) engine=MergeTree order by x settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (x UInt64, y UInt64 ) engine=MergeTree order by x settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run +$CH_CLIENT -q "drop table test;" + diff --git a/tests/queries/0_stateless/03040_dynamic_type_alters_2.reference b/tests/queries/0_stateless/03040_dynamic_type_alters_2.reference new file mode 100644 index 00000000000..18a181464e9 --- /dev/null +++ b/tests/queries/0_stateless/03040_dynamic_type_alters_2.reference @@ -0,0 +1,182 @@ +MergeTree compact +initial insert +alter add column +3 None +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +insert after alter add column 1 +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +alter rename column 1 +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +insert nested dynamic +3 Array(Dynamic) +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N [] [] [] +1 1 \N \N \N \N \N [] [] [] +2 2 \N \N \N \N \N [] [] [] +3 3 3 \N 3 \N \N [] [] [] +4 4 4 \N 4 \N \N [] [] [] +5 5 5 \N 5 \N \N [] [] [] +6 6 str_6 str_6 \N \N \N [] [] [] +7 7 str_7 str_7 \N \N \N [] [] [] +8 8 str_8 str_8 \N \N \N [] [] [] +9 9 \N \N \N \N \N [] [] [] +10 10 \N \N \N \N \N [] [] [] +11 11 \N \N \N \N \N [] [] [] +12 12 12 \N 12 \N \N [] [] [] +13 13 str_13 str_13 \N \N \N [] [] [] +14 14 \N \N \N \N \N [] [] [] +15 15 [15] \N \N \N \N [15] [NULL] [NULL] +16 16 ['str_16'] \N \N \N \N [NULL] ['str_16'] [NULL] +17 17 [17] \N \N \N \N [17] [NULL] [NULL] +alter rename column 2 +3 Array(Dynamic) +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N [] [] [] +1 1 \N \N \N \N \N [] [] [] +2 2 \N \N \N \N \N [] [] [] +3 3 3 \N 3 \N \N [] [] [] +4 4 4 \N 4 \N \N [] [] [] +5 5 5 \N 5 \N \N [] [] [] +6 6 str_6 str_6 \N \N \N [] [] [] +7 7 str_7 str_7 \N \N \N [] [] [] +8 8 str_8 str_8 \N \N \N [] [] [] +9 9 \N \N \N \N \N [] [] [] +10 10 \N \N \N \N \N [] [] [] +11 11 \N \N \N \N \N [] [] [] +12 12 12 \N 12 \N \N [] [] [] +13 13 str_13 str_13 \N \N \N [] [] [] +14 14 \N \N \N \N \N [] [] [] +15 15 [15] \N \N \N \N [15] [NULL] [NULL] +16 16 ['str_16'] \N \N \N \N [NULL] ['str_16'] [NULL] +17 17 [17] \N \N \N \N [17] [NULL] [NULL] +MergeTree wide +initial insert +alter add column +3 None +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +insert after alter add column 1 +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +alter rename column 1 +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +insert nested dynamic +3 Array(Dynamic) +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N [] [] [] +1 1 \N \N \N \N \N [] [] [] +2 2 \N \N \N \N \N [] [] [] +3 3 3 \N 3 \N \N [] [] [] +4 4 4 \N 4 \N \N [] [] [] +5 5 5 \N 5 \N \N [] [] [] +6 6 str_6 str_6 \N \N \N [] [] [] +7 7 str_7 str_7 \N \N \N [] [] [] +8 8 str_8 str_8 \N \N \N [] [] [] +9 9 \N \N \N \N \N [] [] [] +10 10 \N \N \N \N \N [] [] [] +11 11 \N \N \N \N \N [] [] [] +12 12 12 \N 12 \N \N [] [] [] +13 13 str_13 str_13 \N \N \N [] [] [] +14 14 \N \N \N \N \N [] [] [] +15 15 [15] \N \N \N \N [15] [NULL] [NULL] +16 16 ['str_16'] \N \N \N \N [NULL] ['str_16'] [NULL] +17 17 [17] \N \N \N \N [17] [NULL] [NULL] +alter rename column 2 +3 Array(Dynamic) +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N [] [] [] +1 1 \N \N \N \N \N [] [] [] +2 2 \N \N \N \N \N [] [] [] +3 3 3 \N 3 \N \N [] [] [] +4 4 4 \N 4 \N \N [] [] [] +5 5 5 \N 5 \N \N [] [] [] +6 6 str_6 str_6 \N \N \N [] [] [] +7 7 str_7 str_7 \N \N \N [] [] [] +8 8 str_8 str_8 \N \N \N [] [] [] +9 9 \N \N \N \N \N [] [] [] +10 10 \N \N \N \N \N [] [] [] +11 11 \N \N \N \N \N [] [] [] +12 12 12 \N 12 \N \N [] [] [] +13 13 str_13 str_13 \N \N \N [] [] [] +14 14 \N \N \N \N \N [] [] [] +15 15 [15] \N \N \N \N [15] [NULL] [NULL] +16 16 ['str_16'] \N \N \N \N [NULL] ['str_16'] [NULL] +17 17 [17] \N \N \N \N [17] [NULL] [NULL] diff --git a/tests/queries/0_stateless/03040_dynamic_type_alters_2.sh b/tests/queries/0_stateless/03040_dynamic_type_alters_2.sh new file mode 100755 index 00000000000..6491e64372f --- /dev/null +++ b/tests/queries/0_stateless/03040_dynamic_type_alters_2.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --allow_experimental_variant_type=1 --use_variant_as_common_type=1" + +function run() +{ + echo "initial insert" + $CH_CLIENT -q "insert into test select number, number from numbers(3)" + + echo "alter add column" + $CH_CLIENT -q "alter table test add column d Dynamic settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "insert after alter add column 1" + $CH_CLIENT -q "insert into test select number, number, number from numbers(3, 3)" + $CH_CLIENT -q "insert into test select number, number, 'str_' || toString(number) from numbers(6, 3)" + $CH_CLIENT -q "insert into test select number, number, NULL from numbers(9, 3)" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 3 == 0, number, number % 3 == 1, 'str_' || toString(number), NULL) from numbers(12, 3)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "alter rename column 1" + $CH_CLIENT -q "alter table test rename column d to d1 settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d1) from test group by dynamicType(d1) order by count(), dynamicType(d1)" + $CH_CLIENT -q "select x, y, d1, d1.String, d1.UInt64, d1.Date, d1.\`Tuple(a UInt64)\`.a from test order by x" + + echo "insert nested dynamic" + $CH_CLIENT -q "insert into test select number, number, [number % 2 ? number : 'str_' || toString(number)]::Array(Dynamic) from numbers(15, 3)" + $CH_CLIENT -q "select count(), dynamicType(d1) from test group by dynamicType(d1) order by count(), dynamicType(d1)" + $CH_CLIENT -q "select x, y, d1, d1.String, d1.UInt64, d1.Date, d1.\`Tuple(a UInt64)\`.a, d1.\`Array(Dynamic)\`.UInt64, d1.\`Array(Dynamic)\`.String, d1.\`Array(Dynamic)\`.Date from test order by x" + + echo "alter rename column 2" + $CH_CLIENT -q "alter table test rename column d1 to d2 settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d2) from test group by dynamicType(d2) order by count(), dynamicType(d2)" + $CH_CLIENT -q "select x, y, d2, d2.String, d2.UInt64, d2.Date, d2.\`Tuple(a UInt64)\`.a, d2.\`Array(Dynamic)\`.UInt64, d2.\`Array(Dynamic)\`.String, d2.\`Array(Dynamic)\`.Date, from test order by x" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (x UInt64, y UInt64) engine=MergeTree order by x settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (x UInt64, y UInt64 ) engine=MergeTree order by x settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run +$CH_CLIENT -q "drop table test;" + diff --git a/tests/queries/0_stateless/03041_dynamic_type_check_table.reference b/tests/queries/0_stateless/03041_dynamic_type_check_table.reference new file mode 100644 index 00000000000..b1ea186a917 --- /dev/null +++ b/tests/queries/0_stateless/03041_dynamic_type_check_table.reference @@ -0,0 +1,56 @@ +MergeTree compact +initial insert +alter add column +3 None +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +insert after alter add column +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +check table +1 +MergeTree wide +initial insert +alter add column +3 None +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +insert after alter add column +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +check table +1 diff --git a/tests/queries/0_stateless/03041_dynamic_type_check_table.sh b/tests/queries/0_stateless/03041_dynamic_type_check_table.sh new file mode 100755 index 00000000000..3d802485be3 --- /dev/null +++ b/tests/queries/0_stateless/03041_dynamic_type_check_table.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --allow_experimental_variant_type=1 --use_variant_as_common_type=1" + +function run() +{ + echo "initial insert" + $CH_CLIENT -q "insert into test select number, number from numbers(3)" + + echo "alter add column" + $CH_CLIENT -q "alter table test add column d Dynamic(max_types=3) settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "insert after alter add column" + $CH_CLIENT -q "insert into test select number, number, number from numbers(3, 3)" + $CH_CLIENT -q "insert into test select number, number, 'str_' || toString(number) from numbers(6, 3)" + $CH_CLIENT -q "insert into test select number, number, NULL from numbers(9, 3)" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 3 == 0, number, number % 3 == 1, 'str_' || toString(number), NULL) from numbers(12, 3)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "check table" + $CH_CLIENT -q "check table test" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (x UInt64, y UInt64) engine=MergeTree order by x settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (x UInt64, y UInt64 ) engine=MergeTree order by x settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run +$CH_CLIENT -q "drop table test;" + diff --git a/tests/queries/0_stateless/03144_compress_stdout.reference b/tests/queries/0_stateless/03144_compress_stdout.reference new file mode 100644 index 00000000000..6f51dfc24e1 --- /dev/null +++ b/tests/queries/0_stateless/03144_compress_stdout.reference @@ -0,0 +1,2 @@ +Hello, World! From client. +Hello, World! From local. diff --git a/tests/queries/0_stateless/03144_compress_stdout.sh b/tests/queries/0_stateless/03144_compress_stdout.sh new file mode 100755 index 00000000000..569754303a7 --- /dev/null +++ b/tests/queries/0_stateless/03144_compress_stdout.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +set -e + +[ -e "${CLICKHOUSE_TMP}"/test_compression_of_output_file_from_client.gz ] && rm "${CLICKHOUSE_TMP}"/test_compression_of_output_file_from_client.gz + +${CLICKHOUSE_CLIENT} --query "SELECT * FROM (SELECT 'Hello, World! From client.')" > ${CLICKHOUSE_TMP}/test_compression_of_output_file_from_client.gz +gunzip ${CLICKHOUSE_TMP}/test_compression_of_output_file_from_client.gz +cat ${CLICKHOUSE_TMP}/test_compression_of_output_file_from_client + +rm -f "${CLICKHOUSE_TMP}/test_compression_of_output_file_from_client" + +[ -e "${CLICKHOUSE_TMP}"/test_compression_of_output_file_from_local.gz ] && rm "${CLICKHOUSE_TMP}"/test_compression_of_output_file_from_local.gz + +${CLICKHOUSE_LOCAL} --query "SELECT * FROM (SELECT 'Hello, World! From local.')" > ${CLICKHOUSE_TMP}/test_compression_of_output_file_from_local.gz +gunzip ${CLICKHOUSE_TMP}/test_compression_of_output_file_from_local.gz +cat ${CLICKHOUSE_TMP}/test_compression_of_output_file_from_local + +rm -f "${CLICKHOUSE_TMP}/test_compression_of_output_file_from_local" diff --git a/tests/queries/0_stateless/03149_numbers_max_block_size_zero.reference b/tests/queries/0_stateless/03149_numbers_max_block_size_zero.reference new file mode 100644 index 00000000000..d86bac9de59 --- /dev/null +++ b/tests/queries/0_stateless/03149_numbers_max_block_size_zero.reference @@ -0,0 +1 @@ +OK diff --git a/tests/queries/0_stateless/03149_numbers_max_block_size_zero.sh b/tests/queries/0_stateless/03149_numbers_max_block_size_zero.sh new file mode 100755 index 00000000000..6f70a0d2536 --- /dev/null +++ b/tests/queries/0_stateless/03149_numbers_max_block_size_zero.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "SELECT count(*) FROM numbers(10) AS a, numbers(11) AS b, numbers(12) AS c SETTINGS max_block_size = 0" 2>&1 | grep -q "Sanity check: 'max_block_size' cannot be 0. Set to default value" && echo "OK" || echo "FAIL" diff --git a/tests/queries/0_stateless/03150_dynamic_type_mv_insert.reference b/tests/queries/0_stateless/03150_dynamic_type_mv_insert.reference new file mode 100644 index 00000000000..0b76d30953e --- /dev/null +++ b/tests/queries/0_stateless/03150_dynamic_type_mv_insert.reference @@ -0,0 +1,35 @@ +1 2024-01-01 Date +2 1704056400 Decimal(18, 3) +3 1 String +4 2 String + +1 2024-01-01 Date +1 2024-01-01 Date +2 1704056400 Decimal(18, 3) +2 1704056400 Decimal(18, 3) +3 1 String +3 1 String +4 2 String +4 2 String + +1 2024-01-01 String +1 2024-01-01 String +2 1704056400 String +2 1704056400 String +3 1 String +3 1 String +4 2 String +4 2 String + +1 2024-01-01 Date +1 2024-01-01 String +1 2024-01-01 String +2 1704056400 Decimal(18, 3) +2 1704056400 String +2 1704056400 String +3 1 String +3 1 String +3 1 String +4 2 String +4 2 String +4 2 String diff --git a/tests/queries/0_stateless/03150_dynamic_type_mv_insert.sql b/tests/queries/0_stateless/03150_dynamic_type_mv_insert.sql new file mode 100644 index 00000000000..ad5ea9512c6 --- /dev/null +++ b/tests/queries/0_stateless/03150_dynamic_type_mv_insert.sql @@ -0,0 +1,34 @@ +SET allow_experimental_dynamic_type=1; + +CREATE TABLE null_table +( + n1 UInt8, + n2 Dynamic(max_types=3) +) +ENGINE = Null; + +CREATE MATERIALIZED VIEW dummy_rmv TO to_table +AS SELECT * FROM null_table; + +CREATE TABLE to_table +( + n1 UInt8, + n2 Dynamic(max_types=4) +) +ENGINE = MergeTree ORDER BY n1; + +INSERT INTO null_table ( n1, n2 ) VALUES (1, '2024-01-01'), (2, toDateTime64('2024-01-01', 3, 'Asia/Istanbul')), (3, toFloat32(1)), (4, toFloat64(2)); +SELECT *, dynamicType(n2) FROM to_table ORDER BY ALL; + +select ''; +INSERT INTO null_table ( n1, n2 ) VALUES (1, '2024-01-01'), (2, toDateTime64('2024-01-01', 3, 'Asia/Istanbul')), (3, toFloat32(1)), (4, toFloat64(2)); +SELECT *, dynamicType(n2) FROM to_table ORDER BY ALL; + +select ''; +ALTER TABLE to_table MODIFY COLUMN n2 Dynamic(max_types=1); +SELECT *, dynamicType(n2) FROM to_table ORDER BY ALL; + +select ''; +ALTER TABLE to_table MODIFY COLUMN n2 Dynamic(max_types=10); +INSERT INTO null_table ( n1, n2 ) VALUES (1, '2024-01-01'), (2, toDateTime64('2024-01-01', 3, 'Asia/Istanbul')), (3, toFloat32(1)), (4, toFloat64(2)); +SELECT *, dynamicType(n2) FROM to_table ORDER BY ALL; diff --git a/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.reference b/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.reference new file mode 100644 index 00000000000..d96fbf658d8 --- /dev/null +++ b/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.reference @@ -0,0 +1,26 @@ +1 2024-01-01 Date +2 1704056400 String +3 1 String +4 2 String + +1 2024-01-01 Date +1 2024-01-01 Date +2 1704056400 Decimal(18, 3) +2 1704056400 String +3 1 Float32 +3 1 String +4 2 Float64 +4 2 String + +1 2024-01-01 String +1 2024-01-01 String +1 2024-01-01 String +2 1704056400 String +2 1704056400 String +2 1704056400 String +3 1 String +3 1 String +3 1 String +4 2 String +4 2 String +4 2 String diff --git a/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.sql b/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.sql new file mode 100644 index 00000000000..632f3504fdb --- /dev/null +++ b/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.sql @@ -0,0 +1,26 @@ +SET allow_experimental_dynamic_type=1; +set min_compress_block_size = 585572, max_compress_block_size = 373374, max_block_size = 60768, max_joined_block_size_rows = 18966, max_insert_threads = 5, max_threads = 50, max_read_buffer_size = 708232, connect_timeout_with_failover_ms = 2000, connect_timeout_with_failover_secure_ms = 3000, idle_connection_timeout = 36000, use_uncompressed_cache = true, stream_like_engine_allow_direct_select = true, replication_wait_for_inactive_replica_timeout = 30, compile_aggregate_expressions = false, min_count_to_compile_aggregate_expression = 0, compile_sort_description = false, group_by_two_level_threshold = 1000000, group_by_two_level_threshold_bytes = 12610083, enable_memory_bound_merging_of_aggregation_results = false, min_chunk_bytes_for_parallel_parsing = 18769830, merge_tree_coarse_index_granularity = 12, min_bytes_to_use_direct_io = 10737418240, min_bytes_to_use_mmap_io = 10737418240, log_queries = true, insert_quorum_timeout = 60000, merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability = 0.05000000074505806, http_response_buffer_size = 294986, fsync_metadata = true, http_send_timeout = 60., http_receive_timeout = 60., opentelemetry_start_trace_probability = 0.10000000149011612, max_bytes_before_external_group_by = 1, max_bytes_before_external_sort = 10737418240, max_bytes_before_remerge_sort = 1326536545, max_untracked_memory = 1048576, memory_profiler_step = 1048576, log_comment = '03151_dynamic_type_scale_max_types.sql', send_logs_level = 'fatal', prefer_localhost_replica = false, optimize_read_in_order = false, optimize_aggregation_in_order = true, aggregation_in_order_max_block_bytes = 27069500, read_in_order_two_level_merge_threshold = 75, allow_introspection_functions = true, database_atomic_wait_for_drop_and_detach_synchronously = true, remote_filesystem_read_method = 'read', local_filesystem_read_prefetch = true, remote_filesystem_read_prefetch = false, merge_tree_compact_parts_min_granules_to_multibuffer_read = 119, async_insert_busy_timeout_max_ms = 5000, read_from_filesystem_cache_if_exists_otherwise_bypass_cache = true, filesystem_cache_segments_batch_size = 10, use_page_cache_for_disks_without_file_cache = true, page_cache_inject_eviction = true, allow_prefetched_read_pool_for_remote_filesystem = false, filesystem_prefetch_step_marks = 50, filesystem_prefetch_min_bytes_for_single_read_task = 16777216, filesystem_prefetch_max_memory_usage = 134217728, filesystem_prefetches_limit = 10, optimize_sorting_by_input_stream_properties = false, allow_experimental_dynamic_type = true, session_timezone = 'Africa/Khartoum', prefer_warmed_unmerged_parts_seconds = 2; + +drop table if exists to_table; + +CREATE TABLE to_table +( + n1 UInt8, + n2 Dynamic(max_types=2) +) +ENGINE = MergeTree ORDER BY n1; + +INSERT INTO to_table ( n1, n2 ) VALUES (1, '2024-01-01'), (2, toDateTime64('2024-01-01', 3, 'Asia/Istanbul')), (3, toFloat32(1)), (4, toFloat64(2)); +SELECT *, dynamicType(n2) FROM to_table ORDER BY ALL; + +select ''; +ALTER TABLE to_table MODIFY COLUMN n2 Dynamic(max_types=5); +INSERT INTO to_table ( n1, n2 ) VALUES (1, '2024-01-01'), (2, toDateTime64('2024-01-01', 3, 'Asia/Istanbul')), (3, toFloat32(1)), (4, toFloat64(2)); +SELECT *, dynamicType(n2) FROM to_table ORDER BY ALL; + +select ''; +ALTER TABLE to_table MODIFY COLUMN n2 Dynamic(max_types=1); +INSERT INTO to_table ( n1, n2 ) VALUES (1, '2024-01-01'), (2, toDateTime64('2024-01-01', 3, 'Asia/Istanbul')), (3, toFloat32(1)), (4, toFloat64(2)); +SELECT *, dynamicType(n2) FROM to_table ORDER BY ALL; + +ALTER TABLE to_table MODIFY COLUMN n2 Dynamic(max_types=500); -- { serverError UNEXPECTED_AST_STRUCTURE } diff --git a/tests/queries/0_stateless/03152_analyzer_columns_list.reference b/tests/queries/0_stateless/03152_analyzer_columns_list.reference new file mode 100644 index 00000000000..4e9025b5baf --- /dev/null +++ b/tests/queries/0_stateless/03152_analyzer_columns_list.reference @@ -0,0 +1 @@ +4 3 diff --git a/tests/queries/0_stateless/03152_analyzer_columns_list.sql b/tests/queries/0_stateless/03152_analyzer_columns_list.sql new file mode 100644 index 00000000000..baed3a4ff68 --- /dev/null +++ b/tests/queries/0_stateless/03152_analyzer_columns_list.sql @@ -0,0 +1,13 @@ +CREATE TABLE test +( + foo String, + bar String, +) +ENGINE = MergeTree() +ORDER BY (foo, bar); + +INSERT INTO test VALUES ('foo', 'bar1'); + +SELECT COLUMNS(bar, foo) APPLY (length) FROM test; + +SELECT COLUMNS(bar, foo, xyz) APPLY (length) FROM test; -- { serverError UNKNOWN_IDENTIFIER } diff --git a/tests/queries/0_stateless/03152_dynamic_type_simple.reference b/tests/queries/0_stateless/03152_dynamic_type_simple.reference new file mode 100644 index 00000000000..5f243209ff3 --- /dev/null +++ b/tests/queries/0_stateless/03152_dynamic_type_simple.reference @@ -0,0 +1,25 @@ +string1 String +42 Int64 +3.14 Float64 +[1,2] Array(Int64) +2021-01-01 Date +string2 String + +\N None 42 Int64 +42 Int64 string String +string String [1, 2] String +[1,2] Array(Int64) \N None + ┌─d────────────────────────┬─dynamicType(d)─┬─d.Int64─┬─d.String─┬─────d.Date─┬─d.Float64─┬──────────d.DateTime─┬─d.Array(Int64)─┬─d.Array(String)──────────┐ + 1. │ 42 │ Int64 │ 42 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ [] │ + 2. │ string1 │ String │ ᴺᵁᴸᴸ │ string1 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ [] │ + 3. │ 2021-01-01 │ Date │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 2021-01-01 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ [] │ + 4. │ [1,2,3] │ Array(Int64) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ [] │ + 5. │ 3.14 │ Float64 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 3.14 │ ᴺᵁᴸᴸ │ [] │ [] │ + 6. │ string2 │ String │ ᴺᵁᴸᴸ │ string2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ [] │ + 7. │ 2021-01-01 12:00:00 │ DateTime │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 2021-01-01 12:00:00 │ [] │ [] │ + 8. │ ['array','of','strings'] │ Array(String) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ ['array','of','strings'] │ + 9. │ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ [] │ +10. │ 42.42 │ Float64 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 42.42 │ ᴺᵁᴸᴸ │ [] │ [] │ + └──────────────────────────┴────────────────┴─────────┴──────────┴────────────┴───────────┴─────────────────────┴────────────────┴──────────────────────────┘ + +49995000 diff --git a/tests/queries/0_stateless/03152_dynamic_type_simple.sql b/tests/queries/0_stateless/03152_dynamic_type_simple.sql new file mode 100644 index 00000000000..fd5328faf15 --- /dev/null +++ b/tests/queries/0_stateless/03152_dynamic_type_simple.sql @@ -0,0 +1,29 @@ +SET allow_experimental_dynamic_type=1; + +CREATE TABLE test_max_types (d Dynamic(max_types=5)) ENGINE = Memory; +INSERT INTO test_max_types VALUES ('string1'), (42), (3.14), ([1, 2]), (toDate('2021-01-01')), ('string2'); +SELECT d, dynamicType(d) FROM test_max_types; + +SELECT ''; +CREATE TABLE test_nested_dynamic (d1 Dynamic, d2 Dynamic(max_types=2)) ENGINE = Memory; +INSERT INTO test_nested_dynamic VALUES (NULL, 42), (42, 'string'), ('string', [1, 2]), ([1, 2], NULL); +SELECT d1, dynamicType(d1), d2, dynamicType(d2) FROM test_nested_dynamic; + +CREATE TABLE test_rapid_schema (d Dynamic) ENGINE = Memory; +INSERT INTO test_rapid_schema VALUES (42), ('string1'), (toDate('2021-01-01')), ([1, 2, 3]), (3.14), ('string2'), (toDateTime('2021-01-01 12:00:00')), (['array', 'of', 'strings']), (NULL), (toFloat64(42.42)); + +SELECT d, dynamicType(d), d.Int64, d.String, d.Date, d.Float64, d.DateTime, d.`Array(Int64)`, d.`Array(String)` +FROM test_rapid_schema FORMAT PrettyCompactMonoBlock; + + +SELECT ''; +SELECT finalizeAggregation(CAST(dynamic_state, 'AggregateFunction(sum, UInt64)')) +FROM +( + SELECT CAST(state, 'Dynamic') AS dynamic_state + FROM + ( + SELECT sumState(number) AS state + FROM numbers(10000) + ) +); diff --git a/tests/queries/0_stateless/03153_dynamic_type_empty.reference b/tests/queries/0_stateless/03153_dynamic_type_empty.reference new file mode 100644 index 00000000000..f7c047dcd19 --- /dev/null +++ b/tests/queries/0_stateless/03153_dynamic_type_empty.reference @@ -0,0 +1,15 @@ +[] String +[1] Array(Int64) +[] Array(Int64) +['1'] Array(String) +[] Array(Int64) +() String +(1) Tuple(Int64) +(0) Tuple(Int64) +('1') Tuple(String) +(0) Tuple(Int64) +{} String +{1:2} Map(Int64, Int64) +{} Map(Int64, Int64) +{'1':'2'} Map(String, String) +{} Map(Int64, Int64) diff --git a/tests/queries/0_stateless/03153_dynamic_type_empty.sql b/tests/queries/0_stateless/03153_dynamic_type_empty.sql new file mode 100644 index 00000000000..8e942fe6f6e --- /dev/null +++ b/tests/queries/0_stateless/03153_dynamic_type_empty.sql @@ -0,0 +1,5 @@ +SET allow_experimental_dynamic_type=1; + +CREATE TABLE test_null_empty (d Dynamic) ENGINE = Memory; +INSERT INTO test_null_empty VALUES ([]), ([1]), ([]), (['1']), ([]), (()),((1)), (()), (('1')), (()), ({}), ({1:2}), ({}), ({'1':'2'}), ({}); +SELECT d, dynamicType(d) FROM test_null_empty; diff --git a/tests/queries/0_stateless/03156_dynamic_type_concurrent_inserts.reference b/tests/queries/0_stateless/03156_dynamic_type_concurrent_inserts.reference new file mode 100644 index 00000000000..e1c7b69b136 --- /dev/null +++ b/tests/queries/0_stateless/03156_dynamic_type_concurrent_inserts.reference @@ -0,0 +1,7 @@ +Array(UInt64) 12000 10000 +Date 12000 10001 +Float64 12000 10000 +Int64 10000 10000 +Map(UInt64, String) 10000 10000 +String 10000 10000 +UInt64 4000 4000 diff --git a/tests/queries/0_stateless/03156_dynamic_type_concurrent_inserts.sh b/tests/queries/0_stateless/03156_dynamic_type_concurrent_inserts.sh new file mode 100755 index 00000000000..d7709b722c9 --- /dev/null +++ b/tests/queries/0_stateless/03156_dynamic_type_concurrent_inserts.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "CREATE TABLE test_cc (d Dynamic) ENGINE = Memory" + + +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "INSERT INTO test_cc SELECT number::Int64 AS d FROM numbers(10000) SETTINGS max_threads=1,max_insert_threads=1" & +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "INSERT INTO test_cc SELECT toString(number) AS d FROM numbers(10000) SETTINGS max_threads=2,max_insert_threads=2" & +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "INSERT INTO test_cc SELECT toDate(number % 10000) AS d FROM numbers(10000) SETTINGS max_threads=3,max_insert_threads=3" & +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "INSERT INTO test_cc SELECT [number, number + 1] AS d FROM numbers(10000) SETTINGS max_threads=4,max_insert_threads=4" & +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "INSERT INTO test_cc SELECT toFloat64(number) AS d FROM numbers(10000) SETTINGS max_threads=5,max_insert_threads=5" & +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "INSERT INTO test_cc SELECT map(number, toString(number)) AS d FROM numbers(10000) SETTINGS max_threads=6,max_insert_threads=6" & + +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --use_variant_as_common_type=1 --allow_experimental_variant_type=1 -q "INSERT INTO test_cc SELECT CAST(multiIf(number % 5 = 0, toString(number), number % 5 = 1, number, number % 5 = 2, toFloat64(number), number % 5 = 3, toDate('2020-01-01'), [number, number + 1]), 'Dynamic') FROM numbers(10000) SETTINGS max_threads=6,max_insert_threads=6" & + +wait + +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "SELECT dynamicType(d) t, count(), uniqExact(d) FROM test_cc GROUP BY t ORDER BY t" diff --git a/tests/queries/0_stateless/03157_dynamic_type_json.reference b/tests/queries/0_stateless/03157_dynamic_type_json.reference new file mode 100644 index 00000000000..38bca12bb95 --- /dev/null +++ b/tests/queries/0_stateless/03157_dynamic_type_json.reference @@ -0,0 +1,5 @@ +1 (((((((((('deep_value')))))))))) +2 (((((((((('deep_array_value')))))))))) + +(((((((((('deep_value')))))))))) Tuple(level1 Tuple(level2 Tuple(level3 Tuple(level4 Tuple(level5 Tuple(level6 Tuple(level7 Tuple(level8 Tuple(level9 Tuple(level10 String)))))))))) +(((((((((('deep_array_value')))))))))) Tuple(level1 Tuple(level2 Tuple(level3 Tuple(level4 Tuple(level5 Tuple(level6 Tuple(level7 Tuple(level8 Tuple(level9 Tuple(level10 String)))))))))) diff --git a/tests/queries/0_stateless/03157_dynamic_type_json.sql b/tests/queries/0_stateless/03157_dynamic_type_json.sql new file mode 100644 index 00000000000..cb1a5987104 --- /dev/null +++ b/tests/queries/0_stateless/03157_dynamic_type_json.sql @@ -0,0 +1,13 @@ +SET allow_experimental_dynamic_type=1; +SET allow_experimental_object_type=1; +SET allow_experimental_variant_type=1; + +CREATE TABLE test_deep_nested_json (i UInt16, d JSON) ENGINE = Memory; + +INSERT INTO test_deep_nested_json VALUES (1, '{"level1": {"level2": {"level3": {"level4": {"level5": {"level6": {"level7": {"level8": {"level9": {"level10": "deep_value"}}}}}}}}}}'); +INSERT INTO test_deep_nested_json VALUES (2, '{"level1": {"level2": {"level3": {"level4": {"level5": {"level6": {"level7": {"level8": {"level9": {"level10": "deep_array_value"}}}}}}}}}}'); + +SELECT * FROM test_deep_nested_json ORDER BY i; + +SELECT ''; +SELECT d::Dynamic d1, dynamicType(d1) FROM test_deep_nested_json ORDER BY i; diff --git a/tests/queries/0_stateless/03158_dynamic_type_from_variant.reference b/tests/queries/0_stateless/03158_dynamic_type_from_variant.reference new file mode 100644 index 00000000000..2ede006cedc --- /dev/null +++ b/tests/queries/0_stateless/03158_dynamic_type_from_variant.reference @@ -0,0 +1,17 @@ +false Variant(Bool, DateTime64(3), IPv6, String, UInt32) +false Variant(Bool, DateTime64(3), IPv6, String, UInt32) +true Variant(Bool, DateTime64(3), IPv6, String, UInt32) +2001-01-01 01:01:01.111 Variant(Bool, DateTime64(3), IPv6, String, UInt32) +s Variant(Bool, DateTime64(3), IPv6, String, UInt32) +0 Variant(Bool, DateTime64(3), IPv6, String, UInt32) +1 Variant(Bool, DateTime64(3), IPv6, String, UInt32) +\N Variant(Bool, DateTime64(3), IPv6, String, UInt32) + +false Bool +false Bool +true Bool +2001-01-01 01:01:01.111 DateTime64(3) +s String +0 UInt32 +1 UInt32 +\N None diff --git a/tests/queries/0_stateless/03158_dynamic_type_from_variant.sql b/tests/queries/0_stateless/03158_dynamic_type_from_variant.sql new file mode 100644 index 00000000000..20a9e17a148 --- /dev/null +++ b/tests/queries/0_stateless/03158_dynamic_type_from_variant.sql @@ -0,0 +1,15 @@ +SET allow_experimental_dynamic_type=1; +SET allow_experimental_object_type=1; +SET allow_experimental_variant_type=1; + +CREATE TABLE test_variable (v Variant(String, UInt32, IPv6, Bool, DateTime64)) ENGINE = Memory; +CREATE TABLE test_dynamic (d Dynamic) ENGINE = Memory; + +INSERT INTO test_variable VALUES (1), ('s'), (0), ('0'), ('true'), ('false'), ('2001-01-01 01:01:01.111'), (NULL); + +SELECT v, toTypeName(v) FROM test_variable ORDER BY v; + +INSERT INTO test_dynamic SELECT * FROM test_variable; + +SELECT ''; +SELECT d, dynamicType(d) FROM test_dynamic ORDER BY d; diff --git a/tests/queries/0_stateless/03159_dynamic_type_all_types.reference b/tests/queries/0_stateless/03159_dynamic_type_all_types.reference new file mode 100644 index 00000000000..72c5b90dbba --- /dev/null +++ b/tests/queries/0_stateless/03159_dynamic_type_all_types.reference @@ -0,0 +1,292 @@ +Array(Dynamic) [] +Array(Array(Dynamic)) [[]] +Array(Array(Array(Dynamic))) [[[]]] +Bool false +Bool true +Date 2022-01-01 +Date32 2022-01-01 +DateTime 2022-01-01 01:01:01 +DateTime64(3) 2022-01-01 01:01:01.011 +Decimal(9, 1) -99999999.9 +Decimal(18, 2) -999999999.99 +Decimal(38, 3) -999999999.999 +Decimal(76, 4) -999999999.9999 +Float32 -inf +Float32 -inf +Float32 -inf +Float32 -3.4028233e38 +Float32 -1.1754942e-38 +Float32 -1e-45 +Float32 1e-45 +Float32 1.1754942e-38 +Float32 3.4028233e38 +Float32 inf +Float32 inf +Float32 inf +Float32 nan +Float32 nan +Float32 nan +Float64 -inf +Float64 -inf +Float64 -inf +Float64 -1.7976931348623157e308 +Float64 -3.40282347e38 +Float64 -1.1754943499999998e-38 +Float64 -1.3999999999999999e-45 +Float64 -2.2250738585072014e-308 +Float64 2.2250738585072014e-308 +Float64 1.3999999999999999e-45 +Float64 1.1754943499999998e-38 +Float64 3.40282347e38 +Float64 1.7976931348623157e308 +Float64 inf +Float64 inf +Float64 inf +Float64 nan +Float64 nan +Float64 nan +FixedString(1) 1 +FixedString(2) 1\0 +FixedString(10) 1\0\0\0\0\0\0\0\0\0 +IPv4 192.168.0.1 +IPv6 ::1 +Int8 -128 +Int8 -128 +Int8 -127 +Int8 -127 +Int8 -1 +Int8 -1 +Int8 0 +Int8 0 +Int8 1 +Int8 1 +Int8 126 +Int8 126 +Int8 127 +Int8 127 +Int16 -32768 +Int16 -32767 +Int16 -1 +Int16 0 +Int16 1 +Int16 32766 +Int16 32767 +Int32 -2147483648 +Int32 -2147483647 +Int32 -1 +Int32 0 +Int32 1 +Int32 2147483646 +Int32 2147483647 +Int64 -9223372036854775808 +Int64 -9223372036854775807 +Int64 -1 +Int64 0 +Int64 1 +Int64 9223372036854775806 +Int64 9223372036854775807 +Int128 -170141183460469231731687303715884105728 +Int128 -170141183460469231731687303715884105727 +Int128 -1 +Int128 0 +Int128 1 +Int128 170141183460469231731687303715884105726 +Int128 170141183460469231731687303715884105727 +Int256 -57896044618658097711785492504343953926634992332820282019728792003956564819968 +Int256 -57896044618658097711785492504343953926634992332820282019728792003956564819967 +Int256 -1 +Int256 0 +Int256 1 +Int256 57896044618658097711785492504343953926634992332820282019728792003956564819966 +Int256 57896044618658097711785492504343953926634992332820282019728792003956564819967 +IntervalDay 1 +IntervalYear 3 +IntervalMonth 2 +LowCardinality(String) 1 +LowCardinality(String) 1 +LowCardinality(UInt16) 0 +MultiPolygon [[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]]] +Map(Dynamic, Dynamic) {'11':'v1','22':'1'} +Nested(x UInt32, y String) [(1,'aa'),(2,'bb')] +Nested(x UInt32, y Tuple(y1 UInt32, y2 Array(String)), z Nested(z1 UInt32, z2 String)) [(1,(2,['aa','bb']),[(3,'cc'),(4,'dd')]),(5,(6,['ee','ff']),[(7,'gg'),(8,'hh')])] +Point (1.23,4.5600000000000005) +Ring [(1.23,4.5600000000000005),(2.34,5.67)] +String string +SimpleAggregateFunction(anyLast, Array(Int16)) [1,2] +Tuple(Dynamic) ('') +Tuple(Tuple(Dynamic)) (('')) +Tuple(Tuple(Tuple(Dynamic))) (((''))) +UUID 00000000-0000-0000-0000-000000000000 +UUID dededdb6-7835-4ce4-8d11-b5de6f2820e9 +UInt8 0 +UInt8 1 +UInt8 254 +UInt8 255 +UInt16 0 +UInt16 1 +UInt16 65534 +UInt16 65535 +UInt32 0 +UInt32 1 +UInt32 4294967294 +UInt32 4294967295 +UInt64 0 +UInt64 1 +UInt64 18446744073709551614 +UInt64 18446744073709551615 +UInt128 0 +UInt128 1 +UInt128 340282366920938463463374607431768211454 +UInt128 340282366920938463463374607431768211455 +UInt256 0 +UInt256 1 +UInt256 115792089237316195423570985008687907853269984665640564039457584007913129639934 +UInt256 115792089237316195423570985008687907853269984665640564039457584007913129639935 + +Array(Dynamic) [] +Array(Array(Dynamic)) [[]] +Array(Array(Array(Dynamic))) [[[]]] +Bool false +Bool true +Date 2022-01-01 +Date32 2022-01-01 +DateTime 2022-01-01 01:01:01 +DateTime64(3) 2022-01-01 01:01:01.011 +Decimal(9, 1) -99999999.9 +Decimal(18, 2) -999999999.99 +Decimal(38, 3) -999999999.999 +Decimal(76, 4) -999999999.9999 +Float32 -inf +Float32 -inf +Float32 -inf +Float32 -3.4028233e38 +Float32 -1.1754942e-38 +Float32 -1e-45 +Float32 1e-45 +Float32 1.1754942e-38 +Float32 3.4028233e38 +Float32 inf +Float32 inf +Float32 inf +Float32 nan +Float32 nan +Float32 nan +Float64 -inf +Float64 -inf +Float64 -inf +Float64 -1.7976931348623157e308 +Float64 -3.40282347e38 +Float64 -1.1754943499999998e-38 +Float64 -1.3999999999999999e-45 +Float64 -2.2250738585072014e-308 +Float64 2.2250738585072014e-308 +Float64 1.3999999999999999e-45 +Float64 1.1754943499999998e-38 +Float64 3.40282347e38 +Float64 1.7976931348623157e308 +Float64 inf +Float64 inf +Float64 inf +Float64 nan +Float64 nan +Float64 nan +FixedString(1) 1 +FixedString(2) 1\0 +FixedString(10) 1\0\0\0\0\0\0\0\0\0 +IPv4 192.168.0.1 +IPv6 ::1 +Int8 -128 +Int8 -128 +Int8 -127 +Int8 -127 +Int8 -1 +Int8 -1 +Int8 0 +Int8 0 +Int8 1 +Int8 1 +Int8 126 +Int8 126 +Int8 127 +Int8 127 +Int16 -32768 +Int16 -32767 +Int16 -1 +Int16 0 +Int16 1 +Int16 32766 +Int16 32767 +Int32 -2147483648 +Int32 -2147483647 +Int32 -1 +Int32 0 +Int32 1 +Int32 2147483646 +Int32 2147483647 +Int64 -9223372036854775808 +Int64 -9223372036854775807 +Int64 -1 +Int64 0 +Int64 1 +Int64 9223372036854775806 +Int64 9223372036854775807 +Int128 -170141183460469231731687303715884105728 +Int128 -170141183460469231731687303715884105727 +Int128 -1 +Int128 0 +Int128 1 +Int128 170141183460469231731687303715884105726 +Int128 170141183460469231731687303715884105727 +Int256 -57896044618658097711785492504343953926634992332820282019728792003956564819968 +Int256 -57896044618658097711785492504343953926634992332820282019728792003956564819967 +Int256 -1 +Int256 0 +Int256 1 +Int256 57896044618658097711785492504343953926634992332820282019728792003956564819966 +Int256 57896044618658097711785492504343953926634992332820282019728792003956564819967 +IntervalDay 1 +IntervalYear 3 +IntervalMonth 2 +LowCardinality(String) 1 +LowCardinality(String) 1 +LowCardinality(UInt16) 0 +MultiPolygon [[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]]] +Map(Dynamic, Dynamic) {'11':'v1','22':'1'} +Nested(x UInt32, y String) [(1,'aa'),(2,'bb')] +Nested(x UInt32, y Tuple(y1 UInt32, y2 Array(String)), z Nested(z1 UInt32, z2 String)) [(1,(2,['aa','bb']),[(3,'cc'),(4,'dd')]),(5,(6,['ee','ff']),[(7,'gg'),(8,'hh')])] +Point (1.23,4.5600000000000005) +Ring [(1.23,4.5600000000000005),(2.34,5.67)] +String string +SimpleAggregateFunction(anyLast, Array(Int16)) [1,2] +Tuple(Dynamic) ('') +Tuple(Tuple(Dynamic)) (('')) +Tuple(Tuple(Tuple(Dynamic))) (((''))) +UUID 00000000-0000-0000-0000-000000000000 +UUID dededdb6-7835-4ce4-8d11-b5de6f2820e9 +UInt8 0 +UInt8 1 +UInt8 254 +UInt8 255 +UInt16 0 +UInt16 1 +UInt16 65534 +UInt16 65535 +UInt32 0 +UInt32 1 +UInt32 4294967294 +UInt32 4294967295 +UInt64 0 +UInt64 1 +UInt64 18446744073709551614 +UInt64 18446744073709551615 +UInt128 0 +UInt128 1 +UInt128 340282366920938463463374607431768211454 +UInt128 340282366920938463463374607431768211455 +UInt256 0 +UInt256 1 +UInt256 115792089237316195423570985008687907853269984665640564039457584007913129639934 +UInt256 115792089237316195423570985008687907853269984665640564039457584007913129639935 + +48 +48 diff --git a/tests/queries/0_stateless/03159_dynamic_type_all_types.sql b/tests/queries/0_stateless/03159_dynamic_type_all_types.sql new file mode 100644 index 00000000000..d302205ca23 --- /dev/null +++ b/tests/queries/0_stateless/03159_dynamic_type_all_types.sql @@ -0,0 +1,95 @@ +-- Tags: no-random-settings + +SET allow_experimental_dynamic_type=1; +SET allow_experimental_object_type=1; +SET allow_experimental_variant_type=1; +SET allow_suspicious_low_cardinality_types=1; + + +CREATE TABLE t (d Dynamic(max_types=255)) ENGINE = Memory; +-- Integer types: signed and unsigned integers (UInt8, UInt16, UInt32, UInt64, UInt128, UInt256, Int8, Int16, Int32, Int64, Int128, Int256) +INSERT INTO t VALUES (-128::Int8), (-127::Int8), (-1::Int8), (0::Int8), (1::Int8), (126::Int8), (127::Int8); +INSERT INTO t VALUES (-128::Int8), (-127::Int8), (-1::Int8), (0::Int8), (1::Int8), (126::Int8), (127::Int8); +INSERT INTO t VALUES (-32768::Int16), (-32767::Int16), (-1::Int16), (0::Int16), (1::Int16), (32766::Int16), (32767::Int16); +INSERT INTO t VALUES (-2147483648::Int32), (-2147483647::Int32), (-1::Int32), (0::Int32), (1::Int32), (2147483646::Int32), (2147483647::Int32); +INSERT INTO t VALUES (-9223372036854775808::Int64), (-9223372036854775807::Int64), (-1::Int64), (0::Int64), (1::Int64), (9223372036854775806::Int64), (9223372036854775807::Int64); +INSERT INTO t VALUES (-170141183460469231731687303715884105728::Int128), (-170141183460469231731687303715884105727::Int128), (-1::Int128), (0::Int128), (1::Int128), (170141183460469231731687303715884105726::Int128), (170141183460469231731687303715884105727::Int128); +INSERT INTO t VALUES (-57896044618658097711785492504343953926634992332820282019728792003956564819968::Int256), (-57896044618658097711785492504343953926634992332820282019728792003956564819967::Int256), (-1::Int256), (0::Int256), (1::Int256), (57896044618658097711785492504343953926634992332820282019728792003956564819966::Int256), (57896044618658097711785492504343953926634992332820282019728792003956564819967::Int256); + +INSERT INTO t VALUES (0::UInt8), (1::UInt8), (254::UInt8), (255::UInt8); +INSERT INTO t VALUES (0::UInt16), (1::UInt16), (65534::UInt16), (65535::UInt16); +INSERT INTO t VALUES (0::UInt32), (1::UInt32), (4294967294::UInt32), (4294967295::UInt32); +INSERT INTO t VALUES (0::UInt64), (1::UInt64), (18446744073709551614::UInt64), (18446744073709551615::UInt64); +INSERT INTO t VALUES (0::UInt128), (1::UInt128), (340282366920938463463374607431768211454::UInt128), (340282366920938463463374607431768211455::UInt128); +INSERT INTO t VALUES (0::UInt256), (1::UInt256), (115792089237316195423570985008687907853269984665640564039457584007913129639934::UInt256), (115792089237316195423570985008687907853269984665640564039457584007913129639935::UInt256); + +-- Floating-point numbers: floats(Float32 and Float64) and Decimal values +INSERT INTO t VALUES (1.17549435e-38::Float32), (3.40282347e+38::Float32), (-3.40282347e+38::Float32), (-1.17549435e-38::Float32), (1.4e-45::Float32), (-1.4e-45::Float32); +INSERT INTO t VALUES (inf::Float32), (-inf::Float32), (nan::Float32); +INSERT INTO t VALUES (inf::FLOAT(12)), (-inf::FLOAT(12)), (nan::FLOAT(12)); +INSERT INTO t VALUES (inf::FLOAT(15,22)), (-inf::FLOAT(15,22)), (nan::FLOAT(15,22)); + +INSERT INTO t VALUES (1.17549435e-38::Float64), (3.40282347e+38::Float64), (-3.40282347e+38::Float64), (-1.17549435e-38::Float64), (1.4e-45::Float64), (-1.4e-45::Float64); +INSERT INTO t VALUES (2.2250738585072014e-308::Float64), (1.7976931348623157e+308::Float64), (-1.7976931348623157e+308::Float64), (-2.2250738585072014e-308::Float64); +INSERT INTO t VALUES (inf::Float64), (-inf::Float64), (nan::Float64); +INSERT INTO t VALUES (inf::DOUBLE(12)), (-inf::DOUBLE(12)), (nan::DOUBLE(12)); +INSERT INTO t VALUES (inf::DOUBLE(15,22)), (-inf::DOUBLE(15,22)), (nan::DOUBLE(15,22)); + +INSERT INTO t VALUES (-99999999.9::Decimal32(1)); +INSERT INTO t VALUES (-999999999.99::Decimal64(2)); +INSERT INTO t VALUES (-999999999.999::Decimal128(3)); +INSERT INTO t VALUES (-999999999.9999::Decimal256(4)); + +-- Strings: String and FixedString +INSERT INTO t VALUES ('string'::String), ('1'::FixedString(1)), ('1'::FixedString(2)), ('1'::FixedString(10)); --(''::String), + +-- Boolean +INSERT INTO t VALUES ('1'::Bool), (0::Bool); + +-- Dates: use Date and Date32 for days, and DateTime and DateTime64 for instances in time +INSERT INTO t VALUES ('2022-01-01'::Date), ('2022-01-01'::Date32), ('2022-01-01 01:01:01'::DateTime), ('2022-01-01 01:01:01.011'::DateTime64); + +-- UUID +INSERT INTO t VALUES ('dededdb6-7835-4ce4-8d11-b5de6f2820e9'::UUID); +INSERT INTO t VALUES ('00000000-0000-0000-0000-000000000000'::UUID); + +-- LowCardinality +INSERT INTO t VALUES ('1'::LowCardinality(String)), ('1'::LowCardinality(String)), (0::LowCardinality(UInt16)); + +-- Arrays +INSERT INTO t VALUES ([]::Array(Dynamic)), ([[]]::Array(Array(Dynamic))), ([[[]]]::Array(Array(Array(Dynamic)))); + +-- Tuple +INSERT INTO t VALUES (()::Tuple(Dynamic)), ((())::Tuple(Tuple(Dynamic))), (((()))::Tuple(Tuple(Tuple(Dynamic)))); + +-- Map. +INSERT INTO t VALUES (map(11::Dynamic, 'v1'::Dynamic, '22'::Dynamic, 1::Dynamic)); + +-- SimpleAggregateFunction +INSERT INTO t VALUES ([1,2]::SimpleAggregateFunction(anyLast, Array(Int16))); + +-- IPs +INSERT INTO t VALUES (toIPv4('192.168.0.1')), (toIPv6('::1')); + +-- Geo +INSERT INTO t VALUES ((1.23, 4.56)::Point), (([(1.23, 4.56)::Point, (2.34, 5.67)::Point])::Ring); +INSERT INTO t VALUES ([[[(0, 0), (10, 0), (10, 10), (0, 10)]], [[(20, 20), (50, 20), (50, 50), (20, 50)],[(30, 30), (50, 50), (50, 30)]]]::MultiPolygon); + +-- Interval +INSERT INTO t VALUES (interval '1' day), (interval '2' month), (interval '3' year); + +-- Nested +INSERT INTO t VALUES ([(1, 'aa'), (2, 'bb')]::Nested(x UInt32, y String)); +INSERT INTO t VALUES ([(1, (2, ['aa', 'bb']), [(3, 'cc'), (4, 'dd')]), (5, (6, ['ee', 'ff']), [(7, 'gg'), (8, 'hh')])]::Nested(x UInt32, y Tuple(y1 UInt32, y2 Array(String)), z Nested(z1 UInt32, z2 String))); + +SELECT dynamicType(d), d FROM t ORDER BY substring(dynamicType(d),1,1), length(dynamicType(d)), d; + +CREATE TABLE t2 (d Dynamic(max_types=255)) ENGINE = Memory; +INSERT INTO t2 SELECT * FROM t; + +SELECT ''; +SELECT dynamicType(d), d FROM t2 ORDER BY substring(dynamicType(d),1,1), length(dynamicType(d)), d; + +SELECT ''; +SELECT uniqExact(dynamicType(d)) t_ FROM t; +SELECT uniqExact(dynamicType(d)) t_ FROM t2; diff --git a/tests/queries/0_stateless/03160_dynamic_type_agg.reference b/tests/queries/0_stateless/03160_dynamic_type_agg.reference new file mode 100644 index 00000000000..54f6e428839 --- /dev/null +++ b/tests/queries/0_stateless/03160_dynamic_type_agg.reference @@ -0,0 +1 @@ +4950 4950 diff --git a/tests/queries/0_stateless/03160_dynamic_type_agg.sql b/tests/queries/0_stateless/03160_dynamic_type_agg.sql new file mode 100644 index 00000000000..f99232031a8 --- /dev/null +++ b/tests/queries/0_stateless/03160_dynamic_type_agg.sql @@ -0,0 +1,10 @@ +SET allow_experimental_dynamic_type=1; + +CREATE TABLE t (d Dynamic) ENGINE = Memory; + +INSERT INTO t SELECT sumState(number) AS d FROM numbers(100); + +SELECT finalizeAggregation(d.`AggregateFunction(sum, UInt64)`), + sumMerge(d.`AggregateFunction(sum, UInt64)`) +FROM t GROUP BY d.`AggregateFunction(sum, UInt64)`; + diff --git a/tests/queries/0_stateless/03161_create_table_as_mv.reference b/tests/queries/0_stateless/03161_create_table_as_mv.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03161_create_table_as_mv.sql b/tests/queries/0_stateless/03161_create_table_as_mv.sql new file mode 100644 index 00000000000..e80659ac923 --- /dev/null +++ b/tests/queries/0_stateless/03161_create_table_as_mv.sql @@ -0,0 +1,14 @@ +DROP TABLE IF EXISTS base_table; +DROP TABLE IF EXISTS target_table; +DROP TABLE IF EXISTS mv_from_base_to_target; +DROP TABLE IF EXISTS mv_with_storage; +DROP TABLE IF EXISTS other_table_1; +DROP TABLE IF EXISTS other_table_2; + +CREATE TABLE base_table (date DateTime, id String, cost Float64) ENGINE = MergeTree() ORDER BY date; +CREATE TABLE target_table (id String, total AggregateFunction(sum, Float64)) ENGINE = MergeTree() ORDER BY id; +CREATE MATERIALIZED VIEW mv_from_base_to_target TO target_table AS Select id, sumState(cost) FROM base_table GROUP BY id; +CREATE MATERIALIZED VIEW mv_with_storage ENGINE=MergeTree() ORDER BY id AS Select id, sumState(cost) FROM base_table GROUP BY id; + +CREATE TABLE other_table_1 AS mv_with_storage; +CREATE TABLE other_table_2 AS mv_from_base_to_target; -- { serverError INCORRECT_QUERY } diff --git a/tests/queries/0_stateless/03162_dynamic_type_nested.reference b/tests/queries/0_stateless/03162_dynamic_type_nested.reference new file mode 100644 index 00000000000..8d5bcb5f85a --- /dev/null +++ b/tests/queries/0_stateless/03162_dynamic_type_nested.reference @@ -0,0 +1,4 @@ + ┌─dynamicType(d)──────────────┬─d─────────────────────────────────────────┬─d.Nested(x UInt32, y Dynamic).x─┬─d.Nested(x UInt32, y Dynamic).y───┬─dynamicType(arrayElement(d.Nested(x UInt32, y Dynamic).y, 1))─┬─d.Nested(x UInt32, y Dynamic).y.String─┬─d.Nested(x UInt32, y Dynamic).y.Tuple(Int64, Array(String))─┐ +1. │ Nested(x UInt32, y Dynamic) │ [(1,'aa'),(2,'bb')] │ [1,2] │ ['aa','bb'] │ String │ ['aa','bb'] │ [(0,[]),(0,[])] │ +2. │ Nested(x UInt32, y Dynamic) │ [(1,(2,['aa','bb'])),(5,(6,['ee','ff']))] │ [1,5] │ [(2,['aa','bb']),(6,['ee','ff'])] │ Tuple(Int64, Array(String)) │ [NULL,NULL] │ [(2,['aa','bb']),(6,['ee','ff'])] │ + └─────────────────────────────┴───────────────────────────────────────────┴─────────────────────────────────┴───────────────────────────────────┴───────────────────────────────────────────────────────────────┴────────────────────────────────────────┴─────────────────────────────────────────────────────────────┘ diff --git a/tests/queries/0_stateless/03162_dynamic_type_nested.sql b/tests/queries/0_stateless/03162_dynamic_type_nested.sql new file mode 100644 index 00000000000..94007459a9e --- /dev/null +++ b/tests/queries/0_stateless/03162_dynamic_type_nested.sql @@ -0,0 +1,16 @@ +SET allow_experimental_dynamic_type=1; + +CREATE TABLE t (d Dynamic) ENGINE = Memory; + +INSERT INTO t VALUES ([(1, 'aa'), (2, 'bb')]::Nested(x UInt32, y Dynamic)) ; +INSERT INTO t VALUES ([(1, (2, ['aa', 'bb'])), (5, (6, ['ee', 'ff']))]::Nested(x UInt32, y Dynamic)); + +SELECT dynamicType(d), + d, + d.`Nested(x UInt32, y Dynamic)`.x, + d.`Nested(x UInt32, y Dynamic)`.y, + dynamicType(d.`Nested(x UInt32, y Dynamic)`.y[1]), + d.`Nested(x UInt32, y Dynamic)`.y.`String`, + d.`Nested(x UInt32, y Dynamic)`.y.`Tuple(Int64, Array(String))` +FROM t ORDER BY d +FORMAT PrettyCompactMonoBlock; diff --git a/tests/queries/0_stateless/03163_dynamic_as_supertype.reference b/tests/queries/0_stateless/03163_dynamic_as_supertype.reference new file mode 100644 index 00000000000..33e3a15c7fb --- /dev/null +++ b/tests/queries/0_stateless/03163_dynamic_as_supertype.reference @@ -0,0 +1,10 @@ +str_0 Dynamic(max_types=3) String +1 Dynamic(max_types=3) UInt64 +str_2 Dynamic(max_types=3) String +3 Dynamic(max_types=3) UInt64 +[1,2,3] Array(Int64) +2020-01-01 Date +str_1 String +str_2 String +42 UInt64 +43 UInt64 diff --git a/tests/queries/0_stateless/03163_dynamic_as_supertype.sql b/tests/queries/0_stateless/03163_dynamic_as_supertype.sql new file mode 100644 index 00000000000..baba637eea4 --- /dev/null +++ b/tests/queries/0_stateless/03163_dynamic_as_supertype.sql @@ -0,0 +1,8 @@ +SET allow_experimental_dynamic_type=1; +SELECT if(number % 2, number::Dynamic(max_types=3), ('str_' || toString(number))::Dynamic(max_types=2)) AS d, toTypeName(d), dynamicType(d) FROM numbers(4); +CREATE TABLE dynamic_test_1 (d Dynamic(max_types=3)) ENGINE = Memory; +INSERT INTO dynamic_test_1 VALUES ('str_1'), (42::UInt64); +CREATE TABLE dynamic_test_2 (d Dynamic(max_types=5)) ENGINE = Memory; +INSERT INTO dynamic_test_2 VALUES ('str_2'), (43::UInt64), ('2020-01-01'::Date), ([1, 2, 3]); +SELECT * FROM (SELECT d, dynamicType(d) FROM dynamic_test_1 UNION ALL SELECT d, dynamicType(d) FROM dynamic_test_2) order by d; + diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 2a9aa259fdd..1c601bc200a 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -53,6 +53,8 @@ AutoFDO AutoML Autocompletion AvroConfluent +analysisOfVariance +ANOVA BIGINT BIGSERIAL BORO @@ -763,6 +765,7 @@ Promtail Protobuf ProtobufSingle ProxySQL +proportionsZTest Punycode PyArrow PyCharm @@ -2761,6 +2764,7 @@ unixODBC unixodbc unoptimized unparsed +unpooled unrealiable unreplicated unresolvable diff --git a/utils/check-style/check-large-objects.sh b/utils/check-style/check-large-objects.sh index e2266e89556..2122cca911e 100755 --- a/utils/check-style/check-large-objects.sh +++ b/utils/check-style/check-large-objects.sh @@ -7,8 +7,6 @@ export LC_ALL=C # The "total" should be printed without localization TU_EXCLUDES=( AggregateFunctionUniq Aggregator - # FIXME: Exclude for now - FunctionsConversion ) if find $1 -name '*.o' | xargs wc -c | grep --regexp='\.o$' | sort -rn | awk '{ if ($1 > 50000000) print }' \