diff --git a/.github/ISSUE_TEMPLATE/85_bug-report.md b/.github/ISSUE_TEMPLATE/85_bug-report.md index 93b2342af70..6bf265260ac 100644 --- a/.github/ISSUE_TEMPLATE/85_bug-report.md +++ b/.github/ISSUE_TEMPLATE/85_bug-report.md @@ -17,7 +17,7 @@ assignees: '' > A link to reproducer in [https://fiddle.clickhouse.com/](https://fiddle.clickhouse.com/). -**Does it reproduce on recent release?** +**Does it reproduce on the most recent release?** [The list of releases](https://github.com/ClickHouse/ClickHouse/blob/master/utils/list-versions/version_date.tsv) @@ -34,11 +34,11 @@ assignees: '' **How to reproduce** * Which ClickHouse server version to use -* Which interface to use, if matters +* Which interface to use, if it matters * Non-default settings, if any * `CREATE TABLE` statements for all tables involved * Sample data for all these tables, use [clickhouse-obfuscator](https://github.com/ClickHouse/ClickHouse/blob/master/programs/obfuscator/Obfuscator.cpp#L42-L80) if necessary -* Queries to run that lead to unexpected result +* Queries to run that lead to an unexpected result **Expected behavior** diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index ef554a1b0ff..6b05f1fe9f4 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -138,19 +138,26 @@ jobs: ############################################################################################ ##################################### Docker images ####################################### ############################################################################################ - DockerServerImages: + DockerServerImage: needs: [RunConfig, BuilderDebRelease, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} uses: ./.github/workflows/reusable_test.yml with: - test_name: Docker server and keeper images + test_name: Docker server image runner_type: style-checker data: ${{ needs.RunConfig.outputs.data }} - checkout_depth: 0 # It MUST BE THE SAME for all dependencies and the job itself run_command: | - cd "$GITHUB_WORKSPACE/tests/ci" python3 docker_server.py --release-type head --no-push \ --image-repo clickhouse/clickhouse-server --image-path docker/server --allow-build-reuse + DockerKeeperImage: + needs: [RunConfig, BuilderDebRelease, BuilderDebAarch64] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Docker keeper image + runner_type: style-checker + data: ${{ needs.RunConfig.outputs.data }} + run_command: | python3 docker_server.py --release-type head --no-push \ --image-repo clickhouse/clickhouse-keeper --image-path docker/keeper --allow-build-reuse ############################################################################################ diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 6d150f37a27..24daca44da6 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -35,7 +35,7 @@ jobs: - name: PrepareRunConfig id: runconfig run: | - python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --configure --rebuild-all-binaries --outfile ${{ runner.temp }}/ci_run_data.json + python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --configure --outfile ${{ runner.temp }}/ci_run_data.json echo "::group::CI configuration" python3 -m json.tool ${{ runner.temp }}/ci_run_data.json @@ -55,7 +55,6 @@ jobs: uses: ./.github/workflows/reusable_docker.yml with: data: ${{ needs.RunConfig.outputs.data }} - set_latest: true StyleCheck: needs: [RunConfig, BuildDockers] if: ${{ !failure() && !cancelled() }} @@ -98,6 +97,14 @@ jobs: build_name: package_release checkout_depth: 0 data: ${{ needs.RunConfig.outputs.data }} + BuilderDebReleaseCoverage: + needs: [RunConfig, BuildDockers] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_release_coverage + checkout_depth: 0 + data: ${{ needs.RunConfig.outputs.data }} BuilderDebAarch64: needs: [RunConfig, BuildDockers] if: ${{ !failure() && !cancelled() }} @@ -242,20 +249,26 @@ jobs: ############################################################################################ ##################################### Docker images ####################################### ############################################################################################ - DockerServerImages: + DockerServerImage: needs: [RunConfig, BuilderDebRelease, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} uses: ./.github/workflows/reusable_test.yml with: - test_name: Docker server and keeper images + test_name: Docker server image runner_type: style-checker data: ${{ needs.RunConfig.outputs.data }} - # FIXME: avoid using 0 checkout - checkout_depth: 0 # It MUST BE THE SAME for all dependencies and the job itself run_command: | - cd "$GITHUB_WORKSPACE/tests/ci" python3 docker_server.py --release-type head \ --image-repo clickhouse/clickhouse-server --image-path docker/server --allow-build-reuse + DockerKeeperImage: + needs: [RunConfig, BuilderDebRelease, BuilderDebAarch64] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Docker keeper image + runner_type: style-checker + data: ${{ needs.RunConfig.outputs.data }} + run_command: | python3 docker_server.py --release-type head \ --image-repo clickhouse/clickhouse-keeper --image-path docker/keeper --allow-build-reuse ############################################################################################ @@ -272,6 +285,7 @@ jobs: - BuilderDebDebug - BuilderDebMsan - BuilderDebRelease + - BuilderDebReleaseCoverage - BuilderDebTsan - BuilderDebUBsan uses: ./.github/workflows/reusable_test.yml @@ -313,7 +327,7 @@ jobs: run_command: | python3 build_report_check.py "$CHECK_NAME" MarkReleaseReady: - if: ${{ !failure() && !cancelled() }} + if: ${{ ! (contains(needs.*.result, 'skipped') || contains(needs.*.result, 'failure')) }} needs: - BuilderBinDarwin - BuilderBinDarwinAarch64 @@ -323,8 +337,6 @@ jobs: steps: - name: Check out repository code uses: ClickHouse/checkout@v1 - with: - clear-repository: true - name: Mark Commit Release Ready run: | cd "$GITHUB_WORKSPACE/tests/ci" @@ -363,14 +375,6 @@ jobs: test_name: Stateless tests (release) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatelessTestReleaseDatabaseOrdinary: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateless tests (release, DatabaseOrdinary) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} FunctionalStatelessTestReleaseDatabaseReplicated: needs: [RunConfig, BuilderDebRelease] if: ${{ !failure() && !cancelled() }} @@ -395,6 +399,22 @@ jobs: test_name: Stateless tests (release, s3 storage) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatelessTestS3Debug: + needs: [RunConfig, BuilderDebDebug] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateless tests (debug, s3 storage) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatelessTestS3Tsan: + needs: [RunConfig, BuilderDebTsan] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateless tests (tsan, s3 storage) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} FunctionalStatelessTestAarch64: needs: [RunConfig, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} @@ -503,6 +523,55 @@ jobs: test_name: Stateful tests (debug) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} + # Parallel replicas + FunctionalStatefulTestDebugParallelReplicas: + needs: [RunConfig, BuilderDebDebug] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateful tests (debug, ParallelReplicas) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatefulTestUBsanParallelReplicas: + needs: [RunConfig, BuilderDebUBsan] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateful tests (ubsan, ParallelReplicas) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatefulTestMsanParallelReplicas: + needs: [RunConfig, BuilderDebMsan] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateful tests (msan, ParallelReplicas) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatefulTestTsanParallelReplicas: + needs: [RunConfig, BuilderDebTsan] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateful tests (tsan, ParallelReplicas) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatefulTestAsanParallelReplicas: + needs: [RunConfig, BuilderDebAsan] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateful tests (asan, ParallelReplicas) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatefulTestReleaseParallelReplicas: + needs: [RunConfig, BuilderDebRelease] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateful tests (release, ParallelReplicas) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} ############################################################################################## ########################### ClickBench ####################################################### ############################################################################################## @@ -710,6 +779,28 @@ jobs: runner_type: func-tester-aarch64 data: ${{ needs.RunConfig.outputs.data }} ############################################################################################## +############################ SQLLOGIC TEST ################################################### +############################################################################################## + SQLLogicTestRelease: + needs: [RunConfig, BuilderDebRelease] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Sqllogic test (release) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} +############################################################################################## +##################################### SQL TEST ############################################### +############################################################################################## + SQLTest: + needs: [RunConfig, BuilderDebRelease] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: SQLTest + runner_type: fuzzer-unit-tester + data: ${{ needs.RunConfig.outputs.data }} +############################################################################################## ###################################### SQLANCER FUZZERS ###################################### ############################################################################################## SQLancerTestRelease: @@ -734,7 +825,6 @@ jobs: - MarkReleaseReady - FunctionalStatelessTestDebug - FunctionalStatelessTestRelease - - FunctionalStatelessTestReleaseDatabaseOrdinary - FunctionalStatelessTestReleaseDatabaseReplicated - FunctionalStatelessTestReleaseAnalyzer - FunctionalStatelessTestReleaseS3 @@ -743,6 +833,8 @@ jobs: - FunctionalStatelessTestTsan - FunctionalStatelessTestMsan - FunctionalStatelessTestUBsan + - FunctionalStatelessTestS3Debug + - FunctionalStatelessTestS3Tsan - FunctionalStatefulTestDebug - FunctionalStatefulTestRelease - FunctionalStatefulTestAarch64 @@ -750,6 +842,12 @@ jobs: - FunctionalStatefulTestTsan - FunctionalStatefulTestMsan - FunctionalStatefulTestUBsan + - FunctionalStatefulTestDebugParallelReplicas + - FunctionalStatefulTestUBsanParallelReplicas + - FunctionalStatefulTestMsanParallelReplicas + - FunctionalStatefulTestTsanParallelReplicas + - FunctionalStatefulTestAsanParallelReplicas + - FunctionalStatefulTestReleaseParallelReplicas - StressTestDebug - StressTestAsan - StressTestTsan @@ -775,6 +873,8 @@ jobs: - UnitTestsReleaseClang - SQLancerTestRelease - SQLancerTestDebug + - SQLLogicTestRelease + - SQLTest runs-on: [self-hosted, style-checker] steps: - name: Check out repository code diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 2774eae24cc..770e1ec3789 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -28,7 +28,7 @@ jobs: id: runconfig run: | echo "::group::configure CI run" - python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --configure --skip-jobs --rebuild-all-docker --outfile ${{ runner.temp }}/ci_run_data.json + python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --configure --skip-jobs --outfile ${{ runner.temp }}/ci_run_data.json echo "::endgroup::" echo "::group::CI run configure results" diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index b3ac2135e50..c9cf5ab90dd 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -104,7 +104,7 @@ jobs: if: ${{ !failure() && !cancelled() }} uses: ./.github/workflows/reusable_test.yml with: - test_name: Fast tests + test_name: Fast test runner_type: builder data: ${{ needs.RunConfig.outputs.data }} run_command: | @@ -147,6 +147,14 @@ jobs: build_name: package_release checkout_depth: 0 data: ${{ needs.RunConfig.outputs.data }} + BuilderDebReleaseCoverage: + needs: [RunConfig, FastTest] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_release_coverage + checkout_depth: 0 + data: ${{ needs.RunConfig.outputs.data }} BuilderDebAarch64: needs: [RunConfig, FastTest] if: ${{ !failure() && !cancelled() }} @@ -273,19 +281,26 @@ jobs: ############################################################################################ ##################################### Docker images ####################################### ############################################################################################ - DockerServerImages: + DockerServerImage: needs: [RunConfig, BuilderDebRelease, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} uses: ./.github/workflows/reusable_test.yml with: - test_name: Docker server and keeper images + test_name: Docker server image runner_type: style-checker data: ${{ needs.RunConfig.outputs.data }} - checkout_depth: 0 # It MUST BE THE SAME for all dependencies and the job itself run_command: | - cd "$GITHUB_WORKSPACE/tests/ci" python3 docker_server.py --release-type head --no-push \ --image-repo clickhouse/clickhouse-server --image-path docker/server --allow-build-reuse + DockerKeeperImage: + needs: [RunConfig, BuilderDebRelease, BuilderDebAarch64] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Docker keeper image + runner_type: style-checker + data: ${{ needs.RunConfig.outputs.data }} + run_command: | python3 docker_server.py --release-type head --no-push \ --image-repo clickhouse/clickhouse-keeper --image-path docker/keeper --allow-build-reuse ############################################################################################ @@ -302,6 +317,7 @@ jobs: - BuilderDebDebug - BuilderDebMsan - BuilderDebRelease + - BuilderDebReleaseCoverage - BuilderDebTsan - BuilderDebUBsan uses: ./.github/workflows/reusable_test.yml @@ -476,21 +492,9 @@ jobs: if: ${{ !failure() && !cancelled() }} uses: ./.github/workflows/reusable_test.yml with: - test_name: tests bugfix validate check + test_name: Bugfix validation runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} - additional_envs: | - KILL_TIMEOUT=3600 - run_command: | - TEMP_PATH="${TEMP_PATH}/integration" \ - python3 integration_test_check.py "Integration $CHECK_NAME" \ - --validate-bugfix --post-commit-status=file || echo 'ignore exit code' - - TEMP_PATH="${TEMP_PATH}/stateless" \ - python3 functional_test_check.py "Stateless $CHECK_NAME" "$KILL_TIMEOUT" \ - --validate-bugfix --post-commit-status=file || echo 'ignore exit code' - - python3 bugfix_validate_check.py "${TEMP_PATH}/stateless/functional_commit_status.tsv" "${TEMP_PATH}/integration/integration_commit_status.tsv" ############################################################################################## ############################ FUNCTIONAl STATEFUL TESTS ####################################### ############################################################################################## @@ -778,6 +782,15 @@ jobs: test_name: Integration tests (release) runner_type: stress-tester data: ${{ needs.RunConfig.outputs.data }} + IntegrationTestsAarch64: + needs: [RunConfig, BuilderDebAarch64] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Integration tests (aarch64) + # FIXME: there is no stress-tester for aarch64. func-tester-aarch64 is ok? + runner_type: func-tester-aarch64 + data: ${{ needs.RunConfig.outputs.data }} IntegrationTestsFlakyCheck: needs: [RunConfig, BuilderDebAsan] if: ${{ !failure() && !cancelled() }} @@ -874,6 +887,7 @@ jobs: - BuilderSpecialReport - DocsCheck - FastTest + - TestsBugfixCheck - FunctionalStatelessTestDebug - FunctionalStatelessTestRelease - FunctionalStatelessTestReleaseDatabaseReplicated @@ -917,6 +931,7 @@ jobs: - IntegrationTestsAnalyzerAsan - IntegrationTestsTsan - IntegrationTestsRelease + - IntegrationTestsAarch64 - IntegrationTestsFlakyCheck - PerformanceComparisonX86 - PerformanceComparisonAarch @@ -985,7 +1000,7 @@ jobs: ####################################### libFuzzer ########################################### ############################################################################################# libFuzzer: - if: ${{ !failure() && !cancelled() && contains(github.event.pull_request.labels.*.name, 'libFuzzer') }} + if: ${{ !failure() && !cancelled() }} needs: [RunConfig, StyleCheck] uses: ./.github/workflows/libfuzzer.yml with: diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 69229ef75df..c076c2209ec 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -41,7 +41,7 @@ jobs: id: runconfig run: | echo "::group::configure CI run" - python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --configure --rebuild-all-binaries --outfile ${{ runner.temp }}/ci_run_data.json + python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --configure --outfile ${{ runner.temp }}/ci_run_data.json echo "::endgroup::" echo "::group::CI run configure results" python3 -m json.tool ${{ runner.temp }}/ci_run_data.json @@ -91,6 +91,8 @@ jobs: build_name: package_release checkout_depth: 0 data: ${{ needs.RunConfig.outputs.data }} + # always rebuild on release branches to be able to publish from any commit + force: true BuilderDebAarch64: needs: [RunConfig, BuildDockers] if: ${{ !failure() && !cancelled() }} @@ -99,6 +101,8 @@ jobs: build_name: package_aarch64 checkout_depth: 0 data: ${{ needs.RunConfig.outputs.data }} + # always rebuild on release branches to be able to publish from any commit + force: true BuilderDebAsan: needs: [RunConfig, BuildDockers] if: ${{ !failure() && !cancelled() }} @@ -142,6 +146,8 @@ jobs: build_name: binary_darwin checkout_depth: 0 data: ${{ needs.RunConfig.outputs.data }} + # always rebuild on release branches to be able to publish from any commit + force: true BuilderBinDarwinAarch64: needs: [RunConfig, BuildDockers] if: ${{ !failure() && !cancelled() }} @@ -150,22 +156,31 @@ jobs: build_name: binary_darwin_aarch64 checkout_depth: 0 data: ${{ needs.RunConfig.outputs.data }} + # always rebuild on release branches to be able to publish from any commit + force: true ############################################################################################ ##################################### Docker images ####################################### ############################################################################################ - DockerServerImages: + DockerServerImage: needs: [RunConfig, BuilderDebRelease, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} uses: ./.github/workflows/reusable_test.yml with: - test_name: Docker server and keeper images + test_name: Docker server image runner_type: style-checker data: ${{ needs.RunConfig.outputs.data }} - checkout_depth: 0 run_command: | - cd "$GITHUB_WORKSPACE/tests/ci" python3 docker_server.py --release-type head --no-push \ --image-repo clickhouse/clickhouse-server --image-path docker/server --allow-build-reuse + DockerKeeperImage: + needs: [RunConfig, BuilderDebRelease, BuilderDebAarch64] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Docker keeper image + runner_type: style-checker + data: ${{ needs.RunConfig.outputs.data }} + run_command: | python3 docker_server.py --release-type head --no-push \ --image-repo clickhouse/clickhouse-keeper --image-path docker/keeper --allow-build-reuse ############################################################################################ @@ -199,13 +214,8 @@ jobs: if: ${{ !cancelled() }} needs: - RunConfig - - BuilderDebRelease - - BuilderDebAarch64 - - BuilderDebAsan - - BuilderDebTsan - - BuilderDebUBsan - - BuilderDebMsan - - BuilderDebDebug + - BuilderBinDarwin + - BuilderBinDarwinAarch64 uses: ./.github/workflows/reusable_test.yml with: test_name: ClickHouse special build check @@ -218,7 +228,7 @@ jobs: run_command: | python3 build_report_check.py "$CHECK_NAME" MarkReleaseReady: - if: ${{ !failure() && !cancelled() }} + if: ${{ ! (contains(needs.*.result, 'skipped') || contains(needs.*.result, 'failure')) }} needs: - BuilderBinDarwin - BuilderBinDarwinAarch64 @@ -228,8 +238,6 @@ jobs: steps: - name: Check out repository code uses: ClickHouse/checkout@v1 - with: - clear-repository: true - name: Mark Commit Release Ready run: | cd "$GITHUB_WORKSPACE/tests/ci" @@ -456,7 +464,8 @@ jobs: FinishCheck: if: ${{ !failure() && !cancelled() }} needs: - - DockerServerImages + - DockerServerImage + - DockerKeeperImage - BuilderReport - BuilderSpecialReport - MarkReleaseReady diff --git a/.github/workflows/reusable_build.yml b/.github/workflows/reusable_build.yml index 2371579692f..80d78d93e1b 100644 --- a/.github/workflows/reusable_build.yml +++ b/.github/workflows/reusable_build.yml @@ -26,6 +26,10 @@ name: Build ClickHouse description: json ci data type: string required: true + force: + description: disallow job skipping + type: boolean + default: false additional_envs: description: additional ENV variables to setup the job type: string @@ -33,7 +37,7 @@ name: Build ClickHouse jobs: Build: name: Build-${{inputs.build_name}} - if: contains(fromJson(inputs.data).jobs_data.jobs_to_do, inputs.build_name) + if: ${{ contains(fromJson(inputs.data).jobs_data.jobs_to_do, inputs.build_name) || inputs.force }} env: GITHUB_JOB_OVERRIDDEN: Build-${{inputs.build_name}} runs-on: [self-hosted, '${{inputs.runner_type}}'] @@ -78,13 +82,15 @@ jobs: python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" \ --infile ${{ toJson(inputs.data) }} \ --job-name "$BUILD_NAME" \ - --run + --run \ + ${{ inputs.force && '--force' || '' }} - name: Post # it still be build report to upload for failed build job if: ${{ !cancelled() }} run: | python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ toJson(inputs.data) }} --post --job-name '${{inputs.build_name}}' - name: Mark as done + if: ${{ !cancelled() }} run: | python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ toJson(inputs.data) }} --mark-success --job-name '${{inputs.build_name}}' - name: Clean diff --git a/.github/workflows/reusable_docker.yml b/.github/workflows/reusable_docker.yml index 08a5740e7e0..3fe1a8883c6 100644 --- a/.github/workflows/reusable_docker.yml +++ b/.github/workflows/reusable_docker.yml @@ -46,7 +46,7 @@ jobs: needs: [DockerBuildAmd64, DockerBuildAarch64] runs-on: [self-hosted, style-checker] if: | - !failure() && !cancelled() && toJson(fromJson(inputs.data).docker_data.missing_multi) != '[]' + !failure() && !cancelled() && (toJson(fromJson(inputs.data).docker_data.missing_multi) != '[]' || inputs.set_latest) steps: - name: Check out repository code uses: ClickHouse/checkout@v1 @@ -55,14 +55,12 @@ jobs: - name: Build images run: | cd "$GITHUB_WORKSPACE/tests/ci" + FLAG_LATEST='' if [ "${{ inputs.set_latest }}" == "true" ]; then + FLAG_LATEST='--set-latest' echo "latest tag will be set for resulting manifests" - python3 docker_manifests_merge.py --suffix amd64 --suffix aarch64 \ - --image-tags '${{ toJson(fromJson(inputs.data).docker_data.images) }}' \ - --missing-images '${{ toJson(fromJson(inputs.data).docker_data.missing_multi) }}' \ - --set-latest - else - python3 docker_manifests_merge.py --suffix amd64 --suffix aarch64 \ - --image-tags '${{ toJson(fromJson(inputs.data).docker_data.images) }}' \ - --missing-images '${{ toJson(fromJson(inputs.data).docker_data.missing_multi) }}' fi + python3 docker_manifests_merge.py --suffix amd64 --suffix aarch64 \ + --image-tags '${{ toJson(fromJson(inputs.data).docker_data.images) }}' \ + --missing-images '${{ toJson(fromJson(inputs.data).docker_data.missing_multi) }}' \ + $FLAG_LATEST diff --git a/.github/workflows/reusable_test.yml b/.github/workflows/reusable_test.yml index 749f64d434e..e30ef863a86 100644 --- a/.github/workflows/reusable_test.yml +++ b/.github/workflows/reusable_test.yml @@ -107,6 +107,7 @@ jobs: run: | python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ toJson(inputs.data) }} --post --job-name '${{inputs.test_name}}' - name: Mark as done + if: ${{ !cancelled() }} run: | python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ toJson(inputs.data) }} --mark-success --job-name '${{inputs.test_name}}' --batch ${{matrix.batch}} - name: Clean diff --git a/.github/workflows/tags_stable.yml b/.github/workflows/tags_stable.yml index 0a3945829ca..e4fc9f0b1d3 100644 --- a/.github/workflows/tags_stable.yml +++ b/.github/workflows/tags_stable.yml @@ -55,7 +55,7 @@ jobs: python3 ./utils/security-generator/generate_security.py > SECURITY.md git diff HEAD - name: Create Pull Request - uses: peter-evans/create-pull-request@v3 + uses: peter-evans/create-pull-request@v6 with: author: "robot-clickhouse " token: ${{ secrets.ROBOT_CLICKHOUSE_COMMIT_TOKEN }} diff --git a/.gitmessage b/.gitmessage index 098b66aab1c..760cfec97a4 100644 --- a/.gitmessage +++ b/.gitmessage @@ -1,6 +1,6 @@ -### CI modificators (add a leading space to apply): +### CI modificators (add a leading space to apply) ### ## To avoid a merge commit in CI: #no_merge_commit @@ -8,12 +8,21 @@ ## To discard CI cache: #no_ci_cache +## To not test (only style check): +#do_not_test + ## To run specified set of tests in CI: #ci_set_ #ci_set_reduced +#ci_set_arm +#ci_set_integration ## To run specified job in CI: #job_ #job_stateless_tests_release #job_package_debug #job_integration_tests_asan + +## To run only specified batches for multi-batch job(s) +#batch_2 +#btach_1_2_3 diff --git a/.gitmodules b/.gitmodules index 68016bf8c5b..a618104f364 100644 --- a/.gitmodules +++ b/.gitmodules @@ -99,7 +99,7 @@ url = https://github.com/awslabs/aws-c-event-stream [submodule "aws-c-common"] path = contrib/aws-c-common - url = https://github.com/ClickHouse/aws-c-common + url = https://github.com/awslabs/aws-c-common.git [submodule "aws-checksums"] path = contrib/aws-checksums url = https://github.com/awslabs/aws-checksums diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b36142cc9f..b3e5dd709ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,2164 +1,175 @@ ### Table of Contents -**[ClickHouse release v23.12, 2023-12-28](#2312)**
-**[ClickHouse release v23.11, 2023-12-06](#2311)**
-**[ClickHouse release v23.10, 2023-11-02](#2310)**
-**[ClickHouse release v23.9, 2023-09-28](#239)**
-**[ClickHouse release v23.8 LTS, 2023-08-31](#238)**
-**[ClickHouse release v23.7, 2023-07-27](#237)**
-**[ClickHouse release v23.6, 2023-06-30](#236)**
-**[ClickHouse release v23.5, 2023-06-08](#235)**
-**[ClickHouse release v23.4, 2023-04-26](#234)**
-**[ClickHouse release v23.3 LTS, 2023-03-30](#233)**
-**[ClickHouse release v23.2, 2023-02-23](#232)**
-**[ClickHouse release v23.1, 2023-01-25](#231)**
-**[Changelog for 2022](https://clickhouse.com/docs/en/whats-new/changelog/2022/)**
+**[ClickHouse release v24.1, 2024-01-30](#241)**
+**[Changelog for 2023](https://clickhouse.com/docs/en/whats-new/changelog/2023/)**
-# 2023 Changelog +# 2024 Changelog -### ClickHouse release 23.12, 2023-12-28 +### ClickHouse release 24.1, 2024-01-30 #### Backward Incompatible Change -* Fix check for non-deterministic functions in TTL expressions. Previously, you could create a TTL expression with non-deterministic functions in some cases, which could lead to undefined behavior later. This fixes [#37250](https://github.com/ClickHouse/ClickHouse/issues/37250). Disallow TTL expressions that don't depend on any columns of a table by default. It can be allowed back by `SET allow_suspicious_ttl_expressions = 1` or `SET compatibility = '23.11'`. Closes [#37286](https://github.com/ClickHouse/ClickHouse/issues/37286). [#51858](https://github.com/ClickHouse/ClickHouse/pull/51858) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The MergeTree setting `clean_deleted_rows` is deprecated, it has no effect anymore. The `CLEANUP` keyword for the `OPTIMIZE` is not allowed by default (it can be unlocked with the `allow_experimental_replacing_merge_with_cleanup` setting). [#58267](https://github.com/ClickHouse/ClickHouse/pull/58267) ([Alexander Tokmakov](https://github.com/tavplubix)). This fixes [#57930](https://github.com/ClickHouse/ClickHouse/issues/57930). This closes [#54988](https://github.com/ClickHouse/ClickHouse/issues/54988). This closes [#54570](https://github.com/ClickHouse/ClickHouse/issues/54570). This closes [#50346](https://github.com/ClickHouse/ClickHouse/issues/50346). This closes [#47579](https://github.com/ClickHouse/ClickHouse/issues/47579). The feature has to be removed because it is not good. We have to remove it as quickly as possible, because there is no other option. [#57932](https://github.com/ClickHouse/ClickHouse/pull/57932) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* The setting `print_pretty_type_names` is turned on by default. You can turn it off to keep the old behavior or `SET compatibility = '23.12'`. [#57726](https://github.com/ClickHouse/ClickHouse/pull/57726) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* The MergeTree setting `clean_deleted_rows` is deprecated, it has no effect anymore. The `CLEANUP` keyword for `OPTIMIZE` is not allowed by default (unless `allow_experimental_replacing_merge_with_cleanup` is enabled). [#58316](https://github.com/ClickHouse/ClickHouse/pull/58316) ([Alexander Tokmakov](https://github.com/tavplubix)). +* The function `reverseDNSQuery` is no longer available. This closes [#58368](https://github.com/ClickHouse/ClickHouse/issues/58368). [#58369](https://github.com/ClickHouse/ClickHouse/pull/58369) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Enable various changes to improve the access control in the configuration file. These changes affect the behavior, and you check the `config.xml` in the `access_control_improvements` section. In case you are not confident, keep the values in the configuration file as they were in the previous version. [#58584](https://github.com/ClickHouse/ClickHouse/pull/58584) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Improve the operation of `sumMapFiltered` with NaN values. NaN values are now placed at the end (instead of randomly) and considered different from any values. `-0` is now also treated as equal to `0`; since 0 values are discarded, `-0` values are discarded too. [#58959](https://github.com/ClickHouse/ClickHouse/pull/58959) ([Raúl Marín](https://github.com/Algunenano)). +* The function `visibleWidth` will behave according to the docs. In previous versions, it simply counted code points after string serialization, like the `lengthUTF8` function, but didn't consider zero-width and combining characters, full-width characters, tabs, and deletes. Now the behavior is changed accordingly. If you want to keep the old behavior, set `function_visible_width_behavior` to `0`, or set `compatibility` to `23.12` or lower. [#59022](https://github.com/ClickHouse/ClickHouse/pull/59022) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* `Kusto` dialect is disabled until these two bugs will be fixed: [#59037](https://github.com/ClickHouse/ClickHouse/issues/59037) and [#59036](https://github.com/ClickHouse/ClickHouse/issues/59036). [#59305](https://github.com/ClickHouse/ClickHouse/pull/59305) ([Alexey Milovidov](https://github.com/alexey-milovidov)). Any attempt to use `Kusto` will result in exception. +* More efficient implementation of the `FINAL` modifier no longer guarantees preserving the order even if `max_threads = 1`. If you counted on the previous behavior, set `enable_vertical_final` to 0 or `compatibility` to `23.12`. #### New Feature -* Implement Refreshable Materialized Views, requested in [#33919](https://github.com/ClickHouse/ClickHouse/issues/33919). [#56946](https://github.com/ClickHouse/ClickHouse/pull/56946) ([Michael Kolupaev](https://github.com/al13n321), [Michael Guzov](https://github.com/koloshmet)). -* Introduce `PASTE JOIN`, which allows users to join tables without `ON` clause simply by row numbers. Example: `SELECT * FROM (SELECT number AS a FROM numbers(2)) AS t1 PASTE JOIN (SELECT number AS a FROM numbers(2) ORDER BY a DESC) AS t2`. [#57995](https://github.com/ClickHouse/ClickHouse/pull/57995) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* The `ORDER BY` clause now supports specifying `ALL`, meaning that ClickHouse sorts by all columns in the `SELECT` clause. Example: `SELECT col1, col2 FROM tab WHERE [...] ORDER BY ALL`. [#57875](https://github.com/ClickHouse/ClickHouse/pull/57875) ([zhongyuankai](https://github.com/zhongyuankai)). -* Added a new mutation command `ALTER TABLE APPLY DELETED MASK`, which allows to enforce applying of mask written by lightweight delete and to remove rows marked as deleted from disk. [#57433](https://github.com/ClickHouse/ClickHouse/pull/57433) ([Anton Popov](https://github.com/CurtizJ)). -* A handler `/binary` opens a visual viewer of symbols inside the ClickHouse binary. [#58211](https://github.com/ClickHouse/ClickHouse/pull/58211) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Added a new SQL function `sqid` to generate Sqids (https://sqids.org/), example: `SELECT sqid(125, 126)`. [#57512](https://github.com/ClickHouse/ClickHouse/pull/57512) ([Robert Schulze](https://github.com/rschu1ze)). -* Add a new function `seriesPeriodDetectFFT` to detect series period using FFT. [#57574](https://github.com/ClickHouse/ClickHouse/pull/57574) ([Bhavna Jindal](https://github.com/bhavnajindal)). -* Add an HTTP endpoint for checking if Keeper is ready to accept traffic. [#55876](https://github.com/ClickHouse/ClickHouse/pull/55876) ([Konstantin Bogdanov](https://github.com/thevar1able)). -* Add 'union' mode for schema inference. In this mode the resulting table schema is the union of all files schemas (so schema is inferred from each file). The mode of schema inference is controlled by a setting `schema_inference_mode` with two possible values - `default` and `union`. Closes [#55428](https://github.com/ClickHouse/ClickHouse/issues/55428). [#55892](https://github.com/ClickHouse/ClickHouse/pull/55892) ([Kruglov Pavel](https://github.com/Avogar)). -* Add new setting `input_format_csv_try_infer_numbers_from_strings` that allows to infer numbers from strings in CSV format. Closes [#56455](https://github.com/ClickHouse/ClickHouse/issues/56455). [#56859](https://github.com/ClickHouse/ClickHouse/pull/56859) ([Kruglov Pavel](https://github.com/Avogar)). -* When the number of databases or tables exceeds a configurable threshold, show a warning to the user. [#57375](https://github.com/ClickHouse/ClickHouse/pull/57375) ([凌涛](https://github.com/lingtaolf)). -* Dictionary with `HASHED_ARRAY` (and `COMPLEX_KEY_HASHED_ARRAY`) layout supports `SHARDS` similarly to `HASHED`. [#57544](https://github.com/ClickHouse/ClickHouse/pull/57544) ([vdimir](https://github.com/vdimir)). -* Add asynchronous metrics for total primary key bytes and total allocated primary key bytes in memory. [#57551](https://github.com/ClickHouse/ClickHouse/pull/57551) ([Bharat Nallan](https://github.com/bharatnc)). -* Add `SHA512_256` function. [#57645](https://github.com/ClickHouse/ClickHouse/pull/57645) ([Bharat Nallan](https://github.com/bharatnc)). -* Add `FORMAT_BYTES` as an alias for `formatReadableSize`. [#57592](https://github.com/ClickHouse/ClickHouse/pull/57592) ([Bharat Nallan](https://github.com/bharatnc)). -* Allow passing optional session token to the `s3` table function. [#57850](https://github.com/ClickHouse/ClickHouse/pull/57850) ([Shani Elharrar](https://github.com/shanielh)). -* Introduce a new setting `http_make_head_request`. If it is turned off, the URL table engine will not do a HEAD request to determine the file size. This is needed to support inefficient, misconfigured, or not capable HTTP servers. [#54602](https://github.com/ClickHouse/ClickHouse/pull/54602) ([Fionera](https://github.com/fionera)). -* It is now possible to refer to ALIAS column in index (non-primary-key) definitions (issue [#55650](https://github.com/ClickHouse/ClickHouse/issues/55650)). Example: `CREATE TABLE tab(col UInt32, col_alias ALIAS col + 1, INDEX idx (col_alias) TYPE minmax) ENGINE = MergeTree ORDER BY col;`. [#57546](https://github.com/ClickHouse/ClickHouse/pull/57546) ([Robert Schulze](https://github.com/rschu1ze)). -* Added a new setting `readonly` which can be used to specify an S3 disk is read only. It can be useful to create a table on a disk of `s3_plain` type, while having read only access to the underlying S3 bucket. [#57977](https://github.com/ClickHouse/ClickHouse/pull/57977) ([Pengyuan Bian](https://github.com/bianpengyuan)). -* The primary key analysis in MergeTree tables will now be applied to predicates that include the virtual column `_part_offset` (optionally with `_part`). This feature can serve as a special kind of a secondary index. [#58224](https://github.com/ClickHouse/ClickHouse/pull/58224) ([Amos Bird](https://github.com/amosbird)). +* Implement Variant data type that represents a union of other data types. Type `Variant(T1, T2, ..., TN)` means that each row of this type has a value of either type `T1` or `T2` or ... or `TN` or none of them (`NULL` value). Variant type is available under a setting `allow_experimental_variant_type`. Reference: [#54864](https://github.com/ClickHouse/ClickHouse/issues/54864). [#58047](https://github.com/ClickHouse/ClickHouse/pull/58047) ([Kruglov Pavel](https://github.com/Avogar)). +* Certain settings (currently `min_compress_block_size` and `max_compress_block_size`) can now be specified at column-level where they take precedence over the corresponding table-level setting. Example: `CREATE TABLE tab (col String SETTINGS (min_compress_block_size = 81920, max_compress_block_size = 163840)) ENGINE = MergeTree ORDER BY tuple();`. [#55201](https://github.com/ClickHouse/ClickHouse/pull/55201) ([Duc Canh Le](https://github.com/canhld94)). +* Add `quantileDD` aggregate function as well as the corresponding `quantilesDD` and `medianDD`. It is based on the DDSketch https://www.vldb.org/pvldb/vol12/p2195-masson.pdf. ### Documentation entry for user-facing changes. [#56342](https://github.com/ClickHouse/ClickHouse/pull/56342) ([Srikanth Chekuri](https://github.com/srikanthccv)). +* Allow to configure any kind of object storage with any kind of metadata type. [#58357](https://github.com/ClickHouse/ClickHouse/pull/58357) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Added `null_status_on_timeout_only_active` and `throw_only_active` modes for `distributed_ddl_output_mode` that allow to avoid waiting for inactive replicas. [#58350](https://github.com/ClickHouse/ClickHouse/pull/58350) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Add function `arrayShingles` to compute subarrays, e.g. `arrayShingles([1, 2, 3, 4, 5], 3)` returns `[[1,2,3],[2,3,4],[3,4,5]]`. [#58396](https://github.com/ClickHouse/ClickHouse/pull/58396) ([Zheng Miao](https://github.com/zenmiao7)). +* Added functions `punycodeEncode`, `punycodeDecode`, `idnaEncode` and `idnaDecode` which are useful for translating international domain names to an ASCII representation according to the IDNA standard. [#58454](https://github.com/ClickHouse/ClickHouse/pull/58454) ([Robert Schulze](https://github.com/rschu1ze)). +* Added string similarity functions `dramerauLevenshteinDistance`, `jaroSimilarity` and `jaroWinklerSimilarity`. [#58531](https://github.com/ClickHouse/ClickHouse/pull/58531) ([Robert Schulze](https://github.com/rschu1ze)). +* Add two settings `output_format_compression_level` to change output compression level and `output_format_compression_zstd_window_log` to explicitly set compression window size and enable long-range mode for zstd compression if output compression method is `zstd`. Applied for `INTO OUTFILE` and when writing to table functions `file`, `url`, `hdfs`, `s3`, and `azureBlobStorage`. [#58539](https://github.com/ClickHouse/ClickHouse/pull/58539) ([Duc Canh Le](https://github.com/canhld94)). +* Automatically disable ANSI escape sequences in Pretty formats if the output is not a terminal. Add new `auto` mode to setting `output_format_pretty_color`. [#58614](https://github.com/ClickHouse/ClickHouse/pull/58614) ([Shaun Struwig](https://github.com/Blargian)). +* Added function `sqidDecode` which decodes [Sqids](https://sqids.org/). [#58544](https://github.com/ClickHouse/ClickHouse/pull/58544) ([Robert Schulze](https://github.com/rschu1ze)). +* Allow to read Bool values into String in JSON input formats. It's done under a setting `input_format_json_read_bools_as_strings` that is enabled by default. [#58561](https://github.com/ClickHouse/ClickHouse/pull/58561) ([Kruglov Pavel](https://github.com/Avogar)). +* Added function `seriesDecomposeSTL` which decomposes a time series into a season, a trend and a residual component. [#57078](https://github.com/ClickHouse/ClickHouse/pull/57078) ([Bhavna Jindal](https://github.com/bhavnajindal)). +* Introduced MySQL Binlog Client for MaterializedMySQL: One binlog connection for many databases. [#57323](https://github.com/ClickHouse/ClickHouse/pull/57323) ([Val Doroshchuk](https://github.com/valbok)). +* Intel QuickAssist Technology (QAT) provides hardware-accelerated compression and cryptograpy. ClickHouse got a new compression codec `ZSTD_QAT` which utilizes QAT for zstd compression. The codec uses [Intel's QATlib](https://github.com/intel/qatlib) and [Inte's QAT ZSTD Plugin](https://github.com/intel/QAT-ZSTD-Plugin). Right now, only compression can be accelerated in hardware (a software fallback kicks in in case QAT could not be initialized), decompression always runs in software. [#57509](https://github.com/ClickHouse/ClickHouse/pull/57509) ([jasperzhu](https://github.com/jinjunzh)). +* Implementing the new way how object storage keys are generated for s3 disks. Now the format could be defined in terms of `re2` regex syntax with `key_template` option in disc description. [#57663](https://github.com/ClickHouse/ClickHouse/pull/57663) ([Sema Checherinda](https://github.com/CheSema)). +* Table system.dropped_tables_parts contains parts of system.dropped_tables tables (dropped but not yet removed tables). [#58038](https://github.com/ClickHouse/ClickHouse/pull/58038) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Add settings `max_materialized_views_size_for_table` to limit the number of materialized views attached to a table. [#58068](https://github.com/ClickHouse/ClickHouse/pull/58068) ([zhongyuankai](https://github.com/zhongyuankai)). +* `clickhouse-format` improvements: support INSERT queries with `VALUES`; support comments (use `--comments` to output them); support `--max_line_length` option to format only long queries in multiline. [#58246](https://github.com/ClickHouse/ClickHouse/pull/58246) ([vdimir](https://github.com/vdimir)). +* Attach all system tables in `clickhouse-local`, including `system.parts`. This closes [#58312](https://github.com/ClickHouse/ClickHouse/issues/58312). [#58359](https://github.com/ClickHouse/ClickHouse/pull/58359) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Support for `Enum` data types in function `transform`. This closes [#58241](https://github.com/ClickHouse/ClickHouse/issues/58241). [#58360](https://github.com/ClickHouse/ClickHouse/pull/58360) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add table `system.database_engines`. [#58390](https://github.com/ClickHouse/ClickHouse/pull/58390) ([Bharat Nallan](https://github.com/bharatnc)). Allow registering database engines independently in the codebase. [#58365](https://github.com/ClickHouse/ClickHouse/pull/58365) ([Bharat Nallan](https://github.com/bharatnc)). Allow registering interpreters independently. [#58443](https://github.com/ClickHouse/ClickHouse/pull/58443) ([Bharat Nallan](https://github.com/bharatnc)). +* Added `FROM ` modifier for `SYSTEM SYNC REPLICA LIGHTWEIGHT` query. With the `FROM` modifier ensures we wait for fetches and drop-ranges only for the specified source replicas, as well as any replica not in zookeeper or with an empty source_replica. [#58393](https://github.com/ClickHouse/ClickHouse/pull/58393) ([Jayme Bird](https://github.com/jaymebrd)). +* Added setting `update_insert_deduplication_token_in_dependent_materialized_views`. This setting allows to update insert deduplication token with table identifier during insert in dependent materialized views. Closes [#59165](https://github.com/ClickHouse/ClickHouse/issues/59165). [#59238](https://github.com/ClickHouse/ClickHouse/pull/59238) ([Maksim Kita](https://github.com/kitaisreal)). +* Added statement `SYSTEM RELOAD ASYNCHRONOUS METRICS` which updates the asynchronous metrics. Mostly useful for testing and development. [#53710](https://github.com/ClickHouse/ClickHouse/pull/53710) ([Robert Schulze](https://github.com/rschu1ze)). #### Performance Improvement -* Extract non-intersecting parts ranges from MergeTree table during FINAL processing. That way we can avoid additional FINAL logic for this non-intersecting parts ranges. In case when amount of duplicate values with same primary key is low, performance will be almost the same as without FINAL. Improve reading performance for MergeTree FINAL when `do_not_merge_across_partitions_select_final` setting is set. [#58120](https://github.com/ClickHouse/ClickHouse/pull/58120) ([Maksim Kita](https://github.com/kitaisreal)). -* Made copy between s3 disks using a s3-server-side copy instead of copying through the buffer. Improves `BACKUP/RESTORE` operations and `clickhouse-disks copy` command. [#56744](https://github.com/ClickHouse/ClickHouse/pull/56744) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). -* Hash JOIN respects setting `max_joined_block_size_rows` and do not produce large blocks for `ALL JOIN`. [#56996](https://github.com/ClickHouse/ClickHouse/pull/56996) ([vdimir](https://github.com/vdimir)). -* Release memory for aggregation earlier. This may avoid unnecessary external aggregation. [#57691](https://github.com/ClickHouse/ClickHouse/pull/57691) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Improve performance of string serialization. [#57717](https://github.com/ClickHouse/ClickHouse/pull/57717) ([Maksim Kita](https://github.com/kitaisreal)). -* Support trivial count optimization for `Merge`-engine tables. [#57867](https://github.com/ClickHouse/ClickHouse/pull/57867) ([skyoct](https://github.com/skyoct)). -* Optimized aggregation in some cases. [#57872](https://github.com/ClickHouse/ClickHouse/pull/57872) ([Anton Popov](https://github.com/CurtizJ)). -* The `hasAny` function can now take advantage of the full-text skipping indices. [#57878](https://github.com/ClickHouse/ClickHouse/pull/57878) ([Jpnock](https://github.com/Jpnock)). -* Function `if(cond, then, else)` (and its alias `cond ? then : else`) were optimized to use branch-free evaluation. [#57885](https://github.com/ClickHouse/ClickHouse/pull/57885) ([zhanglistar](https://github.com/zhanglistar)). -* MergeTree automatically derive `do_not_merge_across_partitions_select_final` setting if partition key expression contains only columns from primary key expression. [#58218](https://github.com/ClickHouse/ClickHouse/pull/58218) ([Maksim Kita](https://github.com/kitaisreal)). -* Speedup `MIN` and `MAX` for native types. [#58231](https://github.com/ClickHouse/ClickHouse/pull/58231) ([Raúl Marín](https://github.com/Algunenano)). -* Implement `SLRU` cache policy for filesystem cache. [#57076](https://github.com/ClickHouse/ClickHouse/pull/57076) ([Kseniia Sumarokova](https://github.com/kssenii)). -* The limit for the number of connections per endpoint for background fetches was raised from `15` to the value of `background_fetches_pool_size` setting. - MergeTree-level setting `replicated_max_parallel_fetches_for_host` became obsolete - MergeTree-level settings `replicated_fetches_http_connection_timeout`, `replicated_fetches_http_send_timeout` and `replicated_fetches_http_receive_timeout` are moved to the Server-level. - Setting `keep_alive_timeout` is added to the list of Server-level settings. [#57523](https://github.com/ClickHouse/ClickHouse/pull/57523) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Make querying `system.filesystem_cache` not memory intensive. [#57687](https://github.com/ClickHouse/ClickHouse/pull/57687) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Reduce memory usage on strings deserialization. [#57787](https://github.com/ClickHouse/ClickHouse/pull/57787) ([Maksim Kita](https://github.com/kitaisreal)). -* More efficient constructor for Enum - it makes sense when Enum has a boatload of values. [#57887](https://github.com/ClickHouse/ClickHouse/pull/57887) ([Duc Canh Le](https://github.com/canhld94)). -* An improvement for reading from the filesystem cache: always use `pread` method. [#57970](https://github.com/ClickHouse/ClickHouse/pull/57970) ([Nikita Taranov](https://github.com/nickitat)). -* Add optimization for AND notEquals chain in logical expression optimizer. This optimization is only available with the experimental Analyzer enabled. [#58214](https://github.com/ClickHouse/ClickHouse/pull/58214) ([Kevin Mingtarja](https://github.com/kevinmingtarja)). +* Coordination for parallel replicas is rewritten for better parallelism and cache locality. It has been tested for linear scalability on hundreds of replicas. It also got support for reading in order. [#57968](https://github.com/ClickHouse/ClickHouse/pull/57968) ([Nikita Taranov](https://github.com/nickitat)). +* Replace HTTP outgoing buffering based with the native ClickHouse buffers. Add bytes counting metrics for interfaces. [#56064](https://github.com/ClickHouse/ClickHouse/pull/56064) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Large aggregation states of `uniqExact` will be merged in parallel in distrubuted queries. [#59009](https://github.com/ClickHouse/ClickHouse/pull/59009) ([Nikita Taranov](https://github.com/nickitat)). +* Lower memory usage after reading from `MergeTree` tables. [#59290](https://github.com/ClickHouse/ClickHouse/pull/59290) ([Anton Popov](https://github.com/CurtizJ)). +* Lower memory usage in vertical merges. [#59340](https://github.com/ClickHouse/ClickHouse/pull/59340) ([Anton Popov](https://github.com/CurtizJ)). +* Avoid huge memory consumption during Keeper startup for more cases. [#58455](https://github.com/ClickHouse/ClickHouse/pull/58455) ([Antonio Andelic](https://github.com/antonio2368)). +* Keeper improvement: reduce Keeper's memory usage for stored nodes. [#59002](https://github.com/ClickHouse/ClickHouse/pull/59002) ([Antonio Andelic](https://github.com/antonio2368)). +* More cache-friendly final implementation. Note on the behaviour change: previously queries with `FINAL` modifier that read with a single stream (e.g. `max_threads = 1`) produced sorted output without explicitly provided `ORDER BY` clause. This is no longer guaranteed when `enable_vertical_final = true` (and it is so by default). [#54366](https://github.com/ClickHouse/ClickHouse/pull/54366) ([Duc Canh Le](https://github.com/canhld94)). +* Bypass extra copying in `ReadBufferFromIStream` which is used, e.g., for reading from S3. [#56961](https://github.com/ClickHouse/ClickHouse/pull/56961) ([Nikita Taranov](https://github.com/nickitat)). +* Optimize array element function when input is Array(Map)/Array(Array(Num)/Array(Array(String))/Array(BigInt)/Array(Decimal). The previous implementations did more allocations than needed. The optimization speed up is up to ~6x especially when input type is Array(Map). [#56403](https://github.com/ClickHouse/ClickHouse/pull/56403) ([李扬](https://github.com/taiyang-li)). +* Read column once while reading more than one subcolumn from it in compact parts. [#57631](https://github.com/ClickHouse/ClickHouse/pull/57631) ([Kruglov Pavel](https://github.com/Avogar)). +* Rewrite the AST of `sum(column + constant)` function. This is available as an optimization pass for Analyzer [#57853](https://github.com/ClickHouse/ClickHouse/pull/57853) ([Jiebin Sun](https://github.com/jiebinn)). +* The evaluation of function `match` now utilizes skipping indices `ngrambf_v1` and `tokenbf_v1`. [#57882](https://github.com/ClickHouse/ClickHouse/pull/57882) ([凌涛](https://github.com/lingtaolf)). +* The evaluation of function `match` now utilizes inverted indices. [#58284](https://github.com/ClickHouse/ClickHouse/pull/58284) ([凌涛](https://github.com/lingtaolf)). +* MergeTree `FINAL` does not compare rows from same non-L0 part. [#58142](https://github.com/ClickHouse/ClickHouse/pull/58142) ([Duc Canh Le](https://github.com/canhld94)). +* Speed up iota calls (filling array with consecutive numbers). [#58271](https://github.com/ClickHouse/ClickHouse/pull/58271) ([Raúl Marín](https://github.com/Algunenano)). +* Speedup MIN/MAX for non-numeric types. [#58334](https://github.com/ClickHouse/ClickHouse/pull/58334) ([Raúl Marín](https://github.com/Algunenano)). +* Optimize the combination of filters (like in multi-stage PREWHERE) with BMI2/SSE intrinsics [#58800](https://github.com/ClickHouse/ClickHouse/pull/58800) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). +* Use one thread less in `clickhouse-local`. [#58968](https://github.com/ClickHouse/ClickHouse/pull/58968) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Improve the `multiIf` function performance when the type is Nullable. [#57745](https://github.com/ClickHouse/ClickHouse/pull/57745) ([KevinyhZou](https://github.com/KevinyhZou)). +* Add `SYSTEM JEMALLOC PURGE` for purging unused jemalloc pages, `SYSTEM JEMALLOC [ ENABLE | DISABLE | FLUSH ] PROFILE` for controlling jemalloc profile if the profiler is enabled. Add jemalloc-related 4LW command in Keeper: `jmst` for dumping jemalloc stats, `jmfp`, `jmep`, `jmdp` for controlling jemalloc profile if the profiler is enabled. [#58665](https://github.com/ClickHouse/ClickHouse/pull/58665) ([Antonio Andelic](https://github.com/antonio2368)). +* Lower memory consumption in backups to S3. [#58962](https://github.com/ClickHouse/ClickHouse/pull/58962) ([Vitaly Baranov](https://github.com/vitlibar)). #### Improvement -* Support for soft memory limit in Keeper. It will refuse requests if the memory usage is close to the maximum. [#57271](https://github.com/ClickHouse/ClickHouse/pull/57271) ([Han Fei](https://github.com/hanfei1991)). [#57699](https://github.com/ClickHouse/ClickHouse/pull/57699) ([Han Fei](https://github.com/hanfei1991)). -* Make inserts into distributed tables handle updated cluster configuration properly. When the list of cluster nodes is dynamically updated, the Directory Monitor of the distribution table will update it. [#42826](https://github.com/ClickHouse/ClickHouse/pull/42826) ([zhongyuankai](https://github.com/zhongyuankai)). -* Do not allow creating a replicated table with inconsistent merge parameters. [#56833](https://github.com/ClickHouse/ClickHouse/pull/56833) ([Duc Canh Le](https://github.com/canhld94)). -* Show uncompressed size in `system.tables`. [#56618](https://github.com/ClickHouse/ClickHouse/issues/56618). [#57186](https://github.com/ClickHouse/ClickHouse/pull/57186) ([Chen Lixiang](https://github.com/chenlx0)). -* Add `skip_unavailable_shards` as a setting for `Distributed` tables that is similar to the corresponding query-level setting. Closes [#43666](https://github.com/ClickHouse/ClickHouse/issues/43666). [#57218](https://github.com/ClickHouse/ClickHouse/pull/57218) ([Gagan Goel](https://github.com/tntnatbry)). -* The function `substring` (aliases: `substr`, `mid`) can now be used with `Enum` types. Previously, the first function argument had to be a value of type `String` or `FixedString`. This improves compatibility with 3rd party tools such as Tableau via MySQL interface. [#57277](https://github.com/ClickHouse/ClickHouse/pull/57277) ([Serge Klochkov](https://github.com/slvrtrn)). -* Function `format` now supports arbitrary argument types (instead of only `String` and `FixedString` arguments). This is important to calculate `SELECT format('The {0} to all questions is {1}', 'answer', 42)`. [#57549](https://github.com/ClickHouse/ClickHouse/pull/57549) ([Robert Schulze](https://github.com/rschu1ze)). -* Allows to use the `date_trunc` function with a case-insensitive first argument. Both cases are now supported: `SELECT date_trunc('day', now())` and `SELECT date_trunc('DAY', now())`. [#57624](https://github.com/ClickHouse/ClickHouse/pull/57624) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Better hints when a table doesn't exist. [#57342](https://github.com/ClickHouse/ClickHouse/pull/57342) ([Bharat Nallan](https://github.com/bharatnc)). -* Allow to overwrite `max_partition_size_to_drop` and `max_table_size_to_drop` server settings in query time. [#57452](https://github.com/ClickHouse/ClickHouse/pull/57452) ([Jordi Villar](https://github.com/jrdi)). -* Slightly better inference of unnamed tupes in JSON formats. [#57751](https://github.com/ClickHouse/ClickHouse/pull/57751) ([Kruglov Pavel](https://github.com/Avogar)). -* Add support for read-only flag when connecting to Keeper (fixes [#53749](https://github.com/ClickHouse/ClickHouse/issues/53749)). [#57479](https://github.com/ClickHouse/ClickHouse/pull/57479) ([Mikhail Koviazin](https://github.com/mkmkme)). -* Fix possible distributed sends stuck due to "No such file or directory" (during recovering a batch from disk). Fix possible issues with `error_count` from `system.distribution_queue` (in case of `distributed_directory_monitor_max_sleep_time_ms` >5min). Introduce profile event to track async INSERT failures - `DistributedAsyncInsertionFailures`. [#57480](https://github.com/ClickHouse/ClickHouse/pull/57480) ([Azat Khuzhin](https://github.com/azat)). -* Support PostgreSQL generated columns and default column values in `MaterializedPostgreSQL` (experimental feature). Closes [#40449](https://github.com/ClickHouse/ClickHouse/issues/40449). [#57568](https://github.com/ClickHouse/ClickHouse/pull/57568) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Allow to apply some filesystem cache config settings changes without server restart. [#57578](https://github.com/ClickHouse/ClickHouse/pull/57578) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Properly handling PostgreSQL table structure with empty array. [#57618](https://github.com/ClickHouse/ClickHouse/pull/57618) ([Mike Kot](https://github.com/myrrc)). -* Expose the total number of errors occurred since last server restart as a `ClickHouseErrorMetric_ALL` metric. [#57627](https://github.com/ClickHouse/ClickHouse/pull/57627) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Allow nodes in the configuration file with `from_env`/`from_zk` reference and non empty element with replace=1. [#57628](https://github.com/ClickHouse/ClickHouse/pull/57628) ([Azat Khuzhin](https://github.com/azat)). -* A table function `fuzzJSON` which allows generating a lot of malformed JSON for fuzzing. [#57646](https://github.com/ClickHouse/ClickHouse/pull/57646) ([Julia Kartseva](https://github.com/jkartseva)). -* Allow IPv6 to UInt128 conversion and binary arithmetic. [#57707](https://github.com/ClickHouse/ClickHouse/pull/57707) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Add a setting for `async inserts deduplication cache` - how long we wait for cache update. Deprecate setting `async_block_ids_cache_min_update_interval_ms`. Now cache is updated only in case of conflicts. [#57743](https://github.com/ClickHouse/ClickHouse/pull/57743) ([alesapin](https://github.com/alesapin)). -* `sleep()` function now can be cancelled with `KILL QUERY`. [#57746](https://github.com/ClickHouse/ClickHouse/pull/57746) ([Vitaly Baranov](https://github.com/vitlibar)). -* Forbid `CREATE TABLE ... AS SELECT` queries for `Replicated` table engines in the experimental `Replicated` database because they are not supported. Reference [#35408](https://github.com/ClickHouse/ClickHouse/issues/35408). [#57796](https://github.com/ClickHouse/ClickHouse/pull/57796) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix and improve transforming queries for external databases, to recursively obtain all compatible predicates. [#57888](https://github.com/ClickHouse/ClickHouse/pull/57888) ([flynn](https://github.com/ucasfl)). -* Support dynamic reloading of the filesystem cache size. Closes [#57866](https://github.com/ClickHouse/ClickHouse/issues/57866). [#57897](https://github.com/ClickHouse/ClickHouse/pull/57897) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Correctly support `system.stack_trace` for threads with blocked SIGRTMIN (these threads can exist in low-quality external libraries such as Apache rdkafka). [#57907](https://github.com/ClickHouse/ClickHouse/pull/57907) ([Azat Khuzhin](https://github.com/azat)). Aand also send signal to the threads only if it is not blocked to avoid waiting `storage_system_stack_trace_pipe_read_timeout_ms` when it does not make any sense. [#58136](https://github.com/ClickHouse/ClickHouse/pull/58136) ([Azat Khuzhin](https://github.com/azat)). -* Tolerate keeper failures in the quorum inserts' check. [#57986](https://github.com/ClickHouse/ClickHouse/pull/57986) ([Raúl Marín](https://github.com/Algunenano)). -* Add max/peak RSS (`MemoryResidentMax`) into system.asynchronous_metrics. [#58095](https://github.com/ClickHouse/ClickHouse/pull/58095) ([Azat Khuzhin](https://github.com/azat)). -* This PR allows users to use s3-style links (`https://` and `s3://`) without mentioning region if it's not default. Also find the correct region if the user mentioned the wrong one. [#58148](https://github.com/ClickHouse/ClickHouse/pull/58148) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* `clickhouse-format --obfuscate` will know about Settings, MergeTreeSettings, and time zones and keep their names unchanged. [#58179](https://github.com/ClickHouse/ClickHouse/pull/58179) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Added explicit `finalize()` function in `ZipArchiveWriter`. Simplify too complicated code in `ZipArchiveWriter`. This fixes [#58074](https://github.com/ClickHouse/ClickHouse/issues/58074). [#58202](https://github.com/ClickHouse/ClickHouse/pull/58202) ([Vitaly Baranov](https://github.com/vitlibar)). -* Make caches with the same path use the same cache objects. This behaviour existed before, but was broken in 23.4. If such caches with the same path have different set of cache settings, an exception will be thrown, that this is not allowed. [#58264](https://github.com/ClickHouse/ClickHouse/pull/58264) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Parallel replicas (experimental feature): friendly settings [#57542](https://github.com/ClickHouse/ClickHouse/pull/57542) ([Igor Nikonov](https://github.com/devcrafter)). -* Parallel replicas (experimental feature): announcement response handling improvement [#57749](https://github.com/ClickHouse/ClickHouse/pull/57749) ([Igor Nikonov](https://github.com/devcrafter)). -* Parallel replicas (experimental feature): give more respect to `min_number_of_marks` in `ParallelReplicasReadingCoordinator` [#57763](https://github.com/ClickHouse/ClickHouse/pull/57763) ([Nikita Taranov](https://github.com/nickitat)). -* Parallel replicas (experimental feature): disable parallel replicas with IN (subquery) [#58133](https://github.com/ClickHouse/ClickHouse/pull/58133) ([Igor Nikonov](https://github.com/devcrafter)). -* Parallel replicas (experimental feature): add profile event 'ParallelReplicasUsedCount' [#58173](https://github.com/ClickHouse/ClickHouse/pull/58173) ([Igor Nikonov](https://github.com/devcrafter)). -* Non POST requests such as HEAD will be readonly similar to GET. [#58060](https://github.com/ClickHouse/ClickHouse/pull/58060) ([San](https://github.com/santrancisco)). -* Add `bytes_uncompressed` column to `system.part_log` [#58167](https://github.com/ClickHouse/ClickHouse/pull/58167) ([Jordi Villar](https://github.com/jrdi)). -* Add base backup name to `system.backups` and `system.backup_log` tables [#58178](https://github.com/ClickHouse/ClickHouse/pull/58178) ([Pradeep Chhetri](https://github.com/chhetripradeep)). -* Add support for specifying query parameters in the command line in clickhouse-local [#58210](https://github.com/ClickHouse/ClickHouse/pull/58210) ([Pradeep Chhetri](https://github.com/chhetripradeep)). +* Added comments (brief descriptions) to all columns of system tables. There are several reasons for this: - We use system tables a lot, and sometimes it could be very difficult for developer to understand the purpose and the meaning of a particular column. - We change (add new ones or modify existing) system tables a lot and the documentation for them is always outdated. For example take a look at the documentation page for [`system.parts`](https://clickhouse.com/docs/en/operations/system-tables/parts). It misses a lot of columns - We would like to eventually generate documentation directly from ClickHouse. [#58356](https://github.com/ClickHouse/ClickHouse/pull/58356) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Allow queries without aliases for subqueries for `PASTE JOIN`. [#58654](https://github.com/ClickHouse/ClickHouse/pull/58654) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Enable `MySQL`/`MariaDB` integration on macOS. This closes [#21191](https://github.com/ClickHouse/ClickHouse/issues/21191). [#46316](https://github.com/ClickHouse/ClickHouse/pull/46316) ([Alexey Milovidov](https://github.com/alexey-milovidov)) ([Robert Schulze](https://github.com/rschu1ze)). +* Disable `max_rows_in_set_to_optimize_join` by default. [#56396](https://github.com/ClickHouse/ClickHouse/pull/56396) ([vdimir](https://github.com/vdimir)). +* Add `` config parameter that allows avoiding resolving hostnames in ON CLUSTER DDL queries and Replicated database engines. This mitigates the possibility of the queue being stuck in case of a change in cluster definition. Closes [#57573](https://github.com/ClickHouse/ClickHouse/issues/57573). [#57603](https://github.com/ClickHouse/ClickHouse/pull/57603) ([Nikolay Degterinsky](https://github.com/evillique)). +* Increase `load_metadata_threads` to 16 for the filesystem cache. It will make the server start up faster. [#57732](https://github.com/ClickHouse/ClickHouse/pull/57732) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add ability to throttle merges/mutations (`max_mutations_bandwidth_for_server`/`max_merges_bandwidth_for_server`). [#57877](https://github.com/ClickHouse/ClickHouse/pull/57877) ([Azat Khuzhin](https://github.com/azat)). +* Replaced undocumented (boolean) column `is_hot_reloadable` in system table `system.server_settings` by (Enum8) column `changeable_without_restart` with possible values `No`, `Yes`, `IncreaseOnly` and `DecreaseOnly`. Also documented the column. [#58029](https://github.com/ClickHouse/ClickHouse/pull/58029) ([skyoct](https://github.com/skyoct)). +* Cluster discovery supports setting username and password, close [#58063](https://github.com/ClickHouse/ClickHouse/issues/58063). [#58123](https://github.com/ClickHouse/ClickHouse/pull/58123) ([vdimir](https://github.com/vdimir)). +* Support query parameters in `ALTER TABLE ... PART`. [#58297](https://github.com/ClickHouse/ClickHouse/pull/58297) ([Azat Khuzhin](https://github.com/azat)). +* Create consumers for Kafka tables on the fly (but keep them for some period - `kafka_consumers_pool_ttl_ms`, since last used), this should fix problem with statistics for `system.kafka_consumers` (that does not consumed when nobody reads from Kafka table, which leads to live memory leak and slow table detach) and also this PR enables stats for `system.kafka_consumers` by default again. [#58310](https://github.com/ClickHouse/ClickHouse/pull/58310) ([Azat Khuzhin](https://github.com/azat)). +* `sparkBar` as an alias to `sparkbar`. [#58335](https://github.com/ClickHouse/ClickHouse/pull/58335) ([凌涛](https://github.com/lingtaolf)). +* Avoid sending `ComposeObject` requests after upload to `GCS`. [#58343](https://github.com/ClickHouse/ClickHouse/pull/58343) ([Azat Khuzhin](https://github.com/azat)). +* Correctly handle keys with dot in the name in configurations XMLs. [#58354](https://github.com/ClickHouse/ClickHouse/pull/58354) ([Azat Khuzhin](https://github.com/azat)). +* Make function `format` return constant on constant arguments. This closes [#58355](https://github.com/ClickHouse/ClickHouse/issues/58355). [#58358](https://github.com/ClickHouse/ClickHouse/pull/58358) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Adding a setting `max_estimated_execution_time` to separate `max_execution_time` and `max_estimated_execution_time`. [#58402](https://github.com/ClickHouse/ClickHouse/pull/58402) ([Zhang Yifan](https://github.com/zhangyifan27)). +* Provide a hint when an invalid database engine name is used. [#58444](https://github.com/ClickHouse/ClickHouse/pull/58444) ([Bharat Nallan](https://github.com/bharatnc)). +* Add settings for better control of indexes type in Arrow dictionary. Use signed integer type for indexes by default as Arrow recommends. Closes [#57401](https://github.com/ClickHouse/ClickHouse/issues/57401). [#58519](https://github.com/ClickHouse/ClickHouse/pull/58519) ([Kruglov Pavel](https://github.com/Avogar)). +* Implement [#58575](https://github.com/ClickHouse/ClickHouse/issues/58575) Support `CLICKHOUSE_PASSWORD_FILE ` environment variable when running the docker image. [#58583](https://github.com/ClickHouse/ClickHouse/pull/58583) ([Eyal Halpern Shalev](https://github.com/Eyal-Shalev)). +* When executing some queries, which require a lot of streams for reading data, the error `"Paste JOIN requires sorted tables only"` was previously thrown. Now the numbers of streams resize to 1 in that case. [#58608](https://github.com/ClickHouse/ClickHouse/pull/58608) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Better message for INVALID_IDENTIFIER error. [#58703](https://github.com/ClickHouse/ClickHouse/pull/58703) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Improved handling of signed numeric literals in normalizeQuery. [#58710](https://github.com/ClickHouse/ClickHouse/pull/58710) ([Salvatore Mesoraca](https://github.com/aiven-sal)). +* Support Point data type for MySQL. [#58721](https://github.com/ClickHouse/ClickHouse/pull/58721) ([Kseniia Sumarokova](https://github.com/kssenii)). +* When comparing a Float32 column and a const string, read the string as Float32 (instead of Float64). [#58724](https://github.com/ClickHouse/ClickHouse/pull/58724) ([Raúl Marín](https://github.com/Algunenano)). +* Improve S3 compatibility, add ECloud EOS storage support. [#58786](https://github.com/ClickHouse/ClickHouse/pull/58786) ([xleoken](https://github.com/xleoken)). +* Allow `KILL QUERY` to cancel backups / restores. This PR also makes running backups and restores visible in `system.processes`. Also, there is a new setting in the server configuration now - `shutdown_wait_backups_and_restores` (default=true) which makes the server either wait on shutdown for all running backups and restores to finish or just cancel them. [#58804](https://github.com/ClickHouse/ClickHouse/pull/58804) ([Vitaly Baranov](https://github.com/vitlibar)). +* Avro format to support ZSTD codec. Closes [#58735](https://github.com/ClickHouse/ClickHouse/issues/58735). [#58805](https://github.com/ClickHouse/ClickHouse/pull/58805) ([flynn](https://github.com/ucasfl)). +* MySQL interface gained support for `net_write_timeout` and `net_read_timeout` settings. `net_write_timeout` is translated into the native `send_timeout` ClickHouse setting and, similarly, `net_read_timeout` into `receive_timeout`. Fixed an issue where it was possible to set MySQL `sql_select_limit` setting only if the entire statement was in upper case. [#58835](https://github.com/ClickHouse/ClickHouse/pull/58835) ([Serge Klochkov](https://github.com/slvrtrn)). +* A better exception message while conflict of creating dictionary and table with the same name. [#58841](https://github.com/ClickHouse/ClickHouse/pull/58841) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Make sure that for custom (created from SQL) disks ether `filesystem_caches_path` (a common directory prefix for all filesystem caches) or `custom_cached_disks_base_directory` (a common directory prefix for only filesystem caches created from custom disks) is specified in server config. `custom_cached_disks_base_directory` has higher priority for custom disks over `filesystem_caches_path`, which is used if the former one is absent. Filesystem cache setting `path` must lie inside that directory, otherwise exception will be thrown preventing disk to be created. This will not affect disks created on an older version and server was upgraded - then the exception will not be thrown to allow the server to successfully start). `custom_cached_disks_base_directory` is added to default server config as `/var/lib/clickhouse/caches/`. Closes [#57825](https://github.com/ClickHouse/ClickHouse/issues/57825). [#58869](https://github.com/ClickHouse/ClickHouse/pull/58869) ([Kseniia Sumarokova](https://github.com/kssenii)). +* MySQL interface gained compatibility with `SHOW WARNINGS`/`SHOW COUNT(*) WARNINGS` queries, though the returned result is always an empty set. [#58929](https://github.com/ClickHouse/ClickHouse/pull/58929) ([Serge Klochkov](https://github.com/slvrtrn)). +* Skip unavailable replicas when executing parallel distributed `INSERT SELECT`. [#58931](https://github.com/ClickHouse/ClickHouse/pull/58931) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Display word-descriptive log level while enabling structured log formatting in json. [#58936](https://github.com/ClickHouse/ClickHouse/pull/58936) ([Tim Liou](https://github.com/wheatdog)). +* MySQL interface gained support for `CAST(x AS SIGNED)` and `CAST(x AS UNSIGNED)` statements via data type aliases: `SIGNED` for Int64, and `UNSIGNED` for UInt64. This improves compatibility with BI tools such as Looker Studio. [#58954](https://github.com/ClickHouse/ClickHouse/pull/58954) ([Serge Klochkov](https://github.com/slvrtrn)). +* Change working directory to the data path in docker container. [#58975](https://github.com/ClickHouse/ClickHouse/pull/58975) ([cangyin](https://github.com/cangyin)). +* Added setting for Azure Blob Storage `azure_max_unexpected_write_error_retries` , can also be set from config under azure section. [#59001](https://github.com/ClickHouse/ClickHouse/pull/59001) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Allow server to start with broken data lake table. Closes [#58625](https://github.com/ClickHouse/ClickHouse/issues/58625). [#59080](https://github.com/ClickHouse/ClickHouse/pull/59080) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Allow to ignore schema evolution in the `Iceberg` table engine and read all data using schema specified by the user on table creation or latest schema parsed from metadata on table creation. This is done under a setting `iceberg_engine_ignore_schema_evolution` that is disabled by default. Note that enabling this setting can lead to incorrect result as in case of evolved schema all data files will be read using the same schema. [#59133](https://github.com/ClickHouse/ClickHouse/pull/59133) ([Kruglov Pavel](https://github.com/Avogar)). +* Prohibit mutable operations (`INSERT`/`ALTER`/`OPTIMIZE`/...) on read-only/write-once storages with a proper `TABLE_IS_READ_ONLY` error (to avoid leftovers). Avoid leaving left-overs on write-once disks (`format_version.txt`) on `CREATE`/`ATTACH`. Ignore `DROP` for `ReplicatedMergeTree` (so as for `MergeTree`). Fix iterating over `s3_plain` (`MetadataStorageFromPlainObjectStorage::iterateDirectory`). Note read-only is `web` disk, and write-once is `s3_plain`. [#59170](https://github.com/ClickHouse/ClickHouse/pull/59170) ([Azat Khuzhin](https://github.com/azat)). +* Fix bug in the experimental `_block_number` column which could lead to logical error during complex combination of `ALTER`s and `merge`s. Fixes [#56202](https://github.com/ClickHouse/ClickHouse/issues/56202). Replaces [#58601](https://github.com/ClickHouse/ClickHouse/issues/58601). [#59295](https://github.com/ClickHouse/ClickHouse/pull/59295) ([alesapin](https://github.com/alesapin)). +* Play UI understands when an exception is returned inside JSON. Adjustment for [#52853](https://github.com/ClickHouse/ClickHouse/issues/52853). [#59303](https://github.com/ClickHouse/ClickHouse/pull/59303) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* `/binary` HTTP handler allows to specify user, host, and optionally, password in the query string. [#59311](https://github.com/ClickHouse/ClickHouse/pull/59311) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Support backups for compressed in-memory tables. This closes [#57893](https://github.com/ClickHouse/ClickHouse/issues/57893). [#59315](https://github.com/ClickHouse/ClickHouse/pull/59315) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Support the `FORMAT` clause in `BACKUP` and `RESTORE` queries. [#59338](https://github.com/ClickHouse/ClickHouse/pull/59338) ([Vitaly Baranov](https://github.com/vitlibar)). +* Function `concatWithSeparator` now supports arbitrary argument types (instead of only `String` and `FixedString` arguments). For example, `SELECT concatWithSeparator('.', 'number', 1)` now returns `number.1`. [#59341](https://github.com/ClickHouse/ClickHouse/pull/59341) ([Robert Schulze](https://github.com/rschu1ze)). #### Build/Testing/Packaging Improvement -* Randomize more settings [#39663](https://github.com/ClickHouse/ClickHouse/pull/39663) ([Anton Popov](https://github.com/CurtizJ)). -* Randomize disabled optimizations in CI [#57315](https://github.com/ClickHouse/ClickHouse/pull/57315) ([Raúl Marín](https://github.com/Algunenano)). -* Allow usage of Azure-related table engines/functions on macOS. [#51866](https://github.com/ClickHouse/ClickHouse/pull/51866) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* ClickHouse Fast Test now uses Musl instead of GLibc. [#57711](https://github.com/ClickHouse/ClickHouse/pull/57711) ([Alexey Milovidov](https://github.com/alexey-milovidov)). The fully-static Musl build is available to download from the CI. -* Run ClickBench for every commit. This closes [#57708](https://github.com/ClickHouse/ClickHouse/issues/57708). [#57712](https://github.com/ClickHouse/ClickHouse/pull/57712) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Remove the usage of a harmful C/POSIX `select` function from external libraries. [#57467](https://github.com/ClickHouse/ClickHouse/pull/57467) ([Igor Nikonov](https://github.com/devcrafter)). -* Settings only available in ClickHouse Cloud will be also present in the open-source ClickHouse build for convenience. [#57638](https://github.com/ClickHouse/ClickHouse/pull/57638) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). - -#### Bug Fix (user-visible misbehavior in an official stable release) -* Fixed a possibility of sorting order breakage in TTL GROUP BY [#49103](https://github.com/ClickHouse/ClickHouse/pull/49103) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Fix: split `lttb` bucket strategy, first bucket and last bucket should only contain single point [#57003](https://github.com/ClickHouse/ClickHouse/pull/57003) ([FFish](https://github.com/wxybear)). -* Fix possible deadlock in the `Template` format during sync after error [#57004](https://github.com/ClickHouse/ClickHouse/pull/57004) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix early stop while parsing a file with skipping lots of errors [#57006](https://github.com/ClickHouse/ClickHouse/pull/57006) ([Kruglov Pavel](https://github.com/Avogar)). -* Prevent dictionary's ACL bypass via the `dictionary` table function [#57362](https://github.com/ClickHouse/ClickHouse/pull/57362) ([Salvatore Mesoraca](https://github.com/aiven-sal)). -* Fix another case of a "non-ready set" error found by Fuzzer. [#57423](https://github.com/ClickHouse/ClickHouse/pull/57423) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix several issues regarding PostgreSQL `array_ndims` usage. [#57436](https://github.com/ClickHouse/ClickHouse/pull/57436) ([Ryan Jacobs](https://github.com/ryanmjacobs)). -* Fix RWLock inconsistency after write lock timeout [#57454](https://github.com/ClickHouse/ClickHouse/pull/57454) ([Vitaly Baranov](https://github.com/vitlibar)). Fix RWLock inconsistency after write lock timeout (again) [#57733](https://github.com/ClickHouse/ClickHouse/pull/57733) ([Vitaly Baranov](https://github.com/vitlibar)). -* Fix: don't exclude ephemeral column when building pushing to view chain [#57461](https://github.com/ClickHouse/ClickHouse/pull/57461) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* MaterializedPostgreSQL (experimental issue): fix issue [#41922](https://github.com/ClickHouse/ClickHouse/issues/41922), add test for [#41923](https://github.com/ClickHouse/ClickHouse/issues/41923) [#57515](https://github.com/ClickHouse/ClickHouse/pull/57515) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Ignore ON CLUSTER clause in grant/revoke queries for management of replicated access entities. [#57538](https://github.com/ClickHouse/ClickHouse/pull/57538) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). -* Fix crash in clickhouse-local [#57553](https://github.com/ClickHouse/ClickHouse/pull/57553) ([Nikolay Degterinsky](https://github.com/evillique)). -* A fix for Hash JOIN. [#57564](https://github.com/ClickHouse/ClickHouse/pull/57564) ([vdimir](https://github.com/vdimir)). -* Fix possible error in PostgreSQL source [#57567](https://github.com/ClickHouse/ClickHouse/pull/57567) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix type correction in Hash JOIN for nested LowCardinality. [#57614](https://github.com/ClickHouse/ClickHouse/pull/57614) ([vdimir](https://github.com/vdimir)). -* Avoid hangs of `system.stack_trace` by correctly prohibiting parallel reading from it. [#57641](https://github.com/ClickHouse/ClickHouse/pull/57641) ([Azat Khuzhin](https://github.com/azat)). -* Fix an error for aggregation of sparse columns with `any(...) RESPECT NULL` [#57710](https://github.com/ClickHouse/ClickHouse/pull/57710) ([Azat Khuzhin](https://github.com/azat)). -* Fix unary operators parsing [#57713](https://github.com/ClickHouse/ClickHouse/pull/57713) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix dependency loading for the experimental table engine `MaterializedPostgreSQL`. [#57754](https://github.com/ClickHouse/ClickHouse/pull/57754) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix retries for disconnected nodes for BACKUP/RESTORE ON CLUSTER [#57764](https://github.com/ClickHouse/ClickHouse/pull/57764) ([Vitaly Baranov](https://github.com/vitlibar)). -* Fix result of external aggregation in case of partially materialized projection [#57790](https://github.com/ClickHouse/ClickHouse/pull/57790) ([Anton Popov](https://github.com/CurtizJ)). -* Fix merge in aggregation functions with `*Map` combinator [#57795](https://github.com/ClickHouse/ClickHouse/pull/57795) ([Anton Popov](https://github.com/CurtizJ)). -* Disable `system.kafka_consumers` because it has a bug. [#57822](https://github.com/ClickHouse/ClickHouse/pull/57822) ([Azat Khuzhin](https://github.com/azat)). -* Fix LowCardinality keys support in Merge JOIN. [#57827](https://github.com/ClickHouse/ClickHouse/pull/57827) ([vdimir](https://github.com/vdimir)). -* A fix for `InterpreterCreateQuery` related to the sample block. [#57855](https://github.com/ClickHouse/ClickHouse/pull/57855) ([Maksim Kita](https://github.com/kitaisreal)). -* `addresses_expr` were ignored for named collections from PostgreSQL. [#57874](https://github.com/ClickHouse/ClickHouse/pull/57874) ([joelynch](https://github.com/joelynch)). -* Fix invalid memory access in BLAKE3 (Rust) [#57876](https://github.com/ClickHouse/ClickHouse/pull/57876) ([Raúl Marín](https://github.com/Algunenano)). Then it was rewritten from Rust to C++ for better [memory-safety](https://www.memorysafety.org/). [#57994](https://github.com/ClickHouse/ClickHouse/pull/57994) ([Raúl Marín](https://github.com/Algunenano)). -* Normalize function names in `CREATE INDEX` [#57906](https://github.com/ClickHouse/ClickHouse/pull/57906) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Fix handling of unavailable replicas before first request happened [#57933](https://github.com/ClickHouse/ClickHouse/pull/57933) ([Nikita Taranov](https://github.com/nickitat)). -* Fix literal alias misclassification [#57988](https://github.com/ClickHouse/ClickHouse/pull/57988) ([Chen768959](https://github.com/Chen768959)). -* Fix invalid preprocessing on Keeper [#58069](https://github.com/ClickHouse/ClickHouse/pull/58069) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix integer overflow in the `Poco` library, related to `UTF32Encoding` [#58073](https://github.com/ClickHouse/ClickHouse/pull/58073) ([Andrey Fedotov](https://github.com/anfedotoff)). -* Fix parallel replicas (experimental feature) in presence of a scalar subquery with a big integer value [#58118](https://github.com/ClickHouse/ClickHouse/pull/58118) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix `accurateCastOrNull` for out-of-range `DateTime` [#58139](https://github.com/ClickHouse/ClickHouse/pull/58139) ([Andrey Zvonov](https://github.com/zvonand)). -* Fix possible `PARAMETER_OUT_OF_BOUND` error during subcolumns reading from a wide part in MergeTree [#58175](https://github.com/ClickHouse/ClickHouse/pull/58175) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix a slow-down of CREATE VIEW with an enormous number of subqueries [#58220](https://github.com/ClickHouse/ClickHouse/pull/58220) ([Tao Wang](https://github.com/wangtZJU)). -* Fix parallel parsing for JSONCompactEachRow [#58181](https://github.com/ClickHouse/ClickHouse/pull/58181) ([Alexey Milovidov](https://github.com/alexey-milovidov)). [#58250](https://github.com/ClickHouse/ClickHouse/pull/58250) ([Kruglov Pavel](https://github.com/Avogar)). - - -### ClickHouse release 23.11, 2023-12-06 - -#### Backward Incompatible Change -* The default ClickHouse server configuration file has enabled `access_management` (user manipulation by SQL queries) and `named_collection_control` (manipulation of named collection by SQL queries) for the `default` user by default. This closes [#56482](https://github.com/ClickHouse/ClickHouse/issues/56482). [#56619](https://github.com/ClickHouse/ClickHouse/pull/56619) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Multiple improvements for `RESPECT NULLS`/`IGNORE NULLS` for window functions. If you use them as aggregate functions and store the states of aggregate functions with these modifiers, they might become incompatible. [#57189](https://github.com/ClickHouse/ClickHouse/pull/57189) ([Raúl Marín](https://github.com/Algunenano)). -* Remove optimization `optimize_move_functions_out_of_any`. [#57190](https://github.com/ClickHouse/ClickHouse/pull/57190) ([Raúl Marín](https://github.com/Algunenano)). -* Formatters `%l`/`%k`/`%c` in function `parseDateTime` are now able to parse hours/months without leading zeros, e.g. `select parseDateTime('2023-11-26 8:14', '%F %k:%i')` now works. Set `parsedatetime_parse_without_leading_zeros = 0` to restore the previous behavior which required two digits. Function `formatDateTime` is now also able to print hours/months without leading zeros. This is controlled by setting `formatdatetime_format_without_leading_zeros` but off by default to not break existing use cases. [#55872](https://github.com/ClickHouse/ClickHouse/pull/55872) ([Azat Khuzhin](https://github.com/azat)). -* You can no longer use the aggregate function `avgWeighted` with arguments of type `Decimal`. Workaround: convert arguments to `Float64`. This closes [#43928](https://github.com/ClickHouse/ClickHouse/issues/43928). This closes [#31768](https://github.com/ClickHouse/ClickHouse/issues/31768). This closes [#56435](https://github.com/ClickHouse/ClickHouse/issues/56435). If you have used this function inside materialized views or projections with `Decimal` arguments, contact support@clickhouse.com. Fixed error in aggregate function `sumMap` and made it slower around 1.5..2 times. It does not matter because the function is garbage anyway. This closes [#54955](https://github.com/ClickHouse/ClickHouse/issues/54955). This closes [#53134](https://github.com/ClickHouse/ClickHouse/issues/53134). This closes [#55148](https://github.com/ClickHouse/ClickHouse/issues/55148). Fix a bug in function `groupArraySample` - it used the same random seed in case more than one aggregate state is generated in a query. [#56350](https://github.com/ClickHouse/ClickHouse/pull/56350) ([Alexey Milovidov](https://github.com/alexey-milovidov)). - -#### New Feature -* Added server setting `async_load_databases` for asynchronous loading of databases and tables. Speeds up the server start time. Applies to databases with `Ordinary`, `Atomic` and `Replicated` engines. Their tables load metadata asynchronously. Query to a table increases the priority of the load job and waits for it to be done. Added a new table `system.asynchronous_loader` for introspection. [#49351](https://github.com/ClickHouse/ClickHouse/pull/49351) ([Sergei Trifonov](https://github.com/serxa)). -* Add system table `blob_storage_log`. It allows auditing all the data written to S3 and other object storages. [#52918](https://github.com/ClickHouse/ClickHouse/pull/52918) ([vdimir](https://github.com/vdimir)). -* Use statistics to order prewhere conditions better. [#53240](https://github.com/ClickHouse/ClickHouse/pull/53240) ([Han Fei](https://github.com/hanfei1991)). -* Added support for compression in the Keeper's protocol. It can be enabled on the ClickHouse side by using this flag `use_compression` inside `zookeeper` section. Keep in mind that only ClickHouse Keeper supports compression, while Apache ZooKeeper does not. Resolves [#49507](https://github.com/ClickHouse/ClickHouse/issues/49507). [#54957](https://github.com/ClickHouse/ClickHouse/pull/54957) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Introduce the feature `storage_metadata_write_full_object_key`. If it is set as `true` then metadata files are written with the new format. With that format ClickHouse stores full remote object key in the metadata file which allows better flexibility and optimization. [#55566](https://github.com/ClickHouse/ClickHouse/pull/55566) ([Sema Checherinda](https://github.com/CheSema)). -* Add new settings and syntax to protect named collections' fields from being overridden. This is meant to prevent a malicious user from obtaining unauthorized access to secrets. [#55782](https://github.com/ClickHouse/ClickHouse/pull/55782) ([Salvatore Mesoraca](https://github.com/aiven-sal)). -* Add `hostname` column to all system log tables - it is useful if you make the system tables replicated, shared, or distributed. [#55894](https://github.com/ClickHouse/ClickHouse/pull/55894) ([Bharat Nallan](https://github.com/bharatnc)). -* Add `CHECK ALL TABLES` query. [#56022](https://github.com/ClickHouse/ClickHouse/pull/56022) ([vdimir](https://github.com/vdimir)). -* Added function `fromDaysSinceYearZero` which is similar to MySQL's `FROM_DAYS`. E.g. `SELECT fromDaysSinceYearZero(739136)` returns `2023-09-08`. [#56088](https://github.com/ClickHouse/ClickHouse/pull/56088) ([Joanna Hulboj](https://github.com/jh0x)). -* Add an external Python tool to view backups and to extract information from them without using ClickHouse. [#56268](https://github.com/ClickHouse/ClickHouse/pull/56268) ([Vitaly Baranov](https://github.com/vitlibar)). -* Implement a new setting called `preferred_optimize_projection_name`. If it is set to a non-empty string, the specified projection would be used if possible instead of choosing from all the candidates. [#56309](https://github.com/ClickHouse/ClickHouse/pull/56309) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Add 4-letter command for yielding/resigning leadership (https://github.com/ClickHouse/ClickHouse/issues/56352). [#56354](https://github.com/ClickHouse/ClickHouse/pull/56354) ([Pradeep Chhetri](https://github.com/chhetripradeep)). [#56620](https://github.com/ClickHouse/ClickHouse/pull/56620) ([Pradeep Chhetri](https://github.com/chhetripradeep)). -* Added a new SQL function, `arrayRandomSample(arr, k)` which returns a sample of k elements from the input array. Similar functionality could previously be achieved only with less convenient syntax, e.g. `SELECT arrayReduce('groupArraySample(3)', range(10))`. [#56416](https://github.com/ClickHouse/ClickHouse/pull/56416) ([Robert Schulze](https://github.com/rschu1ze)). -* Added support for `Float16` type data to use in `.npy` files. Closes [#56344](https://github.com/ClickHouse/ClickHouse/issues/56344). [#56424](https://github.com/ClickHouse/ClickHouse/pull/56424) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Added a system view `information_schema.statistics` for better compatibility with Tableau Online. [#56425](https://github.com/ClickHouse/ClickHouse/pull/56425) ([Serge Klochkov](https://github.com/slvrtrn)). -* Add `system.symbols` table useful for introspection of the binary. [#56548](https://github.com/ClickHouse/ClickHouse/pull/56548) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Configurable dashboards. Queries for charts are now loaded using a query, which by default uses a new `system.dashboards` table. [#56771](https://github.com/ClickHouse/ClickHouse/pull/56771) ([Sergei Trifonov](https://github.com/serxa)). -* Introduce `fileCluster` table function - it is useful if you mount a shared filesystem (NFS and similar) into the `user_files` directory. [#56868](https://github.com/ClickHouse/ClickHouse/pull/56868) ([Andrey Zvonov](https://github.com/zvonand)). -* Add `_size` virtual column with file size in bytes to `s3/file/hdfs/url/azureBlobStorage` engines. [#57126](https://github.com/ClickHouse/ClickHouse/pull/57126) ([Kruglov Pavel](https://github.com/Avogar)). -* Expose the number of errors for each error code occurred on a server since last restart from the Prometheus endpoint. [#57209](https://github.com/ClickHouse/ClickHouse/pull/57209) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* ClickHouse keeper reports its running availability zone at `/keeper/availability-zone` path. This can be configured via `us-west-1a`. [#56715](https://github.com/ClickHouse/ClickHouse/pull/56715) ([Jianfei Hu](https://github.com/incfly)). -* Make ALTER materialized_view MODIFY QUERY non experimental and deprecate `allow_experimental_alter_materialized_view_structure` setting. Fixes [#15206](https://github.com/ClickHouse/ClickHouse/issues/15206). [#57311](https://github.com/ClickHouse/ClickHouse/pull/57311) ([alesapin](https://github.com/alesapin)). -* Setting `join_algorithm` respects specified order [#51745](https://github.com/ClickHouse/ClickHouse/pull/51745) ([vdimir](https://github.com/vdimir)). -* Add support for the [well-known Protobuf types](https://protobuf.dev/reference/protobuf/google.protobuf/) in the Protobuf format. [#56741](https://github.com/ClickHouse/ClickHouse/pull/56741) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). - -#### Performance Improvement -* Adaptive timeouts for interacting with S3. The first attempt is made with low send and receive timeouts. [#56314](https://github.com/ClickHouse/ClickHouse/pull/56314) ([Sema Checherinda](https://github.com/CheSema)). -* Increase the default value of `max_concurrent_queries` from 100 to 1000. This makes sense when there is a large number of connecting clients, which are slowly sending or receiving data, so the server is not limited by CPU, or when the number of CPU cores is larger than 100. Also, enable the concurrency control by default, and set the desired number of query processing threads in total as twice the number of CPU cores. It improves performance in scenarios with a very large number of concurrent queries. [#46927](https://github.com/ClickHouse/ClickHouse/pull/46927) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Support parallel evaluation of window functions. Fixes [#34688](https://github.com/ClickHouse/ClickHouse/issues/34688). [#39631](https://github.com/ClickHouse/ClickHouse/pull/39631) ([Dmitry Novik](https://github.com/novikd)). -* `Numbers` table engine (of the `system.numbers` table) now analyzes the condition to generate the needed subset of data, like table's index. [#50909](https://github.com/ClickHouse/ClickHouse/pull/50909) ([JackyWoo](https://github.com/JackyWoo)). -* Improved the performance of filtering by `IN (...)` condition for `Merge` table engine. [#54905](https://github.com/ClickHouse/ClickHouse/pull/54905) ([Nikita Taranov](https://github.com/nickitat)). -* An improvement which takes place when the filesystem cache is full and there are big reads. [#55158](https://github.com/ClickHouse/ClickHouse/pull/55158) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Add ability to disable checksums for S3 to avoid excessive pass over the file (this is controlled by the setting `s3_disable_checksum`). [#55559](https://github.com/ClickHouse/ClickHouse/pull/55559) ([Azat Khuzhin](https://github.com/azat)). -* Now we read synchronously from remote tables when data is in page cache (like we do for local tables). It is faster, it doesn't require synchronisation inside the thread pool, and doesn't hesitate to do `seek`-s on local FS, and reduces CPU wait. [#55841](https://github.com/ClickHouse/ClickHouse/pull/55841) ([Nikita Taranov](https://github.com/nickitat)). -* Optimization for getting value from `map`, `arrayElement`. It will bring about 30% speedup. - reduce the reserved memory - reduce the `resize` call. [#55957](https://github.com/ClickHouse/ClickHouse/pull/55957) ([lgbo](https://github.com/lgbo-ustc)). -* Optimization of multi-stage filtering with AVX-512. The performance experiments of the OnTime dataset on the ICX device (Intel Xeon Platinum 8380 CPU, 80 cores, 160 threads) show that this change could bring the improvements of 7.4%, 5.9%, 4.7%, 3.0%, and 4.6% to the QPS of the query Q2, Q3, Q4, Q5 and Q6 respectively while having no impact on others. [#56079](https://github.com/ClickHouse/ClickHouse/pull/56079) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). -* Limit the number of threads busy inside the query profiler. If there are more - they will skip profiling. [#56105](https://github.com/ClickHouse/ClickHouse/pull/56105) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Decrease the amount of virtual function calls in window functions. [#56120](https://github.com/ClickHouse/ClickHouse/pull/56120) ([Maksim Kita](https://github.com/kitaisreal)). -* Allow recursive Tuple field pruning in ORC data format to speed up scaning. [#56122](https://github.com/ClickHouse/ClickHouse/pull/56122) ([李扬](https://github.com/taiyang-li)). -* Trivial count optimization for `Npy` data format: queries like `select count() from 'data.npy'` will work much more fast because of caching the results. [#56304](https://github.com/ClickHouse/ClickHouse/pull/56304) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Queries with aggregation and a large number of streams will use less amount of memory during the plan's construction. [#57074](https://github.com/ClickHouse/ClickHouse/pull/57074) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Improve performance of executing queries for use cases with many users and highly concurrent queries (>2000 QPS) by optimizing the access to ProcessList. [#57106](https://github.com/ClickHouse/ClickHouse/pull/57106) ([Andrej Hoos](https://github.com/adikus)). -* Trivial improvement on array join, reuse some intermediate results. [#57183](https://github.com/ClickHouse/ClickHouse/pull/57183) ([李扬](https://github.com/taiyang-li)). -* There are cases when stack unwinding was slow. Not anymore. [#57221](https://github.com/ClickHouse/ClickHouse/pull/57221) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Now we use default read pool for reading from external storage when `max_streams = 1`. It is beneficial when read prefetches are enabled. [#57334](https://github.com/ClickHouse/ClickHouse/pull/57334) ([Nikita Taranov](https://github.com/nickitat)). -* Keeper improvement: improve memory-usage during startup by delaying log preprocessing. [#55660](https://github.com/ClickHouse/ClickHouse/pull/55660) ([Antonio Andelic](https://github.com/antonio2368)). -* Improved performance of glob matching for `File` and `HDFS` storages. [#56141](https://github.com/ClickHouse/ClickHouse/pull/56141) ([Andrey Zvonov](https://github.com/zvonand)). -* Posting lists in experimental full text indexes are now compressed which reduces their size by 10-30%. [#56226](https://github.com/ClickHouse/ClickHouse/pull/56226) ([Harry Lee](https://github.com/HarryLeeIBM)). -* Parallelise `BackupEntriesCollector` in backups. [#56312](https://github.com/ClickHouse/ClickHouse/pull/56312) ([Kseniia Sumarokova](https://github.com/kssenii)). - -#### Improvement -* Add a new `MergeTree` setting `add_implicit_sign_column_constraint_for_collapsing_engine` (disabled by default). When enabled, it adds an implicit CHECK constraint for `CollapsingMergeTree` tables that restricts the value of the `Sign` column to be only -1 or 1. [#56701](https://github.com/ClickHouse/ClickHouse/issues/56701). [#56986](https://github.com/ClickHouse/ClickHouse/pull/56986) ([Kevin Mingtarja](https://github.com/kevinmingtarja)). -* Enable adding new disk to storage configuration without restart. [#56367](https://github.com/ClickHouse/ClickHouse/pull/56367) ([Duc Canh Le](https://github.com/canhld94)). -* Support creating and materializing index in the same alter query, also support "modify TTL" and "materialize TTL" in the same query. Closes [#55651](https://github.com/ClickHouse/ClickHouse/issues/55651). [#56331](https://github.com/ClickHouse/ClickHouse/pull/56331) ([flynn](https://github.com/ucasfl)). -* Add a new table function named `fuzzJSON` with rows containing perturbed versions of the source JSON string with random variations. [#56490](https://github.com/ClickHouse/ClickHouse/pull/56490) ([Julia Kartseva](https://github.com/jkartseva)). -* Engine `Merge` filters the records according to the row policies of the underlying tables, so you don't have to create another row policy on a `Merge` table. [#50209](https://github.com/ClickHouse/ClickHouse/pull/50209) ([Ilya Golshtein](https://github.com/ilejn)). -* Add a setting `max_execution_time_leaf` to limit the execution time on shard for distributed query, and `timeout_overflow_mode_leaf` to control the behaviour if timeout happens. [#51823](https://github.com/ClickHouse/ClickHouse/pull/51823) ([Duc Canh Le](https://github.com/canhld94)). -* Add ClickHouse setting to disable tunneling for HTTPS requests over HTTP proxy. [#55033](https://github.com/ClickHouse/ClickHouse/pull/55033) ([Arthur Passos](https://github.com/arthurpassos)). -* Set `background_fetches_pool_size` to 16, background_schedule_pool_size to 512 that is better for production usage with frequent small insertions. [#54327](https://github.com/ClickHouse/ClickHouse/pull/54327) ([Denny Crane](https://github.com/den-crane)). -* While read data from a csv format file, and at end of line is `\r` , which not followed by `\n`, then we will enconter the exception as follows `Cannot parse CSV format: found \r (CR) not followed by \n (LF). Line must end by \n (LF) or \r\n (CR LF) or \n\r.` In clickhouse, the csv end of line must be `\n` or `\r\n` or `\n\r`, so the `\r` must be followed by `\n`, but in some suitation, the csv input data is abnormal, like above, `\r` is at end of line. [#54340](https://github.com/ClickHouse/ClickHouse/pull/54340) ([KevinyhZou](https://github.com/KevinyhZou)). -* Update Arrow library to release-13.0.0 that supports new encodings. Closes [#44505](https://github.com/ClickHouse/ClickHouse/issues/44505). [#54800](https://github.com/ClickHouse/ClickHouse/pull/54800) ([Kruglov Pavel](https://github.com/Avogar)). -* Improve performance of ON CLUSTER queries by removing heavy system calls to get all network interfaces when looking for local ip address in the DDL entry hosts list. [#54909](https://github.com/ClickHouse/ClickHouse/pull/54909) ([Duc Canh Le](https://github.com/canhld94)). -* Fixed accounting of memory allocated before attaching a thread to a query or a user. [#56089](https://github.com/ClickHouse/ClickHouse/pull/56089) ([Nikita Taranov](https://github.com/nickitat)). -* Add support for `LARGE_LIST` in Apache Arrow formats. [#56118](https://github.com/ClickHouse/ClickHouse/pull/56118) ([edef](https://github.com/edef1c)). -* Allow manual compaction of `EmbeddedRocksDB` via `OPTIMIZE` query. [#56225](https://github.com/ClickHouse/ClickHouse/pull/56225) ([Azat Khuzhin](https://github.com/azat)). -* Add ability to specify BlockBasedTableOptions for `EmbeddedRocksDB` tables. [#56264](https://github.com/ClickHouse/ClickHouse/pull/56264) ([Azat Khuzhin](https://github.com/azat)). -* `SHOW COLUMNS` now displays MySQL's equivalent data type name when the connection was made through the MySQL protocol. Previously, this was the case when setting `use_mysql_types_in_show_columns = 1`. The setting is retained but made obsolete. [#56277](https://github.com/ClickHouse/ClickHouse/pull/56277) ([Robert Schulze](https://github.com/rschu1ze)). -* Fixed possible `The local set of parts of table doesn't look like the set of parts in ZooKeeper` error if server was restarted just after `TRUNCATE` or `DROP PARTITION`. [#56282](https://github.com/ClickHouse/ClickHouse/pull/56282) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Fixed handling of non-const query strings in functions `formatQuery`/ `formatQuerySingleLine`. Also added `OrNull` variants of both functions that return a NULL when a query cannot be parsed instead of throwing an exception. [#56327](https://github.com/ClickHouse/ClickHouse/pull/56327) ([Robert Schulze](https://github.com/rschu1ze)). -* Allow backup of materialized view with dropped inner table instead of failing the backup. [#56387](https://github.com/ClickHouse/ClickHouse/pull/56387) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Queries to `system.replicas` initiate requests to ZooKeeper when certain columns are queried. When there are thousands of tables these requests might produce a considerable load on ZooKeeper. If there are multiple simultaneous queries to `system.replicas` they do same requests multiple times. The change is to "deduplicate" requests from concurrent queries. [#56420](https://github.com/ClickHouse/ClickHouse/pull/56420) ([Alexander Gololobov](https://github.com/davenger)). -* Fix translation to MySQL compatible query for querying external databases. [#56456](https://github.com/ClickHouse/ClickHouse/pull/56456) ([flynn](https://github.com/ucasfl)). -* Add support for backing up and restoring tables using `KeeperMap` engine. [#56460](https://github.com/ClickHouse/ClickHouse/pull/56460) ([Antonio Andelic](https://github.com/antonio2368)). -* 404 response for CompleteMultipartUpload has to be rechecked. Operation could be done on server even if client got timeout or other network errors. The next retry of CompleteMultipartUpload receives 404 response. If the object key exists that operation is considered as successful. [#56475](https://github.com/ClickHouse/ClickHouse/pull/56475) ([Sema Checherinda](https://github.com/CheSema)). -* Enable the HTTP OPTIONS method by default - it simplifies requesting ClickHouse from a web browser. [#56483](https://github.com/ClickHouse/ClickHouse/pull/56483) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The value for `dns_max_consecutive_failures` was changed by mistake in [#46550](https://github.com/ClickHouse/ClickHouse/issues/46550) - this is reverted and adjusted to a better value. Also, increased the HTTP keep-alive timeout to a reasonable value from production. [#56485](https://github.com/ClickHouse/ClickHouse/pull/56485) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Load base backups lazily (a base backup won't be loaded until it's needed). Also add some log message and profile events for backups. [#56516](https://github.com/ClickHouse/ClickHouse/pull/56516) ([Vitaly Baranov](https://github.com/vitlibar)). -* Setting `query_cache_store_results_of_queries_with_nondeterministic_functions` (with values `false` or `true`) was marked obsolete. It was replaced by setting `query_cache_nondeterministic_function_handling`, a three-valued enum that controls how the query cache handles queries with non-deterministic functions: a) throw an exception (default behavior), b) save the non-deterministic query result regardless, or c) ignore, i.e. don't throw an exception and don't cache the result. [#56519](https://github.com/ClickHouse/ClickHouse/pull/56519) ([Robert Schulze](https://github.com/rschu1ze)). -* Rewrite equality with `is null` check in JOIN ON section. Experimental *Analyzer only*. [#56538](https://github.com/ClickHouse/ClickHouse/pull/56538) ([vdimir](https://github.com/vdimir)). -* Function`concat` now supports arbitrary argument types (instead of only String and FixedString arguments). This makes it behave more similar to MySQL `concat` implementation. For example, `SELECT concat('ab', 42)` now returns `ab42`. [#56540](https://github.com/ClickHouse/ClickHouse/pull/56540) ([Serge Klochkov](https://github.com/slvrtrn)). -* Allow getting cache configuration from 'named_collection' section in config or from SQL created named collections. [#56541](https://github.com/ClickHouse/ClickHouse/pull/56541) ([Kseniia Sumarokova](https://github.com/kssenii)). -* PostgreSQL database engine: Make the removal of outdated tables less aggressive with unsuccessful postgres connection. [#56609](https://github.com/ClickHouse/ClickHouse/pull/56609) ([jsc0218](https://github.com/jsc0218)). -* It took too much time to connnect to PG when URL is not right, so the relevant query stucks there and get cancelled. [#56648](https://github.com/ClickHouse/ClickHouse/pull/56648) ([jsc0218](https://github.com/jsc0218)). -* Keeper improvement: disable compressed logs by default in Keeper. [#56763](https://github.com/ClickHouse/ClickHouse/pull/56763) ([Antonio Andelic](https://github.com/antonio2368)). -* Add config setting `wait_dictionaries_load_at_startup`. [#56782](https://github.com/ClickHouse/ClickHouse/pull/56782) ([Vitaly Baranov](https://github.com/vitlibar)). -* There was a potential vulnerability in previous ClickHouse versions: if a user has connected and unsuccessfully tried to authenticate with the "interserver secret" method, the server didn't terminate the connection immediately but continued to receive and ignore the leftover packets from the client. While these packets are ignored, they are still parsed, and if they use a compression method with another known vulnerability, it will lead to exploitation of it without authentication. This issue was found with [ClickHouse Bug Bounty Program](https://github.com/ClickHouse/ClickHouse/issues/38986) by https://twitter.com/malacupa. [#56794](https://github.com/ClickHouse/ClickHouse/pull/56794) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fetching a part waits when that part is fully committed on remote replica. It is better not send part in PreActive state. In case of zero copy this is mandatory restriction. [#56808](https://github.com/ClickHouse/ClickHouse/pull/56808) ([Sema Checherinda](https://github.com/CheSema)). -* Fix possible postgresql logical replication conversion error when using experimental `MaterializedPostgreSQL`. [#53721](https://github.com/ClickHouse/ClickHouse/pull/53721) ([takakawa](https://github.com/takakawa)). -* Implement user-level setting `alter_move_to_space_execute_async` which allow to execute queries `ALTER TABLE ... MOVE PARTITION|PART TO DISK|VOLUME` asynchronously. The size of pool for background executions is controlled by `background_move_pool_size`. Default behavior is synchronous execution. Fixes [#47643](https://github.com/ClickHouse/ClickHouse/issues/47643). [#56809](https://github.com/ClickHouse/ClickHouse/pull/56809) ([alesapin](https://github.com/alesapin)). -* Able to filter by engine when scanning system.tables, avoid unnecessary (potentially time-consuming) connection. [#56813](https://github.com/ClickHouse/ClickHouse/pull/56813) ([jsc0218](https://github.com/jsc0218)). -* Show `total_bytes` and `total_rows` in system tables for RocksDB storage. [#56816](https://github.com/ClickHouse/ClickHouse/pull/56816) ([Aleksandr Musorin](https://github.com/AVMusorin)). -* Allow basic commands in ALTER for TEMPORARY tables. [#56892](https://github.com/ClickHouse/ClickHouse/pull/56892) ([Sergey](https://github.com/icuken)). -* LZ4 compression. Buffer compressed block in a rare case when out buffer capacity is not enough for writing compressed block directly to out's buffer. [#56938](https://github.com/ClickHouse/ClickHouse/pull/56938) ([Sema Checherinda](https://github.com/CheSema)). -* Add metrics for the number of queued jobs, which is useful for the IO thread pool. [#56958](https://github.com/ClickHouse/ClickHouse/pull/56958) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add a setting for PostgreSQL table engine setting in the config file. Added a check for the setting Added documentation around the additional setting. [#56959](https://github.com/ClickHouse/ClickHouse/pull/56959) ([Peignon Melvyn](https://github.com/melvynator)). -* Function `concat` can now be called with a single argument, e.g., `SELECT concat('abc')`. This makes its behavior more consistent with MySQL's concat implementation. [#57000](https://github.com/ClickHouse/ClickHouse/pull/57000) ([Serge Klochkov](https://github.com/slvrtrn)). -* Signs all `x-amz-*` headers as required by AWS S3 docs. [#57001](https://github.com/ClickHouse/ClickHouse/pull/57001) ([Arthur Passos](https://github.com/arthurpassos)). -* Function `fromDaysSinceYearZero` (alias: `FROM_DAYS`) can now be used with unsigned and signed integer types (previously, it had to be an unsigned integer). This improve compatibility with 3rd party tools such as Tableau Online. [#57002](https://github.com/ClickHouse/ClickHouse/pull/57002) ([Serge Klochkov](https://github.com/slvrtrn)). -* Add `system.s3queue_log` to default config. [#57036](https://github.com/ClickHouse/ClickHouse/pull/57036) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Change the default for `wait_dictionaries_load_at_startup` to true, and use this setting only if `dictionaries_lazy_load` is false. [#57133](https://github.com/ClickHouse/ClickHouse/pull/57133) ([Vitaly Baranov](https://github.com/vitlibar)). -* Check dictionary source type on creation even if `dictionaries_lazy_load` is enabled. [#57134](https://github.com/ClickHouse/ClickHouse/pull/57134) ([Vitaly Baranov](https://github.com/vitlibar)). -* Plan-level optimizations can now be enabled/disabled individually. Previously, it was only possible to disable them all. The setting which previously did that (`query_plan_enable_optimizations`) is retained and can still be used to disable all optimizations. [#57152](https://github.com/ClickHouse/ClickHouse/pull/57152) ([Robert Schulze](https://github.com/rschu1ze)). -* The server's exit code will correspond to the exception code. For example, if the server cannot start due to memory limit, it will exit with the code 241 = MEMORY_LIMIT_EXCEEDED. In previous versions, the exit code for exceptions was always 70 = Poco::Util::ExitCode::EXIT_SOFTWARE. [#57153](https://github.com/ClickHouse/ClickHouse/pull/57153) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Do not demangle and symbolize stack frames from `functional` C++ header. [#57201](https://github.com/ClickHouse/ClickHouse/pull/57201) ([Mike Kot](https://github.com/myrrc)). -* HTTP server page `/dashboard` now supports charts with multiple lines. [#57236](https://github.com/ClickHouse/ClickHouse/pull/57236) ([Sergei Trifonov](https://github.com/serxa)). -* The `max_memory_usage_in_client` command line option supports a string value with a suffix (K, M, G, etc). Closes [#56879](https://github.com/ClickHouse/ClickHouse/issues/56879). [#57273](https://github.com/ClickHouse/ClickHouse/pull/57273) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Bumped Intel QPL (used by codec `DEFLATE_QPL`) from v1.2.0 to v1.3.1 . Also fixed a bug in case of BOF (Block On Fault) = 0, changed to handle page faults by falling back to SW path. [#57291](https://github.com/ClickHouse/ClickHouse/pull/57291) ([jasperzhu](https://github.com/jinjunzh)). -* Increase default `replicated_deduplication_window` of MergeTree settings from 100 to 1k. [#57335](https://github.com/ClickHouse/ClickHouse/pull/57335) ([sichenzhao](https://github.com/sichenzhao)). -* Stop using `INCONSISTENT_METADATA_FOR_BACKUP` that much. If possible prefer to continue scanning instead of stopping and starting the scanning for backup from the beginning. [#57385](https://github.com/ClickHouse/ClickHouse/pull/57385) ([Vitaly Baranov](https://github.com/vitlibar)). - -#### Build/Testing/Packaging Improvement -* Add SQLLogic test. [#56078](https://github.com/ClickHouse/ClickHouse/pull/56078) ([Han Fei](https://github.com/hanfei1991)). -* Make `clickhouse-local` and `clickhouse-client` available under short names (`ch`, `chl`, `chc`) for usability. [#56634](https://github.com/ClickHouse/ClickHouse/pull/56634) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Optimized build size further by removing unused code from external libraries. [#56786](https://github.com/ClickHouse/ClickHouse/pull/56786) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add automatic check that there are no large translation units. [#56559](https://github.com/ClickHouse/ClickHouse/pull/56559) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Lower the size of the single-binary distribution. This closes [#55181](https://github.com/ClickHouse/ClickHouse/issues/55181). [#56617](https://github.com/ClickHouse/ClickHouse/pull/56617) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Information about the sizes of every translation unit and binary file after each build will be sent to the CI database in ClickHouse Cloud. This closes [#56107](https://github.com/ClickHouse/ClickHouse/issues/56107). [#56636](https://github.com/ClickHouse/ClickHouse/pull/56636) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Certain files of "Apache Arrow" library (which we use only for non-essential things like parsing the arrow format) were rebuilt all the time regardless of the build cache. This is fixed. [#56657](https://github.com/ClickHouse/ClickHouse/pull/56657) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Avoid recompiling translation units depending on the autogenerated source file about version. [#56660](https://github.com/ClickHouse/ClickHouse/pull/56660) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Tracing data of the linker invocations will be sent to the CI database in ClickHouse Cloud. [#56725](https://github.com/ClickHouse/ClickHouse/pull/56725) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Use DWARF 5 debug symbols for the clickhouse binary (was DWARF 4 previously). [#56770](https://github.com/ClickHouse/ClickHouse/pull/56770) ([Michael Kolupaev](https://github.com/al13n321)). -* Add a new build option `SANITIZE_COVERAGE`. If it is enabled, the code is instrumented to track the coverage. The collected information is available inside ClickHouse with: (1) a new function `coverage` that returns an array of unique addresses in the code found after the previous coverage reset; (2) `SYSTEM RESET COVERAGE` query that resets the accumulated data. This allows us to compare the coverage of different tests, including differential code coverage. Continuation of [#20539](https://github.com/ClickHouse/ClickHouse/issues/20539). [#56102](https://github.com/ClickHouse/ClickHouse/pull/56102) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Some of the stack frames might not be resolved when collecting stacks. In such cases the raw address might be helpful. [#56267](https://github.com/ClickHouse/ClickHouse/pull/56267) ([Alexander Gololobov](https://github.com/davenger)). -* Add an option to disable `libssh`. [#56333](https://github.com/ClickHouse/ClickHouse/pull/56333) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Enable temporary_data_in_cache in S3 tests in CI. [#48425](https://github.com/ClickHouse/ClickHouse/pull/48425) ([vdimir](https://github.com/vdimir)). -* Set the max memory usage for clickhouse-client (`1G`) in the CI. [#56873](https://github.com/ClickHouse/ClickHouse/pull/56873) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). - -#### Bug Fix (user-visible misbehavior in an official stable release) -* Fix exerimental Analyzer - insertion from select with subquery referencing insertion table should process only insertion block. [#50857](https://github.com/ClickHouse/ClickHouse/pull/50857) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Fix a bug in `str_to_map` function. [#56423](https://github.com/ClickHouse/ClickHouse/pull/56423) ([Arthur Passos](https://github.com/arthurpassos)). -* Keeper `reconfig`: add timeout before yielding/taking leadership [#53481](https://github.com/ClickHouse/ClickHouse/pull/53481) ([Mike Kot](https://github.com/myrrc)). -* Fix incorrect header in grace hash join and filter pushdown [#53922](https://github.com/ClickHouse/ClickHouse/pull/53922) ([vdimir](https://github.com/vdimir)). -* Select from system tables when table based on table function. [#55540](https://github.com/ClickHouse/ClickHouse/pull/55540) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). -* RFC: Fix "Cannot find column X in source stream" for Distributed queries with LIMIT BY [#55836](https://github.com/ClickHouse/ClickHouse/pull/55836) ([Azat Khuzhin](https://github.com/azat)). -* Fix 'Cannot read from file:' while running client in a background [#55976](https://github.com/ClickHouse/ClickHouse/pull/55976) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix clickhouse-local exit on bad send_logs_level setting [#55994](https://github.com/ClickHouse/ClickHouse/pull/55994) ([Kruglov Pavel](https://github.com/Avogar)). -* Bug fix explain ast with parameterized view [#56004](https://github.com/ClickHouse/ClickHouse/pull/56004) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Fix a crash during table loading on startup [#56232](https://github.com/ClickHouse/ClickHouse/pull/56232) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix ClickHouse-sourced dictionaries with an explicit query [#56236](https://github.com/ClickHouse/ClickHouse/pull/56236) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix segfault in signal handler for Keeper [#56266](https://github.com/ClickHouse/ClickHouse/pull/56266) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix incomplete query result for UNION in view() function. [#56274](https://github.com/ClickHouse/ClickHouse/pull/56274) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix inconsistency of "cast('0' as DateTime64(3))" and "cast('0' as Nullable(DateTime64(3)))" [#56286](https://github.com/ClickHouse/ClickHouse/pull/56286) ([李扬](https://github.com/taiyang-li)). -* Fix rare race condition related to Memory allocation failure [#56303](https://github.com/ClickHouse/ClickHouse/pull/56303) ([alesapin](https://github.com/alesapin)). -* Fix restore from backup with `flatten_nested` and `data_type_default_nullable` [#56306](https://github.com/ClickHouse/ClickHouse/pull/56306) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix crash in case of adding a column with type Object(JSON) [#56307](https://github.com/ClickHouse/ClickHouse/pull/56307) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Fix crash in filterPushDown [#56380](https://github.com/ClickHouse/ClickHouse/pull/56380) ([vdimir](https://github.com/vdimir)). -* Fix restore from backup with mat view and dropped source table [#56383](https://github.com/ClickHouse/ClickHouse/pull/56383) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix segfault during Kerberos initialization [#56401](https://github.com/ClickHouse/ClickHouse/pull/56401) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix buffer overflow in T64 [#56434](https://github.com/ClickHouse/ClickHouse/pull/56434) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix nullable primary key in final (2) [#56452](https://github.com/ClickHouse/ClickHouse/pull/56452) ([Amos Bird](https://github.com/amosbird)). -* Fix ON CLUSTER queries without database on initial node [#56484](https://github.com/ClickHouse/ClickHouse/pull/56484) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix startup failure due to TTL dependency [#56489](https://github.com/ClickHouse/ClickHouse/pull/56489) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix ALTER COMMENT queries ON CLUSTER [#56491](https://github.com/ClickHouse/ClickHouse/pull/56491) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix ALTER COLUMN with ALIAS [#56493](https://github.com/ClickHouse/ClickHouse/pull/56493) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix empty NAMED COLLECTIONs [#56494](https://github.com/ClickHouse/ClickHouse/pull/56494) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix two cases of projection analysis. [#56502](https://github.com/ClickHouse/ClickHouse/pull/56502) ([Amos Bird](https://github.com/amosbird)). -* Fix handling of aliases in query cache [#56545](https://github.com/ClickHouse/ClickHouse/pull/56545) ([Robert Schulze](https://github.com/rschu1ze)). -* Fix conversion from `Nullable(Enum)` to `Nullable(String)` [#56644](https://github.com/ClickHouse/ClickHouse/pull/56644) ([Nikolay Degterinsky](https://github.com/evillique)). -* More reliable log handling in Keeper [#56670](https://github.com/ClickHouse/ClickHouse/pull/56670) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix configuration merge for nodes with substitution attributes [#56694](https://github.com/ClickHouse/ClickHouse/pull/56694) ([Konstantin Bogdanov](https://github.com/thevar1able)). -* Fix duplicate usage of table function input(). [#56695](https://github.com/ClickHouse/ClickHouse/pull/56695) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix: RabbitMQ OpenSSL dynamic loading issue [#56703](https://github.com/ClickHouse/ClickHouse/pull/56703) ([Igor Nikonov](https://github.com/devcrafter)). -* Fix crash in GCD codec in case when zeros present in data [#56704](https://github.com/ClickHouse/ClickHouse/pull/56704) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Fix 'mutex lock failed: Invalid argument' in clickhouse-local during insert into function [#56710](https://github.com/ClickHouse/ClickHouse/pull/56710) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix Date text parsing in optimistic path [#56765](https://github.com/ClickHouse/ClickHouse/pull/56765) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix crash in FPC codec [#56795](https://github.com/ClickHouse/ClickHouse/pull/56795) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* DatabaseReplicated: fix DDL query timeout after recovering a replica [#56796](https://github.com/ClickHouse/ClickHouse/pull/56796) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Fix incorrect nullable columns reporting in MySQL binary protocol [#56799](https://github.com/ClickHouse/ClickHouse/pull/56799) ([Serge Klochkov](https://github.com/slvrtrn)). -* Support Iceberg metadata files for metastore tables [#56810](https://github.com/ClickHouse/ClickHouse/pull/56810) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix TSAN report under transform [#56817](https://github.com/ClickHouse/ClickHouse/pull/56817) ([Raúl Marín](https://github.com/Algunenano)). -* Fix SET query and SETTINGS formatting [#56825](https://github.com/ClickHouse/ClickHouse/pull/56825) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix failure to start due to table dependency in joinGet [#56828](https://github.com/ClickHouse/ClickHouse/pull/56828) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix flattening existing Nested columns during ADD COLUMN [#56830](https://github.com/ClickHouse/ClickHouse/pull/56830) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix allow cr end of line for csv [#56901](https://github.com/ClickHouse/ClickHouse/pull/56901) ([KevinyhZou](https://github.com/KevinyhZou)). -* Fix `tryBase64Decode` with invalid input [#56913](https://github.com/ClickHouse/ClickHouse/pull/56913) ([Robert Schulze](https://github.com/rschu1ze)). -* Fix generating deep nested columns in CapnProto/Protobuf schemas [#56941](https://github.com/ClickHouse/ClickHouse/pull/56941) ([Kruglov Pavel](https://github.com/Avogar)). -* Prevent incompatible ALTER of projection columns [#56948](https://github.com/ClickHouse/ClickHouse/pull/56948) ([Amos Bird](https://github.com/amosbird)). -* Fix sqlite file path validation [#56984](https://github.com/ClickHouse/ClickHouse/pull/56984) ([San](https://github.com/santrancisco)). -* S3Queue: fix metadata reference increment [#56990](https://github.com/ClickHouse/ClickHouse/pull/56990) ([Kseniia Sumarokova](https://github.com/kssenii)). -* S3Queue minor fix [#56999](https://github.com/ClickHouse/ClickHouse/pull/56999) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix file path validation for DatabaseFileSystem [#57029](https://github.com/ClickHouse/ClickHouse/pull/57029) ([San](https://github.com/santrancisco)). -* Fix `fuzzBits` with `ARRAY JOIN` [#57033](https://github.com/ClickHouse/ClickHouse/pull/57033) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix Nullptr dereference in partial merge join with joined_subquery_re… [#57048](https://github.com/ClickHouse/ClickHouse/pull/57048) ([vdimir](https://github.com/vdimir)). -* Fix race condition in RemoteSource [#57052](https://github.com/ClickHouse/ClickHouse/pull/57052) ([Raúl Marín](https://github.com/Algunenano)). -* Implement `bitHammingDistance` for big integers [#57073](https://github.com/ClickHouse/ClickHouse/pull/57073) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* S3-style links bug fix [#57075](https://github.com/ClickHouse/ClickHouse/pull/57075) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Fix JSON_QUERY function with multiple numeric paths [#57096](https://github.com/ClickHouse/ClickHouse/pull/57096) ([KevinyhZou](https://github.com/KevinyhZou)). -* Fix buffer overflow in Gorilla codec [#57107](https://github.com/ClickHouse/ClickHouse/pull/57107) ([Nikolay Degterinsky](https://github.com/evillique)). -* Close interserver connection on any exception before authentication [#57142](https://github.com/ClickHouse/ClickHouse/pull/57142) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix segfault after ALTER UPDATE with Nullable MATERIALIZED column [#57147](https://github.com/ClickHouse/ClickHouse/pull/57147) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix incorrect JOIN plan optimization with partially materialized normal projection [#57196](https://github.com/ClickHouse/ClickHouse/pull/57196) ([Amos Bird](https://github.com/amosbird)). -* Ignore comments when comparing column descriptions [#57259](https://github.com/ClickHouse/ClickHouse/pull/57259) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix `ReadonlyReplica` metric for all cases [#57267](https://github.com/ClickHouse/ClickHouse/pull/57267) ([Antonio Andelic](https://github.com/antonio2368)). -* Background merges correctly use temporary data storage in the cache [#57275](https://github.com/ClickHouse/ClickHouse/pull/57275) ([vdimir](https://github.com/vdimir)). -* Keeper fix for changelog and snapshots [#57299](https://github.com/ClickHouse/ClickHouse/pull/57299) ([Antonio Andelic](https://github.com/antonio2368)). -* Ignore finished ON CLUSTER tasks if hostname changed [#57339](https://github.com/ClickHouse/ClickHouse/pull/57339) ([Alexander Tokmakov](https://github.com/tavplubix)). -* MergeTree mutations reuse source part index granularity [#57352](https://github.com/ClickHouse/ClickHouse/pull/57352) ([Maksim Kita](https://github.com/kitaisreal)). -* FS cache: add a limit for background download [#57424](https://github.com/ClickHouse/ClickHouse/pull/57424) ([Kseniia Sumarokova](https://github.com/kssenii)). - - -### ClickHouse release 23.10, 2023-11-02 - -#### Backward Incompatible Change -* There is no longer an option to automatically remove broken data parts. This closes [#55174](https://github.com/ClickHouse/ClickHouse/issues/55174). [#55184](https://github.com/ClickHouse/ClickHouse/pull/55184) ([Alexey Milovidov](https://github.com/alexey-milovidov)). [#55557](https://github.com/ClickHouse/ClickHouse/pull/55557) ([Jihyuk Bok](https://github.com/tomahawk28)). -* The obsolete in-memory data parts can no longer be read from the write-ahead log. If you have configured in-memory parts before, they have to be removed before the upgrade. [#55186](https://github.com/ClickHouse/ClickHouse/pull/55186) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Remove the integration with Meilisearch. Reason: it was compatible only with the old version 0.18. The recent version of Meilisearch changed the protocol and does not work anymore. Note: we would appreciate it if you help to return it back. [#55189](https://github.com/ClickHouse/ClickHouse/pull/55189) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Rename directory monitor concept into background INSERT. All the settings `*directory_monitor*` had been renamed to `distributed_background_insert*`. *Backward compatibility should be preserved* (since old settings had been added as an alias). [#55978](https://github.com/ClickHouse/ClickHouse/pull/55978) ([Azat Khuzhin](https://github.com/azat)). -* Do not interpret the `send_timeout` set on the client side as the `receive_timeout` on the server side and vise-versa. [#56035](https://github.com/ClickHouse/ClickHouse/pull/56035) ([Azat Khuzhin](https://github.com/azat)). -* Comparison of time intervals with different units will throw an exception. This closes [#55942](https://github.com/ClickHouse/ClickHouse/issues/55942). You might have occasionally rely on the previous behavior when the underlying numeric values were compared regardless of the units. [#56090](https://github.com/ClickHouse/ClickHouse/pull/56090) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Rewrited the experimental `S3Queue` table engine completely: changed the way we keep information in zookeeper which allows to make less zookeeper requests, added caching of zookeeper state in cases when we know the state will not change, improved the polling from s3 process to make it less aggressive, changed the way ttl and max set for trached files is maintained, now it is a background process. Added `system.s3queue` and `system.s3queue_log` tables. Closes [#54998](https://github.com/ClickHouse/ClickHouse/issues/54998). [#54422](https://github.com/ClickHouse/ClickHouse/pull/54422) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Arbitrary paths on HTTP endpoint are no longer interpreted as a request to the `/query` endpoint. [#55521](https://github.com/ClickHouse/ClickHouse/pull/55521) ([Konstantin Bogdanov](https://github.com/thevar1able)). - -#### New Feature -* Add function `arrayFold(accumulator, x1, ..., xn -> expression, initial, array1, ..., arrayn)` which applies a lambda function to multiple arrays of the same cardinality and collects the result in an accumulator. [#49794](https://github.com/ClickHouse/ClickHouse/pull/49794) ([Lirikl](https://github.com/Lirikl)). -* Support for `Npy` format. `SELECT * FROM file('example_array.npy', Npy)`. [#55982](https://github.com/ClickHouse/ClickHouse/pull/55982) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* If a table has a space-filling curve in its key, e.g., `ORDER BY mortonEncode(x, y)`, the conditions on its arguments, e.g., `x >= 10 AND x <= 20 AND y >= 20 AND y <= 30` can be used for indexing. A setting `analyze_index_with_space_filling_curves` is added to enable or disable this analysis. This closes [#41195](https://github.com/ClickHouse/ClickHouse/issue/41195). Continuation of [#4538](https://github.com/ClickHouse/ClickHouse/pull/4538). Continuation of [#6286](https://github.com/ClickHouse/ClickHouse/pull/6286). Continuation of [#28130](https://github.com/ClickHouse/ClickHouse/pull/28130). Continuation of [#41753](https://github.com/ClickHouse/ClickHouse/pull/#41753). [#55642](https://github.com/ClickHouse/ClickHouse/pull/55642) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* A new setting called `force_optimize_projection_name`, it takes a name of projection as an argument. If it's value set to a non-empty string, ClickHouse checks that this projection is used in the query at least once. Closes [#55331](https://github.com/ClickHouse/ClickHouse/issues/55331). [#56134](https://github.com/ClickHouse/ClickHouse/pull/56134) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Support asynchronous inserts with external data via native protocol. Previously it worked only if data is inlined into query. [#54730](https://github.com/ClickHouse/ClickHouse/pull/54730) ([Anton Popov](https://github.com/CurtizJ)). -* Added aggregation function `lttb` which uses the [Largest-Triangle-Three-Buckets](https://skemman.is/bitstream/1946/15343/3/SS_MSthesis.pdf) algorithm for downsampling data for visualization. [#53145](https://github.com/ClickHouse/ClickHouse/pull/53145) ([Sinan](https://github.com/sinsinan)). -* Query`CHECK TABLE` has better performance and usability (sends progress updates, cancellable). Support checking particular part with `CHECK TABLE ... PART 'part_name'`. [#53404](https://github.com/ClickHouse/ClickHouse/pull/53404) ([vdimir](https://github.com/vdimir)). -* Added function `jsonMergePatch`. When working with JSON data as strings, it provides a way to merge these strings (of JSON objects) together to form a single string containing a single JSON object. [#54364](https://github.com/ClickHouse/ClickHouse/pull/54364) ([Memo](https://github.com/Joeywzr)). -* The second part of Kusto Query Language dialect support. [Phase 1 implementation ](https://github.com/ClickHouse/ClickHouse/pull/37961) has been merged. [#42510](https://github.com/ClickHouse/ClickHouse/pull/42510) ([larryluogit](https://github.com/larryluogit)). -* Added a new SQL function, `arrayRandomSample(arr, k)` which returns a sample of k elements from the input array. Similar functionality could previously be achieved only with less convenient syntax, e.g. "SELECT arrayReduce('groupArraySample(3)', range(10))". [#54391](https://github.com/ClickHouse/ClickHouse/pull/54391) ([itayisraelov](https://github.com/itayisraelov)). -* Introduce `-ArgMin`/`-ArgMax` aggregate combinators which allow to aggregate by min/max values only. One use case can be found in [#54818](https://github.com/ClickHouse/ClickHouse/issues/54818). This PR also reorganize combinators into dedicated folder. [#54947](https://github.com/ClickHouse/ClickHouse/pull/54947) ([Amos Bird](https://github.com/amosbird)). -* Allow to drop cache for Protobuf format with `SYSTEM DROP SCHEMA FORMAT CACHE [FOR Protobuf]`. [#55064](https://github.com/ClickHouse/ClickHouse/pull/55064) ([Aleksandr Musorin](https://github.com/AVMusorin)). -* Add external HTTP Basic authenticator. [#55199](https://github.com/ClickHouse/ClickHouse/pull/55199) ([Aleksei Filatov](https://github.com/aalexfvk)). -* Added function `byteSwap` which reverses the bytes of unsigned integers. This is particularly useful for reversing values of types which are represented as unsigned integers internally such as IPv4. [#55211](https://github.com/ClickHouse/ClickHouse/pull/55211) ([Priyansh Agrawal](https://github.com/Priyansh121096)). -* Added function `formatQuery` which returns a formatted version (possibly spanning multiple lines) of a SQL query string. Also added function `formatQuerySingleLine` which does the same but the returned string will not contain linebreaks. [#55239](https://github.com/ClickHouse/ClickHouse/pull/55239) ([Salvatore Mesoraca](https://github.com/aiven-sal)). -* Added `DWARF` input format that reads debug symbols from an ELF executable/library/object file. [#55450](https://github.com/ClickHouse/ClickHouse/pull/55450) ([Michael Kolupaev](https://github.com/al13n321)). -* Allow to save unparsed records and errors in RabbitMQ, NATS and FileLog engines. Add virtual columns `_error` and `_raw_message`(for NATS and RabbitMQ), `_raw_record` (for FileLog) that are filled when ClickHouse fails to parse new record. The behaviour is controlled under storage settings `nats_handle_error_mode` for NATS, `rabbitmq_handle_error_mode` for RabbitMQ, `handle_error_mode` for FileLog similar to `kafka_handle_error_mode`. If it's set to `default`, en exception will be thrown when ClickHouse fails to parse a record, if it's set to `stream`, erorr and raw record will be saved into virtual columns. Closes [#36035](https://github.com/ClickHouse/ClickHouse/issues/36035). [#55477](https://github.com/ClickHouse/ClickHouse/pull/55477) ([Kruglov Pavel](https://github.com/Avogar)). -* Keeper client improvement: add `get_all_children_number command` that returns number of all children nodes under a specific path. [#55485](https://github.com/ClickHouse/ClickHouse/pull/55485) ([guoxiaolong](https://github.com/guoxiaolongzte)). -* Keeper client improvement: add `get_direct_children_number` command that returns number of direct children nodes under a path. [#55898](https://github.com/ClickHouse/ClickHouse/pull/55898) ([xuzifu666](https://github.com/xuzifu666)). -* Add statement `SHOW SETTING setting_name` which is a simpler version of existing statement `SHOW SETTINGS`. [#55979](https://github.com/ClickHouse/ClickHouse/pull/55979) ([Maksim Kita](https://github.com/kitaisreal)). -* Added fields `substreams` and `filenames` to the `system.parts_columns` table. [#55108](https://github.com/ClickHouse/ClickHouse/pull/55108) ([Anton Popov](https://github.com/CurtizJ)). -* Add support for `SHOW MERGES` query. [#55815](https://github.com/ClickHouse/ClickHouse/pull/55815) ([megao](https://github.com/jetgm)). -* Introduce a setting `create_table_empty_primary_key_by_default` for default `ORDER BY ()`. [#55899](https://github.com/ClickHouse/ClickHouse/pull/55899) ([Srikanth Chekuri](https://github.com/srikanthccv)). - -#### Performance Improvement -* Add option `query_plan_preserve_num_streams_after_window_functions` to preserve the number of streams after evaluating window functions to allow parallel stream processing. [#50771](https://github.com/ClickHouse/ClickHouse/pull/50771) ([frinkr](https://github.com/frinkr)). -* Release more streams if data is small. [#53867](https://github.com/ClickHouse/ClickHouse/pull/53867) ([Jiebin Sun](https://github.com/jiebinn)). -* RoaringBitmaps being optimized before serialization. [#55044](https://github.com/ClickHouse/ClickHouse/pull/55044) ([UnamedRus](https://github.com/UnamedRus)). -* Posting lists in inverted indexes are now optimized to use the smallest possible representation for internal bitmaps. Depending on the repetitiveness of the data, this may significantly reduce the space consumption of inverted indexes. [#55069](https://github.com/ClickHouse/ClickHouse/pull/55069) ([Harry Lee](https://github.com/HarryLeeIBM)). -* Fix contention on Context lock, this significantly improves performance for a lot of short-running concurrent queries. [#55121](https://github.com/ClickHouse/ClickHouse/pull/55121) ([Maksim Kita](https://github.com/kitaisreal)). -* Improved the performance of inverted index creation by 30%. This was achieved by replacing `std::unordered_map` with `absl::flat_hash_map`. [#55210](https://github.com/ClickHouse/ClickHouse/pull/55210) ([Harry Lee](https://github.com/HarryLeeIBM)). -* Support ORC filter push down (rowgroup level). [#55330](https://github.com/ClickHouse/ClickHouse/pull/55330) ([李扬](https://github.com/taiyang-li)). -* Improve performance of external aggregation with a lot of temporary files. [#55489](https://github.com/ClickHouse/ClickHouse/pull/55489) ([Maksim Kita](https://github.com/kitaisreal)). -* Set a reasonable size for the marks cache for secondary indices by default to avoid loading the marks over and over again. [#55654](https://github.com/ClickHouse/ClickHouse/pull/55654) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Avoid unnecessary reconstruction of index granules when reading skip indexes. This addresses [#55653](https://github.com/ClickHouse/ClickHouse/issues/55653#issuecomment-1763766009). [#55683](https://github.com/ClickHouse/ClickHouse/pull/55683) ([Amos Bird](https://github.com/amosbird)). -* Cache CAST function in set during execution to improve the performance of function `IN` when set element type doesn't exactly match column type. [#55712](https://github.com/ClickHouse/ClickHouse/pull/55712) ([Duc Canh Le](https://github.com/canhld94)). -* Performance improvement for `ColumnVector::insertMany` and `ColumnVector::insertManyFrom`. [#55714](https://github.com/ClickHouse/ClickHouse/pull/55714) ([frinkr](https://github.com/frinkr)). -* Optimized Map subscript operations by predicting the next row's key position and reduce the comparisons. [#55929](https://github.com/ClickHouse/ClickHouse/pull/55929) ([lgbo](https://github.com/lgbo-ustc)). -* Support struct fields pruning in Parquet (in previous versions it didn't work in some cases). [#56117](https://github.com/ClickHouse/ClickHouse/pull/56117) ([lgbo](https://github.com/lgbo-ustc)). -* Add the ability to tune the number of parallel replicas used in a query execution based on the estimation of rows to read. [#51692](https://github.com/ClickHouse/ClickHouse/pull/51692) ([Raúl Marín](https://github.com/Algunenano)). -* Optimized external aggregation memory consumption in case many temporary files were generated. [#54798](https://github.com/ClickHouse/ClickHouse/pull/54798) ([Nikita Taranov](https://github.com/nickitat)). -* Distributed queries executed in `async_socket_for_remote` mode (default) now respect `max_threads` limit. Previously, some queries could create excessive threads (up to `max_distributed_connections`), causing server performance issues. [#53504](https://github.com/ClickHouse/ClickHouse/pull/53504) ([filimonov](https://github.com/filimonov)). -* Caching skip-able entries while executing DDL from Zookeeper distributed DDL queue. [#54828](https://github.com/ClickHouse/ClickHouse/pull/54828) ([Duc Canh Le](https://github.com/canhld94)). -* Experimental inverted indexes do not store tokens with too many matches (i.e. row ids in the posting list). This saves space and avoids ineffective index lookups when sequential scans would be equally fast or faster. The previous heuristics (`density` parameter passed to the index definition) that controlled when tokens would not be stored was too confusing for users. A much simpler heuristics based on parameter `max_rows_per_postings_list` (default: 64k) is introduced which directly controls the maximum allowed number of row ids in a postings list. [#55616](https://github.com/ClickHouse/ClickHouse/pull/55616) ([Harry Lee](https://github.com/HarryLeeIBM)). -* Improve write performance to `EmbeddedRocksDB` tables. [#55732](https://github.com/ClickHouse/ClickHouse/pull/55732) ([Duc Canh Le](https://github.com/canhld94)). -* Improved overall resilience for ClickHouse in case of many parts within partition (more than 1000). It might reduce the number of `TOO_MANY_PARTS` errors. [#55526](https://github.com/ClickHouse/ClickHouse/pull/55526) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Reduced memory consumption during loading of hierarchical dictionaries. [#55838](https://github.com/ClickHouse/ClickHouse/pull/55838) ([Nikita Taranov](https://github.com/nickitat)). -* All dictionaries support setting `dictionary_use_async_executor`. [#55839](https://github.com/ClickHouse/ClickHouse/pull/55839) ([vdimir](https://github.com/vdimir)). -* Prevent excesive memory usage when deserializing AggregateFunctionTopKGenericData. [#55947](https://github.com/ClickHouse/ClickHouse/pull/55947) ([Raúl Marín](https://github.com/Algunenano)). -* On a Keeper with lots of watches AsyncMetrics threads can consume 100% of CPU for noticable time in `DB::KeeperStorage::getSessionsWithWatchesCount`. The fix is to avoid traversing heavy `watches` and `list_watches` sets. [#56054](https://github.com/ClickHouse/ClickHouse/pull/56054) ([Alexander Gololobov](https://github.com/davenger)). -* Add setting `optimize_trivial_approximate_count_query` to use `count` approximation for storage EmbeddedRocksDB. Enable trivial count for StorageJoin. [#55806](https://github.com/ClickHouse/ClickHouse/pull/55806) ([Duc Canh Le](https://github.com/canhld94)). - -#### Improvement -* Functions `toDayOfWeek` (MySQL alias: `DAYOFWEEK`), `toYearWeek` (`YEARWEEK`) and `toWeek` (`WEEK`) now supports `String` arguments. This makes its behavior consistent with MySQL's behavior. [#55589](https://github.com/ClickHouse/ClickHouse/pull/55589) ([Robert Schulze](https://github.com/rschu1ze)). -* Introduced setting `date_time_overflow_behavior` with possible values `ignore`, `throw`, `saturate` that controls the overflow behavior when converting from Date, Date32, DateTime64, Integer or Float to Date, Date32, DateTime or DateTime64. [#55696](https://github.com/ClickHouse/ClickHouse/pull/55696) ([Andrey Zvonov](https://github.com/zvonand)). -* Implement query parameters support for `ALTER TABLE ... ACTION PARTITION [ID] {parameter_name:ParameterType}`. Merges [#49516](https://github.com/ClickHouse/ClickHouse/issues/49516). Closes [#49449](https://github.com/ClickHouse/ClickHouse/issues/49449). [#55604](https://github.com/ClickHouse/ClickHouse/pull/55604) ([alesapin](https://github.com/alesapin)). -* Print processor ids in a prettier manner in EXPLAIN. [#48852](https://github.com/ClickHouse/ClickHouse/pull/48852) ([Vlad Seliverstov](https://github.com/behebot)). -* Creating a direct dictionary with a lifetime field will be rejected at create time (as the lifetime does not make sense for direct dictionaries). Fixes: [#27861](https://github.com/ClickHouse/ClickHouse/issues/27861). [#49043](https://github.com/ClickHouse/ClickHouse/pull/49043) ([Rory Crispin](https://github.com/RoryCrispin)). -* Allow parameters in queries with partitions like `ALTER TABLE t DROP PARTITION`. Closes [#49449](https://github.com/ClickHouse/ClickHouse/issues/49449). [#49516](https://github.com/ClickHouse/ClickHouse/pull/49516) ([Nikolay Degterinsky](https://github.com/evillique)). -* Add a new column `xid` for `system.zookeeper_connection`. [#50702](https://github.com/ClickHouse/ClickHouse/pull/50702) ([helifu](https://github.com/helifu)). -* Display the correct server settings in `system.server_settings` after configuration reload. [#53774](https://github.com/ClickHouse/ClickHouse/pull/53774) ([helifu](https://github.com/helifu)). -* Add support for mathematical minus `−` character in queries, similar to `-`. [#54100](https://github.com/ClickHouse/ClickHouse/pull/54100) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add replica groups to the experimental `Replicated` database engine. Closes [#53620](https://github.com/ClickHouse/ClickHouse/issues/53620). [#54421](https://github.com/ClickHouse/ClickHouse/pull/54421) ([Nikolay Degterinsky](https://github.com/evillique)). -* It is better to retry retriable s3 errors than totally fail the query. Set bigger value to the s3_retry_attempts by default. [#54770](https://github.com/ClickHouse/ClickHouse/pull/54770) ([Sema Checherinda](https://github.com/CheSema)). -* Add load balancing mode `hostname_levenshtein_distance`. [#54826](https://github.com/ClickHouse/ClickHouse/pull/54826) ([JackyWoo](https://github.com/JackyWoo)). -* Improve hiding secrets in logs. [#55089](https://github.com/ClickHouse/ClickHouse/pull/55089) ([Vitaly Baranov](https://github.com/vitlibar)). -* For now the projection analysis will be performed only on top of query plan. The setting `query_plan_optimize_projection` became obsolete (it was enabled by default long time ago). [#55112](https://github.com/ClickHouse/ClickHouse/pull/55112) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* When function `untuple` is now called on a tuple with named elements and itself has an alias (e.g. `select untuple(tuple(1)::Tuple(element_alias Int)) AS untuple_alias`), then the result column name is now generated from the untuple alias and the tuple element alias (in the example: "untuple_alias.element_alias"). [#55123](https://github.com/ClickHouse/ClickHouse/pull/55123) ([garcher22](https://github.com/garcher22)). -* Added setting `describe_include_virtual_columns`, which allows to include virtual columns of table into result of `DESCRIBE` query. Added setting `describe_compact_output`. If it is set to `true`, `DESCRIBE` query returns only names and types of columns without extra information. [#55129](https://github.com/ClickHouse/ClickHouse/pull/55129) ([Anton Popov](https://github.com/CurtizJ)). -* Sometimes `OPTIMIZE` with `optimize_throw_if_noop=1` may fail with an error `unknown reason` while the real cause of it - different projections in different parts. This behavior is fixed. [#55130](https://github.com/ClickHouse/ClickHouse/pull/55130) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Allow to have several `MaterializedPostgreSQL` tables following the same Postgres table. By default this behaviour is not enabled (for compatibility, because it is a backward-incompatible change), but can be turned on with setting `materialized_postgresql_use_unique_replication_consumer_identifier`. Closes [#54918](https://github.com/ClickHouse/ClickHouse/issues/54918). [#55145](https://github.com/ClickHouse/ClickHouse/pull/55145) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Allow to parse negative `DateTime64` and `DateTime` with fractional part from short strings. [#55146](https://github.com/ClickHouse/ClickHouse/pull/55146) ([Andrey Zvonov](https://github.com/zvonand)). -* To improve compatibility with MySQL, 1. `information_schema.tables` now includes the new field `table_rows`, and 2. `information_schema.columns` now includes the new field `extra`. [#55215](https://github.com/ClickHouse/ClickHouse/pull/55215) ([Robert Schulze](https://github.com/rschu1ze)). -* Clickhouse-client won't show "0 rows in set" if it is zero and if exception was thrown. [#55240](https://github.com/ClickHouse/ClickHouse/pull/55240) ([Salvatore Mesoraca](https://github.com/aiven-sal)). -* Support rename table without keyword `TABLE` like `RENAME db.t1 to db.t2`. [#55373](https://github.com/ClickHouse/ClickHouse/pull/55373) ([凌涛](https://github.com/lingtaolf)). -* Add `internal_replication` to `system.clusters`. [#55377](https://github.com/ClickHouse/ClickHouse/pull/55377) ([Konstantin Morozov](https://github.com/k-morozov)). -* Select remote proxy resolver based on request protocol, add proxy feature docs and remove `DB::ProxyConfiguration::Protocol::ANY`. [#55430](https://github.com/ClickHouse/ClickHouse/pull/55430) ([Arthur Passos](https://github.com/arthurpassos)). -* Avoid retrying keeper operations on INSERT after table shutdown. [#55519](https://github.com/ClickHouse/ClickHouse/pull/55519) ([Azat Khuzhin](https://github.com/azat)). -* `SHOW COLUMNS` now correctly reports type `FixedString` as `BLOB` if setting `use_mysql_types_in_show_columns` is on. Also added two new settings, `mysql_map_string_to_text_in_show_columns` and `mysql_map_fixed_string_to_text_in_show_columns` to switch the output for types `String` and `FixedString` as `TEXT` or `BLOB`. [#55617](https://github.com/ClickHouse/ClickHouse/pull/55617) ([Serge Klochkov](https://github.com/slvrtrn)). -* During ReplicatedMergeTree tables startup clickhouse server checks set of parts for unexpected parts (exists locally, but not in zookeeper). All unexpected parts move to detached directory and instead of them server tries to restore some ancestor (covered) parts. Now server tries to restore closest ancestors instead of random covered parts. [#55645](https://github.com/ClickHouse/ClickHouse/pull/55645) ([alesapin](https://github.com/alesapin)). -* The advanced dashboard now supports draggable charts on touch devices. This closes [#54206](https://github.com/ClickHouse/ClickHouse/issues/54206). [#55649](https://github.com/ClickHouse/ClickHouse/pull/55649) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Use the default query format if declared when outputting exception with `http_write_exception_in_output_format`. [#55739](https://github.com/ClickHouse/ClickHouse/pull/55739) ([Raúl Marín](https://github.com/Algunenano)). -* Provide a better message for common MATERIALIZED VIEW pitfalls. [#55826](https://github.com/ClickHouse/ClickHouse/pull/55826) ([Raúl Marín](https://github.com/Algunenano)). -* If you dropped the current database, you will still be able to run some queries in `clickhouse-local` and switch to another database. This makes the behavior consistent with `clickhouse-client`. This closes [#55834](https://github.com/ClickHouse/ClickHouse/issues/55834). [#55853](https://github.com/ClickHouse/ClickHouse/pull/55853) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Functions `(add|subtract)(Year|Quarter|Month|Week|Day|Hour|Minute|Second|Millisecond|Microsecond|Nanosecond)` now support string-encoded date arguments, e.g. `SELECT addDays('2023-10-22', 1)`. This increases compatibility with MySQL and is needed by Tableau Online. [#55869](https://github.com/ClickHouse/ClickHouse/pull/55869) ([Robert Schulze](https://github.com/rschu1ze)). -* The setting `apply_deleted_mask` when disabled allows to read rows that where marked as deleted by lightweight DELETE queries. This is useful for debugging. [#55952](https://github.com/ClickHouse/ClickHouse/pull/55952) ([Alexander Gololobov](https://github.com/davenger)). -* Allow skipping `null` values when serailizing Tuple to json objects, which makes it possible to keep compatiability with Spark's `to_json` function, which is also useful for gluten. [#55956](https://github.com/ClickHouse/ClickHouse/pull/55956) ([李扬](https://github.com/taiyang-li)). -* Functions `(add|sub)Date` now support string-encoded date arguments, e.g. `SELECT addDate('2023-10-22 11:12:13', INTERVAL 5 MINUTE)`. The same support for string-encoded date arguments is added to the plus and minus operators, e.g. `SELECT '2023-10-23' + INTERVAL 1 DAY`. This increases compatibility with MySQL and is needed by Tableau Online. [#55960](https://github.com/ClickHouse/ClickHouse/pull/55960) ([Robert Schulze](https://github.com/rschu1ze)). -* Allow unquoted strings with CR (`\r`) in CSV format. Closes [#39930](https://github.com/ClickHouse/ClickHouse/issues/39930). [#56046](https://github.com/ClickHouse/ClickHouse/pull/56046) ([Kruglov Pavel](https://github.com/Avogar)). -* Allow to run `clickhouse-keeper` using embedded config. [#56086](https://github.com/ClickHouse/ClickHouse/pull/56086) ([Maksim Kita](https://github.com/kitaisreal)). -* Set limit of the maximum configuration value for `queued.min.messages` to avoid problem with start fetching data with Kafka. [#56121](https://github.com/ClickHouse/ClickHouse/pull/56121) ([Stas Morozov](https://github.com/r3b-fish)). -* Fixed a typo in SQL function `minSampleSizeContinous` (renamed `minSampleSizeContinuous`). Old name is preserved for backward compatibility. This closes: [#56139](https://github.com/ClickHouse/ClickHouse/issues/56139). [#56143](https://github.com/ClickHouse/ClickHouse/pull/56143) ([Dorota Szeremeta](https://github.com/orotaday)). -* Print path for broken parts on disk before shutting down the server. Before this change if a part is corrupted on disk and server cannot start, it was almost impossible to understand which part is broken. This is fixed. [#56181](https://github.com/ClickHouse/ClickHouse/pull/56181) ([Duc Canh Le](https://github.com/canhld94)). - -#### Build/Testing/Packaging Improvement -* If the database in Docker is already initialized, it doesn't need to be initialized again upon subsequent launches. This can potentially fix the issue of infinite container restarts when the database fails to load within 1000 attempts (relevant for very large databases and multi-node setups). [#50724](https://github.com/ClickHouse/ClickHouse/pull/50724) ([Alexander Nikolaev](https://github.com/AlexNik)). -* Resource with source code including submodules is built in Darwin special build task. It may be used to build ClickHouse without checking out the submodules. [#51435](https://github.com/ClickHouse/ClickHouse/pull/51435) ([Ilya Yatsishin](https://github.com/qoega)). -* An error was occuring when building ClickHouse with the AVX series of instructions enabled globally (which isn't recommended). The reason is that snappy does not enable `SNAPPY_HAVE_X86_CRC32`. [#55049](https://github.com/ClickHouse/ClickHouse/pull/55049) ([monchickey](https://github.com/monchickey)). -* Solve issue with launching standalone `clickhouse-keeper` from `clickhouse-server` package. [#55226](https://github.com/ClickHouse/ClickHouse/pull/55226) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* In the tests, RabbitMQ version is updated to 3.12.6. Improved logs collection for RabbitMQ tests. [#55424](https://github.com/ClickHouse/ClickHouse/pull/55424) ([Ilya Yatsishin](https://github.com/qoega)). -* Modified the error message difference between openssl and boringssl to fix the functional test. [#55975](https://github.com/ClickHouse/ClickHouse/pull/55975) ([MeenaRenganathan22](https://github.com/MeenaRenganathan22)). -* Use upstream repo for apache datasketches. [#55787](https://github.com/ClickHouse/ClickHouse/pull/55787) ([Nikita Taranov](https://github.com/nickitat)). - -#### Bug Fix (user-visible misbehavior in an official stable release) -* Skip hardlinking inverted index files in mutation [#47663](https://github.com/ClickHouse/ClickHouse/pull/47663) ([cangyin](https://github.com/cangyin)). -* Fixed bug of `match` function (regex) with pattern containing alternation produces incorrect key condition. Closes #53222. [#54696](https://github.com/ClickHouse/ClickHouse/pull/54696) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Fix 'Cannot find column' in read-in-order optimization with ARRAY JOIN [#51746](https://github.com/ClickHouse/ClickHouse/pull/51746) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Support missed experimental `Object(Nullable(json))` subcolumns in query. [#54052](https://github.com/ClickHouse/ClickHouse/pull/54052) ([zps](https://github.com/VanDarkholme7)). -* Re-add fix for `accurateCastOrNull` [#54629](https://github.com/ClickHouse/ClickHouse/pull/54629) ([Salvatore Mesoraca](https://github.com/aiven-sal)). -* Fix detecting `DEFAULT` for columns of a Distributed table created without AS [#55060](https://github.com/ClickHouse/ClickHouse/pull/55060) ([Vitaly Baranov](https://github.com/vitlibar)). -* Proper cleanup in case of exception in ctor of ShellCommandSource [#55103](https://github.com/ClickHouse/ClickHouse/pull/55103) ([Alexander Gololobov](https://github.com/davenger)). -* Fix deadlock in LDAP assigned role update [#55119](https://github.com/ClickHouse/ClickHouse/pull/55119) ([Julian Maicher](https://github.com/jmaicher)). -* Suppress error statistics update for internal exceptions [#55128](https://github.com/ClickHouse/ClickHouse/pull/55128) ([Robert Schulze](https://github.com/rschu1ze)). -* Fix deadlock in backups [#55132](https://github.com/ClickHouse/ClickHouse/pull/55132) ([alesapin](https://github.com/alesapin)). -* Fix storage Iceberg files retrieval [#55144](https://github.com/ClickHouse/ClickHouse/pull/55144) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix partition pruning of extra columns in set. [#55172](https://github.com/ClickHouse/ClickHouse/pull/55172) ([Amos Bird](https://github.com/amosbird)). -* Fix recalculation of skip indexes in ALTER UPDATE queries when table has adaptive granularity [#55202](https://github.com/ClickHouse/ClickHouse/pull/55202) ([Duc Canh Le](https://github.com/canhld94)). -* Fix for background download in fs cache [#55252](https://github.com/ClickHouse/ClickHouse/pull/55252) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Avoid possible memory leaks in compressors in case of missing buffer finalization [#55262](https://github.com/ClickHouse/ClickHouse/pull/55262) ([Azat Khuzhin](https://github.com/azat)). -* Fix functions execution over sparse columns [#55275](https://github.com/ClickHouse/ClickHouse/pull/55275) ([Azat Khuzhin](https://github.com/azat)). -* Fix incorrect merging of Nested for SELECT FINAL FROM SummingMergeTree [#55276](https://github.com/ClickHouse/ClickHouse/pull/55276) ([Azat Khuzhin](https://github.com/azat)). -* Fix bug with inability to drop detached partition in replicated merge tree on top of S3 without zero copy [#55309](https://github.com/ClickHouse/ClickHouse/pull/55309) ([alesapin](https://github.com/alesapin)). -* Fix a crash in MergeSortingPartialResultTransform (due to zero chunks after `remerge`) [#55335](https://github.com/ClickHouse/ClickHouse/pull/55335) ([Azat Khuzhin](https://github.com/azat)). -* Fix data-race in CreatingSetsTransform (on errors) due to throwing shared exception [#55338](https://github.com/ClickHouse/ClickHouse/pull/55338) ([Azat Khuzhin](https://github.com/azat)). -* Fix trash optimization (up to a certain extent) [#55353](https://github.com/ClickHouse/ClickHouse/pull/55353) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix leak in StorageHDFS [#55370](https://github.com/ClickHouse/ClickHouse/pull/55370) ([Azat Khuzhin](https://github.com/azat)). -* Fix parsing of arrays in cast operator [#55417](https://github.com/ClickHouse/ClickHouse/pull/55417) ([Anton Popov](https://github.com/CurtizJ)). -* Fix filtering by virtual columns with OR filter in query [#55418](https://github.com/ClickHouse/ClickHouse/pull/55418) ([Azat Khuzhin](https://github.com/azat)). -* Fix MongoDB connection issues [#55419](https://github.com/ClickHouse/ClickHouse/pull/55419) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix MySQL interface boolean representation [#55427](https://github.com/ClickHouse/ClickHouse/pull/55427) ([Serge Klochkov](https://github.com/slvrtrn)). -* Fix MySQL text protocol DateTime formatting and LowCardinality(Nullable(T)) types reporting [#55479](https://github.com/ClickHouse/ClickHouse/pull/55479) ([Serge Klochkov](https://github.com/slvrtrn)). -* Make `use_mysql_types_in_show_columns` affect only `SHOW COLUMNS` [#55481](https://github.com/ClickHouse/ClickHouse/pull/55481) ([Robert Schulze](https://github.com/rschu1ze)). -* Fix stack symbolizer parsing `DW_FORM_ref_addr` incorrectly and sometimes crashing [#55483](https://github.com/ClickHouse/ClickHouse/pull/55483) ([Michael Kolupaev](https://github.com/al13n321)). -* Destroy fiber in case of exception in cancelBefore in AsyncTaskExecutor [#55516](https://github.com/ClickHouse/ClickHouse/pull/55516) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix Query Parameters not working with custom HTTP handlers [#55521](https://github.com/ClickHouse/ClickHouse/pull/55521) ([Konstantin Bogdanov](https://github.com/thevar1able)). -* Fix checking of non handled data for Values format [#55527](https://github.com/ClickHouse/ClickHouse/pull/55527) ([Azat Khuzhin](https://github.com/azat)). -* Fix 'Invalid cursor state' in odbc interacting with MS SQL Server [#55558](https://github.com/ClickHouse/ClickHouse/pull/55558) ([vdimir](https://github.com/vdimir)). -* Fix max execution time and 'break' overflow mode [#55577](https://github.com/ClickHouse/ClickHouse/pull/55577) ([Alexander Gololobov](https://github.com/davenger)). -* Fix crash in QueryNormalizer with cyclic aliases [#55602](https://github.com/ClickHouse/ClickHouse/pull/55602) ([vdimir](https://github.com/vdimir)). -* Disable wrong optimization and add a test [#55609](https://github.com/ClickHouse/ClickHouse/pull/55609) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Merging [#52352](https://github.com/ClickHouse/ClickHouse/issues/52352) [#55621](https://github.com/ClickHouse/ClickHouse/pull/55621) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add a test to avoid incorrect decimal sorting [#55662](https://github.com/ClickHouse/ClickHouse/pull/55662) ([Amos Bird](https://github.com/amosbird)). -* Fix progress bar for s3 and azure Cluster functions with url without globs [#55666](https://github.com/ClickHouse/ClickHouse/pull/55666) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix filtering by virtual columns with OR filter in query (resubmit) [#55678](https://github.com/ClickHouse/ClickHouse/pull/55678) ([Azat Khuzhin](https://github.com/azat)). -* Fixes and improvements for Iceberg storage [#55695](https://github.com/ClickHouse/ClickHouse/pull/55695) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix data race in CreatingSetsTransform (v2) [#55786](https://github.com/ClickHouse/ClickHouse/pull/55786) ([Azat Khuzhin](https://github.com/azat)). -* Throw exception when parsing illegal string as float if precise_float_parsing is true [#55861](https://github.com/ClickHouse/ClickHouse/pull/55861) ([李扬](https://github.com/taiyang-li)). -* Disable predicate pushdown if the CTE contains stateful functions [#55871](https://github.com/ClickHouse/ClickHouse/pull/55871) ([Raúl Marín](https://github.com/Algunenano)). -* Fix normalize ASTSelectWithUnionQuery, as it was stripping `FORMAT` from the query [#55887](https://github.com/ClickHouse/ClickHouse/pull/55887) ([flynn](https://github.com/ucasfl)). -* Try to fix possible segfault in Native ORC input format [#55891](https://github.com/ClickHouse/ClickHouse/pull/55891) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix window functions in case of sparse columns. [#55895](https://github.com/ClickHouse/ClickHouse/pull/55895) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). -* fix: StorageNull supports subcolumns [#55912](https://github.com/ClickHouse/ClickHouse/pull/55912) ([FFish](https://github.com/wxybear)). -* Do not write retriable errors for Replicated mutate/merge into error log [#55944](https://github.com/ClickHouse/ClickHouse/pull/55944) ([Azat Khuzhin](https://github.com/azat)). -* Fix `SHOW DATABASES LIMIT ` [#55962](https://github.com/ClickHouse/ClickHouse/pull/55962) ([Raúl Marín](https://github.com/Algunenano)). -* Fix autogenerated Protobuf schema with fields with underscore [#55974](https://github.com/ClickHouse/ClickHouse/pull/55974) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix dateTime64ToSnowflake64() with non-default scale [#55983](https://github.com/ClickHouse/ClickHouse/pull/55983) ([Robert Schulze](https://github.com/rschu1ze)). -* Fix output/input of Arrow dictionary column [#55989](https://github.com/ClickHouse/ClickHouse/pull/55989) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix fetching schema from schema registry in AvroConfluent [#55991](https://github.com/ClickHouse/ClickHouse/pull/55991) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix 'Block structure mismatch' on concurrent ALTER and INSERTs in Buffer table [#55995](https://github.com/ClickHouse/ClickHouse/pull/55995) ([Michael Kolupaev](https://github.com/al13n321)). -* Fix incorrect free space accounting for least_used JBOD policy [#56030](https://github.com/ClickHouse/ClickHouse/pull/56030) ([Azat Khuzhin](https://github.com/azat)). -* Fix missing scalar issue when evaluating subqueries inside table functions [#56057](https://github.com/ClickHouse/ClickHouse/pull/56057) ([Amos Bird](https://github.com/amosbird)). -* Fix wrong query result when http_write_exception_in_output_format=1 [#56135](https://github.com/ClickHouse/ClickHouse/pull/56135) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix schema cache for fallback JSON->JSONEachRow with changed settings [#56172](https://github.com/ClickHouse/ClickHouse/pull/56172) ([Kruglov Pavel](https://github.com/Avogar)). -* Add error handler to odbc-bridge [#56185](https://github.com/ClickHouse/ClickHouse/pull/56185) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). - - -### ClickHouse release 23.9, 2023-09-28 - -#### Backward Incompatible Change -* Remove the `status_info` configuration option and dictionaries status from the default Prometheus handler. [#54090](https://github.com/ClickHouse/ClickHouse/pull/54090) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The experimental parts metadata cache is removed from the codebase. [#54215](https://github.com/ClickHouse/ClickHouse/pull/54215) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Disable setting `input_format_json_try_infer_numbers_from_strings` by default, so we don't try to infer numbers from strings in JSON formats by default to avoid possible parsing errors when sample data contains strings that looks like a number. [#55099](https://github.com/ClickHouse/ClickHouse/pull/55099) ([Kruglov Pavel](https://github.com/Avogar)). - -#### New Feature -* Improve schema inference from JSON formats: 1) Now it's possible to infer named Tuples from JSON objects without experimantal JSON type under a setting `input_format_json_try_infer_named_tuples_from_objects` in JSON formats. Previously without experimantal type JSON we could only infer JSON objects as Strings or Maps, now we can infer named Tuple. Resulting Tuple type will conain all keys of objects that were read in data sample during schema inference. It can be useful for reading structured JSON data without sparse objects. The setting is enabled by default. 2) Allow parsing JSON array into a column with type String under setting `input_format_json_read_arrays_as_strings`. It can help reading arrays with values with different types. 3) Allow to use type String for JSON keys with unkown types (`null`/`[]`/`{}`) in sample data under setting `input_format_json_infer_incomplete_types_as_strings`. Now in JSON formats we can read any value into String column and we can avoid getting error `Cannot determine type for column 'column_name' by first 25000 rows of data, most likely this column contains only Nulls or empty Arrays/Maps` during schema inference by using type String for unknown types, so the data will be read successfully. [#54427](https://github.com/ClickHouse/ClickHouse/pull/54427) ([Kruglov Pavel](https://github.com/Avogar)). -* Added IO scheduling support for remote disks. Storage configuration for disk types `s3`, `s3_plain`, `hdfs` and `azure_blob_storage` can now contain `read_resource` and `write_resource` elements holding resource names. Scheduling policies for these resources can be configured in a separate server configuration section `resources`. Queries can be marked using setting `workload` and classified using server configuration section `workload_classifiers` to achieve diverse resource scheduling goals. More details in [the docs](https://clickhouse.com/docs/en/operations/workload-scheduling). [#47009](https://github.com/ClickHouse/ClickHouse/pull/47009) ([Sergei Trifonov](https://github.com/serxa)). Added "bandwidth_limit" IO scheduling node type. It allows you to specify `max_speed` and `max_burst` constraints on traffic passing though this node. [#54618](https://github.com/ClickHouse/ClickHouse/pull/54618) ([Sergei Trifonov](https://github.com/serxa)). -* Added new type of authentication based on SSH keys. It works only for the native TCP protocol. [#41109](https://github.com/ClickHouse/ClickHouse/pull/41109) ([George Gamezardashvili](https://github.com/InfJoker)). -* Added a new column `_block_number` for MergeTree tables. [#44532](https://github.com/ClickHouse/ClickHouse/issues/44532). [#47532](https://github.com/ClickHouse/ClickHouse/pull/47532) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Add `IF EMPTY` clause for `DROP TABLE` queries. [#48915](https://github.com/ClickHouse/ClickHouse/pull/48915) ([Pavel Novitskiy](https://github.com/pnovitskiy)). -* SQL functions `toString(datetime, timezone)` and `formatDateTime(datetime, format, timezone)` now support non-constant timezone arguments. [#53680](https://github.com/ClickHouse/ClickHouse/pull/53680) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Add support for `ALTER TABLE MODIFY COMMENT`. Note: something similar was added by an external contributor a long time ago, but the feature did not work at all and only confused users. This closes [#36377](https://github.com/ClickHouse/ClickHouse/issues/36377). [#51304](https://github.com/ClickHouse/ClickHouse/pull/51304) ([Alexey Milovidov](https://github.com/alexey-milovidov)). Note: this command does not propagate between replicas, so the replicas of a table could have different comments. -* Added `GCD` a.k.a. "greatest common denominator" as a new data compression codec. The codec computes the GCD of all column values, and then divides each value by the GCD. The GCD codec is a data preparation codec (similar to Delta and DoubleDelta) and cannot be used stand-alone. It works with data integer, decimal and date/time type. A viable use case for the GCD codec are column values that change (increase/decrease) in multiples of the GCD, e.g. 24 - 28 - 16 - 24 - 8 - 24 (assuming GCD = 4). [#53149](https://github.com/ClickHouse/ClickHouse/pull/53149) ([Alexander Nam](https://github.com/seshWCS)). -* Two new type aliases `DECIMAL(P)` (as shortcut for `DECIMAL(P, 0)` and `DECIMAL` (as shortcut for `DECIMAL(10, 0)`) were added. This makes ClickHouse more compatible with MySQL's SQL dialect. [#53328](https://github.com/ClickHouse/ClickHouse/pull/53328) ([Val Doroshchuk](https://github.com/valbok)). -* Added a new system log table `backup_log` to track all `BACKUP` and `RESTORE` operations. [#53638](https://github.com/ClickHouse/ClickHouse/pull/53638) ([Victor Krasnov](https://github.com/sirvickr)). -* Added a format setting `output_format_markdown_escape_special_characters` (default: false). The setting controls whether special characters like `!`, `#`, `$` etc. are escaped (i.e. prefixed by a backslash) in the `Markdown` output format. [#53860](https://github.com/ClickHouse/ClickHouse/pull/53860) ([irenjj](https://github.com/irenjj)). -* Add function `decodeHTMLComponent`. [#54097](https://github.com/ClickHouse/ClickHouse/pull/54097) ([Bharat Nallan](https://github.com/bharatnc)). -* Added `peak_threads_usage` to query_log table. [#54335](https://github.com/ClickHouse/ClickHouse/pull/54335) ([Alexey Gerasimchuck](https://github.com/Demilivor)). -* Add `SHOW FUNCTIONS` support to clickhouse-client. [#54337](https://github.com/ClickHouse/ClickHouse/pull/54337) ([Julia Kartseva](https://github.com/wat-ze-hex)). -* Added function `toDaysSinceYearZero` with alias `TO_DAYS` (for compatibility with MySQL) which returns the number of days passed since `0001-01-01` (in Proleptic Gregorian Calendar). [#54479](https://github.com/ClickHouse/ClickHouse/pull/54479) ([Robert Schulze](https://github.com/rschu1ze)). Function `toDaysSinceYearZero` now supports arguments of type `DateTime` and `DateTime64`. [#54856](https://github.com/ClickHouse/ClickHouse/pull/54856) ([Serge Klochkov](https://github.com/slvrtrn)). -* Added functions `YYYYMMDDtoDate`, `YYYYMMDDtoDate32`, `YYYYMMDDhhmmssToDateTime` and `YYYYMMDDhhmmssToDateTime64`. They convert a date or date with time encoded as integer (e.g. 20230911) into a native date or date with time. As such, they provide the opposite functionality of existing functions `YYYYMMDDToDate`, `YYYYMMDDToDateTime`, `YYYYMMDDhhmmddToDateTime`, `YYYYMMDDhhmmddToDateTime64`. [#54509](https://github.com/ClickHouse/ClickHouse/pull/54509) ([Quanfa Fu](https://github.com/dentiscalprum)) ([Robert Schulze](https://github.com/rschu1ze)). -* Add several string distance functions, including `byteHammingDistance`, `editDistance`. [#54935](https://github.com/ClickHouse/ClickHouse/pull/54935) ([flynn](https://github.com/ucasfl)). -* Allow specifying the expiration date and, optionally, the time for user credentials with `VALID UNTIL datetime` clause. [#51261](https://github.com/ClickHouse/ClickHouse/pull/51261) ([Nikolay Degterinsky](https://github.com/evillique)). -* Allow S3-style URLs for table functions `s3`, `gcs`, `oss`. URL is automatically converted to HTTP. Example: `'s3://clickhouse-public-datasets/hits.csv'` is converted to `'https://clickhouse-public-datasets.s3.amazonaws.com/hits.csv'`. [#54931](https://github.com/ClickHouse/ClickHouse/pull/54931) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Add new setting `print_pretty_type_names` to print pretty deep nested types like Tuple/Maps/Arrays. [#55095](https://github.com/ClickHouse/ClickHouse/pull/55095) ([Kruglov Pavel](https://github.com/Avogar)). - -#### Performance Improvement -* Speed up reading from S3 by enabling prefetches by default. [#53709](https://github.com/ClickHouse/ClickHouse/pull/53709) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Do not implicitly read PK and version columns in lonely parts if unnecessary for queries with FINAL. [#53919](https://github.com/ClickHouse/ClickHouse/pull/53919) ([Duc Canh Le](https://github.com/canhld94)). -* Optimize group by constant keys. Will optimize queries with group by `_file/_path` after https://github.com/ClickHouse/ClickHouse/pull/53529. [#53549](https://github.com/ClickHouse/ClickHouse/pull/53549) ([Kruglov Pavel](https://github.com/Avogar)). -* Improve performance of sorting for `Decimal` columns. Improve performance of insertion into `MergeTree` if ORDER BY contains a `Decimal` column. Improve performance of sorting when data is already sorted or almost sorted. [#35961](https://github.com/ClickHouse/ClickHouse/pull/35961) ([Maksim Kita](https://github.com/kitaisreal)). -* Improve performance for huge query analysis. Fixes [#51224](https://github.com/ClickHouse/ClickHouse/issues/51224). [#51469](https://github.com/ClickHouse/ClickHouse/pull/51469) ([frinkr](https://github.com/frinkr)). -* An optimization to rewrite `COUNT(DISTINCT ...)` and various `uniq` variants to `count` if it is selected from a subquery with GROUP BY. [#52082](https://github.com/ClickHouse/ClickHouse/pull/52082) [#52645](https://github.com/ClickHouse/ClickHouse/pull/52645) ([JackyWoo](https://github.com/JackyWoo)). -* Remove manual calls to `mmap/mremap/munmap` and delegate all this work to `jemalloc` - and it slightly improves performance. [#52792](https://github.com/ClickHouse/ClickHouse/pull/52792) ([Nikita Taranov](https://github.com/nickitat)). -* Fixed high in CPU consumption when working with NATS. [#54399](https://github.com/ClickHouse/ClickHouse/pull/54399) ([Vasilev Pyotr](https://github.com/vahpetr)). -* Since we use separate instructions for executing `toString` with datetime argument, it is possible to improve performance a bit for non-datetime arguments and have some parts of the code cleaner. Follows up [#53680](https://github.com/ClickHouse/ClickHouse/issues/53680). [#54443](https://github.com/ClickHouse/ClickHouse/pull/54443) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Instead of serializing json elements into a `std::stringstream`, this PR try to put the serialization result into `ColumnString` direclty. [#54613](https://github.com/ClickHouse/ClickHouse/pull/54613) ([lgbo](https://github.com/lgbo-ustc)). -* Enable ORDER BY optimization for reading data in corresponding order from a MergeTree table in case that the table is behind a view. [#54628](https://github.com/ClickHouse/ClickHouse/pull/54628) ([Vitaly Baranov](https://github.com/vitlibar)). -* Improve JSON SQL functions by reusing `GeneratorJSONPath` and removing several shared pointers. [#54735](https://github.com/ClickHouse/ClickHouse/pull/54735) ([lgbo](https://github.com/lgbo-ustc)). -* Keeper tries to batch flush requests for better performance. [#53049](https://github.com/ClickHouse/ClickHouse/pull/53049) ([Antonio Andelic](https://github.com/antonio2368)). -* Now `clickhouse-client` processes files in parallel in case of `INFILE 'glob_expression'`. Closes [#54218](https://github.com/ClickHouse/ClickHouse/issues/54218). [#54533](https://github.com/ClickHouse/ClickHouse/pull/54533) ([Max K.](https://github.com/mkaynov)). -* Allow to use primary key for IN function where primary key column types are different from `IN` function right side column types. Example: `SELECT id FROM test_table WHERE id IN (SELECT '5')`. Closes [#48936](https://github.com/ClickHouse/ClickHouse/issues/48936). [#54544](https://github.com/ClickHouse/ClickHouse/pull/54544) ([Maksim Kita](https://github.com/kitaisreal)). -* Hash JOIN tries to shrink internal buffers consuming half of maximal available memory (set by `max_bytes_in_join`). [#54584](https://github.com/ClickHouse/ClickHouse/pull/54584) ([vdimir](https://github.com/vdimir)). -* Respect `max_block_size` for array join to avoid possible OOM. Close [#54290](https://github.com/ClickHouse/ClickHouse/issues/54290). [#54664](https://github.com/ClickHouse/ClickHouse/pull/54664) ([李扬](https://github.com/taiyang-li)). -* Reuse HTTP connections in the `s3` table function. [#54812](https://github.com/ClickHouse/ClickHouse/pull/54812) ([Michael Kolupaev](https://github.com/al13n321)). -* Replace the linear search in `MergeTreeRangeReader::Stream::ceilRowsToCompleteGranules` with a binary search. [#54869](https://github.com/ClickHouse/ClickHouse/pull/54869) ([usurai](https://github.com/usurai)). - -#### Experimental Feature -* The creation of `Annoy` indexes can now be parallelized using setting `max_threads_for_annoy_index_creation`. [#54047](https://github.com/ClickHouse/ClickHouse/pull/54047) ([Robert Schulze](https://github.com/rschu1ze)). -* Parallel replicas over distributed don't read from all replicas [#54199](https://github.com/ClickHouse/ClickHouse/pull/54199) ([Igor Nikonov](https://github.com/devcrafter)). - -#### Improvement -* Allow to replace long names of files of columns in `MergeTree` data parts to hashes of names. It helps to avoid `File name too long` error in some cases. [#50612](https://github.com/ClickHouse/ClickHouse/pull/50612) ([Anton Popov](https://github.com/CurtizJ)). -* Parse data in `JSON` format as `JSONEachRow` if failed to parse metadata. It will allow to read files with `.json` extension even if real format is JSONEachRow. Closes [#45740](https://github.com/ClickHouse/ClickHouse/issues/45740). [#54405](https://github.com/ClickHouse/ClickHouse/pull/54405) ([Kruglov Pavel](https://github.com/Avogar)). -* Output valid JSON/XML on excetpion during HTTP query execution. Add setting `http_write_exception_in_output_format` to enable/disable this behaviour (enabled by default). [#52853](https://github.com/ClickHouse/ClickHouse/pull/52853) ([Kruglov Pavel](https://github.com/Avogar)). -* View `information_schema.tables` now has a new field `data_length` which shows the approximate size of the data on disk. Required to run queries generated by Amazon QuickSight. [#55037](https://github.com/ClickHouse/ClickHouse/pull/55037) ([Robert Schulze](https://github.com/rschu1ze)). -* The MySQL interface gained a minimal implementation of prepared statements, just enough to allow a connection from Tableau Online to ClickHouse via the MySQL connector. [#54115](https://github.com/ClickHouse/ClickHouse/pull/54115) ([Serge Klochkov](https://github.com/slvrtrn)). Please note: the prepared statements implementation is pretty minimal, we do not support arguments binding yet, it is not required in this particular Tableau online use case. It will be implemented as a follow-up if necessary after extensive testing of Tableau Online in case we discover issues. -* Support case-insensitive and dot-all matching modes in `regexp_tree` dictionaries. [#50906](https://github.com/ClickHouse/ClickHouse/pull/50906) ([Johann Gan](https://github.com/johanngan)). -* Keeper improvement: Add a `createIfNotExists` Keeper command. [#48855](https://github.com/ClickHouse/ClickHouse/pull/48855) ([Konstantin Bogdanov](https://github.com/thevar1able)). -* More precise integer type inference, fix [#51236](https://github.com/ClickHouse/ClickHouse/issues/51236). [#53003](https://github.com/ClickHouse/ClickHouse/pull/53003) ([Chen768959](https://github.com/Chen768959)). -* Introduced resolving of charsets in the string literals for MaterializedMySQL. [#53220](https://github.com/ClickHouse/ClickHouse/pull/53220) ([Val Doroshchuk](https://github.com/valbok)). -* Fix a subtle issue with a rarely used `EmbeddedRocksDB` table engine in an extremely rare scenario: sometimes the `EmbeddedRocksDB` table engine does not close files correctly in NFS after running `DROP TABLE`. [#53502](https://github.com/ClickHouse/ClickHouse/pull/53502) ([Mingliang Pan](https://github.com/liangliangpan)). -* `RESTORE TABLE ON CLUSTER` must create replicated tables with a matching UUID on hosts. Otherwise the macro `{uuid}` in ZooKeeper path can't work correctly after RESTORE. This PR implements that. [#53765](https://github.com/ClickHouse/ClickHouse/pull/53765) ([Vitaly Baranov](https://github.com/vitlibar)). -* Added restore setting `restore_broken_parts_as_detached`: if it's true the RESTORE process won't stop on broken parts while restoring, instead all the broken parts will be copied to the `detached` folder with the prefix `broken-from-backup'. If it's false the RESTORE process will stop on the first broken part (if any). The default value is false. [#53877](https://github.com/ClickHouse/ClickHouse/pull/53877) ([Vitaly Baranov](https://github.com/vitlibar)). -* Add `elapsed_ns` field to HTTP headers X-ClickHouse-Progress and X-ClickHouse-Summary. [#54179](https://github.com/ClickHouse/ClickHouse/pull/54179) ([joelynch](https://github.com/joelynch)). -* Implementation of `reconfig` (https://github.com/ClickHouse/ClickHouse/pull/49450), `sync`, and `exists` commands for keeper-client. [#54201](https://github.com/ClickHouse/ClickHouse/pull/54201) ([pufit](https://github.com/pufit)). -* `clickhouse-local` and `clickhouse-client` now allow to specify the `--query` parameter multiple times, e.g. `./clickhouse-client --query "SELECT 1" --query "SELECT 2"`. This syntax is slightly more intuitive than `./clickhouse-client --multiquery "SELECT 1;S ELECT 2"`, a bit easier to script (e.g. `queries.push_back('--query "$q"')`) and more consistent with the behavior of existing parameter `--queries-file` (e.g. `./clickhouse client --queries-file queries1.sql --queries-file queries2.sql`). [#54249](https://github.com/ClickHouse/ClickHouse/pull/54249) ([Robert Schulze](https://github.com/rschu1ze)). -* Add sub-second precision to `formatReadableTimeDelta`. [#54250](https://github.com/ClickHouse/ClickHouse/pull/54250) ([Andrey Zvonov](https://github.com/zvonand)). -* Enable `allow_remove_stale_moving_parts` by default. [#54260](https://github.com/ClickHouse/ClickHouse/pull/54260) ([vdimir](https://github.com/vdimir)). -* Fix using count from cache and improve progress bar for reading from archives. [#54271](https://github.com/ClickHouse/ClickHouse/pull/54271) ([Kruglov Pavel](https://github.com/Avogar)). -* Add support for S3 credentials using SSO. To define a profile to be used with SSO, set `AWS_PROFILE` environment variable. [#54347](https://github.com/ClickHouse/ClickHouse/pull/54347) ([Antonio Andelic](https://github.com/antonio2368)). -* Support NULL as default for nested types Array/Tuple/Map for input formats. Closes [#51100](https://github.com/ClickHouse/ClickHouse/issues/51100). [#54351](https://github.com/ClickHouse/ClickHouse/pull/54351) ([Kruglov Pavel](https://github.com/Avogar)). -* Allow reading some unusual configuration of chunks from Arrow/Parquet formats. [#54370](https://github.com/ClickHouse/ClickHouse/pull/54370) ([Arthur Passos](https://github.com/arthurpassos)). -* Add `STD` alias to `stddevPop` function for MySQL compatibility. Closes [#54274](https://github.com/ClickHouse/ClickHouse/issues/54274). [#54382](https://github.com/ClickHouse/ClickHouse/pull/54382) ([Nikolay Degterinsky](https://github.com/evillique)). -* Add `addDate` function for compatibility with MySQL and `subDate` for consistency. Reference [#54275](https://github.com/ClickHouse/ClickHouse/issues/54275). [#54400](https://github.com/ClickHouse/ClickHouse/pull/54400) ([Nikolay Degterinsky](https://github.com/evillique)). -* Add `modification_time` into `system.detached_parts`. [#54506](https://github.com/ClickHouse/ClickHouse/pull/54506) ([Azat Khuzhin](https://github.com/azat)). -* Added a setting `splitby_max_substrings_includes_remaining_string` which controls if functions "splitBy*()" with argument "max_substring" > 0 include the remaining string (if any) in the result array (Python/Spark semantics) or not. The default behavior does not change. [#54518](https://github.com/ClickHouse/ClickHouse/pull/54518) ([Robert Schulze](https://github.com/rschu1ze)). -* Better integer types inference for `Int64`/`UInt64` fields. Continuation of [#53003](https://github.com/ClickHouse/ClickHouse/pull/53003). Now it works also for nested types like Arrays of Arrays and for functions like `map/tuple`. Issue: [#51236](https://github.com/ClickHouse/ClickHouse/issues/51236). [#54553](https://github.com/ClickHouse/ClickHouse/pull/54553) ([Kruglov Pavel](https://github.com/Avogar)). -* Added array operations for multiplying, dividing and modulo on scalar. Works in each way, for example `5 * [5, 5]` and `[5, 5] * 5` - both cases are possible. [#54608](https://github.com/ClickHouse/ClickHouse/pull/54608) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Add optional `version` argument to `rm` command in `keeper-client` to support safer deletes. [#54708](https://github.com/ClickHouse/ClickHouse/pull/54708) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). -* Disable killing the server by systemd (that may lead to data loss when using Buffer tables). [#54744](https://github.com/ClickHouse/ClickHouse/pull/54744) ([Azat Khuzhin](https://github.com/azat)). -* Added field `is_deterministic` to system table `system.functions` which indicates whether the result of a function is stable between two invocations (given exactly the same inputs) or not. [#54766](https://github.com/ClickHouse/ClickHouse/pull/54766) [#55035](https://github.com/ClickHouse/ClickHouse/pull/55035) ([Robert Schulze](https://github.com/rschu1ze)). -* Made the views in schema `information_schema` more compatible with the equivalent views in MySQL (i.e. modified and extended them) up to a point where Tableau Online is able to connect to ClickHouse. More specifically: 1. The type of field `information_schema.tables.table_type` changed from Enum8 to String. 2. Added fields `table_comment` and `table_collation` to view `information_schema.table`. 3. Added views `information_schema.key_column_usage` and `referential_constraints`. 4. Replaced uppercase aliases in `information_schema` views with concrete uppercase columns. [#54773](https://github.com/ClickHouse/ClickHouse/pull/54773) ([Serge Klochkov](https://github.com/slvrtrn)). -* The query cache now returns an error if the user tries to cache the result of a query with a non-deterministic function such as `now`, `randomString` and `dictGet`. Compared to the previous behavior (silently don't cache the result), this reduces confusion and surprise for users. [#54801](https://github.com/ClickHouse/ClickHouse/pull/54801) ([Robert Schulze](https://github.com/rschu1ze)). -* Forbid special columns like materialized/ephemeral/alias for `file`/`s3`/`url`/... storages, fix insert into ephemeral columns from files. Closes [#53477](https://github.com/ClickHouse/ClickHouse/issues/53477). [#54803](https://github.com/ClickHouse/ClickHouse/pull/54803) ([Kruglov Pavel](https://github.com/Avogar)). -* More configurable collecting metadata for backup. [#54804](https://github.com/ClickHouse/ClickHouse/pull/54804) ([Vitaly Baranov](https://github.com/vitlibar)). -* `clickhouse-local`'s log file (if enabled with --server_logs_file flag) will now prefix each line with timestamp, thread id, etc, just like `clickhouse-server`. [#54807](https://github.com/ClickHouse/ClickHouse/pull/54807) ([Michael Kolupaev](https://github.com/al13n321)). -* Field `is_obsolete` in the `system.merge_tree_settings` table - it is now 1 for obsolete merge tree settings. Previously, only the description indicated that the setting is obsolete. [#54837](https://github.com/ClickHouse/ClickHouse/pull/54837) ([Robert Schulze](https://github.com/rschu1ze)). -* Make it possible to use plural when using interval literals. `INTERVAL 2 HOURS` should be equivalent to `INTERVAL 2 HOUR`. [#54860](https://github.com/ClickHouse/ClickHouse/pull/54860) ([Jordi Villar](https://github.com/jrdi)). -* Always allow the creation of a projection with `Nullable` PK. This fixes [#54814](https://github.com/ClickHouse/ClickHouse/issues/54814). [#54895](https://github.com/ClickHouse/ClickHouse/pull/54895) ([Amos Bird](https://github.com/amosbird)). -* Retry backup's S3 operations after connection reset failure. [#54900](https://github.com/ClickHouse/ClickHouse/pull/54900) ([Vitaly Baranov](https://github.com/vitlibar)). -* Make the exception message exact in case of the maximum value of a settings is less than the minimum value. [#54925](https://github.com/ClickHouse/ClickHouse/pull/54925) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). -* `LIKE`, `match`, and other regular expressions matching functions now allow matching with patterns containing non-UTF-8 substrings by falling back to binary matching. Example: you can use `string LIKE '\xFE\xFF%'` to detect BOM. This closes [#54486](https://github.com/ClickHouse/ClickHouse/issues/54486). [#54942](https://github.com/ClickHouse/ClickHouse/pull/54942) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Added `ContextLockWaitMicroseconds` profile event. [#55029](https://github.com/ClickHouse/ClickHouse/pull/55029) ([Maksim Kita](https://github.com/kitaisreal)). -* The Keeper dynamically adjusts log levels. [#50372](https://github.com/ClickHouse/ClickHouse/pull/50372) ([helifu](https://github.com/helifu)). -* Added function `timestamp` for compatibility with MySQL. Closes [#54275](https://github.com/ClickHouse/ClickHouse/issues/54275). [#54639](https://github.com/ClickHouse/ClickHouse/pull/54639) ([Nikolay Degterinsky](https://github.com/evillique)). - -#### Build/Testing/Packaging Improvement -* Bumped the compiler of official and continuous integration builds of ClickHouse from Clang 16 to 17. [#53831](https://github.com/ClickHouse/ClickHouse/pull/53831) ([Robert Schulze](https://github.com/rschu1ze)). -* Regenerated tld data for lookups (`tldLookup.generated.cpp`). [#54269](https://github.com/ClickHouse/ClickHouse/pull/54269) ([Bharat Nallan](https://github.com/bharatnc)). -* Remove the redundant `clickhouse-keeper-client` symlink. [#54587](https://github.com/ClickHouse/ClickHouse/pull/54587) ([Tomas Barton](https://github.com/deric)). -* Use `/usr/bin/env` to resolve bash - now it supports Nix OS. [#54603](https://github.com/ClickHouse/ClickHouse/pull/54603) ([Fionera](https://github.com/fionera)). -* CMake added `PROFILE_CPU` option needed to perform `perf record` without using a DWARF call graph. [#54917](https://github.com/ClickHouse/ClickHouse/pull/54917) ([Maksim Kita](https://github.com/kitaisreal)). -* If the linker is different than LLD, stop with a fatal error. [#55036](https://github.com/ClickHouse/ClickHouse/pull/55036) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Replaced the library to handle (encode/decode) base64 values from Turbo-Base64 to aklomp-base64. Both are SIMD-accelerated on x86 and ARM but 1. the license of the latter (BSD-2) is more favorable for ClickHouse, Turbo64 switched in the meantime to GPL-3, 2. with more GitHub stars, aklomp-base64 seems more future-proof, 3. aklomp-base64 has a slightly nicer API (which is arguably subjective), and 4. aklomp-base64 does not require us to hack around bugs (like non-threadsafe initialization). Note: aklomp-base64 rejects unpadded base64 values whereas Turbo-Base64 decodes them on a best-effort basis. RFC-4648 leaves it open whether padding is mandatory or not, but depending on the context this may be a behavioral change to be aware of. [#54119](https://github.com/ClickHouse/ClickHouse/pull/54119) ([Mikhail Koviazin](https://github.com/mkmkme)). - -#### Bug Fix (user-visible misbehavior in an official stable release) -* Fix REPLACE/MOVE PARTITION with zero-copy replication (note: "zero-copy replication" is an experimental feature) [#54193](https://github.com/ClickHouse/ClickHouse/pull/54193) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Fix zero copy locks with hardlinks (note: "zero-copy replication" is an experimental feature) [#54859](https://github.com/ClickHouse/ClickHouse/pull/54859) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Fix zero copy garbage (note: "zero-copy replication" is an experimental feature) [#54550](https://github.com/ClickHouse/ClickHouse/pull/54550) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Pass HTTP retry timeout as milliseconds (it was incorrect before). [#54438](https://github.com/ClickHouse/ClickHouse/pull/54438) ([Duc Canh Le](https://github.com/canhld94)). -* Fix misleading error message in OUTFILE with `CapnProto`/`Protobuf` [#52870](https://github.com/ClickHouse/ClickHouse/pull/52870) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix summary reporting with parallel replicas with LIMIT [#53050](https://github.com/ClickHouse/ClickHouse/pull/53050) ([Raúl Marín](https://github.com/Algunenano)). -* Fix throttling of BACKUPs from/to S3 (in case native copy was not used) and in some other places as well [#53336](https://github.com/ClickHouse/ClickHouse/pull/53336) ([Azat Khuzhin](https://github.com/azat)). -* Fix IO throttling during copying whole directories [#53338](https://github.com/ClickHouse/ClickHouse/pull/53338) ([Azat Khuzhin](https://github.com/azat)). -* Fix: moved to prewhere condition actions can lose column [#53492](https://github.com/ClickHouse/ClickHouse/pull/53492) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Fixed internal error when replacing with byte-equal parts [#53735](https://github.com/ClickHouse/ClickHouse/pull/53735) ([Pedro Riera](https://github.com/priera)). -* Fix: require columns participating in interpolate expression [#53754](https://github.com/ClickHouse/ClickHouse/pull/53754) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Fix cluster discovery initialization + setting up fail points in config [#54113](https://github.com/ClickHouse/ClickHouse/pull/54113) ([vdimir](https://github.com/vdimir)). -* Fix issues in `accurateCastOrNull` [#54136](https://github.com/ClickHouse/ClickHouse/pull/54136) ([Salvatore Mesoraca](https://github.com/aiven-sal)). -* Fix nullable primary key with the FINAL modifier [#54164](https://github.com/ClickHouse/ClickHouse/pull/54164) ([Amos Bird](https://github.com/amosbird)). -* Fixed error that prevented insertion in replicated materialized view of new data in presence of duplicated data. [#54184](https://github.com/ClickHouse/ClickHouse/pull/54184) ([Pedro Riera](https://github.com/priera)). -* Fix: allow `IPv6` for bloom filter [#54200](https://github.com/ClickHouse/ClickHouse/pull/54200) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* fix possible type mismatch with `IPv4` [#54212](https://github.com/ClickHouse/ClickHouse/pull/54212) ([Bharat Nallan](https://github.com/bharatnc)). -* Fix `system.data_skipping_indices` for recreated indices [#54225](https://github.com/ClickHouse/ClickHouse/pull/54225) ([Artur Malchanau](https://github.com/Hexta)). -* fix name clash for multiple join rewriter v2 [#54240](https://github.com/ClickHouse/ClickHouse/pull/54240) ([Tao Wang](https://github.com/wangtZJU)). -* Fix unexpected errors in `system.errors` after join [#54306](https://github.com/ClickHouse/ClickHouse/pull/54306) ([vdimir](https://github.com/vdimir)). -* Fix `isZeroOrNull(NULL)` [#54316](https://github.com/ClickHouse/ClickHouse/pull/54316) ([flynn](https://github.com/ucasfl)). -* Fix: parallel replicas over distributed with `prefer_localhost_replica` = 1 [#54334](https://github.com/ClickHouse/ClickHouse/pull/54334) ([Igor Nikonov](https://github.com/devcrafter)). -* Fix logical error in vertical merge + replacing merge tree + optimize cleanup [#54368](https://github.com/ClickHouse/ClickHouse/pull/54368) ([alesapin](https://github.com/alesapin)). -* Fix possible error `URI contains invalid characters` in the `s3` table function [#54373](https://github.com/ClickHouse/ClickHouse/pull/54373) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix segfault in AST optimization of `arrayExists` function [#54379](https://github.com/ClickHouse/ClickHouse/pull/54379) ([Nikolay Degterinsky](https://github.com/evillique)). -* Check for overflow before addition in `analysisOfVariance` function [#54385](https://github.com/ClickHouse/ClickHouse/pull/54385) ([Antonio Andelic](https://github.com/antonio2368)). -* Reproduce and fix the bug in removeSharedRecursive [#54430](https://github.com/ClickHouse/ClickHouse/pull/54430) ([Sema Checherinda](https://github.com/CheSema)). -* Fix possible incorrect result with SimpleAggregateFunction in PREWHERE and FINAL [#54436](https://github.com/ClickHouse/ClickHouse/pull/54436) ([Azat Khuzhin](https://github.com/azat)). -* Fix filtering parts with indexHint for non analyzer [#54449](https://github.com/ClickHouse/ClickHouse/pull/54449) ([Azat Khuzhin](https://github.com/azat)). -* Fix aggregate projections with normalized states [#54480](https://github.com/ClickHouse/ClickHouse/pull/54480) ([Amos Bird](https://github.com/amosbird)). -* `clickhouse-local`: something for multiquery parameter [#54498](https://github.com/ClickHouse/ClickHouse/pull/54498) ([CuiShuoGuo](https://github.com/bakam412)). -* `clickhouse-local` supports `--database` command line argument [#54503](https://github.com/ClickHouse/ClickHouse/pull/54503) ([vdimir](https://github.com/vdimir)). -* Fix possible parsing error in `-WithNames` formats with disabled `input_format_with_names_use_header` [#54513](https://github.com/ClickHouse/ClickHouse/pull/54513) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix rare case of CHECKSUM_DOESNT_MATCH error [#54549](https://github.com/ClickHouse/ClickHouse/pull/54549) ([alesapin](https://github.com/alesapin)). -* Fix sorting of UNION ALL of already sorted results [#54564](https://github.com/ClickHouse/ClickHouse/pull/54564) ([Vitaly Baranov](https://github.com/vitlibar)). -* Fix snapshot install in Keeper [#54572](https://github.com/ClickHouse/ClickHouse/pull/54572) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix race in `ColumnUnique` [#54575](https://github.com/ClickHouse/ClickHouse/pull/54575) ([Nikita Taranov](https://github.com/nickitat)). -* Annoy/Usearch index: Fix LOGICAL_ERROR during build-up with default values [#54600](https://github.com/ClickHouse/ClickHouse/pull/54600) ([Robert Schulze](https://github.com/rschu1ze)). -* Fix serialization of `ColumnDecimal` [#54601](https://github.com/ClickHouse/ClickHouse/pull/54601) ([Nikita Taranov](https://github.com/nickitat)). -* Fix schema inference for *Cluster functions for column names with spaces [#54635](https://github.com/ClickHouse/ClickHouse/pull/54635) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix using structure from insertion tables in case of defaults and explicit insert columns [#54655](https://github.com/ClickHouse/ClickHouse/pull/54655) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix: avoid using regex match, possibly containing alternation, as a key condition. [#54696](https://github.com/ClickHouse/ClickHouse/pull/54696) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Fix ReplacingMergeTree with vertical merge and cleanup [#54706](https://github.com/ClickHouse/ClickHouse/pull/54706) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Fix virtual columns having incorrect values after ORDER BY [#54811](https://github.com/ClickHouse/ClickHouse/pull/54811) ([Michael Kolupaev](https://github.com/al13n321)). -* Fix filtering parts with indexHint for non analyzer [#54825](https://github.com/ClickHouse/ClickHouse/pull/54825) [#54449](https://github.com/ClickHouse/ClickHouse/pull/54449) ([Azat Khuzhin](https://github.com/azat)). -* Fix Keeper segfault during shutdown [#54841](https://github.com/ClickHouse/ClickHouse/pull/54841) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix `Invalid number of rows in Chunk` in MaterializedPostgreSQL [#54844](https://github.com/ClickHouse/ClickHouse/pull/54844) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Move obsolete format settings to separate section [#54855](https://github.com/ClickHouse/ClickHouse/pull/54855) ([Kruglov Pavel](https://github.com/Avogar)). -* Rebuild `minmax_count_projection` when partition key gets modified [#54943](https://github.com/ClickHouse/ClickHouse/pull/54943) ([Amos Bird](https://github.com/amosbird)). -* Fix bad cast to `ColumnVector` in function `if` [#55019](https://github.com/ClickHouse/ClickHouse/pull/55019) ([Kruglov Pavel](https://github.com/Avogar)). -* Prevent attaching parts from tables with different projections or indices [#55062](https://github.com/ClickHouse/ClickHouse/pull/55062) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). -* Store NULL in scalar result map for empty subquery result [#52240](https://github.com/ClickHouse/ClickHouse/pull/52240) ([vdimir](https://github.com/vdimir)). -* Fix `FINAL` produces invalid read ranges in a rare case [#54934](https://github.com/ClickHouse/ClickHouse/pull/54934) ([Nikita Taranov](https://github.com/nickitat)). -* Fix: insert quorum w/o keeper retries [#55026](https://github.com/ClickHouse/ClickHouse/pull/55026) ([Igor Nikonov](https://github.com/devcrafter)). -* Fix simple state with nullable [#55030](https://github.com/ClickHouse/ClickHouse/pull/55030) ([Pedro Riera](https://github.com/priera)). - - -### ClickHouse release 23.8 LTS, 2023-08-31 - -#### Backward Incompatible Change -* If a dynamic disk contains a name, it should be specified as `disk = disk(name = 'disk_name'`, ...) in disk function arguments. In previous version it could be specified as `disk = disk_(...)`, which is no longer supported. [#52820](https://github.com/ClickHouse/ClickHouse/pull/52820) ([Kseniia Sumarokova](https://github.com/kssenii)). -* `clickhouse-benchmark` will establish connections in parallel when invoked with `--concurrency` more than one. Previously it was unusable if you ran it with 1000 concurrent connections from Europe to the US. Correct calculation of QPS for connections with high latency. Backward incompatible change: the option for JSON output of `clickhouse-benchmark` is removed. If you've used this option, you can also extract data from the `system.query_log` in JSON format as a workaround. [#53293](https://github.com/ClickHouse/ClickHouse/pull/53293) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The `microseconds` column is removed from the `system.text_log`, and the `milliseconds` column is removed from the `system.metric_log`, because they are redundant in the presence of the `event_time_microseconds` column. [#53601](https://github.com/ClickHouse/ClickHouse/pull/53601) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Deprecate the metadata cache feature. It is experimental and we have never used it. The feature is dangerous: [#51182](https://github.com/ClickHouse/ClickHouse/issues/51182). Remove the `system.merge_tree_metadata_cache` system table. The metadata cache is still available in this version but will be removed soon. This closes [#39197](https://github.com/ClickHouse/ClickHouse/issues/39197). [#51303](https://github.com/ClickHouse/ClickHouse/pull/51303) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Disable support for 3DES in TLS connections. [#52893](https://github.com/ClickHouse/ClickHouse/pull/52893) ([Kenji Noguchi](https://github.com/knoguchi)). - -#### New Feature -* Direct import from zip/7z/tar archives. Example: `file('*.zip :: *.csv')`. [#50321](https://github.com/ClickHouse/ClickHouse/pull/50321) ([nikitakeba](https://github.com/nikitakeba)). -* Add column `ptr` to `system.trace_log` for `trace_type = 'MemorySample'`. This column contains an address of allocation. Added function `flameGraph` which can build flamegraph containing allocated and not released memory. Reworking of [#38391](https://github.com/ClickHouse/ClickHouse/issues/38391). [#45322](https://github.com/ClickHouse/ClickHouse/pull/45322) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Added table function `azureBlobStorageCluster`. The supported set of features is very similar to table function `s3Cluster`. [#50795](https://github.com/ClickHouse/ClickHouse/pull/50795) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Allow using `cluster`, `clusterAllReplicas`, `remote`, and `remoteSecure` without table name in issue [#50808](https://github.com/ClickHouse/ClickHouse/issues/50808). [#50848](https://github.com/ClickHouse/ClickHouse/pull/50848) ([Yangkuan Liu](https://github.com/LiuYangkuan)). -* A system table to monitor Kafka consumers. [#50999](https://github.com/ClickHouse/ClickHouse/pull/50999) ([Ilya Golshtein](https://github.com/ilejn)). -* Added `max_sessions_for_user` setting. [#51724](https://github.com/ClickHouse/ClickHouse/pull/51724) ([Alexey Gerasimchuck](https://github.com/Demilivor)). -* New functions `toUTCTimestamp/fromUTCTimestamp` to act same as spark's `to_utc_timestamp/from_utc_timestamp`. [#52117](https://github.com/ClickHouse/ClickHouse/pull/52117) ([KevinyhZou](https://github.com/KevinyhZou)). -* Add new functions `structureToCapnProtoSchema`/`structureToProtobufSchema` that convert ClickHouse table structure to CapnProto/Protobuf format schema. Allow to input/output data in CapnProto/Protobuf format without external format schema using autogenerated schema from table structure (controlled by settings `format_capn_proto_use_autogenerated_schema`/`format_protobuf_use_autogenerated_schema`). Allow to export autogenerated schema while input/output using setting `output_format_schema`. [#52278](https://github.com/ClickHouse/ClickHouse/pull/52278) ([Kruglov Pavel](https://github.com/Avogar)). -* A new field `query_cache_usage` in `system.query_log` now shows if and how the query cache was used. [#52384](https://github.com/ClickHouse/ClickHouse/pull/52384) ([Robert Schulze](https://github.com/rschu1ze)). -* Add new function `startsWithUTF8` and `endsWithUTF8`. [#52555](https://github.com/ClickHouse/ClickHouse/pull/52555) ([李扬](https://github.com/taiyang-li)). -* Allow variable number of columns in TSV/CustomSeparated/JSONCompactEachRow, make schema inference work with variable number of columns. Add settings `input_format_tsv_allow_variable_number_of_columns`, `input_format_custom_allow_variable_number_of_columns`, `input_format_json_compact_allow_variable_number_of_columns`. [#52692](https://github.com/ClickHouse/ClickHouse/pull/52692) ([Kruglov Pavel](https://github.com/Avogar)). -* Added `SYSTEM STOP/START PULLING REPLICATION LOG` queries (for testing `ReplicatedMergeTree`). [#52881](https://github.com/ClickHouse/ClickHouse/pull/52881) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Allow to execute constant non-deterministic functions in mutations on initiator. [#53129](https://github.com/ClickHouse/ClickHouse/pull/53129) ([Anton Popov](https://github.com/CurtizJ)). -* Add input format `One` that doesn't read any data and always returns single row with column `dummy` with type `UInt8` and value `0` like `system.one`. It can be used together with `_file/_path` virtual columns to list files in file/s3/url/hdfs/etc table functions without reading any data. [#53209](https://github.com/ClickHouse/ClickHouse/pull/53209) ([Kruglov Pavel](https://github.com/Avogar)). -* Add `tupleConcat` function. Closes [#52759](https://github.com/ClickHouse/ClickHouse/issues/52759). [#53239](https://github.com/ClickHouse/ClickHouse/pull/53239) ([Nikolay Degterinsky](https://github.com/evillique)). -* Support `TRUNCATE DATABASE` operation. [#53261](https://github.com/ClickHouse/ClickHouse/pull/53261) ([Bharat Nallan](https://github.com/bharatnc)). -* Add `max_threads_for_indexes` setting to limit number of threads used for primary key processing. [#53313](https://github.com/ClickHouse/ClickHouse/pull/53313) ([jorisgio](https://github.com/jorisgio)). -* Re-add SipHash keyed functions. [#53525](https://github.com/ClickHouse/ClickHouse/pull/53525) ([Salvatore Mesoraca](https://github.com/aiven-sal)). -* ([#52755](https://github.com/ClickHouse/ClickHouse/issues/52755) , [#52895](https://github.com/ClickHouse/ClickHouse/issues/52895)) Added functions `arrayRotateLeft`, `arrayRotateRight`, `arrayShiftLeft`, `arrayShiftRight`. [#53557](https://github.com/ClickHouse/ClickHouse/pull/53557) ([Mikhail Koviazin](https://github.com/mkmkme)). -* Add column `name` to `system.clusters` as an alias to cluster. [#53605](https://github.com/ClickHouse/ClickHouse/pull/53605) ([irenjj](https://github.com/irenjj)). -* The advanced dashboard now allows mass editing (save/load). [#53608](https://github.com/ClickHouse/ClickHouse/pull/53608) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The advanced dashboard now has an option to maximize charts and move them around. [#53622](https://github.com/ClickHouse/ClickHouse/pull/53622) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Added support for adding and subtracting arrays: `[5,2] + [1,7]`. Division and multiplication were not implemented due to confusion between pointwise multiplication and the scalar product of arguments. Closes [#49939](https://github.com/ClickHouse/ClickHouse/issues/49939). [#52625](https://github.com/ClickHouse/ClickHouse/pull/52625) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Add support for string literals as table names. Closes [#52178](https://github.com/ClickHouse/ClickHouse/issues/52178). [#52635](https://github.com/ClickHouse/ClickHouse/pull/52635) ([hendrik-m](https://github.com/hendrik-m)). - -#### Experimental Feature -* Add new table engine `S3Queue` for streaming data import from s3. Closes [#37012](https://github.com/ClickHouse/ClickHouse/issues/37012). [#49086](https://github.com/ClickHouse/ClickHouse/pull/49086) ([s-kat](https://github.com/s-kat)). It is not ready to use. Do not use it. -* Enable parallel reading from replicas over distributed table. Related to [#49708](https://github.com/ClickHouse/ClickHouse/issues/49708). [#53005](https://github.com/ClickHouse/ClickHouse/pull/53005) ([Igor Nikonov](https://github.com/devcrafter)). -* Add experimental support for HNSW as approximate neighbor search method. [#53447](https://github.com/ClickHouse/ClickHouse/pull/53447) ([Davit Vardanyan](https://github.com/davvard)). This is currently intended for those who continue working on the implementation. Do not use it. - -#### Performance Improvement -* Parquet filter pushdown. I.e. when reading Parquet files, row groups (chunks of the file) are skipped based on the WHERE condition and the min/max values in each column. In particular, if the file is roughly sorted by some column, queries that filter by a short range of that column will be much faster. [#52951](https://github.com/ClickHouse/ClickHouse/pull/52951) ([Michael Kolupaev](https://github.com/al13n321)). -* Optimize reading small row groups by batching them together in Parquet. Closes [#53069](https://github.com/ClickHouse/ClickHouse/issues/53069). [#53281](https://github.com/ClickHouse/ClickHouse/pull/53281) ([Kruglov Pavel](https://github.com/Avogar)). -* Optimize count from files in most input formats. Closes [#44334](https://github.com/ClickHouse/ClickHouse/issues/44334). [#53637](https://github.com/ClickHouse/ClickHouse/pull/53637) ([Kruglov Pavel](https://github.com/Avogar)). -* Use filter by file/path before reading in `url`/`file`/`hdfs` table functions. [#53529](https://github.com/ClickHouse/ClickHouse/pull/53529) ([Kruglov Pavel](https://github.com/Avogar)). -* Enable JIT compilation for AArch64, PowerPC, SystemZ, RISC-V. [#38217](https://github.com/ClickHouse/ClickHouse/pull/38217) ([Maksim Kita](https://github.com/kitaisreal)). -* Add setting `rewrite_count_distinct_if_with_count_distinct_implementation` to rewrite `countDistinctIf` with `count_distinct_implementation`. Closes [#30642](https://github.com/ClickHouse/ClickHouse/issues/30642). [#46051](https://github.com/ClickHouse/ClickHouse/pull/46051) ([flynn](https://github.com/ucasfl)). -* Speed up merging of states of `uniq` and `uniqExact` aggregate functions by parallelizing conversion before merge. [#50748](https://github.com/ClickHouse/ClickHouse/pull/50748) ([Jiebin Sun](https://github.com/jiebinn)). -* Optimize aggregation performance of nullable string key when using a large number of variable length keys. [#51399](https://github.com/ClickHouse/ClickHouse/pull/51399) ([LiuNeng](https://github.com/liuneng1994)). -* Add a pass in Analyzer for time filter optimization with preimage. The performance experiments of SSB on the ICX device (Intel Xeon Platinum 8380 CPU, 80 cores, 160 threads) show that this change could bring an improvement of 8.5% to the geomean QPS when the experimental analyzer is enabled. [#52091](https://github.com/ClickHouse/ClickHouse/pull/52091) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). -* Optimize the merge if all hash sets are single-level in the `uniqExact` (COUNT DISTINCT) function. [#52973](https://github.com/ClickHouse/ClickHouse/pull/52973) ([Jiebin Sun](https://github.com/jiebinn)). -* `Join` table engine: do not clone hash join data structure with all columns. [#53046](https://github.com/ClickHouse/ClickHouse/pull/53046) ([Duc Canh Le](https://github.com/canhld94)). -* Implement native `ORC` input format without the "apache arrow" library to improve performance. [#53324](https://github.com/ClickHouse/ClickHouse/pull/53324) ([李扬](https://github.com/taiyang-li)). -* The dashboard will tell the server to compress the data, which is useful for large time frames over slow internet connections. For example, one chart with 86400 points can be 1.5 MB uncompressed and 60 KB compressed with `br`. [#53569](https://github.com/ClickHouse/ClickHouse/pull/53569) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Better utilization of thread pool for BACKUPs and RESTOREs. [#53649](https://github.com/ClickHouse/ClickHouse/pull/53649) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Load filesystem cache metadata on startup in parallel. Configured by `load_metadata_threads` (default: 1) cache config setting. Related to [#52037](https://github.com/ClickHouse/ClickHouse/issues/52037). [#52943](https://github.com/ClickHouse/ClickHouse/pull/52943) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Improve `move_primary_key_columns_to_end_of_prewhere`. [#53337](https://github.com/ClickHouse/ClickHouse/pull/53337) ([Han Fei](https://github.com/hanfei1991)). -* This optimizes the interaction with ClickHouse Keeper. Previously the caller could register the same watch callback multiple times. In that case each entry was consuming memory and the same callback was called multiple times which didn't make much sense. In order to avoid this the caller could have some logic to not add the same watch multiple times. With this change this deduplication is done internally if the watch callback is passed via shared_ptr. [#53452](https://github.com/ClickHouse/ClickHouse/pull/53452) ([Alexander Gololobov](https://github.com/davenger)). -* Cache number of rows in files for count in file/s3/url/hdfs/azure functions. The cache can be enabled/disabled by setting `use_cache_for_count_from_files` (enabled by default). Continuation of https://github.com/ClickHouse/ClickHouse/pull/53637. [#53692](https://github.com/ClickHouse/ClickHouse/pull/53692) ([Kruglov Pavel](https://github.com/Avogar)). -* More careful thread management will improve the speed of the S3 table function over a large number of files by more than ~25%. [#53668](https://github.com/ClickHouse/ClickHouse/pull/53668) ([pufit](https://github.com/pufit)). - -#### Improvement -* Add `stderr_reaction` configuration/setting to control the reaction (none, log or throw) when external command stderr has data. This helps make debugging external command easier. [#43210](https://github.com/ClickHouse/ClickHouse/pull/43210) ([Amos Bird](https://github.com/amosbird)). -* Add `partition` column to the `system part_log` and merge table. [#48990](https://github.com/ClickHouse/ClickHouse/pull/48990) ([Jianfei Hu](https://github.com/incfly)). -* The sizes of the (index) uncompressed/mark, mmap and query caches can now be configured dynamically at runtime (without server restart). [#51446](https://github.com/ClickHouse/ClickHouse/pull/51446) ([Robert Schulze](https://github.com/rschu1ze)). -* If a dictionary is created with a complex key, automatically choose the "complex key" layout variant. [#49587](https://github.com/ClickHouse/ClickHouse/pull/49587) ([xiebin](https://github.com/xbthink)). -* Add setting `use_concurrency_control` for better testing of the new concurrency control feature. [#49618](https://github.com/ClickHouse/ClickHouse/pull/49618) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Added suggestions for mistyped names for databases and tables. [#49801](https://github.com/ClickHouse/ClickHouse/pull/49801) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* While read small files from HDFS by Gluten, we found that it will cost more times when compare to directly query by Spark. And we did something with that. [#50063](https://github.com/ClickHouse/ClickHouse/pull/50063) ([KevinyhZou](https://github.com/KevinyhZou)). -* There were too many worthless error logs after session expiration, which we didn't like. [#50171](https://github.com/ClickHouse/ClickHouse/pull/50171) ([helifu](https://github.com/helifu)). -* Introduce fallback ZooKeeper sessions which are time-bound. Fixed `index` column in system.zookeeper_connection for DNS addresses. [#50424](https://github.com/ClickHouse/ClickHouse/pull/50424) ([Anton Kozlov](https://github.com/tonickkozlov)). -* Add ability to log when max_partitions_per_insert_block is reached. [#50948](https://github.com/ClickHouse/ClickHouse/pull/50948) ([Sean Haynes](https://github.com/seandhaynes)). -* Added a bunch of custom commands to clickhouse-keeper-client (mostly to make ClickHouse debugging easier). [#51117](https://github.com/ClickHouse/ClickHouse/pull/51117) ([pufit](https://github.com/pufit)). -* Updated check for connection string in `azureBlobStorage` table function as connection string with "sas" does not always begin with the default endpoint and updated connection URL to include "sas" token after adding Azure's container to URL. [#51141](https://github.com/ClickHouse/ClickHouse/pull/51141) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Fix description for filtering sets in the `full_sorting_merge` JOIN algorithm. [#51329](https://github.com/ClickHouse/ClickHouse/pull/51329) ([Tanay Tummalapalli](https://github.com/ttanay)). -* Fixed memory consumption in `Aggregator` when `max_block_size` is huge. [#51566](https://github.com/ClickHouse/ClickHouse/pull/51566) ([Nikita Taranov](https://github.com/nickitat)). -* Add `SYSTEM SYNC FILESYSTEM CACHE` command. It will compare in-memory state of filesystem cache with what it has on disk and fix in-memory state if needed. This is only needed if you are making manual interventions in on-disk data, which is highly discouraged. [#51622](https://github.com/ClickHouse/ClickHouse/pull/51622) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Attempt to create a generic proxy resolver for CH while keeping backwards compatibility with existing S3 storage conf proxy resolver. [#51749](https://github.com/ClickHouse/ClickHouse/pull/51749) ([Arthur Passos](https://github.com/arthurpassos)). -* Support reading tuple subcolumns from file/s3/hdfs/url/azureBlobStorage table functions. [#51806](https://github.com/ClickHouse/ClickHouse/pull/51806) ([Kruglov Pavel](https://github.com/Avogar)). -* Function `arrayIntersect` now returns the values in the order, corresponding to the first argument. Closes [#27622](https://github.com/ClickHouse/ClickHouse/issues/27622). [#51850](https://github.com/ClickHouse/ClickHouse/pull/51850) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Add new queries, which allow to create/drop of access entities in specified access storage or move access entities from one access storage to another. [#51912](https://github.com/ClickHouse/ClickHouse/pull/51912) ([pufit](https://github.com/pufit)). -* Make `ALTER TABLE FREEZE` queries not replicated in the Replicated database engine. [#52064](https://github.com/ClickHouse/ClickHouse/pull/52064) ([Mike Kot](https://github.com/myrrc)). -* Added possibility to flush system tables on unexpected shutdown. [#52174](https://github.com/ClickHouse/ClickHouse/pull/52174) ([Alexey Gerasimchuck](https://github.com/Demilivor)). -* Fix the case when `s3` table function refused to work with pre-signed URLs. close [#50846](https://github.com/ClickHouse/ClickHouse/issues/50846). [#52310](https://github.com/ClickHouse/ClickHouse/pull/52310) ([chen](https://github.com/xiedeyantu)). -* Add column `name` as an alias to `event` and `metric` in the `system.events` and `system.metrics` tables. Closes [#51257](https://github.com/ClickHouse/ClickHouse/issues/51257). [#52315](https://github.com/ClickHouse/ClickHouse/pull/52315) ([chen](https://github.com/xiedeyantu)). -* Added support of syntax `CREATE UNIQUE INDEX` in parser as a no-op for better SQL compatibility. `UNIQUE` index is not supported. Set `create_index_ignore_unique = 1` to ignore UNIQUE keyword in queries. [#52320](https://github.com/ClickHouse/ClickHouse/pull/52320) ([Ilya Yatsishin](https://github.com/qoega)). -* Add support of predefined macro (`{database}` and `{table}`) in some Kafka engine settings: topic, consumer, client_id, etc. [#52386](https://github.com/ClickHouse/ClickHouse/pull/52386) ([Yury Bogomolov](https://github.com/ybogo)). -* Disable updating the filesystem cache during backup/restore. Filesystem cache must not be updated during backup/restore, it seems it just slows down the process without any profit (because the BACKUP command can read a lot of data and it's no use to put all the data to the filesystem cache and immediately evict it). [#52402](https://github.com/ClickHouse/ClickHouse/pull/52402) ([Vitaly Baranov](https://github.com/vitlibar)). -* The configuration of S3 endpoint allow using it from the root, and append '/' automatically if needed. [#47809](https://github.com/ClickHouse/ClickHouse/issues/47809). [#52600](https://github.com/ClickHouse/ClickHouse/pull/52600) ([xiaolei565](https://github.com/xiaolei565)). -* For clickhouse-local allow positional options and populate global UDF settings (user_scripts_path and user_defined_executable_functions_config). [#52643](https://github.com/ClickHouse/ClickHouse/pull/52643) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* `system.asynchronous_metrics` now includes metrics "QueryCacheEntries" and "QueryCacheBytes" to inspect the query cache. [#52650](https://github.com/ClickHouse/ClickHouse/pull/52650) ([Robert Schulze](https://github.com/rschu1ze)). -* Added possibility to use `s3_storage_class` parameter in the `SETTINGS` clause of the `BACKUP` statement for backups to S3. [#52658](https://github.com/ClickHouse/ClickHouse/pull/52658) ([Roman Vasin](https://github.com/rvasin)). -* Add utility `print-backup-info.py` which parses a backup metadata file and prints information about the backup. [#52690](https://github.com/ClickHouse/ClickHouse/pull/52690) ([Vitaly Baranov](https://github.com/vitlibar)). -* Closes [#49510](https://github.com/ClickHouse/ClickHouse/issues/49510). Currently we have database and table names case-sensitive, but BI tools query `information_schema` sometimes in lowercase, sometimes in uppercase. For this reason we have `information_schema` database, containing lowercase tables, such as `information_schema.tables` and `INFORMATION_SCHEMA` database, containing uppercase tables, such as `INFORMATION_SCHEMA.TABLES`. But some tools are querying `INFORMATION_SCHEMA.tables` and `information_schema.TABLES`. The proposed solution is to duplicate both lowercase and uppercase tables in lowercase and uppercase `information_schema` database. [#52695](https://github.com/ClickHouse/ClickHouse/pull/52695) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Query`CHECK TABLE` has better performance and usability (sends progress updates, cancellable). [#52745](https://github.com/ClickHouse/ClickHouse/pull/52745) ([vdimir](https://github.com/vdimir)). -* Add support for `modulo`, `intDiv`, `intDivOrZero` for tuples by distributing them across tuple's elements. [#52758](https://github.com/ClickHouse/ClickHouse/pull/52758) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Search for default `yaml` and `yml` configs in clickhouse-client after `xml`. [#52767](https://github.com/ClickHouse/ClickHouse/pull/52767) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* When merging into non-'clickhouse' rooted configuration, configs with different root node name just bypassed without exception. [#52770](https://github.com/ClickHouse/ClickHouse/pull/52770) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Now it's possible to specify min (`memory_profiler_sample_min_allocation_size`) and max (`memory_profiler_sample_max_allocation_size`) size for allocations to be tracked with sampling memory profiler. [#52779](https://github.com/ClickHouse/ClickHouse/pull/52779) ([alesapin](https://github.com/alesapin)). -* Add `precise_float_parsing` setting to switch float parsing methods (fast/precise). [#52791](https://github.com/ClickHouse/ClickHouse/pull/52791) ([Andrey Zvonov](https://github.com/zvonand)). -* Use the same default paths for `clickhouse-keeper` (symlink) as for `clickhouse-keeper` (executable). [#52861](https://github.com/ClickHouse/ClickHouse/pull/52861) ([Vitaly Baranov](https://github.com/vitlibar)). -* Improve error message for table function `remote`. Closes [#40220](https://github.com/ClickHouse/ClickHouse/issues/40220). [#52959](https://github.com/ClickHouse/ClickHouse/pull/52959) ([jiyoungyoooo](https://github.com/jiyoungyoooo)). -* Added the possibility to specify custom storage policy in the `SETTINGS` clause of `RESTORE` queries. [#52970](https://github.com/ClickHouse/ClickHouse/pull/52970) ([Victor Krasnov](https://github.com/sirvickr)). -* Add the ability to throttle the S3 requests on backup operations (`BACKUP` and `RESTORE` commands now honor `s3_max_[get/put]_[rps/burst]`). [#52974](https://github.com/ClickHouse/ClickHouse/pull/52974) ([Daniel Pozo Escalona](https://github.com/danipozo)). -* Add settings to ignore ON CLUSTER clause in queries for management of replicated user-defined functions or access control entities with replicated storage. [#52975](https://github.com/ClickHouse/ClickHouse/pull/52975) ([Aleksei Filatov](https://github.com/aalexfvk)). -* EXPLAIN actions for JOIN step. [#53006](https://github.com/ClickHouse/ClickHouse/pull/53006) ([Maksim Kita](https://github.com/kitaisreal)). -* Make `hasTokenOrNull` and `hasTokenCaseInsensitiveOrNull` return null for empty needles. [#53059](https://github.com/ClickHouse/ClickHouse/pull/53059) ([ltrk2](https://github.com/ltrk2)). -* Allow to restrict allowed paths for filesystem caches. Mainly useful for dynamic disks. If in server config `filesystem_caches_path` is specified, all filesystem caches' paths will be restricted to this directory. E.g. if the `path` in cache config is relative - it will be put in `filesystem_caches_path`; if `path` in cache config is absolute, it will be required to lie inside `filesystem_caches_path`. If `filesystem_caches_path` is not specified in config, then behaviour will be the same as in earlier versions. [#53124](https://github.com/ClickHouse/ClickHouse/pull/53124) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Added a bunch of custom commands (mostly to make ClickHouse debugging easier). [#53127](https://github.com/ClickHouse/ClickHouse/pull/53127) ([pufit](https://github.com/pufit)). -* Add diagnostic info about file name during schema inference - it helps when you process multiple files with globs. [#53135](https://github.com/ClickHouse/ClickHouse/pull/53135) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Client will load suggestions using the main connection if the second connection is not allowed to create a session. [#53177](https://github.com/ClickHouse/ClickHouse/pull/53177) ([Alexey Gerasimchuck](https://github.com/Demilivor)). -* Add EXCEPT clause to `SYSTEM STOP/START LISTEN QUERIES [ALL/DEFAULT/CUSTOM]` query, for example `SYSTEM STOP LISTEN QUERIES ALL EXCEPT TCP, HTTP`. [#53280](https://github.com/ClickHouse/ClickHouse/pull/53280) ([Nikolay Degterinsky](https://github.com/evillique)). -* Change the default of `max_concurrent_queries` from 100 to 1000. It's ok to have many concurrent queries if they are not heavy, and mostly waiting for the network. Note: don't confuse concurrent queries and QPS: for example, ClickHouse server can do tens of thousands of QPS with less than 100 concurrent queries. [#53285](https://github.com/ClickHouse/ClickHouse/pull/53285) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Limit number of concurrent background partition optimize merges. [#53405](https://github.com/ClickHouse/ClickHouse/pull/53405) ([Duc Canh Le](https://github.com/canhld94)). -* Added a setting `allow_moving_table_directory_to_trash` that allows to ignore `Directory for table data already exists` error when replicating/recovering a `Replicated` database. [#53425](https://github.com/ClickHouse/ClickHouse/pull/53425) ([Alexander Tokmakov](https://github.com/tavplubix)). -* If server settings `asynchronous_metrics_update_period_s` and `asynchronous_heavy_metrics_update_period_s` are misconfigured to 0, it will now fail gracefully instead of terminating the application. [#53428](https://github.com/ClickHouse/ClickHouse/pull/53428) ([Robert Schulze](https://github.com/rschu1ze)). -* The ClickHouse server now respects memory limits changed via cgroups when reloading its configuration. [#53455](https://github.com/ClickHouse/ClickHouse/pull/53455) ([Robert Schulze](https://github.com/rschu1ze)). -* Add ability to turn off flush of Distributed tables on `DETACH`, `DROP`, or server shutdown. [#53501](https://github.com/ClickHouse/ClickHouse/pull/53501) ([Azat Khuzhin](https://github.com/azat)). -* The `domainRFC` function now supports IPv6 in square brackets. [#53506](https://github.com/ClickHouse/ClickHouse/pull/53506) ([Chen768959](https://github.com/Chen768959)). -* Use longer timeout for S3 CopyObject requests, which are used in backups. [#53533](https://github.com/ClickHouse/ClickHouse/pull/53533) ([Michael Kolupaev](https://github.com/al13n321)). -* Added server setting `aggregate_function_group_array_max_element_size`. This setting is used to limit array size for `groupArray` function at serialization. The default value is `16777215`. [#53550](https://github.com/ClickHouse/ClickHouse/pull/53550) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* `SCHEMA` was added as alias for `DATABASE` to improve MySQL compatibility. [#53587](https://github.com/ClickHouse/ClickHouse/pull/53587) ([Daniël van Eeden](https://github.com/dveeden)). -* Add asynchronous metrics about tables in the system database. For example, `TotalBytesOfMergeTreeTablesSystem`. This closes [#53603](https://github.com/ClickHouse/ClickHouse/issues/53603). [#53604](https://github.com/ClickHouse/ClickHouse/pull/53604) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* SQL editor in the Play UI and Dashboard will not use Grammarly. [#53614](https://github.com/ClickHouse/ClickHouse/pull/53614) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* As expert-level settings, it is now possible to (1) configure the size_ratio (i.e. the relative size of the protected queue) of the [index] mark/uncompressed caches, (2) configure the cache policy of the index mark and index uncompressed caches. [#53657](https://github.com/ClickHouse/ClickHouse/pull/53657) ([Robert Schulze](https://github.com/rschu1ze)). -* Added client info validation to the query packet in TCPHandler. [#53673](https://github.com/ClickHouse/ClickHouse/pull/53673) ([Alexey Gerasimchuck](https://github.com/Demilivor)). -* Retry loading parts in case of network errors while interaction with Microsoft Azure. [#53750](https://github.com/ClickHouse/ClickHouse/pull/53750) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Stacktrace for exceptions, Materailized view exceptions are propagated. [#53766](https://github.com/ClickHouse/ClickHouse/pull/53766) ([Ilya Golshtein](https://github.com/ilejn)). -* If no hostname or port were specified, keeper client will try to search for a connection string in the ClickHouse's config.xml. [#53769](https://github.com/ClickHouse/ClickHouse/pull/53769) ([pufit](https://github.com/pufit)). -* Add profile event `PartsLockMicroseconds` which shows the amount of microseconds we hold the data parts lock in MergeTree table engine family. [#53797](https://github.com/ClickHouse/ClickHouse/pull/53797) ([alesapin](https://github.com/alesapin)). -* Make reconnect limit in RAFT limits configurable for keeper. This configuration can help to make keeper to rebuild connection with peers quicker if the current connection is broken. [#53817](https://github.com/ClickHouse/ClickHouse/pull/53817) ([Pengyuan Bian](https://github.com/bianpengyuan)). -* Ignore foreign keys in tables definition to improve compatibility with MySQL, so a user wouldn't need to rewrite his SQL of the foreign key part, ref [#53380](https://github.com/ClickHouse/ClickHouse/issues/53380). [#53864](https://github.com/ClickHouse/ClickHouse/pull/53864) ([jsc0218](https://github.com/jsc0218)). - -#### Build/Testing/Packaging Improvement -* Don't expose symbols from ClickHouse binary to dynamic linker. It might fix [#43933](https://github.com/ClickHouse/ClickHouse/issues/43933). [#47475](https://github.com/ClickHouse/ClickHouse/pull/47475) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add `clickhouse-keeper-client` symlink to the clickhouse-server package. [#51882](https://github.com/ClickHouse/ClickHouse/pull/51882) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* Add https://github.com/elliotchance/sqltest to CI to report the SQL 2016 conformance. [#52293](https://github.com/ClickHouse/ClickHouse/pull/52293) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Upgrade PRQL to 0.9.3. [#53060](https://github.com/ClickHouse/ClickHouse/pull/53060) ([Maximilian Roos](https://github.com/max-sixty)). -* System tables from CI checks are exported to ClickHouse Cloud. [#53086](https://github.com/ClickHouse/ClickHouse/pull/53086) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The compiler's profile data (`-ftime-trace`) is uploaded to ClickHouse Cloud. [#53100](https://github.com/ClickHouse/ClickHouse/pull/53100) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Speed up Debug and Tidy builds. [#53178](https://github.com/ClickHouse/ClickHouse/pull/53178) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Speed up the build by removing tons and tonnes of garbage. One of the frequently included headers was poisoned by boost. [#53180](https://github.com/ClickHouse/ClickHouse/pull/53180) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Remove even more garbage. [#53182](https://github.com/ClickHouse/ClickHouse/pull/53182) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The function `arrayAUC` was using heavy C++ templates - ditched them. [#53183](https://github.com/ClickHouse/ClickHouse/pull/53183) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Some translation units were always rebuilt regardless of ccache. The culprit is found and fixed. [#53184](https://github.com/ClickHouse/ClickHouse/pull/53184) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The compiler's profile data (`-ftime-trace`) is uploaded to ClickHouse Cloud., the second attempt after [#53100](https://github.com/ClickHouse/ClickHouse/issues/53100). [#53213](https://github.com/ClickHouse/ClickHouse/pull/53213) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Export logs from CI in stateful tests to ClickHouse Cloud. [#53351](https://github.com/ClickHouse/ClickHouse/pull/53351) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Export logs from CI in stress tests. [#53353](https://github.com/ClickHouse/ClickHouse/pull/53353) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Export logs from CI in fuzzer. [#53354](https://github.com/ClickHouse/ClickHouse/pull/53354) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Preserve environment parameters in `clickhouse start` command. Fixes [#51962](https://github.com/ClickHouse/ClickHouse/issues/51962). [#53418](https://github.com/ClickHouse/ClickHouse/pull/53418) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* Follow up for [#53418](https://github.com/ClickHouse/ClickHouse/issues/53418). Small improvements for install_check.py, adding tests for proper ENV parameters passing to the main process on `init.d start`. [#53457](https://github.com/ClickHouse/ClickHouse/pull/53457) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* Reorganize file management in CMake to prevent potential duplications. For instance, `indexHint.cpp` is duplicated in both `dbms_sources` and `clickhouse_functions_sources`. [#53621](https://github.com/ClickHouse/ClickHouse/pull/53621) ([Amos Bird](https://github.com/amosbird)). -* Upgrade snappy to 1.1.10. [#53672](https://github.com/ClickHouse/ClickHouse/pull/53672) ([李扬](https://github.com/taiyang-li)). -* Slightly improve cmake build by sanitizing some dependencies and removing some duplicates. Each commit includes a short description of the changes made. [#53759](https://github.com/ClickHouse/ClickHouse/pull/53759) ([Amos Bird](https://github.com/amosbird)). - -#### Bug Fix (user-visible misbehavior in an official stable release) -* Do not reset (experimental) Annoy index during build-up with more than one mark [#51325](https://github.com/ClickHouse/ClickHouse/pull/51325) ([Tian Xinhui](https://github.com/xinhuitian)). -* Fix usage of temporary directories during RESTORE [#51493](https://github.com/ClickHouse/ClickHouse/pull/51493) ([Azat Khuzhin](https://github.com/azat)). -* Fix binary arithmetic for Nullable(IPv4) [#51642](https://github.com/ClickHouse/ClickHouse/pull/51642) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Support IPv4 and IPv6 data types as dictionary attributes [#51756](https://github.com/ClickHouse/ClickHouse/pull/51756) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* A fix for checksum of compress marks [#51777](https://github.com/ClickHouse/ClickHouse/pull/51777) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Fix mistakenly comma parsing as part of datetime in CSV best effort parsing [#51950](https://github.com/ClickHouse/ClickHouse/pull/51950) ([Kruglov Pavel](https://github.com/Avogar)). -* Don't throw exception when executable UDF has parameters [#51961](https://github.com/ClickHouse/ClickHouse/pull/51961) ([Nikita Taranov](https://github.com/nickitat)). -* Fix recalculation of skip indexes and projections in `ALTER DELETE` queries [#52530](https://github.com/ClickHouse/ClickHouse/pull/52530) ([Anton Popov](https://github.com/CurtizJ)). -* MaterializedMySQL: Fix the infinite loop in ReadBuffer::read [#52621](https://github.com/ClickHouse/ClickHouse/pull/52621) ([Val Doroshchuk](https://github.com/valbok)). -* Load suggestion only with `clickhouse` dialect [#52628](https://github.com/ClickHouse/ClickHouse/pull/52628) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). -* Init and destroy ares channel on demand. [#52634](https://github.com/ClickHouse/ClickHouse/pull/52634) ([Arthur Passos](https://github.com/arthurpassos)). -* Fix filtering by virtual columns with OR expression [#52653](https://github.com/ClickHouse/ClickHouse/pull/52653) ([Azat Khuzhin](https://github.com/azat)). -* Fix crash in function `tuple` with one sparse column argument [#52659](https://github.com/ClickHouse/ClickHouse/pull/52659) ([Anton Popov](https://github.com/CurtizJ)). -* Fix named collections on cluster [#52687](https://github.com/ClickHouse/ClickHouse/pull/52687) ([Al Korgun](https://github.com/alkorgun)). -* Fix reading of unnecessary column in case of multistage `PREWHERE` [#52689](https://github.com/ClickHouse/ClickHouse/pull/52689) ([Anton Popov](https://github.com/CurtizJ)). -* Fix unexpected sort result on multi columns with nulls first direction [#52761](https://github.com/ClickHouse/ClickHouse/pull/52761) ([copperybean](https://github.com/copperybean)). -* Fix data race in Keeper reconfiguration [#52804](https://github.com/ClickHouse/ClickHouse/pull/52804) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix sorting of sparse columns with large limit [#52827](https://github.com/ClickHouse/ClickHouse/pull/52827) ([Anton Popov](https://github.com/CurtizJ)). -* clickhouse-keeper: fix implementation of server with poll. [#52833](https://github.com/ClickHouse/ClickHouse/pull/52833) ([Andy Fiddaman](https://github.com/citrus-it)). -* Make regexp analyzer recognize named capturing groups [#52840](https://github.com/ClickHouse/ClickHouse/pull/52840) ([Han Fei](https://github.com/hanfei1991)). -* Fix possible assert in `~PushingAsyncPipelineExecutor` in clickhouse-local [#52862](https://github.com/ClickHouse/ClickHouse/pull/52862) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix reading of empty `Nested(Array(LowCardinality(...)))` [#52949](https://github.com/ClickHouse/ClickHouse/pull/52949) ([Anton Popov](https://github.com/CurtizJ)). -* Added new tests for session_log and fixed the inconsistency between login and logout. [#52958](https://github.com/ClickHouse/ClickHouse/pull/52958) ([Alexey Gerasimchuck](https://github.com/Demilivor)). -* Fix password leak in show create mysql table [#52962](https://github.com/ClickHouse/ClickHouse/pull/52962) ([Duc Canh Le](https://github.com/canhld94)). -* Convert sparse column format to full in CreateSetAndFilterOnTheFlyStep [#53000](https://github.com/ClickHouse/ClickHouse/pull/53000) ([vdimir](https://github.com/vdimir)). -* Fix rare race condition with empty key prefix directory deletion in fs cache [#53055](https://github.com/ClickHouse/ClickHouse/pull/53055) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix ZstdDeflatingWriteBuffer truncating the output sometimes [#53064](https://github.com/ClickHouse/ClickHouse/pull/53064) ([Michael Kolupaev](https://github.com/al13n321)). -* Fix query_id in part_log with async flush queries [#53103](https://github.com/ClickHouse/ClickHouse/pull/53103) ([Raúl Marín](https://github.com/Algunenano)). -* Fix possible error from cache "Read unexpected size" [#53121](https://github.com/ClickHouse/ClickHouse/pull/53121) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Disable the new parquet encoder [#53130](https://github.com/ClickHouse/ClickHouse/pull/53130) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix "Not-ready Set" exception [#53162](https://github.com/ClickHouse/ClickHouse/pull/53162) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix character escaping in the PostgreSQL engine [#53250](https://github.com/ClickHouse/ClickHouse/pull/53250) ([Nikolay Degterinsky](https://github.com/evillique)). -* Experimental session_log table: Added new tests for session_log and fixed the inconsistency between login and logout. [#53255](https://github.com/ClickHouse/ClickHouse/pull/53255) ([Alexey Gerasimchuck](https://github.com/Demilivor)). Fixed inconsistency between login success and logout [#53302](https://github.com/ClickHouse/ClickHouse/pull/53302) ([Alexey Gerasimchuck](https://github.com/Demilivor)). -* Fix adding sub-second intervals to DateTime [#53309](https://github.com/ClickHouse/ClickHouse/pull/53309) ([Michael Kolupaev](https://github.com/al13n321)). -* Fix "Context has expired" error in dictionaries [#53342](https://github.com/ClickHouse/ClickHouse/pull/53342) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix incorrect normal projection AST format [#53347](https://github.com/ClickHouse/ClickHouse/pull/53347) ([Amos Bird](https://github.com/amosbird)). -* Forbid use_structure_from_insertion_table_in_table_functions when execute Scalar [#53348](https://github.com/ClickHouse/ClickHouse/pull/53348) ([flynn](https://github.com/ucasfl)). -* Fix loading lazy database during system.table select query [#53372](https://github.com/ClickHouse/ClickHouse/pull/53372) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Fixed system.data_skipping_indices for MaterializedMySQL [#53381](https://github.com/ClickHouse/ClickHouse/pull/53381) ([Filipp Ozinov](https://github.com/bakwc)). -* Fix processing single carriage return in TSV file segmentation engine [#53407](https://github.com/ClickHouse/ClickHouse/pull/53407) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix `Context has expired` error properly [#53433](https://github.com/ClickHouse/ClickHouse/pull/53433) ([Michael Kolupaev](https://github.com/al13n321)). -* Fix `timeout_overflow_mode` when having subquery in the rhs of IN [#53439](https://github.com/ClickHouse/ClickHouse/pull/53439) ([Duc Canh Le](https://github.com/canhld94)). -* Fix an unexpected behavior in [#53152](https://github.com/ClickHouse/ClickHouse/issues/53152) [#53440](https://github.com/ClickHouse/ClickHouse/pull/53440) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). -* Fix JSON_QUERY Function parse error while path is all number [#53470](https://github.com/ClickHouse/ClickHouse/pull/53470) ([KevinyhZou](https://github.com/KevinyhZou)). -* Fix wrong columns order for queries with parallel FINAL. [#53489](https://github.com/ClickHouse/ClickHouse/pull/53489) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fixed SELECTing from ReplacingMergeTree with do_not_merge_across_partitions_select_final [#53511](https://github.com/ClickHouse/ClickHouse/pull/53511) ([Vasily Nemkov](https://github.com/Enmk)). -* Flush async insert queue first on shutdown [#53547](https://github.com/ClickHouse/ClickHouse/pull/53547) ([joelynch](https://github.com/joelynch)). -* Fix crash in join on sparse columna [#53548](https://github.com/ClickHouse/ClickHouse/pull/53548) ([vdimir](https://github.com/vdimir)). -* Fix possible UB in Set skipping index for functions with incorrect args [#53559](https://github.com/ClickHouse/ClickHouse/pull/53559) ([Azat Khuzhin](https://github.com/azat)). -* Fix possible UB in inverted indexes (experimental feature) [#53560](https://github.com/ClickHouse/ClickHouse/pull/53560) ([Azat Khuzhin](https://github.com/azat)). -* Fix: interpolate expression takes source column instead of same name aliased from select expression. [#53572](https://github.com/ClickHouse/ClickHouse/pull/53572) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Fix number of dropped granules in EXPLAIN PLAN index=1 [#53616](https://github.com/ClickHouse/ClickHouse/pull/53616) ([wangxiaobo](https://github.com/wzb5212)). -* Correctly handle totals and extremes with `DelayedSource` [#53644](https://github.com/ClickHouse/ClickHouse/pull/53644) ([Antonio Andelic](https://github.com/antonio2368)). -* Prepared set cache in mutation pipeline stuck [#53645](https://github.com/ClickHouse/ClickHouse/pull/53645) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix bug on mutations with subcolumns of type JSON in predicates of UPDATE and DELETE queries. [#53677](https://github.com/ClickHouse/ClickHouse/pull/53677) ([VanDarkholme7](https://github.com/VanDarkholme7)). -* Fix filter pushdown for full_sorting_merge join [#53699](https://github.com/ClickHouse/ClickHouse/pull/53699) ([vdimir](https://github.com/vdimir)). -* Try to fix bug with `NULL::LowCardinality(Nullable(...)) NOT IN` [#53706](https://github.com/ClickHouse/ClickHouse/pull/53706) ([Andrey Zvonov](https://github.com/zvonand)). -* Fix: sorted distinct with sparse columns [#53711](https://github.com/ClickHouse/ClickHouse/pull/53711) ([Igor Nikonov](https://github.com/devcrafter)). -* `transform`: correctly handle default column with multiple rows [#53742](https://github.com/ClickHouse/ClickHouse/pull/53742) ([Salvatore Mesoraca](https://github.com/aiven-sal)). -* Fix fuzzer crash in parseDateTime [#53764](https://github.com/ClickHouse/ClickHouse/pull/53764) ([Robert Schulze](https://github.com/rschu1ze)). -* MaterializedPostgreSQL: fix uncaught exception in getCreateTableQueryImpl [#53832](https://github.com/ClickHouse/ClickHouse/pull/53832) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix possible segfault while using PostgreSQL engine [#53847](https://github.com/ClickHouse/ClickHouse/pull/53847) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix named_collection_admin alias [#54066](https://github.com/ClickHouse/ClickHouse/pull/54066) ([Kseniia Sumarokova](https://github.com/kssenii)). - -### ClickHouse release 23.7, 2023-07-27 - -#### Backward Incompatible Change -* Add `NAMED COLLECTION` access type (aliases `USE NAMED COLLECTION`, `NAMED COLLECTION USAGE`). This PR is backward incompatible because this access type is disabled by default (because a parent access type `NAMED COLLECTION ADMIN` is disabled by default as well). Proposed in [#50277](https://github.com/ClickHouse/ClickHouse/issues/50277). To grant use `GRANT NAMED COLLECTION ON collection_name TO user` or `GRANT NAMED COLLECTION ON * TO user`, to be able to give these grants `named_collection_admin` is required in config (previously it was named `named_collection_control`, so will remain as an alias). [#50625](https://github.com/ClickHouse/ClickHouse/pull/50625) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fixing a typo in the `system.parts` column name `last_removal_attemp_time`. Now it is named `last_removal_attempt_time`. [#52104](https://github.com/ClickHouse/ClickHouse/pull/52104) ([filimonov](https://github.com/filimonov)). -* Bump version of the distributed_ddl_entry_format_version to 5 by default (enables opentelemetry and initial_query_idd pass through). This will not allow to process existing entries for distributed DDL after *downgrade* (but note, that usually there should be no such unprocessed entries). [#52128](https://github.com/ClickHouse/ClickHouse/pull/52128) ([Azat Khuzhin](https://github.com/azat)). -* Check projection metadata the same way we check ordinary metadata. This change may prevent the server from starting in case there was a table with an invalid projection. An example is a projection that created positional columns in PK (e.g. `projection p (select * order by 1, 4)` which is not allowed in table PK and can cause a crash during insert/merge). Drop such projections before the update. Fixes [#52353](https://github.com/ClickHouse/ClickHouse/issues/52353). [#52361](https://github.com/ClickHouse/ClickHouse/pull/52361) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* The experimental feature `hashid` is removed due to a bug. The quality of implementation was questionable at the start, and it didn't get through the experimental status. This closes [#52406](https://github.com/ClickHouse/ClickHouse/issues/52406). [#52449](https://github.com/ClickHouse/ClickHouse/pull/52449) ([Alexey Milovidov](https://github.com/alexey-milovidov)). - -#### New Feature -* Added `Overlay` database engine to combine multiple databases into one. Added `Filesystem` database engine to represent a directory in the filesystem as a set of implicitly available tables with auto-detected formats and structures. A new `S3` database engine allows to read-only interact with s3 storage by representing a prefix as a set of tables. A new `HDFS` database engine allows to interact with HDFS storage in the same way. [#48821](https://github.com/ClickHouse/ClickHouse/pull/48821) ([alekseygolub](https://github.com/alekseygolub)). -* Add support for external disks in Keeper for storing snapshots and logs. [#50098](https://github.com/ClickHouse/ClickHouse/pull/50098) ([Antonio Andelic](https://github.com/antonio2368)). -* Add support for multi-directory selection (`{}`) globs. [#50559](https://github.com/ClickHouse/ClickHouse/pull/50559) ([Andrey Zvonov](https://github.com/zvonand)). -* Kafka connector can fetch Avro schema from schema registry with basic authentication using url-encoded credentials. [#49664](https://github.com/ClickHouse/ClickHouse/pull/49664) ([Ilya Golshtein](https://github.com/ilejn)). -* Add function `arrayJaccardIndex` which computes the Jaccard similarity between two arrays. [#50076](https://github.com/ClickHouse/ClickHouse/pull/50076) ([FFFFFFFHHHHHHH](https://github.com/FFFFFFFHHHHHHH)). -* Add a column `is_obsolete` to `system.settings` and similar tables. Closes [#50819](https://github.com/ClickHouse/ClickHouse/issues/50819). [#50826](https://github.com/ClickHouse/ClickHouse/pull/50826) ([flynn](https://github.com/ucasfl)). -* Implement support of encrypted elements in configuration file. Added possibility to use encrypted text in leaf elements of configuration file. The text is encrypted using encryption codecs from `` section. [#50986](https://github.com/ClickHouse/ClickHouse/pull/50986) ([Roman Vasin](https://github.com/rvasin)). -* Grace Hash Join algorithm is now applicable to FULL and RIGHT JOINs. [#49483](https://github.com/ClickHouse/ClickHouse/issues/49483). [#51013](https://github.com/ClickHouse/ClickHouse/pull/51013) ([lgbo](https://github.com/lgbo-ustc)). -* Add `SYSTEM STOP LISTEN` query for more graceful termination. Closes [#47972](https://github.com/ClickHouse/ClickHouse/issues/47972). [#51016](https://github.com/ClickHouse/ClickHouse/pull/51016) ([Nikolay Degterinsky](https://github.com/evillique)). -* Add `input_format_csv_allow_variable_number_of_columns` options. [#51273](https://github.com/ClickHouse/ClickHouse/pull/51273) ([Dmitry Kardymon](https://github.com/kardymonds)). -* Another boring feature: add function `substring_index`, as in Spark or MySQL. [#51472](https://github.com/ClickHouse/ClickHouse/pull/51472) ([李扬](https://github.com/taiyang-li)). -* A system table `jemalloc_bins` to show stats for jemalloc bins. Example `SELECT *, size * (nmalloc - ndalloc) AS allocated_bytes FROM system.jemalloc_bins WHERE allocated_bytes > 0 ORDER BY allocated_bytes DESC LIMIT 10`. Enjoy. [#51674](https://github.com/ClickHouse/ClickHouse/pull/51674) ([Alexander Gololobov](https://github.com/davenger)). -* Add `RowBinaryWithDefaults` format with extra byte before each column as a flag for using the column's default value. Closes [#50854](https://github.com/ClickHouse/ClickHouse/issues/50854). [#51695](https://github.com/ClickHouse/ClickHouse/pull/51695) ([Kruglov Pavel](https://github.com/Avogar)). -* Added `default_temporary_table_engine` setting. Same as `default_table_engine` but for temporary tables. [#51292](https://github.com/ClickHouse/ClickHouse/issues/51292). [#51708](https://github.com/ClickHouse/ClickHouse/pull/51708) ([velavokr](https://github.com/velavokr)). -* Added new `initcap` / `initcapUTF8` functions which convert the first letter of each word to upper case and the rest to lower case. [#51735](https://github.com/ClickHouse/ClickHouse/pull/51735) ([Dmitry Kardymon](https://github.com/kardymonds)). -* Create table now supports `PRIMARY KEY` syntax in column definition. Columns are added to primary index in the same order columns are defined. [#51881](https://github.com/ClickHouse/ClickHouse/pull/51881) ([Ilya Yatsishin](https://github.com/qoega)). -* Added the possibility to use date and time format specifiers in log and error log file names, either in config files (`log` and `errorlog` tags) or command line arguments (`--log-file` and `--errorlog-file`). [#51945](https://github.com/ClickHouse/ClickHouse/pull/51945) ([Victor Krasnov](https://github.com/sirvickr)). -* Added Peak Memory Usage statistic to HTTP headers. [#51946](https://github.com/ClickHouse/ClickHouse/pull/51946) ([Dmitry Kardymon](https://github.com/kardymonds)). -* Added new `hasSubsequence` (+`CaseInsensitive` and `UTF8` versions) functions to match subsequences in strings. [#52050](https://github.com/ClickHouse/ClickHouse/pull/52050) ([Dmitry Kardymon](https://github.com/kardymonds)). -* Add `array_agg` as alias of `groupArray` for PostgreSQL compatibility. Closes [#52100](https://github.com/ClickHouse/ClickHouse/issues/52100). ### Documentation entry for user-facing changes. [#52135](https://github.com/ClickHouse/ClickHouse/pull/52135) ([flynn](https://github.com/ucasfl)). -* Add `any_value` as a compatibility alias for `any` aggregate function. Closes [#52140](https://github.com/ClickHouse/ClickHouse/issues/52140). [#52147](https://github.com/ClickHouse/ClickHouse/pull/52147) ([flynn](https://github.com/ucasfl)). -* Add aggregate function `array_concat_agg` for compatibility with BigQuery, it's alias of `groupArrayArray`. Closes [#52139](https://github.com/ClickHouse/ClickHouse/issues/52139). [#52149](https://github.com/ClickHouse/ClickHouse/pull/52149) ([flynn](https://github.com/ucasfl)). -* Add `OCTET_LENGTH` as an alias to `length`. Closes [#52153](https://github.com/ClickHouse/ClickHouse/issues/52153). [#52176](https://github.com/ClickHouse/ClickHouse/pull/52176) ([FFFFFFFHHHHHHH](https://github.com/FFFFFFFHHHHHHH)). -* Added `firstLine` function to extract the first line from the multi-line string. This closes [#51172](https://github.com/ClickHouse/ClickHouse/issues/51172). [#52209](https://github.com/ClickHouse/ClickHouse/pull/52209) ([Mikhail Koviazin](https://github.com/mkmkme)). -* Implement KQL-style formatting for the `Interval` data type. This is only needed for compatibility with the `Kusto` query language. [#45671](https://github.com/ClickHouse/ClickHouse/pull/45671) ([ltrk2](https://github.com/ltrk2)). -* Added query `SYSTEM FLUSH ASYNC INSERT QUEUE` which flushes all pending asynchronous inserts to the destination tables. Added a server-side setting `async_insert_queue_flush_on_shutdown` (`true` by default) which determines whether to flush queue of asynchronous inserts on graceful shutdown. Setting `async_insert_threads` is now a server-side setting. [#49160](https://github.com/ClickHouse/ClickHouse/pull/49160) ([Anton Popov](https://github.com/CurtizJ)). -* Aliases `current_database` and a new function `current_schemas` for compatibility with PostgreSQL. [#51076](https://github.com/ClickHouse/ClickHouse/pull/51076) ([Pedro Riera](https://github.com/priera)). -* Add alias for functions `today` (now available under the `curdate`/`current_date` names) and `now` (`current_timestamp`). [#52106](https://github.com/ClickHouse/ClickHouse/pull/52106) ([Lloyd-Pottiger](https://github.com/Lloyd-Pottiger)). -* Support `async_deduplication_token` for async insert. [#52136](https://github.com/ClickHouse/ClickHouse/pull/52136) ([Han Fei](https://github.com/hanfei1991)). -* Add new setting `disable_url_encoding` that allows to disable decoding/encoding path in uri in URL engine. [#52337](https://github.com/ClickHouse/ClickHouse/pull/52337) ([Kruglov Pavel](https://github.com/Avogar)). - -#### Performance Improvement -* Enable automatic selection of the sparse serialization format by default. It improves performance. The format is supported since version 22.1. After this change, downgrading to versions older than 22.1 might not be possible. A downgrade may require to set `ratio_of_defaults_for_sparse_serialization=0.9375` [55153](https://github.com/ClickHouse/ClickHouse/issues/55153). You can turn off the usage of the sparse serialization format by providing the `ratio_of_defaults_for_sparse_serialization = 1` setting for your MergeTree tables. [#49631](https://github.com/ClickHouse/ClickHouse/pull/49631) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Enable `move_all_conditions_to_prewhere` and `enable_multiple_prewhere_read_steps` settings by default. [#46365](https://github.com/ClickHouse/ClickHouse/pull/46365) ([Alexander Gololobov](https://github.com/davenger)). -* Improves performance of some queries by tuning allocator. [#46416](https://github.com/ClickHouse/ClickHouse/pull/46416) ([Azat Khuzhin](https://github.com/azat)). -* Now we use fixed-size tasks in `MergeTreePrefetchedReadPool` as in `MergeTreeReadPool`. Also from now we use connection pool for S3 requests. [#49732](https://github.com/ClickHouse/ClickHouse/pull/49732) ([Nikita Taranov](https://github.com/nickitat)). -* More pushdown to the right side of join. [#50532](https://github.com/ClickHouse/ClickHouse/pull/50532) ([Nikita Taranov](https://github.com/nickitat)). -* Improve grace_hash join by reserving hash table's size (resubmit). [#50875](https://github.com/ClickHouse/ClickHouse/pull/50875) ([lgbo](https://github.com/lgbo-ustc)). -* Waiting on lock in `OpenedFileCache` could be noticeable sometimes. We sharded it into multiple sub-maps (each with its own lock) to avoid contention. [#51341](https://github.com/ClickHouse/ClickHouse/pull/51341) ([Nikita Taranov](https://github.com/nickitat)). -* Move conditions with primary key columns to the end of PREWHERE chain. The idea is that conditions with PK columns are likely to be used in PK analysis and will not contribute much more to PREWHERE filtering. [#51958](https://github.com/ClickHouse/ClickHouse/pull/51958) ([Alexander Gololobov](https://github.com/davenger)). -* Speed up `COUNT(DISTINCT)` for String types by inlining SipHash. The performance experiments of *OnTime* on the ICX device (Intel Xeon Platinum 8380 CPU, 80 cores, 160 threads) show that this change could bring an improvement of *11.6%* to the QPS of the query *Q8* while having no impact on others. [#52036](https://github.com/ClickHouse/ClickHouse/pull/52036) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). -* Enable `allow_vertical_merges_from_compact_to_wide_parts` by default. It will save memory usage during merges. [#52295](https://github.com/ClickHouse/ClickHouse/pull/52295) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix incorrect projection analysis which invalidates primary keys. This issue only exists when `query_plan_optimize_primary_key = 1, query_plan_optimize_projection = 1`. This fixes [#48823](https://github.com/ClickHouse/ClickHouse/issues/48823). This fixes [#51173](https://github.com/ClickHouse/ClickHouse/issues/51173). [#52308](https://github.com/ClickHouse/ClickHouse/pull/52308) ([Amos Bird](https://github.com/amosbird)). -* Reduce the number of syscalls in `FileCache::loadMetadata` - this speeds up server startup if the filesystem cache is configured. [#52435](https://github.com/ClickHouse/ClickHouse/pull/52435) ([Raúl Marín](https://github.com/Algunenano)). -* Allow to have strict lower boundary for file segment size by downloading remaining data in the background. Minimum size of file segment (if actual file size is bigger) is configured as cache configuration setting `boundary_alignment`, by default `4Mi`. Number of background threads are configured as cache configuration setting `background_download_threads`, by default `2`. Also `max_file_segment_size` was increased from `8Mi` to `32Mi` in this PR. [#51000](https://github.com/ClickHouse/ClickHouse/pull/51000) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Decreased default timeouts for S3 from 30 seconds to 3 seconds, and for other HTTP from 180 seconds to 30 seconds. [#51171](https://github.com/ClickHouse/ClickHouse/pull/51171) ([Michael Kolupaev](https://github.com/al13n321)). -* New setting `merge_tree_determine_task_size_by_prewhere_columns` added. If set to `true` only sizes of the columns from `PREWHERE` section will be considered to determine reading task size. Otherwise all the columns from query are considered. [#52606](https://github.com/ClickHouse/ClickHouse/pull/52606) ([Nikita Taranov](https://github.com/nickitat)). - -#### Improvement -* Use read_bytes/total_bytes_to_read for progress bar in s3/file/url/... table functions for better progress indication. [#51286](https://github.com/ClickHouse/ClickHouse/pull/51286) ([Kruglov Pavel](https://github.com/Avogar)). -* Introduce a table setting `wait_for_unique_parts_send_before_shutdown_ms` which specify the amount of time replica will wait before closing interserver handler for replicated sends. Also fix inconsistency with shutdown of tables and interserver handlers: now server shutdown tables first and only after it shut down interserver handlers. [#51851](https://github.com/ClickHouse/ClickHouse/pull/51851) ([alesapin](https://github.com/alesapin)). -* Allow SQL standard `FETCH` without `OFFSET`. See https://antonz.org/sql-fetch/. [#51293](https://github.com/ClickHouse/ClickHouse/pull/51293) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Allow filtering HTTP headers for the URL/S3 table functions with the new `http_forbid_headers` section in config. Both exact matching and regexp filters are available. [#51038](https://github.com/ClickHouse/ClickHouse/pull/51038) ([Nikolay Degterinsky](https://github.com/evillique)). -* Don't show messages about `16 EiB` free space in logs, as they don't make sense. This closes [#49320](https://github.com/ClickHouse/ClickHouse/issues/49320). [#49342](https://github.com/ClickHouse/ClickHouse/pull/49342) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Properly check the limit for the `sleepEachRow` function. Add a setting `function_sleep_max_microseconds_per_block`. This is needed for generic query fuzzer. [#49343](https://github.com/ClickHouse/ClickHouse/pull/49343) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix two issues in `geoHash` functions. [#50066](https://github.com/ClickHouse/ClickHouse/pull/50066) ([李扬](https://github.com/taiyang-li)). -* Log async insert flush queries into `system.query_log`. [#51160](https://github.com/ClickHouse/ClickHouse/pull/51160) ([Raúl Marín](https://github.com/Algunenano)). -* Functions `date_diff` and `age` now support millisecond/microsecond unit and work with microsecond precision. [#51291](https://github.com/ClickHouse/ClickHouse/pull/51291) ([Dmitry Kardymon](https://github.com/kardymonds)). -* Improve parsing of path in clickhouse-keeper-client. [#51359](https://github.com/ClickHouse/ClickHouse/pull/51359) ([Azat Khuzhin](https://github.com/azat)). -* A third-party product depending on ClickHouse (Gluten: a Plugin to Double SparkSQL's Performance) had a bug. This fix avoids heap overflow in that third-party product while reading from HDFS. [#51386](https://github.com/ClickHouse/ClickHouse/pull/51386) ([李扬](https://github.com/taiyang-li)). -* Add ability to disable native copy for S3 (setting for BACKUP/RESTORE `allow_s3_native_copy`, and `s3_allow_native_copy` for `s3`/`s3_plain` disks). [#51448](https://github.com/ClickHouse/ClickHouse/pull/51448) ([Azat Khuzhin](https://github.com/azat)). -* Add column `primary_key_size` to `system.parts` table to show compressed primary key size on disk. Closes [#51400](https://github.com/ClickHouse/ClickHouse/issues/51400). [#51496](https://github.com/ClickHouse/ClickHouse/pull/51496) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). -* Allow running `clickhouse-local` without procfs, without home directory existing, and without name resolution plugins from glibc. [#51518](https://github.com/ClickHouse/ClickHouse/pull/51518) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add placeholder `%a` for rull filename in rename_files_after_processing setting. [#51603](https://github.com/ClickHouse/ClickHouse/pull/51603) ([Kruglov Pavel](https://github.com/Avogar)). -* Add column `modification_time` into `system.parts_columns`. [#51685](https://github.com/ClickHouse/ClickHouse/pull/51685) ([Azat Khuzhin](https://github.com/azat)). -* Add new setting `input_format_csv_use_default_on_bad_values` to CSV format that allows to insert default value when parsing of a single field failed. [#51716](https://github.com/ClickHouse/ClickHouse/pull/51716) ([KevinyhZou](https://github.com/KevinyhZou)). -* Added a crash log flush to the disk after the unexpected crash. [#51720](https://github.com/ClickHouse/ClickHouse/pull/51720) ([Alexey Gerasimchuck](https://github.com/Demilivor)). -* Fix behavior in dashboard page where errors unrelated to authentication are not shown. Also fix 'overlapping' chart behavior. [#51744](https://github.com/ClickHouse/ClickHouse/pull/51744) ([Zach Naimon](https://github.com/ArctypeZach)). -* Allow UUID to UInt128 conversion. [#51765](https://github.com/ClickHouse/ClickHouse/pull/51765) ([Dmitry Kardymon](https://github.com/kardymonds)). -* Added support for function `range` of Nullable arguments. [#51767](https://github.com/ClickHouse/ClickHouse/pull/51767) ([Dmitry Kardymon](https://github.com/kardymonds)). -* Convert condition like `toyear(x) = c` to `c1 <= x < c2`. [#51795](https://github.com/ClickHouse/ClickHouse/pull/51795) ([Han Fei](https://github.com/hanfei1991)). -* Improve MySQL compatibility of the statement `SHOW INDEX`. [#51796](https://github.com/ClickHouse/ClickHouse/pull/51796) ([Robert Schulze](https://github.com/rschu1ze)). -* Fix `use_structure_from_insertion_table_in_table_functions` does not work with `MATERIALIZED` and `ALIAS` columns. Closes [#51817](https://github.com/ClickHouse/ClickHouse/issues/51817). Closes [#51019](https://github.com/ClickHouse/ClickHouse/issues/51019). [#51825](https://github.com/ClickHouse/ClickHouse/pull/51825) ([flynn](https://github.com/ucasfl)). -* Cache dictionary now requests only unique keys from source. Closes [#51762](https://github.com/ClickHouse/ClickHouse/issues/51762). [#51853](https://github.com/ClickHouse/ClickHouse/pull/51853) ([Maksim Kita](https://github.com/kitaisreal)). -* Fixed the case when settings were not applied for EXPLAIN query when FORMAT was provided. [#51859](https://github.com/ClickHouse/ClickHouse/pull/51859) ([Nikita Taranov](https://github.com/nickitat)). -* Allow SETTINGS before FORMAT in DESCRIBE TABLE query for compatibility with SELECT query. Closes [#51544](https://github.com/ClickHouse/ClickHouse/issues/51544). [#51899](https://github.com/ClickHouse/ClickHouse/pull/51899) ([Nikolay Degterinsky](https://github.com/evillique)). -* Var-Int encoded integers (e.g. used by the native protocol) can now use the full 64-bit range. 3rd party clients are advised to update their var-int code accordingly. [#51905](https://github.com/ClickHouse/ClickHouse/pull/51905) ([Robert Schulze](https://github.com/rschu1ze)). -* Update certificates when they change without the need to manually SYSTEM RELOAD CONFIG. [#52030](https://github.com/ClickHouse/ClickHouse/pull/52030) ([Mike Kot](https://github.com/myrrc)). -* Added `allow_create_index_without_type` setting that allow to ignore `ADD INDEX` queries without specified `TYPE`. Standard SQL queries will just succeed without changing table schema. [#52056](https://github.com/ClickHouse/ClickHouse/pull/52056) ([Ilya Yatsishin](https://github.com/qoega)). -* Log messages are written to the `system.text_log` from the server startup. [#52113](https://github.com/ClickHouse/ClickHouse/pull/52113) ([Dmitry Kardymon](https://github.com/kardymonds)). -* In cases where the HTTP endpoint has multiple IP addresses and the first of them is unreachable, a timeout exception was thrown. Made session creation with handling all resolved endpoints. [#52116](https://github.com/ClickHouse/ClickHouse/pull/52116) ([Aleksei Filatov](https://github.com/aalexfvk)). -* Avro input format now supports Union even if it contains only a single type. Closes [#52131](https://github.com/ClickHouse/ClickHouse/issues/52131). [#52137](https://github.com/ClickHouse/ClickHouse/pull/52137) ([flynn](https://github.com/ucasfl)). -* Add setting `optimize_use_implicit_projections` to disable implicit projections (currently only `min_max_count` projection). [#52152](https://github.com/ClickHouse/ClickHouse/pull/52152) ([Amos Bird](https://github.com/amosbird)). -* It was possible to use the function `hasToken` for infinite loop. Now this possibility is removed. This closes [#52156](https://github.com/ClickHouse/ClickHouse/issues/52156). [#52160](https://github.com/ClickHouse/ClickHouse/pull/52160) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Create ZK ancestors optimistically. [#52195](https://github.com/ClickHouse/ClickHouse/pull/52195) ([Raúl Marín](https://github.com/Algunenano)). -* Fix [#50582](https://github.com/ClickHouse/ClickHouse/issues/50582). Avoid the `Not found column ... in block` error in some cases of reading in-order and constants. [#52259](https://github.com/ClickHouse/ClickHouse/pull/52259) ([Chen768959](https://github.com/Chen768959)). -* Check whether S2 geo primitives are invalid as early as possible on ClickHouse side. This closes: [#27090](https://github.com/ClickHouse/ClickHouse/issues/27090). [#52260](https://github.com/ClickHouse/ClickHouse/pull/52260) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Add back missing projection QueryAccessInfo when `query_plan_optimize_projection = 1`. This fixes [#50183](https://github.com/ClickHouse/ClickHouse/issues/50183) . This fixes [#50093](https://github.com/ClickHouse/ClickHouse/issues/50093). [#52327](https://github.com/ClickHouse/ClickHouse/pull/52327) ([Amos Bird](https://github.com/amosbird)). -* When `ZooKeeperRetriesControl` rethrows an error, it's more useful to see its original stack trace, not the one from `ZooKeeperRetriesControl` itself. [#52347](https://github.com/ClickHouse/ClickHouse/pull/52347) ([Vitaly Baranov](https://github.com/vitlibar)). -* Wait for zero copy replication lock even if some disks don't support it. [#52376](https://github.com/ClickHouse/ClickHouse/pull/52376) ([Raúl Marín](https://github.com/Algunenano)). -* Now interserver port will be closed only after tables are shut down. [#52498](https://github.com/ClickHouse/ClickHouse/pull/52498) ([alesapin](https://github.com/alesapin)). - -#### Experimental Feature -* Writing parquet files is 10x faster, it's multi-threaded now. Almost the same speed as reading. [#49367](https://github.com/ClickHouse/ClickHouse/pull/49367) ([Michael Kolupaev](https://github.com/al13n321)). This is controlled by the setting `output_format_parquet_use_custom_encoder` which is disabled by default, because the feature is non-ideal. -* Added support for [PRQL](https://prql-lang.org/) as a query language. [#50686](https://github.com/ClickHouse/ClickHouse/pull/50686) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). -* Allow to add disk name for custom disks. Previously custom disks would use an internal generated disk name. Now it will be possible with `disk = disk_(...)` (e.g. disk will have name `name`) . [#51552](https://github.com/ClickHouse/ClickHouse/pull/51552) ([Kseniia Sumarokova](https://github.com/kssenii)). This syntax can be changed in this release. -* (experimental MaterializedMySQL) Fixed crash when `mysqlxx::Pool::Entry` is used after it was disconnected. [#52063](https://github.com/ClickHouse/ClickHouse/pull/52063) ([Val Doroshchuk](https://github.com/valbok)). -* (experimental MaterializedMySQL) `CREATE TABLE ... AS SELECT` .. is now supported in MaterializedMySQL. [#52067](https://github.com/ClickHouse/ClickHouse/pull/52067) ([Val Doroshchuk](https://github.com/valbok)). -* (experimental MaterializedMySQL) Introduced automatic conversion of text types to utf8 for MaterializedMySQL. [#52084](https://github.com/ClickHouse/ClickHouse/pull/52084) ([Val Doroshchuk](https://github.com/valbok)). -* (experimental MaterializedMySQL) Now unquoted UTF-8 strings are supported in DDL for MaterializedMySQL. [#52318](https://github.com/ClickHouse/ClickHouse/pull/52318) ([Val Doroshchuk](https://github.com/valbok)). -* (experimental MaterializedMySQL) Now double quoted comments are supported in MaterializedMySQL. [#52355](https://github.com/ClickHouse/ClickHouse/pull/52355) ([Val Doroshchuk](https://github.com/valbok)). -* Upgrade Intel QPL from v1.1.0 to v1.2.0 2. Upgrade Intel accel-config from v3.5 to v4.0 3. Fixed issue that Device IOTLB miss has big perf. impact for IAA accelerators. [#52180](https://github.com/ClickHouse/ClickHouse/pull/52180) ([jasperzhu](https://github.com/jinjunzh)). -* The `session_timezone` setting (new in version 23.6) is demoted to experimental. [#52445](https://github.com/ClickHouse/ClickHouse/pull/52445) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Support ZooKeeper `reconfig` command for ClickHouse Keeper with incremental reconfiguration which can be enabled via `keeper_server.enable_reconfiguration` setting. Support adding servers, removing servers, and changing server priorities. [#49450](https://github.com/ClickHouse/ClickHouse/pull/49450) ([Mike Kot](https://github.com/myrrc)). It is suspected that this feature is incomplete. - -#### Build/Testing/Packaging Improvement -* Add experimental ClickHouse builds for Linux RISC-V 64 to CI. [#31398](https://github.com/ClickHouse/ClickHouse/pull/31398) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add integration test check with the enabled Analyzer. [#50926](https://github.com/ClickHouse/ClickHouse/pull/50926) [#52210](https://github.com/ClickHouse/ClickHouse/pull/52210) ([Dmitry Novik](https://github.com/novikd)). -* Reproducible builds for Rust. [#52395](https://github.com/ClickHouse/ClickHouse/pull/52395) ([Azat Khuzhin](https://github.com/azat)). -* Update Cargo dependencies. [#51721](https://github.com/ClickHouse/ClickHouse/pull/51721) ([Raúl Marín](https://github.com/Algunenano)). -* Make the function `CHColumnToArrowColumn::fillArrowArrayWithArrayColumnData` to work with nullable arrays, which are not possible in ClickHouse, but needed for Gluten. [#52112](https://github.com/ClickHouse/ClickHouse/pull/52112) ([李扬](https://github.com/taiyang-li)). -* We've updated the CCTZ library to master, but there are no user-visible changes. [#52124](https://github.com/ClickHouse/ClickHouse/pull/52124) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The `system.licenses` table now includes the hard-forked library Poco. This closes [#52066](https://github.com/ClickHouse/ClickHouse/issues/52066). [#52127](https://github.com/ClickHouse/ClickHouse/pull/52127) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Check that there are no cases of bad punctuation: whitespace before a comma like `Hello ,world` instead of `Hello, world`. [#52549](https://github.com/ClickHouse/ClickHouse/pull/52549) ([Alexey Milovidov](https://github.com/alexey-milovidov)). - -#### Bug Fix (user-visible misbehavior in an official stable release) -* Fix MaterializedPostgreSQL syncTables [#49698](https://github.com/ClickHouse/ClickHouse/pull/49698) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix projection with optimize_aggregators_of_group_by_keys [#49709](https://github.com/ClickHouse/ClickHouse/pull/49709) ([Amos Bird](https://github.com/amosbird)). -* Fix optimize_skip_unused_shards with JOINs [#51037](https://github.com/ClickHouse/ClickHouse/pull/51037) ([Azat Khuzhin](https://github.com/azat)). -* Fix formatDateTime() with fractional negative datetime64 [#51290](https://github.com/ClickHouse/ClickHouse/pull/51290) ([Dmitry Kardymon](https://github.com/kardymonds)). -* Functions `hasToken*` were totally wrong. Add a test for [#43358](https://github.com/ClickHouse/ClickHouse/issues/43358) [#51378](https://github.com/ClickHouse/ClickHouse/pull/51378) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix optimization to move functions before sorting. [#51481](https://github.com/ClickHouse/ClickHouse/pull/51481) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix Block structure mismatch in Pipe::unitePipes for FINAL [#51492](https://github.com/ClickHouse/ClickHouse/pull/51492) ([Nikita Taranov](https://github.com/nickitat)). -* Fix SIGSEGV for clusters with zero weight across all shards (fixes INSERT INTO FUNCTION clusterAllReplicas()) [#51545](https://github.com/ClickHouse/ClickHouse/pull/51545) ([Azat Khuzhin](https://github.com/azat)). -* Fix timeout for hedged requests [#51582](https://github.com/ClickHouse/ClickHouse/pull/51582) ([Azat Khuzhin](https://github.com/azat)). -* Fix logical error in ANTI join with NULL [#51601](https://github.com/ClickHouse/ClickHouse/pull/51601) ([vdimir](https://github.com/vdimir)). -* Fix for moving 'IN' conditions to PREWHERE [#51610](https://github.com/ClickHouse/ClickHouse/pull/51610) ([Alexander Gololobov](https://github.com/davenger)). -* Do not apply PredicateExpressionsOptimizer for ASOF/ANTI join [#51633](https://github.com/ClickHouse/ClickHouse/pull/51633) ([vdimir](https://github.com/vdimir)). -* Fix async insert with deduplication for ReplicatedMergeTree using merging algorithms [#51676](https://github.com/ClickHouse/ClickHouse/pull/51676) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix reading from empty column in `parseSipHashKey` [#51804](https://github.com/ClickHouse/ClickHouse/pull/51804) ([Nikita Taranov](https://github.com/nickitat)). -* Fix segfault when create invalid EmbeddedRocksdb table [#51847](https://github.com/ClickHouse/ClickHouse/pull/51847) ([Duc Canh Le](https://github.com/canhld94)). -* Fix inserts into MongoDB tables [#51876](https://github.com/ClickHouse/ClickHouse/pull/51876) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix deadlock on DatabaseCatalog shutdown [#51908](https://github.com/ClickHouse/ClickHouse/pull/51908) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Fix error in subquery operators [#51922](https://github.com/ClickHouse/ClickHouse/pull/51922) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix async connect to hosts with multiple ips [#51934](https://github.com/ClickHouse/ClickHouse/pull/51934) ([Kruglov Pavel](https://github.com/Avogar)). -* Do not remove inputs after ActionsDAG::merge [#51947](https://github.com/ClickHouse/ClickHouse/pull/51947) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Check refcount in `RemoveManyObjectStorageOperation::finalize` instead of `execute` [#51954](https://github.com/ClickHouse/ClickHouse/pull/51954) ([vdimir](https://github.com/vdimir)). -* Allow parametric UDFs [#51964](https://github.com/ClickHouse/ClickHouse/pull/51964) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Small fix for toDateTime64() for dates after 2283-12-31 [#52130](https://github.com/ClickHouse/ClickHouse/pull/52130) ([Andrey Zvonov](https://github.com/zvonand)). -* Fix ORDER BY tuple of WINDOW functions [#52145](https://github.com/ClickHouse/ClickHouse/pull/52145) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix incorrect projection analysis when aggregation expression contains monotonic functions [#52151](https://github.com/ClickHouse/ClickHouse/pull/52151) ([Amos Bird](https://github.com/amosbird)). -* Fix error in `groupArrayMoving` functions [#52161](https://github.com/ClickHouse/ClickHouse/pull/52161) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Disable direct join for range dictionary [#52187](https://github.com/ClickHouse/ClickHouse/pull/52187) ([Duc Canh Le](https://github.com/canhld94)). -* Fix sticky mutations test (and extremely rare race condition) [#52197](https://github.com/ClickHouse/ClickHouse/pull/52197) ([alesapin](https://github.com/alesapin)). -* Fix race in Web disk [#52211](https://github.com/ClickHouse/ClickHouse/pull/52211) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix data race in Connection::setAsyncCallback on unknown packet from server [#52219](https://github.com/ClickHouse/ClickHouse/pull/52219) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix temp data deletion on startup, add test [#52275](https://github.com/ClickHouse/ClickHouse/pull/52275) ([vdimir](https://github.com/vdimir)). -* Don't use minmax_count projections when counting nullable columns [#52297](https://github.com/ClickHouse/ClickHouse/pull/52297) ([Amos Bird](https://github.com/amosbird)). -* MergeTree/ReplicatedMergeTree should use server timezone for log entries [#52325](https://github.com/ClickHouse/ClickHouse/pull/52325) ([Azat Khuzhin](https://github.com/azat)). -* Fix parameterized view with cte and multiple usage [#52328](https://github.com/ClickHouse/ClickHouse/pull/52328) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Disable expression templates for time intervals [#52335](https://github.com/ClickHouse/ClickHouse/pull/52335) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Fix `apply_snapshot` in Keeper [#52358](https://github.com/ClickHouse/ClickHouse/pull/52358) ([Antonio Andelic](https://github.com/antonio2368)). -* Update build-osx.md [#52377](https://github.com/ClickHouse/ClickHouse/pull/52377) ([AlexBykovski](https://github.com/AlexBykovski)). -* Fix `countSubstrings` hang with empty needle and a column haystack [#52409](https://github.com/ClickHouse/ClickHouse/pull/52409) ([Sergei Trifonov](https://github.com/serxa)). -* Fix normal projection with merge table [#52432](https://github.com/ClickHouse/ClickHouse/pull/52432) ([Amos Bird](https://github.com/amosbird)). -* Fix possible double-free in Aggregator [#52439](https://github.com/ClickHouse/ClickHouse/pull/52439) ([Nikita Taranov](https://github.com/nickitat)). -* Fixed inserting into Buffer engine [#52440](https://github.com/ClickHouse/ClickHouse/pull/52440) ([Vasily Nemkov](https://github.com/Enmk)). -* The implementation of AnyHash was non-conformant. [#52448](https://github.com/ClickHouse/ClickHouse/pull/52448) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Check recursion depth in OptimizedRegularExpression [#52451](https://github.com/ClickHouse/ClickHouse/pull/52451) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix data-race DatabaseReplicated::startupTables()/canExecuteReplicatedMetadataAlter() [#52490](https://github.com/ClickHouse/ClickHouse/pull/52490) ([Azat Khuzhin](https://github.com/azat)). -* Fix abort in function `transform` [#52513](https://github.com/ClickHouse/ClickHouse/pull/52513) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix lightweight delete after drop of projection [#52517](https://github.com/ClickHouse/ClickHouse/pull/52517) ([Anton Popov](https://github.com/CurtizJ)). -* Fix possible error "Cannot drain connections: cancel first" [#52585](https://github.com/ClickHouse/ClickHouse/pull/52585) ([Kruglov Pavel](https://github.com/Avogar)). - - -### ClickHouse release 23.6, 2023-06-29 - -#### Backward Incompatible Change -* Delete feature `do_not_evict_index_and_mark_files` in the fs cache. This feature was only making things worse. [#51253](https://github.com/ClickHouse/ClickHouse/pull/51253) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Remove ALTER support for experimental LIVE VIEW. [#51287](https://github.com/ClickHouse/ClickHouse/pull/51287) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Decrease the default values for `http_max_field_value_size` and `http_max_field_name_size` to 128 KiB. [#51163](https://github.com/ClickHouse/ClickHouse/pull/51163) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* CGroups metrics related to CPU are replaced with one metric, `CGroupMaxCPU` for better usability. The `Normalized` CPU usage metrics will be normalized to CGroups limits instead of the total number of CPUs when they are set. This closes [#50836](https://github.com/ClickHouse/ClickHouse/issues/50836). [#50835](https://github.com/ClickHouse/ClickHouse/pull/50835) ([Alexey Milovidov](https://github.com/alexey-milovidov)). - -#### New Feature -* The function `transform` as well as `CASE` with value matching started to support all data types. This closes [#29730](https://github.com/ClickHouse/ClickHouse/issues/29730). This closes [#32387](https://github.com/ClickHouse/ClickHouse/issues/32387). This closes [#50827](https://github.com/ClickHouse/ClickHouse/issues/50827). This closes [#31336](https://github.com/ClickHouse/ClickHouse/issues/31336). This closes [#40493](https://github.com/ClickHouse/ClickHouse/issues/40493). [#51351](https://github.com/ClickHouse/ClickHouse/pull/51351) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Added option `--rename_files_after_processing `. This closes [#34207](https://github.com/ClickHouse/ClickHouse/issues/34207). [#49626](https://github.com/ClickHouse/ClickHouse/pull/49626) ([alekseygolub](https://github.com/alekseygolub)). -* Add support for `TRUNCATE` modifier in `INTO OUTFILE` clause. Suggest using `APPEND` or `TRUNCATE` for `INTO OUTFILE` when file exists. [#50950](https://github.com/ClickHouse/ClickHouse/pull/50950) ([alekar](https://github.com/alekar)). -* Add table engine `Redis` and table function `redis`. It allows querying external Redis servers. [#50150](https://github.com/ClickHouse/ClickHouse/pull/50150) ([JackyWoo](https://github.com/JackyWoo)). -* Allow to skip empty files in file/s3/url/hdfs table functions using settings `s3_skip_empty_files`, `hdfs_skip_empty_files`, `engine_file_skip_empty_files`, `engine_url_skip_empty_files`. [#50364](https://github.com/ClickHouse/ClickHouse/pull/50364) ([Kruglov Pavel](https://github.com/Avogar)). -* Add a new setting named `use_mysql_types_in_show_columns` to alter the `SHOW COLUMNS` SQL statement to display MySQL equivalent types when a client is connected via the MySQL compatibility port. [#49577](https://github.com/ClickHouse/ClickHouse/pull/49577) ([Thomas Panetti](https://github.com/tpanetti)). -* Clickhouse-client can now be called with a connection string instead of "--host", "--port", "--user" etc. [#50689](https://github.com/ClickHouse/ClickHouse/pull/50689) ([Alexey Gerasimchuck](https://github.com/Demilivor)). -* Add setting `session_timezone`; it is used as the default timezone for a session when not explicitly specified. [#44149](https://github.com/ClickHouse/ClickHouse/pull/44149) ([Andrey Zvonov](https://github.com/zvonand)). -* Codec DEFLATE_QPL is now controlled via server setting "enable_deflate_qpl_codec" (default: false) instead of setting "allow_experimental_codecs". This marks DEFLATE_QPL non-experimental. [#50775](https://github.com/ClickHouse/ClickHouse/pull/50775) ([Robert Schulze](https://github.com/rschu1ze)). - -#### Performance Improvement -* Improved scheduling of merge selecting and cleanup tasks in `ReplicatedMergeTree`. The tasks will not be executed too frequently when there's nothing to merge or cleanup. Added settings `max_merge_selecting_sleep_ms`, `merge_selecting_sleep_slowdown_factor`, `max_cleanup_delay_period` and `cleanup_thread_preferred_points_per_iteration`. It should close [#31919](https://github.com/ClickHouse/ClickHouse/issues/31919). [#50107](https://github.com/ClickHouse/ClickHouse/pull/50107) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Make filter push down through cross join. [#50605](https://github.com/ClickHouse/ClickHouse/pull/50605) ([Han Fei](https://github.com/hanfei1991)). -* Improve performance with enabled QueryProfiler using thread-local timer_id instead of global object. [#48778](https://github.com/ClickHouse/ClickHouse/pull/48778) ([Jiebin Sun](https://github.com/jiebinn)). -* Rewrite CapnProto input/output format to improve its performance. Map column names and CapnProto fields case insensitive, fix reading/writing of nested structure fields. [#49752](https://github.com/ClickHouse/ClickHouse/pull/49752) ([Kruglov Pavel](https://github.com/Avogar)). -* Optimize parquet write performance for parallel threads. [#50102](https://github.com/ClickHouse/ClickHouse/pull/50102) ([Hongbin Ma](https://github.com/binmahone)). -* Disable `parallelize_output_from_storages` for processing MATERIALIZED VIEWs and storages with one block only. [#50214](https://github.com/ClickHouse/ClickHouse/pull/50214) ([Azat Khuzhin](https://github.com/azat)). -* Merge PR [#46558](https://github.com/ClickHouse/ClickHouse/pull/46558). Avoid block permutation during sort if the block is already sorted. [#50697](https://github.com/ClickHouse/ClickHouse/pull/50697) ([Alexey Milovidov](https://github.com/alexey-milovidov), [Maksim Kita](https://github.com/kitaisreal)). -* Make multiple list requests to ZooKeeper in parallel to speed up reading from system.zookeeper table. [#51042](https://github.com/ClickHouse/ClickHouse/pull/51042) ([Alexander Gololobov](https://github.com/davenger)). -* Speedup initialization of DateTime lookup tables for time zones. This should reduce startup/connect time of clickhouse-client especially in debug build as it is rather heavy. [#51347](https://github.com/ClickHouse/ClickHouse/pull/51347) ([Alexander Gololobov](https://github.com/davenger)). -* Fix data lakes slowness because of synchronous head requests. (Related to Iceberg/Deltalake/Hudi being slow with a lot of files). [#50976](https://github.com/ClickHouse/ClickHouse/pull/50976) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Do not read all the columns from right GLOBAL JOIN table. [#50721](https://github.com/ClickHouse/ClickHouse/pull/50721) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). - -#### Experimental Feature -* Support parallel replicas with the analyzer. [#50441](https://github.com/ClickHouse/ClickHouse/pull/50441) ([Raúl Marín](https://github.com/Algunenano)). -* Add random sleep before large merges/mutations execution to split load more evenly between replicas in case of zero-copy replication. [#51282](https://github.com/ClickHouse/ClickHouse/pull/51282) ([alesapin](https://github.com/alesapin)). -* Do not replicate `ALTER PARTITION` queries and mutations through `Replicated` database if it has only one shard and the underlying table is `ReplicatedMergeTree`. [#51049](https://github.com/ClickHouse/ClickHouse/pull/51049) ([Alexander Tokmakov](https://github.com/tavplubix)). - -#### Improvement -* Relax the thresholds for "too many parts" to be more modern. Return the backpressure during long-running insert queries. [#50856](https://github.com/ClickHouse/ClickHouse/pull/50856) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Allow to cast IPv6 to IPv4 address for CIDR ::ffff:0:0/96 (IPv4-mapped addresses). [#49759](https://github.com/ClickHouse/ClickHouse/pull/49759) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Update MongoDB protocol to support MongoDB 5.1 version and newer. Support for the versions with the old protocol (<3.6) is preserved. Closes [#45621](https://github.com/ClickHouse/ClickHouse/issues/45621), [#49879](https://github.com/ClickHouse/ClickHouse/issues/49879). [#50061](https://github.com/ClickHouse/ClickHouse/pull/50061) ([Nikolay Degterinsky](https://github.com/evillique)). -* Add setting `input_format_max_bytes_to_read_for_schema_inference` to limit the number of bytes to read in schema inference. Closes [#50577](https://github.com/ClickHouse/ClickHouse/issues/50577). [#50592](https://github.com/ClickHouse/ClickHouse/pull/50592) ([Kruglov Pavel](https://github.com/Avogar)). -* Respect setting `input_format_null_as_default` in schema inference. [#50602](https://github.com/ClickHouse/ClickHouse/pull/50602) ([Kruglov Pavel](https://github.com/Avogar)). -* Allow to skip trailing empty lines in CSV/TSV/CustomSeparated formats via settings `input_format_csv_skip_trailing_empty_lines`, `input_format_tsv_skip_trailing_empty_lines` and `input_format_custom_skip_trailing_empty_lines` (disabled by default). Closes [#49315](https://github.com/ClickHouse/ClickHouse/issues/49315). [#50635](https://github.com/ClickHouse/ClickHouse/pull/50635) ([Kruglov Pavel](https://github.com/Avogar)). -* Functions "toDateOrDefault|OrNull" and "accuateCast[OrDefault|OrNull]" now correctly parse numeric arguments. [#50709](https://github.com/ClickHouse/ClickHouse/pull/50709) ([Dmitry Kardymon](https://github.com/kardymonds)). -* Support CSV with whitespace or `\t` field delimiters, and these delimiters are supported in Spark. [#50712](https://github.com/ClickHouse/ClickHouse/pull/50712) ([KevinyhZou](https://github.com/KevinyhZou)). -* Settings `number_of_mutations_to_delay` and `number_of_mutations_to_throw` are enabled by default now with values 500 and 1000 respectively. [#50726](https://github.com/ClickHouse/ClickHouse/pull/50726) ([Anton Popov](https://github.com/CurtizJ)). -* The dashboard correctly shows missing values. This closes [#50831](https://github.com/ClickHouse/ClickHouse/issues/50831). [#50832](https://github.com/ClickHouse/ClickHouse/pull/50832) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Added the possibility to use date and time arguments in the syslog timestamp format in functions `parseDateTimeBestEffort*` and `parseDateTime64BestEffort*`. [#50925](https://github.com/ClickHouse/ClickHouse/pull/50925) ([Victor Krasnov](https://github.com/sirvickr)). -* Command line parameter "--password" in clickhouse-client can now be specified only once. [#50966](https://github.com/ClickHouse/ClickHouse/pull/50966) ([Alexey Gerasimchuck](https://github.com/Demilivor)). -* Use `hash_of_all_files` from `system.parts` to check identity of parts during on-cluster backups. [#50997](https://github.com/ClickHouse/ClickHouse/pull/50997) ([Vitaly Baranov](https://github.com/vitlibar)). -* The system table zookeeper_connection connected_time identifies the time when the connection is established (standard format), and session_uptime_elapsed_seconds is added, which labels the duration of the established connection session (in seconds). [#51026](https://github.com/ClickHouse/ClickHouse/pull/51026) ([郭小龙](https://github.com/guoxiaolongzte)). -* Improve the progress bar for file/s3/hdfs/url table functions by using chunk size from source data and using incremental total size counting in each thread. Fix the progress bar for *Cluster functions. This closes [#47250](https://github.com/ClickHouse/ClickHouse/issues/47250). [#51088](https://github.com/ClickHouse/ClickHouse/pull/51088) ([Kruglov Pavel](https://github.com/Avogar)). -* Add total_bytes_to_read to the Progress packet in TCP protocol for better Progress bar. [#51158](https://github.com/ClickHouse/ClickHouse/pull/51158) ([Kruglov Pavel](https://github.com/Avogar)). -* Better checking of data parts on disks with filesystem cache. [#51164](https://github.com/ClickHouse/ClickHouse/pull/51164) ([Anton Popov](https://github.com/CurtizJ)). -* Fix sometimes not correct current_elements_num in fs cache. [#51242](https://github.com/ClickHouse/ClickHouse/pull/51242) ([Kseniia Sumarokova](https://github.com/kssenii)). - -#### Build/Testing/Packaging Improvement -* Add embedded keeper-client to standalone keeper binary. [#50964](https://github.com/ClickHouse/ClickHouse/pull/50964) ([pufit](https://github.com/pufit)). -* Actual LZ4 version is used now. [#50621](https://github.com/ClickHouse/ClickHouse/pull/50621) ([Nikita Taranov](https://github.com/nickitat)). -* ClickHouse server will print the list of changed settings on fatal errors. This closes [#51137](https://github.com/ClickHouse/ClickHouse/issues/51137). [#51138](https://github.com/ClickHouse/ClickHouse/pull/51138) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Allow building ClickHouse with clang-17. [#51300](https://github.com/ClickHouse/ClickHouse/pull/51300) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* [SQLancer](https://github.com/sqlancer/sqlancer) check is considered stable as bugs that were triggered by it are fixed. Now failures of SQLancer check will be reported as failed check status. [#51340](https://github.com/ClickHouse/ClickHouse/pull/51340) ([Ilya Yatsishin](https://github.com/qoega)). -* Split huge `RUN` in Dockerfile into smaller conditional. Install the necessary tools on demand in the same `RUN` layer, and remove them after that. Upgrade the OS only once at the beginning. Use a modern way to check the signed repository. Downgrade the base repo to ubuntu:20.04 to address the issues on older docker versions. Upgrade golang version to address golang vulnerabilities. [#51504](https://github.com/ClickHouse/ClickHouse/pull/51504) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Improve aliases for clickhouse binary (now `ch`/`clickhouse` is `clickhouse-local` or `clickhouse` depends on the arguments) and add bash completion for new aliases. [#58344](https://github.com/ClickHouse/ClickHouse/pull/58344) ([Azat Khuzhin](https://github.com/azat)). +* Add settings changes check to CI to check that all settings changes are reflected in settings changes history. [#58555](https://github.com/ClickHouse/ClickHouse/pull/58555) ([Kruglov Pavel](https://github.com/Avogar)). +* Use tables directly attached from S3 in stateful tests. [#58791](https://github.com/ClickHouse/ClickHouse/pull/58791) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Save the whole `fuzzer.log` as an archive instead of the last 100k lines. `tail -n 100000` often removes lines with table definitions. Example:. [#58821](https://github.com/ClickHouse/ClickHouse/pull/58821) ([Dmitry Novik](https://github.com/novikd)). +* Enable Rust on macOS with Aarch64 (this will add fuzzy search in client with skim and the PRQL language, though I don't think that are people who host ClickHouse on darwin, so it is mostly for fuzzy search in client I would say). [#59272](https://github.com/ClickHouse/ClickHouse/pull/59272) ([Azat Khuzhin](https://github.com/azat)). +* Fix aggregation issue in mixed x86_64 and ARM clusters [#59132](https://github.com/ClickHouse/ClickHouse/pull/59132) ([Harry Lee](https://github.com/HarryLeeIBM)). #### Bug Fix (user-visible misbehavior in an official stable release) -* Report loading status for executable dictionaries correctly [#48775](https://github.com/ClickHouse/ClickHouse/pull/48775) ([Anton Kozlov](https://github.com/tonickkozlov)). -* Proper mutation of skip indices and projections [#50104](https://github.com/ClickHouse/ClickHouse/pull/50104) ([Amos Bird](https://github.com/amosbird)). -* Cleanup moving parts [#50489](https://github.com/ClickHouse/ClickHouse/pull/50489) ([vdimir](https://github.com/vdimir)). -* Fix backward compatibility for IP types hashing in aggregate functions [#50551](https://github.com/ClickHouse/ClickHouse/pull/50551) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Fix Log family table return wrong rows count after truncate [#50585](https://github.com/ClickHouse/ClickHouse/pull/50585) ([flynn](https://github.com/ucasfl)). -* Fix bug in `uniqExact` parallel merging [#50590](https://github.com/ClickHouse/ClickHouse/pull/50590) ([Nikita Taranov](https://github.com/nickitat)). -* Revert recent grace hash join changes [#50699](https://github.com/ClickHouse/ClickHouse/pull/50699) ([vdimir](https://github.com/vdimir)). -* Query Cache: Try to fix bad cast from `ColumnConst` to `ColumnVector` [#50704](https://github.com/ClickHouse/ClickHouse/pull/50704) ([Robert Schulze](https://github.com/rschu1ze)). -* Avoid storing logs in Keeper containing unknown operation [#50751](https://github.com/ClickHouse/ClickHouse/pull/50751) ([Antonio Andelic](https://github.com/antonio2368)). -* SummingMergeTree support for DateTime64 [#50797](https://github.com/ClickHouse/ClickHouse/pull/50797) ([Jordi Villar](https://github.com/jrdi)). -* Add compatibility setting for non-const timezones [#50834](https://github.com/ClickHouse/ClickHouse/pull/50834) ([Robert Schulze](https://github.com/rschu1ze)). -* Fix hashing of LDAP params in the cache entries [#50865](https://github.com/ClickHouse/ClickHouse/pull/50865) ([Julian Maicher](https://github.com/jmaicher)). -* Fallback to parsing big integer from String instead of exception in Parquet format [#50873](https://github.com/ClickHouse/ClickHouse/pull/50873) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix checking the lock file too often while writing a backup [#50889](https://github.com/ClickHouse/ClickHouse/pull/50889) ([Vitaly Baranov](https://github.com/vitlibar)). -* Do not apply projection if read-in-order was enabled. [#50923](https://github.com/ClickHouse/ClickHouse/pull/50923) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix race in the Azure blob storage iterator [#50936](https://github.com/ClickHouse/ClickHouse/pull/50936) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Fix erroneous `sort_description` propagation in `CreatingSets` [#50955](https://github.com/ClickHouse/ClickHouse/pull/50955) ([Nikita Taranov](https://github.com/nickitat)). -* Fix Iceberg v2 optional metadata parsing [#50974](https://github.com/ClickHouse/ClickHouse/pull/50974) ([Kseniia Sumarokova](https://github.com/kssenii)). -* MaterializedMySQL: Keep parentheses for empty table overrides [#50977](https://github.com/ClickHouse/ClickHouse/pull/50977) ([Val Doroshchuk](https://github.com/valbok)). -* Fix crash in BackupCoordinationStageSync::setError() [#51012](https://github.com/ClickHouse/ClickHouse/pull/51012) ([Vitaly Baranov](https://github.com/vitlibar)). -* Fix subtly broken copy-on-write of ColumnLowCardinality dictionary [#51064](https://github.com/ClickHouse/ClickHouse/pull/51064) ([Michael Kolupaev](https://github.com/al13n321)). -* Generate safe IVs [#51086](https://github.com/ClickHouse/ClickHouse/pull/51086) ([Salvatore Mesoraca](https://github.com/aiven-sal)). -* Fix ineffective query cache for SELECTs with subqueries [#51132](https://github.com/ClickHouse/ClickHouse/pull/51132) ([Robert Schulze](https://github.com/rschu1ze)). -* Fix Set index with constant nullable comparison. [#51205](https://github.com/ClickHouse/ClickHouse/pull/51205) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix a crash in s3 and s3Cluster functions [#51209](https://github.com/ClickHouse/ClickHouse/pull/51209) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix a crash with compiled expressions [#51231](https://github.com/ClickHouse/ClickHouse/pull/51231) ([LiuNeng](https://github.com/liuneng1994)). -* Fix use-after-free in StorageURL when switching URLs [#51260](https://github.com/ClickHouse/ClickHouse/pull/51260) ([Michael Kolupaev](https://github.com/al13n321)). -* Updated check for parameterized view [#51272](https://github.com/ClickHouse/ClickHouse/pull/51272) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Fix multiple writing of same file to backup [#51299](https://github.com/ClickHouse/ClickHouse/pull/51299) ([Vitaly Baranov](https://github.com/vitlibar)). -* Fix fuzzer failure in ActionsDAG [#51301](https://github.com/ClickHouse/ClickHouse/pull/51301) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Remove garbage from function `transform` [#51350](https://github.com/ClickHouse/ClickHouse/pull/51350) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add join keys conversion for nested LowCardinality [#51550](https://github.com/ClickHouse/ClickHouse/pull/51550) ([vdimir](https://github.com/vdimir)). +* Flatten only true Nested type if flatten_nested=1, not all Array(Tuple) [#56132](https://github.com/ClickHouse/ClickHouse/pull/56132) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix a bug with projections and the `aggregate_functions_null_for_empty` setting during insertion. [#56944](https://github.com/ClickHouse/ClickHouse/pull/56944) ([Amos Bird](https://github.com/amosbird)). +* Fixed potential exception due to stale profile UUID [#57263](https://github.com/ClickHouse/ClickHouse/pull/57263) ([Vasily Nemkov](https://github.com/Enmk)). +* Fix working with read buffers in StreamingFormatExecutor [#57438](https://github.com/ClickHouse/ClickHouse/pull/57438) ([Kruglov Pavel](https://github.com/Avogar)). +* Ignore MVs with dropped target table during pushing to views [#57520](https://github.com/ClickHouse/ClickHouse/pull/57520) ([Kruglov Pavel](https://github.com/Avogar)). +* Eliminate possible race between ALTER_METADATA and MERGE_PARTS [#57755](https://github.com/ClickHouse/ClickHouse/pull/57755) ([Azat Khuzhin](https://github.com/azat)). +* Fix the expressions order bug in group by with rollup [#57786](https://github.com/ClickHouse/ClickHouse/pull/57786) ([Chen768959](https://github.com/Chen768959)). +* A fix for the obsolete "zero-copy" replication feature: Fix lost blobs after dropping a replica with broken detached parts [#58333](https://github.com/ClickHouse/ClickHouse/pull/58333) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Allow users to work with symlinks in user_files_path [#58447](https://github.com/ClickHouse/ClickHouse/pull/58447) ([Duc Canh Le](https://github.com/canhld94)). +* Fix a crash when graphite table does not have an agg function [#58453](https://github.com/ClickHouse/ClickHouse/pull/58453) ([Duc Canh Le](https://github.com/canhld94)). +* Delay reading from StorageKafka to allow multiple reads in materialized views [#58477](https://github.com/ClickHouse/ClickHouse/pull/58477) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Fix a stupid case of intersecting parts [#58482](https://github.com/ClickHouse/ClickHouse/pull/58482) ([Alexander Tokmakov](https://github.com/tavplubix)). +* MergeTreePrefetchedReadPool disable for LIMIT only queries [#58505](https://github.com/ClickHouse/ClickHouse/pull/58505) ([Maksim Kita](https://github.com/kitaisreal)). +* Enable ordinary databases while restoration [#58520](https://github.com/ClickHouse/ClickHouse/pull/58520) ([Jihyuk Bok](https://github.com/tomahawk28)). +* Fix Apache Hive threadpool reading for ORC/Parquet/... [#58537](https://github.com/ClickHouse/ClickHouse/pull/58537) ([sunny](https://github.com/sunny19930321)). +* Hide credentials in `system.backup_log`'s `base_backup_name` column [#58550](https://github.com/ClickHouse/ClickHouse/pull/58550) ([Daniel Pozo Escalona](https://github.com/danipozo)). +* `toStartOfInterval` for milli- microsencods values rounding [#58557](https://github.com/ClickHouse/ClickHouse/pull/58557) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Disable `max_joined_block_rows` in ConcurrentHashJoin [#58595](https://github.com/ClickHouse/ClickHouse/pull/58595) ([vdimir](https://github.com/vdimir)). +* Fix join using nullable in the old analyzer [#58596](https://github.com/ClickHouse/ClickHouse/pull/58596) ([vdimir](https://github.com/vdimir)). +* `makeDateTime64`: Allow non-const fraction argument [#58597](https://github.com/ClickHouse/ClickHouse/pull/58597) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix possible NULL dereference during symbolizing inline frames [#58607](https://github.com/ClickHouse/ClickHouse/pull/58607) ([Azat Khuzhin](https://github.com/azat)). +* Improve isolation of query cache entries under re-created users or role switches [#58611](https://github.com/ClickHouse/ClickHouse/pull/58611) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix broken partition key analysis when doing projection optimization [#58638](https://github.com/ClickHouse/ClickHouse/pull/58638) ([Amos Bird](https://github.com/amosbird)). +* Query cache: Fix per-user quota [#58731](https://github.com/ClickHouse/ClickHouse/pull/58731) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix stream partitioning in parallel window functions [#58739](https://github.com/ClickHouse/ClickHouse/pull/58739) ([Dmitry Novik](https://github.com/novikd)). +* Fix double destroy call on exception throw in addBatchLookupTable8 [#58745](https://github.com/ClickHouse/ClickHouse/pull/58745) ([Raúl Marín](https://github.com/Algunenano)). +* Don't process requests in Keeper during shutdown [#58765](https://github.com/ClickHouse/ClickHouse/pull/58765) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix a null pointer dereference in `SlabsPolygonIndex::find` [#58771](https://github.com/ClickHouse/ClickHouse/pull/58771) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Fix JSONExtract function for LowCardinality(Nullable) columns [#58808](https://github.com/ClickHouse/ClickHouse/pull/58808) ([vdimir](https://github.com/vdimir)). +* A fix for unexpected accumulation of memory usage while creating a huge number of tables by CREATE and DROP. [#58831](https://github.com/ClickHouse/ClickHouse/pull/58831) ([Maksim Kita](https://github.com/kitaisreal)). +* Multiple read file log storage in mv [#58877](https://github.com/ClickHouse/ClickHouse/pull/58877) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Restriction for the access key id for s3. [#58900](https://github.com/ClickHouse/ClickHouse/pull/58900) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). +* Fix possible crash in clickhouse-local during loading suggestions [#58907](https://github.com/ClickHouse/ClickHouse/pull/58907) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix crash when `indexHint` is used [#58911](https://github.com/ClickHouse/ClickHouse/pull/58911) ([Dmitry Novik](https://github.com/novikd)). +* Fix StorageURL forgetting headers on server restart [#58933](https://github.com/ClickHouse/ClickHouse/pull/58933) ([Michael Kolupaev](https://github.com/al13n321)). +* Analyzer: fix storage replacement with insertion block [#58958](https://github.com/ClickHouse/ClickHouse/pull/58958) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Fix seek in ReadBufferFromZipArchive [#58966](https://github.com/ClickHouse/ClickHouse/pull/58966) ([Michael Kolupaev](https://github.com/al13n321)). +* A fix for experimental inverted indices (don't use in production): `DROP INDEX` of inverted index now removes all relevant files from persistence [#59040](https://github.com/ClickHouse/ClickHouse/pull/59040) ([mochi](https://github.com/MochiXu)). +* Fix data race on query_factories_info [#59049](https://github.com/ClickHouse/ClickHouse/pull/59049) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Disable "Too many redirects" error retry [#59099](https://github.com/ClickHouse/ClickHouse/pull/59099) ([skyoct](https://github.com/skyoct)). +* Fix not started database shutdown deadlock [#59137](https://github.com/ClickHouse/ClickHouse/pull/59137) ([Sergei Trifonov](https://github.com/serxa)). +* Fix: LIMIT BY and LIMIT in distributed query [#59153](https://github.com/ClickHouse/ClickHouse/pull/59153) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix crash with nullable timezone for `toString` [#59190](https://github.com/ClickHouse/ClickHouse/pull/59190) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Fix abort in iceberg metadata on bad file paths [#59275](https://github.com/ClickHouse/ClickHouse/pull/59275) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix architecture name in select of Rust target [#59307](https://github.com/ClickHouse/ClickHouse/pull/59307) ([p1rattttt](https://github.com/p1rattttt)). +* Fix a logical error about "not-ready set" for querying from `system.tables` with a subquery in the IN clause. [#59351](https://github.com/ClickHouse/ClickHouse/pull/59351) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). - -### ClickHouse release 23.5, 2023-06-08 - -#### Upgrade Notes -* Compress marks and primary key by default. It significantly reduces the cold query time. Upgrade notes: the support for compressed marks and primary key has been added in version 22.9. If you turned on compressed marks or primary key or installed version 23.5 or newer, which has compressed marks or primary key on by default, you will not be able to downgrade to version 22.8 or earlier. You can also explicitly disable compressed marks or primary keys by specifying the `compress_marks` and `compress_primary_key` settings in the `` section of the server configuration file. **Upgrade notes:** If you upgrade from versions prior to 22.9, you should either upgrade all replicas at once or disable the compression before upgrade, or upgrade through an intermediate version, where the compressed marks are supported but not enabled by default, such as 23.3. [#42587](https://github.com/ClickHouse/ClickHouse/pull/42587) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Make local object storage work consistently with s3 object storage, fix problem with append (closes [#48465](https://github.com/ClickHouse/ClickHouse/issues/48465)), make it configurable as independent storage. The change is backward incompatible because the cache on top of local object storage is not compatible to previous versions. [#48791](https://github.com/ClickHouse/ClickHouse/pull/48791) ([Kseniia Sumarokova](https://github.com/kssenii)). -* The experimental feature "in-memory data parts" is removed. The data format is still supported, but the settings are no-op, and compact or wide parts will be used instead. This closes [#45409](https://github.com/ClickHouse/ClickHouse/issues/45409). [#49429](https://github.com/ClickHouse/ClickHouse/pull/49429) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Changed default values of settings `parallelize_output_from_storages` and `input_format_parquet_preserve_order`. This allows ClickHouse to reorder rows when reading from files (e.g. CSV or Parquet), greatly improving performance in many cases. To restore the old behavior of preserving order, use `parallelize_output_from_storages = 0`, `input_format_parquet_preserve_order = 1`. [#49479](https://github.com/ClickHouse/ClickHouse/pull/49479) ([Michael Kolupaev](https://github.com/al13n321)). -* Make projections production-ready. Add the `optimize_use_projections` setting to control whether the projections will be selected for SELECT queries. The setting `allow_experimental_projection_optimization` is obsolete and does nothing. [#49719](https://github.com/ClickHouse/ClickHouse/pull/49719) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Mark `joinGet` as non-deterministic (so as `dictGet`). It allows using them in mutations without an extra setting. [#49843](https://github.com/ClickHouse/ClickHouse/pull/49843) ([Azat Khuzhin](https://github.com/azat)). -* Revert the "`groupArray` returns cannot be nullable" change (due to binary compatibility breakage for `groupArray`/`groupArrayLast`/`groupArraySample` over `Nullable` types, which likely will lead to `TOO_LARGE_ARRAY_SIZE` or `CANNOT_READ_ALL_DATA`). [#49971](https://github.com/ClickHouse/ClickHouse/pull/49971) ([Azat Khuzhin](https://github.com/azat)). -* Setting `enable_memory_bound_merging_of_aggregation_results` is enabled by default. If you update from version prior to 22.12, we recommend to set this flag to `false` until update is finished. [#50319](https://github.com/ClickHouse/ClickHouse/pull/50319) ([Nikita Taranov](https://github.com/nickitat)). - -#### New Feature -* Added storage engine AzureBlobStorage and azureBlobStorage table function. The supported set of features is very similar to storage/table function S3 [#50604] (https://github.com/ClickHouse/ClickHouse/pull/50604) ([alesapin](https://github.com/alesapin)) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni). -* Added native ClickHouse Keeper CLI Client, it is available as `clickhouse keeper-client` [#47414](https://github.com/ClickHouse/ClickHouse/pull/47414) ([pufit](https://github.com/pufit)). -* Add `urlCluster` table function. Refactor all *Cluster table functions to reduce code duplication. Make schema inference work for all possible *Cluster function signatures and for named collections. Closes [#38499](https://github.com/ClickHouse/ClickHouse/issues/38499). [#45427](https://github.com/ClickHouse/ClickHouse/pull/45427) ([attack204](https://github.com/attack204)), Pavel Kruglov. -* The query cache can now be used for production workloads. [#47977](https://github.com/ClickHouse/ClickHouse/pull/47977) ([Robert Schulze](https://github.com/rschu1ze)). The query cache can now support queries with totals and extremes modifier. [#48853](https://github.com/ClickHouse/ClickHouse/pull/48853) ([Robert Schulze](https://github.com/rschu1ze)). Make `allow_experimental_query_cache` setting as obsolete for backward-compatibility. It was removed in https://github.com/ClickHouse/ClickHouse/pull/47977. [#49934](https://github.com/ClickHouse/ClickHouse/pull/49934) ([Timur Solodovnikov](https://github.com/tsolodov)). -* Geographical data types (`Point`, `Ring`, `Polygon`, and `MultiPolygon`) are production-ready. [#50022](https://github.com/ClickHouse/ClickHouse/pull/50022) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add schema inference to PostgreSQL, MySQL, MeiliSearch, and SQLite table engines. Closes [#49972](https://github.com/ClickHouse/ClickHouse/issues/49972). [#50000](https://github.com/ClickHouse/ClickHouse/pull/50000) ([Nikolay Degterinsky](https://github.com/evillique)). -* Password type in queries like `CREATE USER u IDENTIFIED BY 'p'` will be automatically set according to the setting `default_password_type` in the `config.xml` on the server. Closes [#42915](https://github.com/ClickHouse/ClickHouse/issues/42915). [#44674](https://github.com/ClickHouse/ClickHouse/pull/44674) ([Nikolay Degterinsky](https://github.com/evillique)). -* Add bcrypt password authentication type. Closes [#34599](https://github.com/ClickHouse/ClickHouse/issues/34599). [#44905](https://github.com/ClickHouse/ClickHouse/pull/44905) ([Nikolay Degterinsky](https://github.com/evillique)). -* Introduces new keyword `INTO OUTFILE 'file.txt' APPEND`. [#48880](https://github.com/ClickHouse/ClickHouse/pull/48880) ([alekar](https://github.com/alekar)). -* Added `system.zookeeper_connection` table that shows information about Keeper connections. [#45245](https://github.com/ClickHouse/ClickHouse/pull/45245) ([mateng915](https://github.com/mateng0915)). -* Add new function `generateRandomStructure` that generates random table structure. It can be used in combination with table function `generateRandom`. [#47409](https://github.com/ClickHouse/ClickHouse/pull/47409) ([Kruglov Pavel](https://github.com/Avogar)). -* Allow the use of `CASE` without an `ELSE` branch and extended `transform` to deal with more types. Also fix some issues that made transform() return incorrect results when decimal types were mixed with other numeric types. [#48300](https://github.com/ClickHouse/ClickHouse/pull/48300) ([Salvatore Mesoraca](https://github.com/aiven-sal)). This closes #2655. This closes #9596. This closes #38666. -* Added [server-side encryption using KMS keys](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html) with S3 tables, and the `header` setting with S3 disks. Closes [#48723](https://github.com/ClickHouse/ClickHouse/issues/48723). [#48724](https://github.com/ClickHouse/ClickHouse/pull/48724) ([Johann Gan](https://github.com/johanngan)). -* Add MemoryTracker for the background tasks (merges and mutation). Introduces `merges_mutations_memory_usage_soft_limit` and `merges_mutations_memory_usage_to_ram_ratio` settings that represent the soft memory limit for merges and mutations. If this limit is reached ClickHouse won't schedule new merge or mutation tasks. Also `MergesMutationsMemoryTracking` metric is introduced to allow observing current memory usage of background tasks. Resubmit [#46089](https://github.com/ClickHouse/ClickHouse/issues/46089). Closes [#48774](https://github.com/ClickHouse/ClickHouse/issues/48774). [#48787](https://github.com/ClickHouse/ClickHouse/pull/48787) ([Dmitry Novik](https://github.com/novikd)). -* Function `dotProduct` work for array. [#49050](https://github.com/ClickHouse/ClickHouse/pull/49050) ([FFFFFFFHHHHHHH](https://github.com/FFFFFFFHHHHHHH)). -* Support statement `SHOW INDEX` to improve compatibility with MySQL. [#49158](https://github.com/ClickHouse/ClickHouse/pull/49158) ([Robert Schulze](https://github.com/rschu1ze)). -* Add virtual column `_file` and `_path` support to table function `url`. - Improve error message for table function `url`. - resolves [#49231](https://github.com/ClickHouse/ClickHouse/issues/49231) - resolves [#49232](https://github.com/ClickHouse/ClickHouse/issues/49232). [#49356](https://github.com/ClickHouse/ClickHouse/pull/49356) ([Ziyi Tan](https://github.com/Ziy1-Tan)). -* Adding the `grants` field in the users.xml file, which allows specifying grants for users. [#49381](https://github.com/ClickHouse/ClickHouse/pull/49381) ([pufit](https://github.com/pufit)). -* Support full/right join by using grace hash join algorithm. [#49483](https://github.com/ClickHouse/ClickHouse/pull/49483) ([lgbo](https://github.com/lgbo-ustc)). -* `WITH FILL` modifier groups filling by sorting prefix. Controlled by `use_with_fill_by_sorting_prefix` setting (enabled by default). Related to [#33203](https://github.com/ClickHouse/ClickHouse/issues/33203)#issuecomment-1418736794. [#49503](https://github.com/ClickHouse/ClickHouse/pull/49503) ([Igor Nikonov](https://github.com/devcrafter)). -* Clickhouse-client now accepts queries after "--multiquery" when "--query" (or "-q") is absent. example: clickhouse-client --multiquery "select 1; select 2;". [#49870](https://github.com/ClickHouse/ClickHouse/pull/49870) ([Alexey Gerasimchuk](https://github.com/Demilivor)). -* Add separate `handshake_timeout` for receiving Hello packet from replica. Closes [#48854](https://github.com/ClickHouse/ClickHouse/issues/48854). [#49948](https://github.com/ClickHouse/ClickHouse/pull/49948) ([Kruglov Pavel](https://github.com/Avogar)). -* Added a function "space" which repeats a space as many times as specified. [#50103](https://github.com/ClickHouse/ClickHouse/pull/50103) ([Robert Schulze](https://github.com/rschu1ze)). -* Added --input_format_csv_trim_whitespaces option. [#50215](https://github.com/ClickHouse/ClickHouse/pull/50215) ([Alexey Gerasimchuk](https://github.com/Demilivor)). -* Allow the `dictGetAll` function for regexp tree dictionaries to return values from multiple matches as arrays. Closes [#50254](https://github.com/ClickHouse/ClickHouse/issues/50254). [#50255](https://github.com/ClickHouse/ClickHouse/pull/50255) ([Johann Gan](https://github.com/johanngan)). -* Added `toLastDayOfWeek` function to round a date or a date with time up to the nearest Saturday or Sunday. [#50315](https://github.com/ClickHouse/ClickHouse/pull/50315) ([Victor Krasnov](https://github.com/sirvickr)). -* Ability to ignore a skip index by specifying `ignore_data_skipping_indices`. [#50329](https://github.com/ClickHouse/ClickHouse/pull/50329) ([Boris Kuschel](https://github.com/bkuschel)). -* Add `system.user_processes` table and `SHOW USER PROCESSES` query to show memory info and ProfileEvents on user level. [#50492](https://github.com/ClickHouse/ClickHouse/pull/50492) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). -* Add server and format settings `display_secrets_in_show_and_select` for displaying secrets of tables, databases, table functions, and dictionaries. Add privilege `displaySecretsInShowAndSelect` controlling which users can view secrets. [#46528](https://github.com/ClickHouse/ClickHouse/pull/46528) ([Mike Kot](https://github.com/myrrc)). -* Allow to set up a ROW POLICY for all tables that belong to a DATABASE. [#47640](https://github.com/ClickHouse/ClickHouse/pull/47640) ([Ilya Golshtein](https://github.com/ilejn)). - -#### Performance Improvement -* Compress marks and primary key by default. It significantly reduces the cold query time. Upgrade notes: the support for compressed marks and primary key has been added in version 22.9. If you turned on compressed marks or primary key or installed version 23.5 or newer, which has compressed marks or primary key on by default, you will not be able to downgrade to version 22.8 or earlier. You can also explicitly disable compressed marks or primary keys by specifying the `compress_marks` and `compress_primary_key` settings in the `` section of the server configuration file. [#42587](https://github.com/ClickHouse/ClickHouse/pull/42587) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* New setting s3_max_inflight_parts_for_one_file sets the limit of concurrently loaded parts with multipart upload request in scope of one file. [#49961](https://github.com/ClickHouse/ClickHouse/pull/49961) ([Sema Checherinda](https://github.com/CheSema)). -* When reading from multiple files reduce parallel parsing threads for each file. Resolves [#42192](https://github.com/ClickHouse/ClickHouse/issues/42192). [#46661](https://github.com/ClickHouse/ClickHouse/pull/46661) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Use aggregate projection only if it reads fewer granules than normal reading. It should help in case if query hits the PK of the table, but not the projection. Fixes [#49150](https://github.com/ClickHouse/ClickHouse/issues/49150). [#49417](https://github.com/ClickHouse/ClickHouse/pull/49417) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Do not store blocks in `ANY` hash join if nothing is inserted. [#48633](https://github.com/ClickHouse/ClickHouse/pull/48633) ([vdimir](https://github.com/vdimir)). -* Fixes aggregate combinator `-If` when JIT compiled, and enable JIT compilation for aggregate functions. Closes [#48120](https://github.com/ClickHouse/ClickHouse/issues/48120). [#49083](https://github.com/ClickHouse/ClickHouse/pull/49083) ([Igor Nikonov](https://github.com/devcrafter)). -* For reading from remote tables we use smaller tasks (instead of reading the whole part) to make tasks stealing work * task size is determined by size of columns to read * always use 1mb buffers for reading from s3 * boundaries of cache segments aligned to 1mb so they have decent size even with small tasks. it also should prevent fragmentation. [#49287](https://github.com/ClickHouse/ClickHouse/pull/49287) ([Nikita Taranov](https://github.com/nickitat)). -* Introduced settings: - `merge_max_block_size_bytes` to limit the amount of memory used for background operations. - `vertical_merge_algorithm_min_bytes_to_activate` to add another condition to activate vertical merges. [#49313](https://github.com/ClickHouse/ClickHouse/pull/49313) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Default size of a read buffer for reading from local filesystem changed to a slightly better value. Also two new settings are introduced: `max_read_buffer_size_local_fs` and `max_read_buffer_size_remote_fs`. [#49321](https://github.com/ClickHouse/ClickHouse/pull/49321) ([Nikita Taranov](https://github.com/nickitat)). -* Improve memory usage and speed of `SPARSE_HASHED`/`HASHED` dictionaries (e.g. `SPARSE_HASHED` now eats 2.6x less memory, and is ~2x faster). [#49380](https://github.com/ClickHouse/ClickHouse/pull/49380) ([Azat Khuzhin](https://github.com/azat)). -* Optimize the `system.query_log` and `system.query_thread_log` tables by applying `LowCardinality` when appropriate. The queries over these tables will be faster. [#49530](https://github.com/ClickHouse/ClickHouse/pull/49530) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Better performance when reading local `Parquet` files (through parallel reading). [#49539](https://github.com/ClickHouse/ClickHouse/pull/49539) ([Michael Kolupaev](https://github.com/al13n321)). -* Improve the performance of `RIGHT/FULL JOIN` by up to 2 times in certain scenarios, especially when joining a small left table with a large right table. [#49585](https://github.com/ClickHouse/ClickHouse/pull/49585) ([lgbo](https://github.com/lgbo-ustc)). -* Improve performance of BLAKE3 by 11% by enabling LTO for Rust. [#49600](https://github.com/ClickHouse/ClickHouse/pull/49600) ([Azat Khuzhin](https://github.com/azat)). Now it is on par with C++. -* Optimize the structure of the `system.opentelemetry_span_log`. Use `LowCardinality` where appropriate. Although this table is generally stupid (it is using the Map data type even for common attributes), it will be slightly better. [#49647](https://github.com/ClickHouse/ClickHouse/pull/49647) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Try to reserve hash table's size in `grace_hash` join. [#49816](https://github.com/ClickHouse/ClickHouse/pull/49816) ([lgbo](https://github.com/lgbo-ustc)). -* Parallel merge of `uniqExactIf` states. Closes [#49885](https://github.com/ClickHouse/ClickHouse/issues/49885). [#50285](https://github.com/ClickHouse/ClickHouse/pull/50285) ([flynn](https://github.com/ucasfl)). -* Keeper improvement: add `CheckNotExists` request to Keeper, which allows to improve the performance of Replicated tables. [#48897](https://github.com/ClickHouse/ClickHouse/pull/48897) ([Antonio Andelic](https://github.com/antonio2368)). -* Keeper performance improvements: avoid serializing same request twice while processing. Cache deserialization results of large requests. Controlled by new coordination setting `min_request_size_for_cache`. [#49004](https://github.com/ClickHouse/ClickHouse/pull/49004) ([Antonio Andelic](https://github.com/antonio2368)). -* Reduced number of `List` ZooKeeper requests when selecting parts to merge and a lot of partitions do not have anything to merge. [#49637](https://github.com/ClickHouse/ClickHouse/pull/49637) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Rework locking in the FS cache [#44985](https://github.com/ClickHouse/ClickHouse/pull/44985) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Disable pure parallel replicas if trivial count optimization is possible. [#50594](https://github.com/ClickHouse/ClickHouse/pull/50594) ([Raúl Marín](https://github.com/Algunenano)). -* Don't send head request for all keys in Iceberg schema inference, only for keys that are used for reaing data. [#50203](https://github.com/ClickHouse/ClickHouse/pull/50203) ([Kruglov Pavel](https://github.com/Avogar)). -* Setting `enable_memory_bound_merging_of_aggregation_results` is enabled by default. [#50319](https://github.com/ClickHouse/ClickHouse/pull/50319) ([Nikita Taranov](https://github.com/nickitat)). - -#### Experimental Feature -* `DEFLATE_QPL` codec lower the minimum simd version to SSE 4.2. [doc change in qpl](https://github.com/intel/qpl/commit/3f8f5cea27739f5261e8fd577dc233ffe88bf679) - Intel® QPL relies on a run-time kernels dispatcher and cpuid check to choose the best available implementation(sse/avx2/avx512) - restructured cmakefile for qpl build in clickhouse to align with latest upstream qpl. [#49811](https://github.com/ClickHouse/ClickHouse/pull/49811) ([jasperzhu](https://github.com/jinjunzh)). -* Add initial support to do JOINs with pure parallel replicas. [#49544](https://github.com/ClickHouse/ClickHouse/pull/49544) ([Raúl Marín](https://github.com/Algunenano)). -* More parallelism on `Outdated` parts removal with "zero-copy replication". [#49630](https://github.com/ClickHouse/ClickHouse/pull/49630) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Parallel Replicas: 1) Fixed an error `NOT_FOUND_COLUMN_IN_BLOCK` in case of using parallel replicas with non-replicated storage with disabled setting `parallel_replicas_for_non_replicated_merge_tree` 2) Now `allow_experimental_parallel_reading_from_replicas` have 3 possible values - 0, 1 and 2. 0 - disabled, 1 - enabled, silently disable them in case of failure (in case of FINAL or JOIN), 2 - enabled, throw an exception in case of failure. 3) If FINAL modifier is used in SELECT query and parallel replicas are enabled, ClickHouse will try to disable them if `allow_experimental_parallel_reading_from_replicas` is set to 1 and throw an exception otherwise. [#50195](https://github.com/ClickHouse/ClickHouse/pull/50195) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* When parallel replicas are enabled they will always skip unavailable servers (the behavior is controlled by the setting `skip_unavailable_shards`, enabled by default and can be only disabled). This closes: [#48565](https://github.com/ClickHouse/ClickHouse/issues/48565). [#50293](https://github.com/ClickHouse/ClickHouse/pull/50293) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). - -#### Improvement -* The `BACKUP` command will not decrypt data from encrypted disks while making a backup. Instead the data will be stored in a backup in encrypted form. Such backups can be restored only to an encrypted disk with the same (or extended) list of encryption keys. [#48896](https://github.com/ClickHouse/ClickHouse/pull/48896) ([Vitaly Baranov](https://github.com/vitlibar)). -* Added possibility to use temporary tables in FROM part of ATTACH PARTITION FROM and REPLACE PARTITION FROM. [#49436](https://github.com/ClickHouse/ClickHouse/pull/49436) ([Roman Vasin](https://github.com/rvasin)). -* Added setting `async_insert` for `MergeTree` tables. It has the same meaning as query-level setting `async_insert` and enables asynchronous inserts for specific table. Note: it doesn't take effect for insert queries from `clickhouse-client`, use query-level setting in that case. [#49122](https://github.com/ClickHouse/ClickHouse/pull/49122) ([Anton Popov](https://github.com/CurtizJ)). -* Add support for size suffixes in quota creation statement parameters. [#49087](https://github.com/ClickHouse/ClickHouse/pull/49087) ([Eridanus](https://github.com/Eridanus117)). -* Extend `first_value` and `last_value` to accept NULL. [#46467](https://github.com/ClickHouse/ClickHouse/pull/46467) ([lgbo](https://github.com/lgbo-ustc)). -* Add alias `str_to_map` and `mapFromString` for `extractKeyValuePairs`. closes https://github.com/clickhouse/clickhouse/issues/47185. [#49466](https://github.com/ClickHouse/ClickHouse/pull/49466) ([flynn](https://github.com/ucasfl)). -* Add support for CGroup version 2 for asynchronous metrics about the memory usage and availability. This closes [#37983](https://github.com/ClickHouse/ClickHouse/issues/37983). [#45999](https://github.com/ClickHouse/ClickHouse/pull/45999) ([sichenzhao](https://github.com/sichenzhao)). -* Cluster table functions should always skip unavailable shards. close [#46314](https://github.com/ClickHouse/ClickHouse/issues/46314). [#46765](https://github.com/ClickHouse/ClickHouse/pull/46765) ([zk_kiger](https://github.com/zk-kiger)). -* Allow CSV file to contain empty columns in its header. [#47496](https://github.com/ClickHouse/ClickHouse/pull/47496) ([你不要过来啊](https://github.com/iiiuwioajdks)). -* Add Google Cloud Storage S3 compatible table function `gcs`. Like the `oss` and `cosn` functions, it is just an alias over the `s3` table function, and it does not bring any new features. [#47815](https://github.com/ClickHouse/ClickHouse/pull/47815) ([Kuba Kaflik](https://github.com/jkaflik)). -* Add ability to use strict parts size for S3 (compatibility with CloudFlare R2 S3 Storage). [#48492](https://github.com/ClickHouse/ClickHouse/pull/48492) ([Azat Khuzhin](https://github.com/azat)). -* Added new columns with info about `Replicated` database replicas to `system.clusters`: `database_shard_name`, `database_replica_name`, `is_active`. Added an optional `FROM SHARD` clause to `SYSTEM DROP DATABASE REPLICA` query. [#48548](https://github.com/ClickHouse/ClickHouse/pull/48548) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Add a new column `zookeeper_name` in system.replicas, to indicate on which (auxiliary) zookeeper cluster the replicated table's metadata is stored. [#48549](https://github.com/ClickHouse/ClickHouse/pull/48549) ([cangyin](https://github.com/cangyin)). -* `IN` operator support the comparison of `Date` and `Date32`. Closes [#48736](https://github.com/ClickHouse/ClickHouse/issues/48736). [#48806](https://github.com/ClickHouse/ClickHouse/pull/48806) ([flynn](https://github.com/ucasfl)). -* Support for erasure codes in `HDFS`, author: @M1eyu2018, @tomscut. [#48833](https://github.com/ClickHouse/ClickHouse/pull/48833) ([M1eyu](https://github.com/M1eyu2018)). -* Implement SYSTEM DROP REPLICA from auxiliary ZooKeeper clusters, may be close [#48931](https://github.com/ClickHouse/ClickHouse/issues/48931). [#48932](https://github.com/ClickHouse/ClickHouse/pull/48932) ([wangxiaobo](https://github.com/wzb5212)). -* Add Array data type to MongoDB. Closes [#48598](https://github.com/ClickHouse/ClickHouse/issues/48598). [#48983](https://github.com/ClickHouse/ClickHouse/pull/48983) ([Nikolay Degterinsky](https://github.com/evillique)). -* Support storing `Interval` data types in tables. [#49085](https://github.com/ClickHouse/ClickHouse/pull/49085) ([larryluogit](https://github.com/larryluogit)). -* Allow using `ntile` window function without explicit window frame definition: `ntile(3) OVER (ORDER BY a)`, close [#46763](https://github.com/ClickHouse/ClickHouse/issues/46763). [#49093](https://github.com/ClickHouse/ClickHouse/pull/49093) ([vdimir](https://github.com/vdimir)). -* Added settings (`number_of_mutations_to_delay`, `number_of_mutations_to_throw`) to delay or throw `ALTER` queries that create mutations (`ALTER UPDATE`, `ALTER DELETE`, `ALTER MODIFY COLUMN`, ...) in case when table already has a lot of unfinished mutations. [#49117](https://github.com/ClickHouse/ClickHouse/pull/49117) ([Anton Popov](https://github.com/CurtizJ)). -* Catch exception from `create_directories` in filesystem cache. [#49203](https://github.com/ClickHouse/ClickHouse/pull/49203) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Copies embedded examples to a new field `example` in `system.functions` to supplement the field `description`. [#49222](https://github.com/ClickHouse/ClickHouse/pull/49222) ([Dan Roscigno](https://github.com/DanRoscigno)). -* Enable connection options for the MongoDB dictionary. Example: ``` xml localhost 27017 test dictionary_source ssl=true ``` ### Documentation entry for user-facing changes. [#49225](https://github.com/ClickHouse/ClickHouse/pull/49225) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). -* Added an alias `asymptotic` for `asymp` computational method for `kolmogorovSmirnovTest`. Improved documentation. [#49286](https://github.com/ClickHouse/ClickHouse/pull/49286) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Aggregation function groupBitAnd/Or/Xor now work on signed integer data. This makes them consistent with the behavior of scalar functions bitAnd/Or/Xor. [#49292](https://github.com/ClickHouse/ClickHouse/pull/49292) ([exmy](https://github.com/exmy)). -* Split function-documentation into more fine-granular fields. [#49300](https://github.com/ClickHouse/ClickHouse/pull/49300) ([Robert Schulze](https://github.com/rschu1ze)). -* Use multiple threads shared between all tables within a server to load outdated data parts. The the size of the pool and its queue is controlled by `max_outdated_parts_loading_thread_pool_size` and `outdated_part_loading_thread_pool_queue_size` settings. [#49317](https://github.com/ClickHouse/ClickHouse/pull/49317) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Don't overestimate the size of processed data for `LowCardinality` columns when they share dictionaries between blocks. This closes [#49322](https://github.com/ClickHouse/ClickHouse/issues/49322). See also [#48745](https://github.com/ClickHouse/ClickHouse/issues/48745). [#49323](https://github.com/ClickHouse/ClickHouse/pull/49323) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Parquet writer now uses reasonable row group size when invoked through `OUTFILE`. [#49325](https://github.com/ClickHouse/ClickHouse/pull/49325) ([Michael Kolupaev](https://github.com/al13n321)). -* Allow restricted keywords like `ARRAY` as an alias if the alias is quoted. Closes [#49324](https://github.com/ClickHouse/ClickHouse/issues/49324). [#49360](https://github.com/ClickHouse/ClickHouse/pull/49360) ([Nikolay Degterinsky](https://github.com/evillique)). -* Data parts loading and deletion jobs were moved to shared server-wide pools instead of per-table pools. Pools sizes are controlled via settings `max_active_parts_loading_thread_pool_size`, `max_outdated_parts_loading_thread_pool_size` and `max_parts_cleaning_thread_pool_size` in top-level config. Table-level settings `max_part_loading_threads` and `max_part_removal_threads` became obsolete. [#49474](https://github.com/ClickHouse/ClickHouse/pull/49474) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Allow `?password=pass` in URL of the Play UI. Password is replaced in browser history. [#49505](https://github.com/ClickHouse/ClickHouse/pull/49505) ([Mike Kot](https://github.com/myrrc)). -* Allow reading zero-size objects from remote filesystems. (because empty files are not backup'd, so we might end up with zero blobs in metadata file). Closes [#49480](https://github.com/ClickHouse/ClickHouse/issues/49480). [#49519](https://github.com/ClickHouse/ClickHouse/pull/49519) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Attach thread MemoryTracker to `total_memory_tracker` after `ThreadGroup` detached. [#49527](https://github.com/ClickHouse/ClickHouse/pull/49527) ([Dmitry Novik](https://github.com/novikd)). -* Fix parameterized views when a query parameter is used multiple times in the query. [#49556](https://github.com/ClickHouse/ClickHouse/pull/49556) ([Azat Khuzhin](https://github.com/azat)). -* Release memory allocated for the last sent ProfileEvents snapshot in the context of a query. Followup [#47564](https://github.com/ClickHouse/ClickHouse/issues/47564). [#49561](https://github.com/ClickHouse/ClickHouse/pull/49561) ([Dmitry Novik](https://github.com/novikd)). -* Function "makeDate" now provides a MySQL-compatible overload (year & day of the year argument). [#49603](https://github.com/ClickHouse/ClickHouse/pull/49603) ([Robert Schulze](https://github.com/rschu1ze)). -* Support `dictionary` table function for `RegExpTreeDictionary`. [#49666](https://github.com/ClickHouse/ClickHouse/pull/49666) ([Han Fei](https://github.com/hanfei1991)). -* Added weighted fair IO scheduling policy. Added dynamic resource manager, which allows IO scheduling hierarchy to be updated in runtime w/o server restarts. [#49671](https://github.com/ClickHouse/ClickHouse/pull/49671) ([Sergei Trifonov](https://github.com/serxa)). -* Add compose request after multipart upload to GCS. This enables the usage of copy operation on objects uploaded with the multipart upload. It's recommended to set `s3_strict_upload_part_size` to some value because compose request can fail on objects created with parts of different sizes. [#49693](https://github.com/ClickHouse/ClickHouse/pull/49693) ([Antonio Andelic](https://github.com/antonio2368)). -* For the `extractKeyValuePairs` function: improve the "best-effort" parsing logic to accept `key_value_delimiter` as a valid part of the value. This also simplifies branching and might even speed up things a bit. [#49760](https://github.com/ClickHouse/ClickHouse/pull/49760) ([Arthur Passos](https://github.com/arthurpassos)). -* Add `initial_query_id` field for system.processors_profile_log [#49777](https://github.com/ClickHouse/ClickHouse/pull/49777) ([helifu](https://github.com/helifu)). -* System log tables can now have custom sorting keys. [#49778](https://github.com/ClickHouse/ClickHouse/pull/49778) ([helifu](https://github.com/helifu)). -* A new field `partitions` to `system.query_log` is used to indicate which partitions are participating in the calculation. [#49779](https://github.com/ClickHouse/ClickHouse/pull/49779) ([helifu](https://github.com/helifu)). -* Added `enable_the_endpoint_id_with_zookeeper_name_prefix` setting for `ReplicatedMergeTree` (disabled by default). When enabled, it adds ZooKeeper cluster name to table's interserver communication endpoint. It avoids `Duplicate interserver IO endpoint` errors when having replicated tables with the same path, but different auxiliary ZooKeepers. [#49780](https://github.com/ClickHouse/ClickHouse/pull/49780) ([helifu](https://github.com/helifu)). -* Add query parameters to `clickhouse-local`. Closes [#46561](https://github.com/ClickHouse/ClickHouse/issues/46561). [#49785](https://github.com/ClickHouse/ClickHouse/pull/49785) ([Nikolay Degterinsky](https://github.com/evillique)). -* Allow loading dictionaries and functions from YAML by default. In previous versions, it required editing the `dictionaries_config` or `user_defined_executable_functions_config` in the configuration file, as they expected `*.xml` files. [#49812](https://github.com/ClickHouse/ClickHouse/pull/49812) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The Kafka table engine now allows to use alias columns. [#49824](https://github.com/ClickHouse/ClickHouse/pull/49824) ([Aleksandr Musorin](https://github.com/AVMusorin)). -* Add setting to limit the max number of pairs produced by `extractKeyValuePairs`, a safeguard to avoid using way too much memory. [#49836](https://github.com/ClickHouse/ClickHouse/pull/49836) ([Arthur Passos](https://github.com/arthurpassos)). -* Add support for (an unusual) case where the arguments in the `IN` operator are single-element tuples. [#49844](https://github.com/ClickHouse/ClickHouse/pull/49844) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). -* `bitHammingDistance` function support `String` and `FixedString` data type. Closes [#48827](https://github.com/ClickHouse/ClickHouse/issues/48827). [#49858](https://github.com/ClickHouse/ClickHouse/pull/49858) ([flynn](https://github.com/ucasfl)). -* Fix timeout resetting errors in the client on OS X. [#49863](https://github.com/ClickHouse/ClickHouse/pull/49863) ([alekar](https://github.com/alekar)). -* Add support for big integers, such as UInt128, Int128, UInt256, and Int256 in the function `bitCount`. This enables Hamming distance over large bit masks for AI applications. [#49867](https://github.com/ClickHouse/ClickHouse/pull/49867) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fingerprints to be used instead of key IDs in encrypted disks. This simplifies the configuration of encrypted disks. [#49882](https://github.com/ClickHouse/ClickHouse/pull/49882) ([Vitaly Baranov](https://github.com/vitlibar)). -* Add UUID data type to PostgreSQL. Closes [#49739](https://github.com/ClickHouse/ClickHouse/issues/49739). [#49894](https://github.com/ClickHouse/ClickHouse/pull/49894) ([Nikolay Degterinsky](https://github.com/evillique)). -* Function `toUnixTimestamp` now accepts `Date` and `Date32` arguments. [#49989](https://github.com/ClickHouse/ClickHouse/pull/49989) ([Victor Krasnov](https://github.com/sirvickr)). -* Charge only server memory for dictionaries. [#49995](https://github.com/ClickHouse/ClickHouse/pull/49995) ([Azat Khuzhin](https://github.com/azat)). -* The server will allow using the `SQL_*` settings such as `SQL_AUTO_IS_NULL` as no-ops for MySQL compatibility. This closes [#49927](https://github.com/ClickHouse/ClickHouse/issues/49927). [#50013](https://github.com/ClickHouse/ClickHouse/pull/50013) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Preserve initial_query_id for ON CLUSTER queries, which is useful for introspection (under `distributed_ddl_entry_format_version=5`). [#50015](https://github.com/ClickHouse/ClickHouse/pull/50015) ([Azat Khuzhin](https://github.com/azat)). -* Preserve backward incompatibility for renamed settings by using aliases (`allow_experimental_projection_optimization` for `optimize_use_projections`, `allow_experimental_lightweight_delete` for `enable_lightweight_delete`). [#50044](https://github.com/ClickHouse/ClickHouse/pull/50044) ([Azat Khuzhin](https://github.com/azat)). -* Support passing FQDN through setting my_hostname to register cluster node in keeper. Add setting of invisible to support multi compute groups. A compute group as a cluster, is invisible to other compute groups. [#50186](https://github.com/ClickHouse/ClickHouse/pull/50186) ([Yangkuan Liu](https://github.com/LiuYangkuan)). -* Fix PostgreSQL reading all the data even though `LIMIT n` could be specified. [#50187](https://github.com/ClickHouse/ClickHouse/pull/50187) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Add new profile events for queries with subqueries (`QueriesWithSubqueries`/`SelectQueriesWithSubqueries`/`InsertQueriesWithSubqueries`). [#50204](https://github.com/ClickHouse/ClickHouse/pull/50204) ([Azat Khuzhin](https://github.com/azat)). -* Adding the roles field in the users.xml file, which allows specifying roles with grants via a config file. [#50278](https://github.com/ClickHouse/ClickHouse/pull/50278) ([pufit](https://github.com/pufit)). -* Report `CGroupCpuCfsPeriod` and `CGroupCpuCfsQuota` in AsynchronousMetrics. - Respect cgroup v2 memory limits during server startup. [#50379](https://github.com/ClickHouse/ClickHouse/pull/50379) ([alekar](https://github.com/alekar)). -* Add a signal handler for SIGQUIT to work the same way as SIGINT. Closes [#50298](https://github.com/ClickHouse/ClickHouse/issues/50298). [#50435](https://github.com/ClickHouse/ClickHouse/pull/50435) ([Nikolay Degterinsky](https://github.com/evillique)). -* In case JSON parse fails due to the large size of the object output the last position to allow debugging. [#50474](https://github.com/ClickHouse/ClickHouse/pull/50474) ([Valentin Alexeev](https://github.com/valentinalexeev)). -* Support decimals with not fixed size. Closes [#49130](https://github.com/ClickHouse/ClickHouse/issues/49130). [#50586](https://github.com/ClickHouse/ClickHouse/pull/50586) ([Kruglov Pavel](https://github.com/Avogar)). - -#### Build/Testing/Packaging Improvement -* New and improved `keeper-bench`. Everything can be customized from YAML/XML file: - request generator - each type of request generator can have a specific set of fields - multi requests can be generated just by doing the same under `multi` key - for each request or subrequest in multi a `weight` field can be defined to control distribution - define trees that need to be setup for a test run - hosts can be defined with all timeouts customizable and it's possible to control how many sessions to generate for each host - integers defined with `min_value` and `max_value` fields are random number generators. [#48547](https://github.com/ClickHouse/ClickHouse/pull/48547) ([Antonio Andelic](https://github.com/antonio2368)). -* Io_uring is not supported on macos, don't choose it when running tests on local to avoid occasional failures. [#49250](https://github.com/ClickHouse/ClickHouse/pull/49250) ([Frank Chen](https://github.com/FrankChen021)). -* Support named fault injection for testing. [#49361](https://github.com/ClickHouse/ClickHouse/pull/49361) ([Han Fei](https://github.com/hanfei1991)). -* Allow running ClickHouse in the OS where the `prctl` (process control) syscall is not available, such as AWS Lambda. [#49538](https://github.com/ClickHouse/ClickHouse/pull/49538) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fixed the issue of build conflict between contrib/isa-l and isa-l in qpl [49296](https://github.com/ClickHouse/ClickHouse/issues/49296). [#49584](https://github.com/ClickHouse/ClickHouse/pull/49584) ([jasperzhu](https://github.com/jinjunzh)). -* Utilities are now only build if explicitly requested ("-DENABLE_UTILS=1") instead of by default, this reduces link times in typical development builds. [#49620](https://github.com/ClickHouse/ClickHouse/pull/49620) ([Robert Schulze](https://github.com/rschu1ze)). -* Pull build description of idxd-config into a separate CMake file to avoid accidental removal in future. [#49651](https://github.com/ClickHouse/ClickHouse/pull/49651) ([jasperzhu](https://github.com/jinjunzh)). -* Add CI check with an enabled analyzer in the master. Follow-up [#49562](https://github.com/ClickHouse/ClickHouse/issues/49562). [#49668](https://github.com/ClickHouse/ClickHouse/pull/49668) ([Dmitry Novik](https://github.com/novikd)). -* Switch to LLVM/clang 16. [#49678](https://github.com/ClickHouse/ClickHouse/pull/49678) ([Azat Khuzhin](https://github.com/azat)). -* Allow building ClickHouse with clang-17. [#49851](https://github.com/ClickHouse/ClickHouse/pull/49851) ([Alexey Milovidov](https://github.com/alexey-milovidov)). [#50410](https://github.com/ClickHouse/ClickHouse/pull/50410) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* ClickHouse is now easier to be integrated into other cmake projects. [#49991](https://github.com/ClickHouse/ClickHouse/pull/49991) ([Amos Bird](https://github.com/amosbird)). (Which is strongly discouraged - Alexey Milovidov). -* Fix strange additional QEMU logging after [#47151](https://github.com/ClickHouse/ClickHouse/issues/47151), see https://s3.amazonaws.com/clickhouse-test-reports/50078/a4743996ee4f3583884d07bcd6501df0cfdaa346/stateless_tests__release__databasereplicated__[3_4].html. [#50442](https://github.com/ClickHouse/ClickHouse/pull/50442) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* ClickHouse can work on Linux RISC-V 6.1.22. This closes [#50456](https://github.com/ClickHouse/ClickHouse/issues/50456). [#50457](https://github.com/ClickHouse/ClickHouse/pull/50457) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Bump internal protobuf to v3.18 (fixes bogus CVE-2022-1941). [#50400](https://github.com/ClickHouse/ClickHouse/pull/50400) ([Robert Schulze](https://github.com/rschu1ze)). -* Bump internal libxml2 to v2.10.4 (fixes bogus CVE-2023-28484 and bogus CVE-2023-29469). [#50402](https://github.com/ClickHouse/ClickHouse/pull/50402) ([Robert Schulze](https://github.com/rschu1ze)). -* Bump c-ares to v1.19.1 (bogus CVE-2023-32067, bogus CVE-2023-31130, bogus CVE-2023-31147). [#50403](https://github.com/ClickHouse/ClickHouse/pull/50403) ([Robert Schulze](https://github.com/rschu1ze)). -* Fix bogus CVE-2022-2469 in libgsasl. [#50404](https://github.com/ClickHouse/ClickHouse/pull/50404) ([Robert Schulze](https://github.com/rschu1ze)). - -#### Bug Fix (user-visible misbehavior in an official stable release) - -* ActionsDAG: fix wrong optimization [#47584](https://github.com/ClickHouse/ClickHouse/pull/47584) ([Salvatore Mesoraca](https://github.com/aiven-sal)). -* Correctly handle concurrent snapshots in Keeper [#48466](https://github.com/ClickHouse/ClickHouse/pull/48466) ([Antonio Andelic](https://github.com/antonio2368)). -* MergeTreeMarksLoader holds DataPart instead of DataPartStorage [#48515](https://github.com/ClickHouse/ClickHouse/pull/48515) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Sequence state fix [#48603](https://github.com/ClickHouse/ClickHouse/pull/48603) ([Ilya Golshtein](https://github.com/ilejn)). -* Back/Restore concurrency check on previous fails [#48726](https://github.com/ClickHouse/ClickHouse/pull/48726) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Fix Attaching a table with non-existent ZK path does not increase the ReadonlyReplica metric [#48954](https://github.com/ClickHouse/ClickHouse/pull/48954) ([wangxiaobo](https://github.com/wzb5212)). -* Fix possible terminate called for uncaught exception in some places [#49112](https://github.com/ClickHouse/ClickHouse/pull/49112) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix key not found error for queries with multiple StorageJoin [#49137](https://github.com/ClickHouse/ClickHouse/pull/49137) ([vdimir](https://github.com/vdimir)). -* Fix wrong query result when using nullable primary key [#49172](https://github.com/ClickHouse/ClickHouse/pull/49172) ([Duc Canh Le](https://github.com/canhld94)). -* Fix reinterpretAs*() on big endian machines [#49198](https://github.com/ClickHouse/ClickHouse/pull/49198) ([Suzy Wang](https://github.com/SuzyWangIBMer)). -* (Experimental zero-copy replication) Lock zero copy parts more atomically [#49211](https://github.com/ClickHouse/ClickHouse/pull/49211) ([alesapin](https://github.com/alesapin)). -* Fix race on Outdated parts loading [#49223](https://github.com/ClickHouse/ClickHouse/pull/49223) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Fix all key value is null and group use rollup return wrong answer [#49282](https://github.com/ClickHouse/ClickHouse/pull/49282) ([Shuai li](https://github.com/loneylee)). -* Fix calculating load_factor for HASHED dictionaries with SHARDS [#49319](https://github.com/ClickHouse/ClickHouse/pull/49319) ([Azat Khuzhin](https://github.com/azat)). -* Disallow configuring compression CODECs for alias columns [#49363](https://github.com/ClickHouse/ClickHouse/pull/49363) ([Timur Solodovnikov](https://github.com/tsolodov)). -* Fix bug in removal of existing part directory [#49365](https://github.com/ClickHouse/ClickHouse/pull/49365) ([alesapin](https://github.com/alesapin)). -* Properly fix GCS when HMAC is used [#49390](https://github.com/ClickHouse/ClickHouse/pull/49390) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix fuzz bug when subquery set is not built when reading from remote() [#49425](https://github.com/ClickHouse/ClickHouse/pull/49425) ([Alexander Gololobov](https://github.com/davenger)). -* Invert `shutdown_wait_unfinished_queries` [#49427](https://github.com/ClickHouse/ClickHouse/pull/49427) ([Konstantin Bogdanov](https://github.com/thevar1able)). -* (Experimental zero-copy replication) Fix another zero copy bug [#49473](https://github.com/ClickHouse/ClickHouse/pull/49473) ([alesapin](https://github.com/alesapin)). -* Fix postgres database setting [#49481](https://github.com/ClickHouse/ClickHouse/pull/49481) ([Mal Curtis](https://github.com/snikch)). -* Correctly handle `s3Cluster` arguments [#49490](https://github.com/ClickHouse/ClickHouse/pull/49490) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix bug in TraceCollector destructor. [#49508](https://github.com/ClickHouse/ClickHouse/pull/49508) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Fix AsynchronousReadIndirectBufferFromRemoteFS breaking on short seeks [#49525](https://github.com/ClickHouse/ClickHouse/pull/49525) ([Michael Kolupaev](https://github.com/al13n321)). -* Fix dictionaries loading order [#49560](https://github.com/ClickHouse/ClickHouse/pull/49560) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Forbid the change of data type of Object('json') column [#49563](https://github.com/ClickHouse/ClickHouse/pull/49563) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix stress test (Logical error: Expected 7134 >= 11030) [#49623](https://github.com/ClickHouse/ClickHouse/pull/49623) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix bug in DISTINCT [#49628](https://github.com/ClickHouse/ClickHouse/pull/49628) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix: DISTINCT in order with zero values in non-sorted columns [#49636](https://github.com/ClickHouse/ClickHouse/pull/49636) ([Igor Nikonov](https://github.com/devcrafter)). -* Fix one-off error in big integers found by UBSan with fuzzer [#49645](https://github.com/ClickHouse/ClickHouse/pull/49645) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix reading from sparse columns after restart [#49660](https://github.com/ClickHouse/ClickHouse/pull/49660) ([Anton Popov](https://github.com/CurtizJ)). -* Fix assert in SpanHolder::finish() with fibers [#49673](https://github.com/ClickHouse/ClickHouse/pull/49673) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix short circuit functions and mutations with sparse arguments [#49716](https://github.com/ClickHouse/ClickHouse/pull/49716) ([Anton Popov](https://github.com/CurtizJ)). -* Fix writing appended files to incremental backups [#49725](https://github.com/ClickHouse/ClickHouse/pull/49725) ([Vitaly Baranov](https://github.com/vitlibar)). -* Fix "There is no physical column _row_exists in table" error occurring during lightweight delete mutation on a table with Object column. [#49737](https://github.com/ClickHouse/ClickHouse/pull/49737) ([Alexander Gololobov](https://github.com/davenger)). -* Fix msan issue in randomStringUTF8(uneven number) [#49750](https://github.com/ClickHouse/ClickHouse/pull/49750) ([Robert Schulze](https://github.com/rschu1ze)). -* Fix aggregate function kolmogorovSmirnovTest [#49768](https://github.com/ClickHouse/ClickHouse/pull/49768) ([FFFFFFFHHHHHHH](https://github.com/FFFFFFFHHHHHHH)). -* Fix settings aliases in native protocol [#49776](https://github.com/ClickHouse/ClickHouse/pull/49776) ([Azat Khuzhin](https://github.com/azat)). -* Fix `arrayMap` with array of tuples with single argument [#49789](https://github.com/ClickHouse/ClickHouse/pull/49789) ([Anton Popov](https://github.com/CurtizJ)). -* Fix per-query IO/BACKUPs throttling settings [#49797](https://github.com/ClickHouse/ClickHouse/pull/49797) ([Azat Khuzhin](https://github.com/azat)). -* Fix setting NULL in profile definition [#49831](https://github.com/ClickHouse/ClickHouse/pull/49831) ([Vitaly Baranov](https://github.com/vitlibar)). -* Fix a bug with projections and the aggregate_functions_null_for_empty setting (for query_plan_optimize_projection) [#49873](https://github.com/ClickHouse/ClickHouse/pull/49873) ([Amos Bird](https://github.com/amosbird)). -* Fix processing pending batch for Distributed async INSERT after restart [#49884](https://github.com/ClickHouse/ClickHouse/pull/49884) ([Azat Khuzhin](https://github.com/azat)). -* Fix assertion in CacheMetadata::doCleanup [#49914](https://github.com/ClickHouse/ClickHouse/pull/49914) ([Kseniia Sumarokova](https://github.com/kssenii)). -* fix `is_prefix` in OptimizeRegularExpression [#49919](https://github.com/ClickHouse/ClickHouse/pull/49919) ([Han Fei](https://github.com/hanfei1991)). -* Fix metrics `WriteBufferFromS3Bytes`, `WriteBufferFromS3Microseconds` and `WriteBufferFromS3RequestsErrors` [#49930](https://github.com/ClickHouse/ClickHouse/pull/49930) ([Aleksandr Musorin](https://github.com/AVMusorin)). -* Fix IPv6 encoding in protobuf [#49933](https://github.com/ClickHouse/ClickHouse/pull/49933) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Fix possible Logical error on bad Nullable parsing for text formats [#49960](https://github.com/ClickHouse/ClickHouse/pull/49960) ([Kruglov Pavel](https://github.com/Avogar)). -* Add setting output_format_parquet_compliant_nested_types to produce more compatible Parquet files [#50001](https://github.com/ClickHouse/ClickHouse/pull/50001) ([Michael Kolupaev](https://github.com/al13n321)). -* Fix logical error in stress test "Not enough space to add ..." [#50021](https://github.com/ClickHouse/ClickHouse/pull/50021) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Avoid deadlock when starting table in attach thread of `ReplicatedMergeTree` [#50026](https://github.com/ClickHouse/ClickHouse/pull/50026) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix assert in SpanHolder::finish() with fibers attempt 2 [#50034](https://github.com/ClickHouse/ClickHouse/pull/50034) ([Kruglov Pavel](https://github.com/Avogar)). -* Add proper escaping for DDL OpenTelemetry context serialization [#50045](https://github.com/ClickHouse/ClickHouse/pull/50045) ([Azat Khuzhin](https://github.com/azat)). -* Fix reporting broken projection parts [#50052](https://github.com/ClickHouse/ClickHouse/pull/50052) ([Amos Bird](https://github.com/amosbird)). -* JIT compilation not equals NaN fix [#50056](https://github.com/ClickHouse/ClickHouse/pull/50056) ([Maksim Kita](https://github.com/kitaisreal)). -* Fix crashing in case of Replicated database without arguments [#50058](https://github.com/ClickHouse/ClickHouse/pull/50058) ([Azat Khuzhin](https://github.com/azat)). -* Fix crash with `multiIf` and constant condition and nullable arguments [#50123](https://github.com/ClickHouse/ClickHouse/pull/50123) ([Anton Popov](https://github.com/CurtizJ)). -* Fix invalid index analysis for date related keys [#50153](https://github.com/ClickHouse/ClickHouse/pull/50153) ([Amos Bird](https://github.com/amosbird)). -* do not allow modify order by when there are no order by cols [#50154](https://github.com/ClickHouse/ClickHouse/pull/50154) ([Han Fei](https://github.com/hanfei1991)). -* Fix broken index analysis when binary operator contains a null constant argument [#50177](https://github.com/ClickHouse/ClickHouse/pull/50177) ([Amos Bird](https://github.com/amosbird)). -* clickhouse-client: disallow usage of `--query` and `--queries-file` at the same time [#50210](https://github.com/ClickHouse/ClickHouse/pull/50210) ([Alexey Gerasimchuk](https://github.com/Demilivor)). -* Fix UB for INTO OUTFILE extensions (APPEND / AND STDOUT) and WATCH EVENTS [#50216](https://github.com/ClickHouse/ClickHouse/pull/50216) ([Azat Khuzhin](https://github.com/azat)). -* Fix skipping spaces at end of row in CustomSeparatedIgnoreSpaces format [#50224](https://github.com/ClickHouse/ClickHouse/pull/50224) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix iceberg metadata parsing [#50232](https://github.com/ClickHouse/ClickHouse/pull/50232) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix nested distributed SELECT in WITH clause [#50234](https://github.com/ClickHouse/ClickHouse/pull/50234) ([Azat Khuzhin](https://github.com/azat)). -* Fix msan issue in keyed siphash [#50245](https://github.com/ClickHouse/ClickHouse/pull/50245) ([Robert Schulze](https://github.com/rschu1ze)). -* Fix bugs in Poco sockets in non-blocking mode, use true non-blocking sockets [#50252](https://github.com/ClickHouse/ClickHouse/pull/50252) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix checksum calculation for backup entries [#50264](https://github.com/ClickHouse/ClickHouse/pull/50264) ([Vitaly Baranov](https://github.com/vitlibar)). -* Comparison functions NaN fix [#50287](https://github.com/ClickHouse/ClickHouse/pull/50287) ([Maksim Kita](https://github.com/kitaisreal)). -* JIT aggregation nullable key fix [#50291](https://github.com/ClickHouse/ClickHouse/pull/50291) ([Maksim Kita](https://github.com/kitaisreal)). -* Fix clickhouse-local crashing when writing empty Arrow or Parquet output [#50328](https://github.com/ClickHouse/ClickHouse/pull/50328) ([Michael Kolupaev](https://github.com/al13n321)). -* Fix crash when Pool::Entry::disconnect() is called [#50334](https://github.com/ClickHouse/ClickHouse/pull/50334) ([Val Doroshchuk](https://github.com/valbok)). -* Improved fetch part by holding directory lock longer [#50339](https://github.com/ClickHouse/ClickHouse/pull/50339) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Fix bitShift* functions with both constant arguments [#50343](https://github.com/ClickHouse/ClickHouse/pull/50343) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix Keeper deadlock on exception when preprocessing requests. [#50387](https://github.com/ClickHouse/ClickHouse/pull/50387) ([frinkr](https://github.com/frinkr)). -* Fix hashing of const integer values [#50421](https://github.com/ClickHouse/ClickHouse/pull/50421) ([Robert Schulze](https://github.com/rschu1ze)). -* Fix merge_tree_min_rows_for_seek/merge_tree_min_bytes_for_seek for data skipping indexes [#50432](https://github.com/ClickHouse/ClickHouse/pull/50432) ([Azat Khuzhin](https://github.com/azat)). -* Limit the number of in-flight tasks for loading outdated parts [#50450](https://github.com/ClickHouse/ClickHouse/pull/50450) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Keeper fix: apply uncommitted state after snapshot install [#50483](https://github.com/ClickHouse/ClickHouse/pull/50483) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix incorrect constant folding [#50536](https://github.com/ClickHouse/ClickHouse/pull/50536) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix logical error in stress test (Not enough space to add ...) [#50583](https://github.com/ClickHouse/ClickHouse/pull/50583) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix converting Null to LowCardinality(Nullable) in values table function [#50637](https://github.com/ClickHouse/ClickHouse/pull/50637) ([Kruglov Pavel](https://github.com/Avogar)). -* Revert invalid RegExpTreeDictionary optimization [#50642](https://github.com/ClickHouse/ClickHouse/pull/50642) ([Johann Gan](https://github.com/johanngan)). - -### ClickHouse release 23.4, 2023-04-26 - -#### Backward Incompatible Change -* Formatter '%M' in function formatDateTime() now prints the month name instead of the minutes. This makes the behavior consistent with MySQL. The previous behavior can be restored using setting "formatdatetime_parsedatetime_m_is_month_name = 0". [#47246](https://github.com/ClickHouse/ClickHouse/pull/47246) ([Robert Schulze](https://github.com/rschu1ze)). -* This change makes sense only if you are using the virtual filesystem cache. If `path` in the virtual filesystem cache configuration is not empty and is not an absolute path, then it will be put in `/caches/`. [#48784](https://github.com/ClickHouse/ClickHouse/pull/48784) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Primary/secondary indices and sorting keys with identical expressions are now rejected. This behavior can be disabled using setting `allow_suspicious_indices`. [#48536](https://github.com/ClickHouse/ClickHouse/pull/48536) ([凌涛](https://github.com/lingtaolf)). - -#### New Feature -* Support new aggregate function `quantileGK`/`quantilesGK`, like [approx_percentile](https://spark.apache.org/docs/latest/api/sql/index.html#approx_percentile) in spark. Greenwald-Khanna algorithm refer to http://infolab.stanford.edu/~datar/courses/cs361a/papers/quantiles.pdf. [#46428](https://github.com/ClickHouse/ClickHouse/pull/46428) ([李扬](https://github.com/taiyang-li)). -* Add a statement `SHOW COLUMNS` which shows distilled information from system.columns. [#48017](https://github.com/ClickHouse/ClickHouse/pull/48017) ([Robert Schulze](https://github.com/rschu1ze)). -* Added `LIGHTWEIGHT` and `PULL` modifiers for `SYSTEM SYNC REPLICA` query. `LIGHTWEIGHT` version waits for fetches and drop-ranges only (merges and mutations are ignored). `PULL` version pulls new entries from ZooKeeper and does not wait for them. Fixes [#47794](https://github.com/ClickHouse/ClickHouse/issues/47794). [#48085](https://github.com/ClickHouse/ClickHouse/pull/48085) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Add `kafkaMurmurHash` function for compatibility with Kafka DefaultPartitioner. Closes [#47834](https://github.com/ClickHouse/ClickHouse/issues/47834). [#48185](https://github.com/ClickHouse/ClickHouse/pull/48185) ([Nikolay Degterinsky](https://github.com/evillique)). -* Allow to easily create a user with the same grants as the current user by using `GRANT CURRENT GRANTS`. [#48262](https://github.com/ClickHouse/ClickHouse/pull/48262) ([pufit](https://github.com/pufit)). -* Add statistical aggregate function `kolmogorovSmirnovTest`. Close [#48228](https://github.com/ClickHouse/ClickHouse/issues/48228). [#48325](https://github.com/ClickHouse/ClickHouse/pull/48325) ([FFFFFFFHHHHHHH](https://github.com/FFFFFFFHHHHHHH)). -* Added a `lost_part_count` column to the `system.replicas` table. The column value shows the total number of lost parts in the corresponding table. Value is stored in zookeeper and can be used instead of not persistent `ReplicatedDataLoss` profile event for monitoring. [#48526](https://github.com/ClickHouse/ClickHouse/pull/48526) ([Sergei Trifonov](https://github.com/serxa)). -* Add `soundex` function for compatibility. Closes [#39880](https://github.com/ClickHouse/ClickHouse/issues/39880). [#48567](https://github.com/ClickHouse/ClickHouse/pull/48567) ([FriendLey](https://github.com/FriendLey)). -* Support `Map` type for JSONExtract. [#48629](https://github.com/ClickHouse/ClickHouse/pull/48629) ([李扬](https://github.com/taiyang-li)). -* Add `PrettyJSONEachRow` format to output pretty JSON with new line delimiters and 4 space indents. [#48898](https://github.com/ClickHouse/ClickHouse/pull/48898) ([Kruglov Pavel](https://github.com/Avogar)). -* Add `ParquetMetadata` input format to read Parquet file metadata. [#48911](https://github.com/ClickHouse/ClickHouse/pull/48911) ([Kruglov Pavel](https://github.com/Avogar)). -* Add `extractKeyValuePairs` function to extract key value pairs from strings. Input strings might contain noise (i.e. log files / do not need to be 100% formatted in key-value-pair format), the algorithm will look for key value pairs matching the arguments passed to the function. As of now, function accepts the following arguments: `data_column` (mandatory), `key_value_pair_delimiter` (defaults to `:`), `pair_delimiters` (defaults to `\space \, \;`) and `quoting_character` (defaults to double quotes). [#43606](https://github.com/ClickHouse/ClickHouse/pull/43606) ([Arthur Passos](https://github.com/arthurpassos)). -* Functions replaceOne(), replaceAll(), replaceRegexpOne() and replaceRegexpAll() can now be called with non-const pattern and replacement arguments. [#46589](https://github.com/ClickHouse/ClickHouse/pull/46589) ([Robert Schulze](https://github.com/rschu1ze)). -* Added functions to work with columns of type `Map`: `mapConcat`, `mapSort`, `mapExists`. [#48071](https://github.com/ClickHouse/ClickHouse/pull/48071) ([Anton Popov](https://github.com/CurtizJ)). - -#### Performance Improvement -* Reading files in `Parquet` format is now much faster. IO and decoding are parallelized (controlled by `max_threads` setting), and only required data ranges are read. [#47964](https://github.com/ClickHouse/ClickHouse/pull/47964) ([Michael Kolupaev](https://github.com/al13n321)). -* If we run a mutation with IN (subquery) like this: `ALTER TABLE t UPDATE col='new value' WHERE id IN (SELECT id FROM huge_table)` and the table `t` has multiple parts than for each part a set for subquery `SELECT id FROM huge_table` is built in memory. And if there are many parts then this might consume a lot of memory (and lead to an OOM) and CPU. The solution is to introduce a short-lived cache of sets that are currently being built by mutation tasks. If another task of the same mutation is executed concurrently it can look up the set in the cache, wait for it to be built and reuse it. [#46835](https://github.com/ClickHouse/ClickHouse/pull/46835) ([Alexander Gololobov](https://github.com/davenger)). -* Only check dependencies if necessary when applying `ALTER TABLE` queries. [#48062](https://github.com/ClickHouse/ClickHouse/pull/48062) ([Raúl Marín](https://github.com/Algunenano)). -* Optimize function `mapUpdate`. [#48118](https://github.com/ClickHouse/ClickHouse/pull/48118) ([Anton Popov](https://github.com/CurtizJ)). -* Now an internal query to local replica is sent explicitly and data from it received through loopback interface. Setting `prefer_localhost_replica` is not respected for parallel replicas. This is needed for better scheduling and makes the code cleaner: the initiator is only responsible for coordinating of the reading process and merging results, continuously answering for requests while all the secondary queries read the data. Note: Using loopback interface is not so performant, otherwise some replicas could starve for tasks which could lead to even slower query execution and not utilizing all possible resources. The initialization of the coordinator is now even more lazy. All incoming requests contain the information about the reading algorithm we initialize the coordinator with it when first request comes. If any replica decides to read with a different algorithm–an exception will be thrown and a query will be aborted. [#48246](https://github.com/ClickHouse/ClickHouse/pull/48246) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Do not build set for the right side of `IN` clause with subquery when it is used only for analysis of skip indexes, and they are disabled by setting (`use_skip_indexes=0`). Previously it might affect the performance of queries. [#48299](https://github.com/ClickHouse/ClickHouse/pull/48299) ([Anton Popov](https://github.com/CurtizJ)). -* Query processing is parallelized right after reading `FROM file(...)`. Related to [#38755](https://github.com/ClickHouse/ClickHouse/issues/38755). [#48525](https://github.com/ClickHouse/ClickHouse/pull/48525) ([Igor Nikonov](https://github.com/devcrafter)). Query processing is parallelized right after reading from any data source. Affected data sources are mostly simple or external storages like table functions `url`, `file`. [#48727](https://github.com/ClickHouse/ClickHouse/pull/48727) ([Igor Nikonov](https://github.com/devcrafter)). This is controlled by the setting `parallelize_output_from_storages` which is not enabled by default. -* Lowered contention of ThreadPool mutex (may increase performance for a huge amount of small jobs). [#48750](https://github.com/ClickHouse/ClickHouse/pull/48750) ([Sergei Trifonov](https://github.com/serxa)). -* Reduce memory usage for multiple `ALTER DELETE` mutations. [#48522](https://github.com/ClickHouse/ClickHouse/pull/48522) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Remove the excessive connection attempts if the `skip_unavailable_shards` setting is enabled. [#48771](https://github.com/ClickHouse/ClickHouse/pull/48771) ([Azat Khuzhin](https://github.com/azat)). - -#### Experimental Feature -* Entries in the query cache are now squashed to max_block_size and compressed. [#45912](https://github.com/ClickHouse/ClickHouse/pull/45912) ([Robert Schulze](https://github.com/rschu1ze)). -* It is now possible to define per-user quotas in the query cache. [#48284](https://github.com/ClickHouse/ClickHouse/pull/48284) ([Robert Schulze](https://github.com/rschu1ze)). -* Some fixes for parallel replicas [#48433](https://github.com/ClickHouse/ClickHouse/pull/48433) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Implement zero-copy-replication (an experimental feature) on encrypted disks. [#48741](https://github.com/ClickHouse/ClickHouse/pull/48741) ([Vitaly Baranov](https://github.com/vitlibar)). - -#### Improvement -* Increase default value for `connect_timeout_with_failover_ms` to 1000 ms (because of adding async connections in https://github.com/ClickHouse/ClickHouse/pull/47229) . Closes [#5188](https://github.com/ClickHouse/ClickHouse/issues/5188). [#49009](https://github.com/ClickHouse/ClickHouse/pull/49009) ([Kruglov Pavel](https://github.com/Avogar)). -* Several improvements around data lakes: - Make `Iceberg` work with non-partitioned data. - Support `Iceberg` format version v2 (previously only v1 was supported) - Support reading partitioned data for `DeltaLake`/`Hudi` - Faster reading of `DeltaLake` metadata by using Delta's checkpoint files - Fixed incorrect `Hudi` reads: previously it incorrectly chose which data to read and therefore was able to read correctly only small size tables - Made these engines to pickup updates of changed data (previously the state was set on table creation) - Make proper testing for `Iceberg`/`DeltaLake`/`Hudi` using spark. [#47307](https://github.com/ClickHouse/ClickHouse/pull/47307) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Add async connection to socket and async writing to socket. Make creating connections and sending query/external tables async across shards. Refactor code with fibers. Closes [#46931](https://github.com/ClickHouse/ClickHouse/issues/46931). We will be able to increase `connect_timeout_with_failover_ms` by default after this PR (https://github.com/ClickHouse/ClickHouse/issues/5188). [#47229](https://github.com/ClickHouse/ClickHouse/pull/47229) ([Kruglov Pavel](https://github.com/Avogar)). -* Support config sections `keeper`/`keeper_server` as an alternative to `zookeeper`. Close [#34766](https://github.com/ClickHouse/ClickHouse/issues/34766) , [#34767](https://github.com/ClickHouse/ClickHouse/issues/34767). [#35113](https://github.com/ClickHouse/ClickHouse/pull/35113) ([李扬](https://github.com/taiyang-li)). -* It is possible to set _secure_ flag in named_collections for a dictionary with a ClickHouse table source. Addresses [#38450](https://github.com/ClickHouse/ClickHouse/issues/38450) . [#46323](https://github.com/ClickHouse/ClickHouse/pull/46323) ([Ilya Golshtein](https://github.com/ilejn)). -* `bitCount` function support `FixedString` and `String` data type. [#49044](https://github.com/ClickHouse/ClickHouse/pull/49044) ([flynn](https://github.com/ucasfl)). -* Added configurable retries for all operations with [Zoo]Keeper for Backup queries. [#47224](https://github.com/ClickHouse/ClickHouse/pull/47224) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Enable `use_environment_credentials` for S3 by default, so the entire provider chain is constructed by default. [#47397](https://github.com/ClickHouse/ClickHouse/pull/47397) ([Antonio Andelic](https://github.com/antonio2368)). -* Currently, the JSON_VALUE function is similar as spark's get_json_object function, which support to get value from JSON string by a path like '$.key'. But still has something different - 1. in spark's get_json_object will return null while the path is not exist, but in JSON_VALUE will return empty string; - 2. in spark's get_json_object will return a complex type value, such as a JSON object/array value, but in JSON_VALUE will return empty string. [#47494](https://github.com/ClickHouse/ClickHouse/pull/47494) ([KevinyhZou](https://github.com/KevinyhZou)). -* For `use_structure_from_insertion_table_in_table_functions` more flexible insert table structure propagation to table function. Fixed an issue with name mapping and using virtual columns. No more need for 'auto' setting. [#47962](https://github.com/ClickHouse/ClickHouse/pull/47962) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Do not continue retrying to connect to Keeper if the query is killed or over limits. [#47985](https://github.com/ClickHouse/ClickHouse/pull/47985) ([Raúl Marín](https://github.com/Algunenano)). -* Support Enum output/input in `BSONEachRow`, allow all map key types and avoid extra calculations on output. [#48122](https://github.com/ClickHouse/ClickHouse/pull/48122) ([Kruglov Pavel](https://github.com/Avogar)). -* Support more ClickHouse types in `ORC`/`Arrow`/`Parquet` formats: Enum(8|16), (U)Int(128|256), Decimal256 (for ORC), allow reading IPv4 from Int32 values (ORC outputs IPv4 as Int32, and we couldn't read it back), fix reading Nullable(IPv6) from binary data for `ORC`. [#48126](https://github.com/ClickHouse/ClickHouse/pull/48126) ([Kruglov Pavel](https://github.com/Avogar)). -* Add columns `perform_ttl_move_on_insert`, `load_balancing` for table `system.storage_policies`, modify column `volume_type` type to `Enum8`. [#48167](https://github.com/ClickHouse/ClickHouse/pull/48167) ([lizhuoyu5](https://github.com/lzydmxy)). -* Added support for `BACKUP ALL` command which backups all tables and databases, including temporary and system ones. [#48189](https://github.com/ClickHouse/ClickHouse/pull/48189) ([Vitaly Baranov](https://github.com/vitlibar)). -* Function mapFromArrays supports `Map` type as an input. [#48207](https://github.com/ClickHouse/ClickHouse/pull/48207) ([李扬](https://github.com/taiyang-li)). -* The output of some SHOW PROCESSLIST is now sorted. [#48241](https://github.com/ClickHouse/ClickHouse/pull/48241) ([Robert Schulze](https://github.com/rschu1ze)). -* Per-query/per-server throttling for remote IO/local IO/BACKUPs (server settings: `max_remote_read_network_bandwidth_for_server`, `max_remote_write_network_bandwidth_for_server`, `max_local_read_bandwidth_for_server`, `max_local_write_bandwidth_for_server`, `max_backup_bandwidth_for_server`, settings: `max_remote_read_network_bandwidth`, `max_remote_write_network_bandwidth`, `max_local_read_bandwidth`, `max_local_write_bandwidth`, `max_backup_bandwidth`). [#48242](https://github.com/ClickHouse/ClickHouse/pull/48242) ([Azat Khuzhin](https://github.com/azat)). -* Support more types in `CapnProto` format: Map, (U)Int(128|256), Decimal(128|256). Allow integer conversions during input/output. [#48257](https://github.com/ClickHouse/ClickHouse/pull/48257) ([Kruglov Pavel](https://github.com/Avogar)). -* Don't throw CURRENT_WRITE_BUFFER_IS_EXHAUSTED for normal behaviour. [#48288](https://github.com/ClickHouse/ClickHouse/pull/48288) ([Raúl Marín](https://github.com/Algunenano)). -* Add new setting `keeper_map_strict_mode` which enforces extra guarantees on operations made on top of `KeeperMap` tables. [#48293](https://github.com/ClickHouse/ClickHouse/pull/48293) ([Antonio Andelic](https://github.com/antonio2368)). -* Check primary key type for simple dictionary is native unsigned integer type Add setting `check_dictionary_primary_key ` for compatibility(set `check_dictionary_primary_key =false` to disable checking). [#48335](https://github.com/ClickHouse/ClickHouse/pull/48335) ([lizhuoyu5](https://github.com/lzydmxy)). -* Don't replicate mutations for `KeeperMap` because it's unnecessary. [#48354](https://github.com/ClickHouse/ClickHouse/pull/48354) ([Antonio Andelic](https://github.com/antonio2368)). -* Allow to write/read unnamed tuple as nested Message in Protobuf format. Tuple elements and Message fields are matched by position. [#48390](https://github.com/ClickHouse/ClickHouse/pull/48390) ([Kruglov Pavel](https://github.com/Avogar)). -* Support `additional_table_filters` and `additional_result_filter` settings in the new planner. Also, add a documentation entry for `additional_result_filter`. [#48405](https://github.com/ClickHouse/ClickHouse/pull/48405) ([Dmitry Novik](https://github.com/novikd)). -* `parseDateTime` now understands format string '%f' (fractional seconds). [#48420](https://github.com/ClickHouse/ClickHouse/pull/48420) ([Robert Schulze](https://github.com/rschu1ze)). -* Format string "%f" in formatDateTime() now prints "000000" if the formatted value has no fractional seconds, the previous behavior (single zero) can be restored using setting "formatdatetime_f_prints_single_zero = 1". [#48422](https://github.com/ClickHouse/ClickHouse/pull/48422) ([Robert Schulze](https://github.com/rschu1ze)). -* Don't replicate DELETE and TRUNCATE for KeeperMap. [#48434](https://github.com/ClickHouse/ClickHouse/pull/48434) ([Antonio Andelic](https://github.com/antonio2368)). -* Generate valid Decimals and Bools in generateRandom function. [#48436](https://github.com/ClickHouse/ClickHouse/pull/48436) ([Kruglov Pavel](https://github.com/Avogar)). -* Allow trailing commas in expression list of SELECT query, for example `SELECT a, b, c, FROM table`. Closes [#37802](https://github.com/ClickHouse/ClickHouse/issues/37802). [#48438](https://github.com/ClickHouse/ClickHouse/pull/48438) ([Nikolay Degterinsky](https://github.com/evillique)). -* Override `CLICKHOUSE_USER` and `CLICKHOUSE_PASSWORD` environment variables with `--user` and `--password` client parameters. Closes [#38909](https://github.com/ClickHouse/ClickHouse/issues/38909). [#48440](https://github.com/ClickHouse/ClickHouse/pull/48440) ([Nikolay Degterinsky](https://github.com/evillique)). -* Added retries to loading of data parts in `MergeTree` tables in case of retryable errors. [#48442](https://github.com/ClickHouse/ClickHouse/pull/48442) ([Anton Popov](https://github.com/CurtizJ)). -* Add support for `Date`, `Date32`, `DateTime`, `DateTime64` data types to `arrayMin`, `arrayMax`, `arrayDifference` functions. Closes [#21645](https://github.com/ClickHouse/ClickHouse/issues/21645). [#48445](https://github.com/ClickHouse/ClickHouse/pull/48445) ([Nikolay Degterinsky](https://github.com/evillique)). -* Add support for `{server_uuid}` macro. It is useful for identifying replicas in autoscaled clusters when new replicas are constantly added and removed in runtime. This closes [#48554](https://github.com/ClickHouse/ClickHouse/issues/48554). [#48563](https://github.com/ClickHouse/ClickHouse/pull/48563) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The installation script will create a hard link instead of copying if it is possible. [#48578](https://github.com/ClickHouse/ClickHouse/pull/48578) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Support `SHOW TABLE` syntax meaning the same as `SHOW CREATE TABLE`. Closes [#48580](https://github.com/ClickHouse/ClickHouse/issues/48580). [#48591](https://github.com/ClickHouse/ClickHouse/pull/48591) ([flynn](https://github.com/ucasfl)). -* HTTP temporary buffers now support working by evicting data from the virtual filesystem cache. [#48664](https://github.com/ClickHouse/ClickHouse/pull/48664) ([Vladimir C](https://github.com/vdimir)). -* Make Schema inference works for `CREATE AS SELECT`. Closes [#47599](https://github.com/ClickHouse/ClickHouse/issues/47599). [#48679](https://github.com/ClickHouse/ClickHouse/pull/48679) ([flynn](https://github.com/ucasfl)). -* Added a `replicated_max_mutations_in_one_entry` setting for `ReplicatedMergeTree` that allows limiting the number of mutation commands per one `MUTATE_PART` entry (default is 10000). [#48731](https://github.com/ClickHouse/ClickHouse/pull/48731) ([Alexander Tokmakov](https://github.com/tavplubix)). -* In AggregateFunction types, don't count unused arena bytes as `read_bytes`. [#48745](https://github.com/ClickHouse/ClickHouse/pull/48745) ([Raúl Marín](https://github.com/Algunenano)). -* Fix some MySQL-related settings not being handled with the MySQL dictionary source + named collection. Closes [#48402](https://github.com/ClickHouse/ClickHouse/issues/48402). [#48759](https://github.com/ClickHouse/ClickHouse/pull/48759) ([Kseniia Sumarokova](https://github.com/kssenii)). -* If a user set `max_single_part_upload_size` to a very large value, it can lead to a crash due to a bug in the AWS S3 SDK. This fixes [#47679](https://github.com/ClickHouse/ClickHouse/issues/47679). [#48816](https://github.com/ClickHouse/ClickHouse/pull/48816) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix data race in `RabbitMQ` ([report](https://pastila.nl/?004f7100/de1505289ab5bb355e67ebe6c7cc8707)), refactor the code. [#48845](https://github.com/ClickHouse/ClickHouse/pull/48845) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Add aliases `name` and `part_name` form `system.parts` and `system.part_log`. Closes [#48718](https://github.com/ClickHouse/ClickHouse/issues/48718). [#48850](https://github.com/ClickHouse/ClickHouse/pull/48850) ([sichenzhao](https://github.com/sichenzhao)). -* Functions "arrayDifferenceSupport()", "arrayCumSum()" and "arrayCumSumNonNegative()" now support input arrays of wide integer types (U)Int128/256. [#48866](https://github.com/ClickHouse/ClickHouse/pull/48866) ([cluster](https://github.com/infdahai)). -* Multi-line history in clickhouse-client is now no longer padded. This makes pasting more natural. [#48870](https://github.com/ClickHouse/ClickHouse/pull/48870) ([Joanna Hulboj](https://github.com/jh0x)). -* Implement a slight improvement for the rare case when ClickHouse is run inside LXC and LXCFS is used. The LXCFS has an issue: sometimes it returns an error "Transport endpoint is not connected" on reading from the file inside `/proc`. This error was correctly logged into ClickHouse's server log. We have additionally workaround this issue by reopening a file. This is a minuscule change. [#48922](https://github.com/ClickHouse/ClickHouse/pull/48922) ([Real](https://github.com/RunningXie)). -* Improve memory accounting for prefetches. Randomise prefetch settings In CI. [#48973](https://github.com/ClickHouse/ClickHouse/pull/48973) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Correctly set headers for native copy operations on GCS. [#48981](https://github.com/ClickHouse/ClickHouse/pull/48981) ([Antonio Andelic](https://github.com/antonio2368)). -* Add support for specifying setting names in the command line with dashes instead of underscores, for example, `--max-threads` instead of `--max_threads`. Additionally, support Unicode dash characters like `—` instead of `--` - this is useful when you communicate with a team in another company, and a manager from that team copy-pasted code from MS Word. [#48985](https://github.com/ClickHouse/ClickHouse/pull/48985) ([alekseygolub](https://github.com/alekseygolub)). -* Add fallback to password authentication when authentication with SSL user certificate has failed. Closes [#48974](https://github.com/ClickHouse/ClickHouse/issues/48974). [#48989](https://github.com/ClickHouse/ClickHouse/pull/48989) ([Nikolay Degterinsky](https://github.com/evillique)). -* Improve the embedded dashboard. Close [#46671](https://github.com/ClickHouse/ClickHouse/issues/46671). [#49036](https://github.com/ClickHouse/ClickHouse/pull/49036) ([Kevin Zhang](https://github.com/Kinzeng)). -* Add profile events for log messages, so you can easily see the count of log messages by severity. [#49042](https://github.com/ClickHouse/ClickHouse/pull/49042) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* In previous versions, the `LineAsString` format worked inconsistently when the parallel parsing was enabled or not, in presence of DOS or macOS Classic line breaks. This closes [#49039](https://github.com/ClickHouse/ClickHouse/issues/49039). [#49052](https://github.com/ClickHouse/ClickHouse/pull/49052) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The exception message about the unparsed query parameter will also tell about the name of the parameter. Reimplement [#48878](https://github.com/ClickHouse/ClickHouse/issues/48878). Close [#48772](https://github.com/ClickHouse/ClickHouse/issues/48772). [#49061](https://github.com/ClickHouse/ClickHouse/pull/49061) ([Alexey Milovidov](https://github.com/alexey-milovidov)). - -#### Build/Testing/Packaging Improvement -* Update time zones. The following were updated: Africa/Cairo, Africa/Casablanca, Africa/El_Aaiun, America/Bogota, America/Cambridge_Bay, America/Ciudad_Juarez, America/Godthab, America/Inuvik, America/Iqaluit, America/Nuuk, America/Ojinaga, America/Pangnirtung, America/Rankin_Inlet, America/Resolute, America/Whitehorse, America/Yellowknife, Asia/Gaza, Asia/Hebron, Asia/Kuala_Lumpur, Asia/Singapore, Canada/Yukon, Egypt, Europe/Kirov, Europe/Volgograd, Singapore. [#48572](https://github.com/ClickHouse/ClickHouse/pull/48572) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Reduce the number of dependencies in the header files to speed up the build. [#47984](https://github.com/ClickHouse/ClickHouse/pull/47984) ([Dmitry Novik](https://github.com/novikd)). -* Randomize compression of marks and indices in tests. [#48286](https://github.com/ClickHouse/ClickHouse/pull/48286) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Bump internal ZSTD from 1.5.4 to 1.5.5. [#46797](https://github.com/ClickHouse/ClickHouse/pull/46797) ([Robert Schulze](https://github.com/rschu1ze)). -* Randomize vertical merges from compact to wide parts in tests. [#48287](https://github.com/ClickHouse/ClickHouse/pull/48287) ([Raúl Marín](https://github.com/Algunenano)). -* Support for CRC32 checksum in HDFS. Fix performance issues. [#48614](https://github.com/ClickHouse/ClickHouse/pull/48614) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Remove remainders of GCC support. [#48671](https://github.com/ClickHouse/ClickHouse/pull/48671) ([Robert Schulze](https://github.com/rschu1ze)). -* Add CI run with new analyzer infrastructure enabled. [#48719](https://github.com/ClickHouse/ClickHouse/pull/48719) ([Dmitry Novik](https://github.com/novikd)). - -#### Bug Fix (user-visible misbehavior in an official stable release) - -* Fix system.query_views_log for MVs that are pushed from background threads [#46668](https://github.com/ClickHouse/ClickHouse/pull/46668) ([Azat Khuzhin](https://github.com/azat)). -* Fix several `RENAME COLUMN` bugs [#46946](https://github.com/ClickHouse/ClickHouse/pull/46946) ([alesapin](https://github.com/alesapin)). -* Fix minor hiliting issues in clickhouse-format [#47610](https://github.com/ClickHouse/ClickHouse/pull/47610) ([Natasha Murashkina](https://github.com/murfel)). -* Fix a bug in LLVM's libc++ leading to a crash for uploading parts to S3 which size is greater than INT_MAX [#47693](https://github.com/ClickHouse/ClickHouse/pull/47693) ([Azat Khuzhin](https://github.com/azat)). -* Fix overflow in the `sparkbar` function [#48121](https://github.com/ClickHouse/ClickHouse/pull/48121) ([Vladimir C](https://github.com/vdimir)). -* Fix race in S3 [#48190](https://github.com/ClickHouse/ClickHouse/pull/48190) ([Anton Popov](https://github.com/CurtizJ)). -* Disable JIT for aggregate functions due to inconsistent behavior [#48195](https://github.com/ClickHouse/ClickHouse/pull/48195) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix alter formatting (minor) [#48289](https://github.com/ClickHouse/ClickHouse/pull/48289) ([Natasha Murashkina](https://github.com/murfel)). -* Fix CPU usage in RabbitMQ (was worsened in 23.2 after [#44404](https://github.com/ClickHouse/ClickHouse/issues/44404)) [#48311](https://github.com/ClickHouse/ClickHouse/pull/48311) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix crash in EXPLAIN PIPELINE for Merge over Distributed [#48320](https://github.com/ClickHouse/ClickHouse/pull/48320) ([Azat Khuzhin](https://github.com/azat)). -* Fix serializing LowCardinality as Arrow dictionary [#48361](https://github.com/ClickHouse/ClickHouse/pull/48361) ([Kruglov Pavel](https://github.com/Avogar)). -* Reset downloader for cache file segment in TemporaryFileStream [#48386](https://github.com/ClickHouse/ClickHouse/pull/48386) ([Vladimir C](https://github.com/vdimir)). -* Fix possible SYSTEM SYNC REPLICA stuck in case of DROP/REPLACE PARTITION [#48391](https://github.com/ClickHouse/ClickHouse/pull/48391) ([Azat Khuzhin](https://github.com/azat)). -* Fix a startup error when loading a distributed table that depends on a dictionary [#48419](https://github.com/ClickHouse/ClickHouse/pull/48419) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). -* Don't check dependencies when renaming system tables automatically [#48431](https://github.com/ClickHouse/ClickHouse/pull/48431) ([Raúl Marín](https://github.com/Algunenano)). -* Update only affected rows in KeeperMap storage [#48435](https://github.com/ClickHouse/ClickHouse/pull/48435) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix possible segfault in the VFS cache [#48469](https://github.com/ClickHouse/ClickHouse/pull/48469) ([Kseniia Sumarokova](https://github.com/kssenii)). -* `toTimeZone` function throws an error when no constant string is provided [#48471](https://github.com/ClickHouse/ClickHouse/pull/48471) ([Jordi Villar](https://github.com/jrdi)). -* Fix logical error with IPv4 in Protobuf, add support for Date32 [#48486](https://github.com/ClickHouse/ClickHouse/pull/48486) ([Kruglov Pavel](https://github.com/Avogar)). -* "changed" flag in system.settings was calculated incorrectly for settings with multiple values [#48516](https://github.com/ClickHouse/ClickHouse/pull/48516) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). -* Fix storage `Memory` with enabled compression [#48517](https://github.com/ClickHouse/ClickHouse/pull/48517) ([Anton Popov](https://github.com/CurtizJ)). -* Fix bracketed-paste mode messing up password input in the event of client reconnection [#48528](https://github.com/ClickHouse/ClickHouse/pull/48528) ([Michael Kolupaev](https://github.com/al13n321)). -* Fix nested map for keys of IP and UUID types [#48556](https://github.com/ClickHouse/ClickHouse/pull/48556) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Fix an uncaught exception in case of parallel loader for hashed dictionaries [#48571](https://github.com/ClickHouse/ClickHouse/pull/48571) ([Azat Khuzhin](https://github.com/azat)). -* The `groupArray` aggregate function correctly works for empty result over nullable types [#48593](https://github.com/ClickHouse/ClickHouse/pull/48593) ([lgbo](https://github.com/lgbo-ustc)). -* Fix bug in Keeper when a node is not created with scheme `auth` in ACL sometimes. [#48595](https://github.com/ClickHouse/ClickHouse/pull/48595) ([Aleksei Filatov](https://github.com/aalexfvk)). -* Allow IPv4 comparison operators with UInt [#48611](https://github.com/ClickHouse/ClickHouse/pull/48611) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Fix possible error from cache [#48636](https://github.com/ClickHouse/ClickHouse/pull/48636) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Async inserts with empty data will no longer throw exception. [#48663](https://github.com/ClickHouse/ClickHouse/pull/48663) ([Anton Popov](https://github.com/CurtizJ)). -* Fix table dependencies in case of failed RENAME TABLE [#48683](https://github.com/ClickHouse/ClickHouse/pull/48683) ([Azat Khuzhin](https://github.com/azat)). -* If the primary key has duplicate columns (which is only possible for projections), in previous versions it might lead to a bug [#48838](https://github.com/ClickHouse/ClickHouse/pull/48838) ([Amos Bird](https://github.com/amosbird)). -* Fix for a race condition in ZooKeeper when joining send_thread/receive_thread [#48849](https://github.com/ClickHouse/ClickHouse/pull/48849) ([Alexander Gololobov](https://github.com/davenger)). -* Fix unexpected part name error when trying to drop a ignored detached part with zero copy replication [#48862](https://github.com/ClickHouse/ClickHouse/pull/48862) ([Michael Lex](https://github.com/mlex)). -* Fix reading `Date32` Parquet/Arrow column into not a `Date32` column [#48864](https://github.com/ClickHouse/ClickHouse/pull/48864) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix `UNKNOWN_IDENTIFIER` error while selecting from table with row policy and column with dots [#48976](https://github.com/ClickHouse/ClickHouse/pull/48976) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix aggregation by empty nullable strings [#48999](https://github.com/ClickHouse/ClickHouse/pull/48999) ([LiuNeng](https://github.com/liuneng1994)). - -### ClickHouse release 23.3 LTS, 2023-03-30 - -#### Upgrade Notes -* Lightweight DELETEs are production ready and enabled by default. The `DELETE` query for MergeTree tables is now available by default. -* The behavior of `*domain*RFC` and `netloc` functions is slightly changed: relaxed the set of symbols that are allowed in the URL authority for better conformance. [#46841](https://github.com/ClickHouse/ClickHouse/pull/46841) ([Azat Khuzhin](https://github.com/azat)). -* Prohibited creating tables based on KafkaEngine with DEFAULT/EPHEMERAL/ALIAS/MATERIALIZED statements for columns. [#47138](https://github.com/ClickHouse/ClickHouse/pull/47138) ([Aleksandr Musorin](https://github.com/AVMusorin)). -* An "asynchronous connection drain" feature is removed. Related settings and metrics are removed as well. It was an internal feature, so the removal should not affect users who had never heard about that feature. [#47486](https://github.com/ClickHouse/ClickHouse/pull/47486) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Support 256-bit Decimal data type (more than 38 digits) in `arraySum`/`Min`/`Max`/`Avg`/`Product`, `arrayCumSum`/`CumSumNonNegative`, `arrayDifference`, array construction, IN operator, query parameters, `groupArrayMovingSum`, statistical functions, `min`/`max`/`any`/`argMin`/`argMax`, PostgreSQL wire protocol, MySQL table engine and function, `sumMap`, `mapAdd`, `mapSubtract`, `arrayIntersect`. Add support for big integers in `arrayIntersect`. Statistical aggregate functions involving moments (such as `corr` or various `TTest`s) will use `Float64` as their internal representation (they were using `Decimal128` before this change, but it was pointless), and these functions can return `nan` instead of `inf` in case of infinite variance. Some functions were allowed on `Decimal256` data types but returned `Decimal128` in previous versions - now it is fixed. This closes [#47569](https://github.com/ClickHouse/ClickHouse/issues/47569). This closes [#44864](https://github.com/ClickHouse/ClickHouse/issues/44864). This closes [#28335](https://github.com/ClickHouse/ClickHouse/issues/28335). [#47594](https://github.com/ClickHouse/ClickHouse/pull/47594) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Make backup_threads/restore_threads server settings (instead of user settings). [#47881](https://github.com/ClickHouse/ClickHouse/pull/47881) ([Azat Khuzhin](https://github.com/azat)). -* Do not allow const and non-deterministic secondary indices [#46839](https://github.com/ClickHouse/ClickHouse/pull/46839) ([Anton Popov](https://github.com/CurtizJ)). - -#### New Feature -* Add a new mode for splitting the work on replicas using settings `parallel_replicas_custom_key` and `parallel_replicas_custom_key_filter_type`. If the cluster consists of a single shard with multiple replicas, up to `max_parallel_replicas` will be randomly picked and turned into shards. For each shard, a corresponding filter is added to the query on the initiator before being sent to the shard. If the cluster consists of multiple shards, it will behave the same as `sample_key` but with the possibility to define an arbitrary key. [#45108](https://github.com/ClickHouse/ClickHouse/pull/45108) ([Antonio Andelic](https://github.com/antonio2368)). -* An option to display partial result on cancel: Added query setting `partial_result_on_first_cancel` allowing the canceled query (e.g. due to Ctrl-C) to return a partial result. [#45689](https://github.com/ClickHouse/ClickHouse/pull/45689) ([Alexey Perevyshin](https://github.com/alexX512)). -* Added support of arbitrary tables engines for temporary tables (except for Replicated and KeeperMap engines). Close [#31497](https://github.com/ClickHouse/ClickHouse/issues/31497). [#46071](https://github.com/ClickHouse/ClickHouse/pull/46071) ([Roman Vasin](https://github.com/rvasin)). -* Add support for replication of user-defined SQL functions using centralized storage in Keeper. [#46085](https://github.com/ClickHouse/ClickHouse/pull/46085) ([Aleksei Filatov](https://github.com/aalexfvk)). -* Implement `system.server_settings` (similar to `system.settings`), which will contain server configurations. [#46550](https://github.com/ClickHouse/ClickHouse/pull/46550) ([pufit](https://github.com/pufit)). -* Support for `UNDROP TABLE` query. Closes [#46811](https://github.com/ClickHouse/ClickHouse/issues/46811). [#47241](https://github.com/ClickHouse/ClickHouse/pull/47241) ([chen](https://github.com/xiedeyantu)). -* Allow separate grants for named collections (e.g. to be able to give `SHOW/CREATE/ALTER/DROP named collection` access only to certain collections, instead of all at once). Closes [#40894](https://github.com/ClickHouse/ClickHouse/issues/40894). Add new access type `NAMED_COLLECTION_CONTROL` which is not given to user default unless explicitly added to the user config (is required to be able to do `GRANT ALL`), also `show_named_collections` is no longer obligatory to be manually specified for user default to be able to have full access rights as was in 23.2. [#46241](https://github.com/ClickHouse/ClickHouse/pull/46241) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Allow nested custom disks. Previously custom disks supported only flat disk structure. [#47106](https://github.com/ClickHouse/ClickHouse/pull/47106) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Introduce a function `widthBucket` (with a `WIDTH_BUCKET` alias for compatibility). [#42974](https://github.com/ClickHouse/ClickHouse/issues/42974). [#46790](https://github.com/ClickHouse/ClickHouse/pull/46790) ([avoiderboi](https://github.com/avoiderboi)). -* Add new function `parseDateTime`/`parseDateTimeInJodaSyntax` according to the specified format string. parseDateTime parses String to DateTime in MySQL syntax, parseDateTimeInJodaSyntax parses in Joda syntax. [#46815](https://github.com/ClickHouse/ClickHouse/pull/46815) ([李扬](https://github.com/taiyang-li)). -* Use `dummy UInt8` for the default structure of table function `null`. Closes [#46930](https://github.com/ClickHouse/ClickHouse/issues/46930). [#47006](https://github.com/ClickHouse/ClickHouse/pull/47006) ([flynn](https://github.com/ucasfl)). -* Support for date format with a comma, like `Dec 15, 2021` in the `parseDateTimeBestEffort` function. Closes [#46816](https://github.com/ClickHouse/ClickHouse/issues/46816). [#47071](https://github.com/ClickHouse/ClickHouse/pull/47071) ([chen](https://github.com/xiedeyantu)). -* Add settings `http_wait_end_of_query` and `http_response_buffer_size` that corresponds to URL params `wait_end_of_query` and `buffer_size` for the HTTP interface. This allows changing these settings in the profiles. [#47108](https://github.com/ClickHouse/ClickHouse/pull/47108) ([Vladimir C](https://github.com/vdimir)). -* Add `system.dropped_tables` table that shows tables that were dropped from `Atomic` databases but were not completely removed yet. [#47364](https://github.com/ClickHouse/ClickHouse/pull/47364) ([chen](https://github.com/xiedeyantu)). -* Add `INSTR` as alias of `positionCaseInsensitive` for MySQL compatibility. Closes [#47529](https://github.com/ClickHouse/ClickHouse/issues/47529). [#47535](https://github.com/ClickHouse/ClickHouse/pull/47535) ([flynn](https://github.com/ucasfl)). -* Added `toDecimalString` function allowing to convert numbers to string with fixed precision. [#47838](https://github.com/ClickHouse/ClickHouse/pull/47838) ([Andrey Zvonov](https://github.com/zvonand)). -* Add a merge tree setting `max_number_of_mutations_for_replica`. It limits the number of part mutations per replica to the specified amount. Zero means no limit on the number of mutations per replica (the execution can still be constrained by other settings). [#48047](https://github.com/ClickHouse/ClickHouse/pull/48047) ([Vladimir C](https://github.com/vdimir)). -* Add the Map-related function `mapFromArrays`, which allows the creation of a map from a pair of arrays. [#31125](https://github.com/ClickHouse/ClickHouse/pull/31125) ([李扬](https://github.com/taiyang-li)). -* Allow control of compression in Parquet/ORC/Arrow output formats, adds support for more compression input formats. This closes [#13541](https://github.com/ClickHouse/ClickHouse/issues/13541). [#47114](https://github.com/ClickHouse/ClickHouse/pull/47114) ([Kruglov Pavel](https://github.com/Avogar)). -* Add SSL User Certificate authentication to the native protocol. Closes [#47077](https://github.com/ClickHouse/ClickHouse/issues/47077). [#47596](https://github.com/ClickHouse/ClickHouse/pull/47596) ([Nikolay Degterinsky](https://github.com/evillique)). -* Add *OrNull() and *OrZero() variants for `parseDateTime`, add alias `str_to_date` for MySQL parity. [#48000](https://github.com/ClickHouse/ClickHouse/pull/48000) ([Robert Schulze](https://github.com/rschu1ze)). -* Added operator `REGEXP` (similar to operators "LIKE", "IN", "MOD" etc.) for better compatibility with MySQL [#47869](https://github.com/ClickHouse/ClickHouse/pull/47869) ([Robert Schulze](https://github.com/rschu1ze)). - -#### Performance Improvement -* Marks in memory are now compressed, using 3-6x less memory. [#47290](https://github.com/ClickHouse/ClickHouse/pull/47290) ([Michael Kolupaev](https://github.com/al13n321)). -* Backups for large numbers of files were unbelievably slow in previous versions. Not anymore. Now they are unbelievably fast. [#47251](https://github.com/ClickHouse/ClickHouse/pull/47251) ([Alexey Milovidov](https://github.com/alexey-milovidov)). Introduced a separate thread pool for backup's IO operations. This will allow scaling it independently of other pools and increase performance. [#47174](https://github.com/ClickHouse/ClickHouse/pull/47174) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). Use MultiRead request and retries for collecting metadata at the final stage of backup processing. [#47243](https://github.com/ClickHouse/ClickHouse/pull/47243) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). If a backup and restoring data are both in S3 then server-side copy should be used from now on. [#47546](https://github.com/ClickHouse/ClickHouse/pull/47546) ([Vitaly Baranov](https://github.com/vitlibar)). -* Fixed excessive reading in queries with `FINAL`. [#47801](https://github.com/ClickHouse/ClickHouse/pull/47801) ([Nikita Taranov](https://github.com/nickitat)). -* Setting `max_final_threads` would be set to the number of cores at server startup (by the same algorithm as used for `max_threads`). This improves the concurrency of `final` execution on servers with high number of CPUs. [#47915](https://github.com/ClickHouse/ClickHouse/pull/47915) ([Nikita Taranov](https://github.com/nickitat)). -* Allow executing reading pipeline for DIRECT dictionary with CLICKHOUSE source in multiple threads. To enable set `dictionary_use_async_executor=1` in `SETTINGS` section for source in `CREATE DICTIONARY` statement. [#47986](https://github.com/ClickHouse/ClickHouse/pull/47986) ([Vladimir C](https://github.com/vdimir)). -* Optimize one nullable key aggregate performance. [#45772](https://github.com/ClickHouse/ClickHouse/pull/45772) ([LiuNeng](https://github.com/liuneng1994)). -* Implemented lowercase `tokenbf_v1` index utilization for `hasTokenOrNull`, `hasTokenCaseInsensitive` and `hasTokenCaseInsensitiveOrNull`. [#46252](https://github.com/ClickHouse/ClickHouse/pull/46252) ([ltrk2](https://github.com/ltrk2)). -* Optimize functions `position` and `LIKE` by searching the first two chars using SIMD. [#46289](https://github.com/ClickHouse/ClickHouse/pull/46289) ([Jiebin Sun](https://github.com/jiebinn)). -* Optimize queries from the `system.detached_parts`, which could be significantly large. Added several sources with respect to the block size limitation; in each block, an IO thread pool is used to calculate the part size, i.e. to make syscalls in parallel. [#46624](https://github.com/ClickHouse/ClickHouse/pull/46624) ([Sema Checherinda](https://github.com/CheSema)). -* Increase the default value of `max_replicated_merges_in_queue` for ReplicatedMergeTree tables from 16 to 1000. It allows faster background merge operation on clusters with a very large number of replicas, such as clusters with shared storage in ClickHouse Cloud. [#47050](https://github.com/ClickHouse/ClickHouse/pull/47050) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Updated `clickhouse-copier` to use `GROUP BY` instead of `DISTINCT` to get the list of partitions. For large tables, this reduced the select time from over 500s to under 1s. [#47386](https://github.com/ClickHouse/ClickHouse/pull/47386) ([Clayton McClure](https://github.com/cmcclure-twilio)). -* Fix performance degradation in `ASOF JOIN`. [#47544](https://github.com/ClickHouse/ClickHouse/pull/47544) ([Ongkong](https://github.com/ongkong)). -* Even more batching in Keeper. Improve performance by avoiding breaking batches on read requests. [#47978](https://github.com/ClickHouse/ClickHouse/pull/47978) ([Antonio Andelic](https://github.com/antonio2368)). -* Allow PREWHERE for Merge with different DEFAULT expressions for columns. [#46831](https://github.com/ClickHouse/ClickHouse/pull/46831) ([Azat Khuzhin](https://github.com/azat)). - -#### Experimental Feature -* Parallel replicas: Improved the overall performance by better utilizing the local replica, and forbid the reading with parallel replicas from non-replicated MergeTree by default. [#47858](https://github.com/ClickHouse/ClickHouse/pull/47858) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Support filter push down to left table for JOIN with `Join`, `Dictionary` and `EmbeddedRocksDB` tables if the experimental Analyzer is enabled. [#47280](https://github.com/ClickHouse/ClickHouse/pull/47280) ([Maksim Kita](https://github.com/kitaisreal)). -* Now ReplicatedMergeTree with zero copy replication has less load to Keeper. [#47676](https://github.com/ClickHouse/ClickHouse/pull/47676) ([alesapin](https://github.com/alesapin)). -* Fix create materialized view with MaterializedPostgreSQL [#40807](https://github.com/ClickHouse/ClickHouse/pull/40807) ([Maksim Buren](https://github.com/maks-buren630501)). - -#### Improvement -* Enable `input_format_json_ignore_unknown_keys_in_named_tuple` by default. [#46742](https://github.com/ClickHouse/ClickHouse/pull/46742) ([Kruglov Pavel](https://github.com/Avogar)). -* Allow errors to be ignored while pushing to MATERIALIZED VIEW (add new setting `materialized_views_ignore_errors`, by default to `false`, but it is set to `true` for flushing logs to `system.*_log` tables unconditionally). [#46658](https://github.com/ClickHouse/ClickHouse/pull/46658) ([Azat Khuzhin](https://github.com/azat)). -* Track the file queue of distributed sends in memory. [#45491](https://github.com/ClickHouse/ClickHouse/pull/45491) ([Azat Khuzhin](https://github.com/azat)). -* Now `X-ClickHouse-Query-Id` and `X-ClickHouse-Timezone` headers are added to responses in all queries via HTTP protocol. Previously it was done only for `SELECT` queries. [#46364](https://github.com/ClickHouse/ClickHouse/pull/46364) ([Anton Popov](https://github.com/CurtizJ)). -* External tables from `MongoDB`: support for connection to a replica set via a URI with a host:port enum and support for the readPreference option in MongoDB dictionaries. Example URI: mongodb://db0.example.com:27017,db1.example.com:27017,db2.example.com:27017/?replicaSet=myRepl&readPreference=primary. [#46524](https://github.com/ClickHouse/ClickHouse/pull/46524) ([artem-yadr](https://github.com/artem-yadr)). -* This improvement should be invisible for users. Re-implement projection analysis on top of query plan. Added setting `query_plan_optimize_projection=1` to switch between old and new version. Fixes [#44963](https://github.com/ClickHouse/ClickHouse/issues/44963). [#46537](https://github.com/ClickHouse/ClickHouse/pull/46537) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Use Parquet format v2 instead of v1 in output format by default. Add setting `output_format_parquet_version` to control parquet version, possible values `1.0`, `2.4`, `2.6`, `2.latest` (default). [#46617](https://github.com/ClickHouse/ClickHouse/pull/46617) ([Kruglov Pavel](https://github.com/Avogar)). -* It is now possible to use the new configuration syntax to configure Kafka topics with periods (`.`) in their name. [#46752](https://github.com/ClickHouse/ClickHouse/pull/46752) ([Robert Schulze](https://github.com/rschu1ze)). -* Fix heuristics that check hyperscan patterns for problematic repeats. [#46819](https://github.com/ClickHouse/ClickHouse/pull/46819) ([Robert Schulze](https://github.com/rschu1ze)). -* Don't report ZK node exists to system.errors when a block was created concurrently by a different replica. [#46820](https://github.com/ClickHouse/ClickHouse/pull/46820) ([Raúl Marín](https://github.com/Algunenano)). -* Increase the limit for opened files in `clickhouse-local`. It will be able to read from `web` tables on servers with a huge number of CPU cores. Do not back off reading from the URL table engine in case of too many opened files. This closes [#46852](https://github.com/ClickHouse/ClickHouse/issues/46852). [#46853](https://github.com/ClickHouse/ClickHouse/pull/46853) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Exceptions thrown when numbers cannot be parsed now have an easier-to-read exception message. [#46917](https://github.com/ClickHouse/ClickHouse/pull/46917) ([Robert Schulze](https://github.com/rschu1ze)). -* Added update `system.backups` after every processed task to track the progress of backups. [#46989](https://github.com/ClickHouse/ClickHouse/pull/46989) ([Aleksandr Musorin](https://github.com/AVMusorin)). -* Allow types conversion in Native input format. Add settings `input_format_native_allow_types_conversion` that controls it (enabled by default). [#46990](https://github.com/ClickHouse/ClickHouse/pull/46990) ([Kruglov Pavel](https://github.com/Avogar)). -* Allow IPv4 in the `range` function to generate IP ranges. [#46995](https://github.com/ClickHouse/ClickHouse/pull/46995) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Improve exception message when it's impossible to move a part from one volume/disk to another. [#47032](https://github.com/ClickHouse/ClickHouse/pull/47032) ([alesapin](https://github.com/alesapin)). -* Support `Bool` type in `JSONType` function. Previously `Null` type was mistakenly returned for bool values. [#47046](https://github.com/ClickHouse/ClickHouse/pull/47046) ([Anton Popov](https://github.com/CurtizJ)). -* Use `_request_body` parameter to configure predefined HTTP queries. [#47086](https://github.com/ClickHouse/ClickHouse/pull/47086) ([Constantine Peresypkin](https://github.com/pkit)). -* Automatic indentation in the built-in UI SQL editor when Enter is pressed. [#47113](https://github.com/ClickHouse/ClickHouse/pull/47113) ([Alexey Korepanov](https://github.com/alexkorep)). -* Self-extraction with 'sudo' will attempt to set uid and gid of extracted files to running user. [#47116](https://github.com/ClickHouse/ClickHouse/pull/47116) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Previously, the `repeat` function's second argument only accepted an unsigned integer type, which meant it could not accept values such as -1. This behavior differed from that of the Spark function. In this update, the repeat function has been modified to match the behavior of the Spark function. It now accepts the same types of inputs, including negative integers. Extensive testing has been performed to verify the correctness of the updated implementation. [#47134](https://github.com/ClickHouse/ClickHouse/pull/47134) ([KevinyhZou](https://github.com/KevinyhZou)). Note: the changelog entry was rewritten by ChatGPT. -* Remove `::__1` part from stacktraces. Display `std::basic_string ClickHouse release 23.2, 2023-02-23 - -#### Backward Incompatible Change -* Extend function "toDayOfWeek()" (alias: "DAYOFWEEK") with a mode argument that encodes whether the week starts on Monday or Sunday and whether counting starts at 0 or 1. For consistency with other date time functions, the mode argument was inserted between the time and the time zone arguments. This breaks existing usage of the (previously undocumented) 2-argument syntax "toDayOfWeek(time, time_zone)". A fix is to rewrite the function into "toDayOfWeek(time, 0, time_zone)". [#45233](https://github.com/ClickHouse/ClickHouse/pull/45233) ([Robert Schulze](https://github.com/rschu1ze)). -* Rename setting `max_query_cache_size` to `filesystem_cache_max_download_size`. [#45614](https://github.com/ClickHouse/ClickHouse/pull/45614) ([Kseniia Sumarokova](https://github.com/kssenii)). -* The `default` user will not have permissions for access type `SHOW NAMED COLLECTION` by default (e.g. `default` user will no longer be able to grant ALL to other users as it was before, therefore this PR is backward incompatible). [#46010](https://github.com/ClickHouse/ClickHouse/pull/46010) ([Kseniia Sumarokova](https://github.com/kssenii)). -* If the SETTINGS clause is specified before the FORMAT clause, the settings will be applied to formatting as well. [#46003](https://github.com/ClickHouse/ClickHouse/pull/46003) ([Azat Khuzhin](https://github.com/azat)). -* Remove support for setting `materialized_postgresql_allow_automatic_update` (which was by default turned off). [#46106](https://github.com/ClickHouse/ClickHouse/pull/46106) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Slightly improve performance of `countDigits` on realistic datasets. This closed [#44518](https://github.com/ClickHouse/ClickHouse/issues/44518). In previous versions, `countDigits(0)` returned `0`; now it returns `1`, which is more correct, and follows the existing documentation. [#46187](https://github.com/ClickHouse/ClickHouse/pull/46187) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Disallow creation of new columns compressed by a combination of codecs "Delta" or "DoubleDelta" followed by codecs "Gorilla" or "FPC". This can be bypassed using setting "allow_suspicious_codecs = true". [#45652](https://github.com/ClickHouse/ClickHouse/pull/45652) ([Robert Schulze](https://github.com/rschu1ze)). - -#### New Feature -* Add `StorageIceberg` and table function `iceberg` to access iceberg table store on S3. [#45384](https://github.com/ClickHouse/ClickHouse/pull/45384) ([flynn](https://github.com/ucasfl)). -* Allow configuring storage as `SETTINGS disk = ''` (instead of `storage_policy`) and with explicit disk creation `SETTINGS disk = disk(type=s3, ...)`. [#41976](https://github.com/ClickHouse/ClickHouse/pull/41976) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Expose `ProfileEvents` counters in `system.part_log`. [#38614](https://github.com/ClickHouse/ClickHouse/pull/38614) ([Bharat Nallan](https://github.com/bharatnc)). -* Enrichment of the existing `ReplacingMergeTree` engine to allow duplicate the insertion. It leverages the power of both `ReplacingMergeTree` and `CollapsingMergeTree` in one MergeTree engine. Deleted data are not returned when queried, but not removed from disk neither. [#41005](https://github.com/ClickHouse/ClickHouse/pull/41005) ([youennL-cs](https://github.com/youennL-cs)). -* Add `generateULID` function. Closes [#36536](https://github.com/ClickHouse/ClickHouse/issues/36536). [#44662](https://github.com/ClickHouse/ClickHouse/pull/44662) ([Nikolay Degterinsky](https://github.com/evillique)). -* Add `corrMatrix` aggregate function, calculating each two columns. In addition, since Aggregatefunctions `covarSamp` and `covarPop` are similar to `corr`, I add `covarSampMatrix`, `covarPopMatrix` by the way. @alexey-milovidov closes [#44587](https://github.com/ClickHouse/ClickHouse/issues/44587). [#44680](https://github.com/ClickHouse/ClickHouse/pull/44680) ([FFFFFFFHHHHHHH](https://github.com/FFFFFFFHHHHHHH)). -* Introduce arrayShuffle function for random array permutations. [#45271](https://github.com/ClickHouse/ClickHouse/pull/45271) ([Joanna Hulboj](https://github.com/jh0x)). -* Support types `FIXED_SIZE_BINARY` type in Arrow, `FIXED_LENGTH_BYTE_ARRAY` in `Parquet` and match them to `FixedString`. Add settings `output_format_parquet_fixed_string_as_fixed_byte_array/output_format_arrow_fixed_string_as_fixed_byte_array` to control default output type for FixedString. Closes [#45326](https://github.com/ClickHouse/ClickHouse/issues/45326). [#45340](https://github.com/ClickHouse/ClickHouse/pull/45340) ([Kruglov Pavel](https://github.com/Avogar)). -* Add a new column `last_exception_time` to system.replication_queue. [#45457](https://github.com/ClickHouse/ClickHouse/pull/45457) ([Frank Chen](https://github.com/FrankChen021)). -* Add two new functions which allow for user-defined keys/seeds with SipHash{64,128}. [#45513](https://github.com/ClickHouse/ClickHouse/pull/45513) ([Salvatore Mesoraca](https://github.com/aiven-sal)). -* Allow a three-argument version for table function `format`. close [#45808](https://github.com/ClickHouse/ClickHouse/issues/45808). [#45873](https://github.com/ClickHouse/ClickHouse/pull/45873) ([FFFFFFFHHHHHHH](https://github.com/FFFFFFFHHHHHHH)). -* Add `JodaTime` format support for 'x','w','S'. Refer to https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html. [#46073](https://github.com/ClickHouse/ClickHouse/pull/46073) ([zk_kiger](https://github.com/zk-kiger)). -* Support window function `ntile`. ([lgbo](https://github.com/lgbo-ustc)). -* Add setting `final` to implicitly apply the `FINAL` modifier to every table. [#40945](https://github.com/ClickHouse/ClickHouse/pull/40945) ([Arthur Passos](https://github.com/arthurpassos)). -* Added `arrayPartialSort` and `arrayPartialReverseSort` functions. [#46296](https://github.com/ClickHouse/ClickHouse/pull/46296) ([Joanna Hulboj](https://github.com/jh0x)). -* The new http parameter `client_protocol_version` allows setting a client protocol version for HTTP responses using the Native format. [#40397](https://github.com/ClickHouse/ClickHouse/issues/40397). [#46360](https://github.com/ClickHouse/ClickHouse/pull/46360) ([Geoff Genz](https://github.com/genzgd)). -* Add new function `regexpExtract`, like spark function `REGEXP_EXTRACT` for compatibility. It is similar to the existing function `extract`. [#46469](https://github.com/ClickHouse/ClickHouse/pull/46469) ([李扬](https://github.com/taiyang-li)). -* Add new function `JSONArrayLength`, which returns the number of elements in the outermost JSON array. The function returns NULL if the input JSON string is invalid. [#46631](https://github.com/ClickHouse/ClickHouse/pull/46631) ([李扬](https://github.com/taiyang-li)). - -#### Performance Improvement -* The introduced logic works if PREWHERE condition is a conjunction of multiple conditions (cond1 AND cond2 AND ... ). It groups those conditions that require reading the same columns into steps. After each step the corresponding part of the full condition is computed and the result rows might be filtered. This allows to read fewer rows in the next steps thus saving IO bandwidth and doing less computation. This logic is disabled by default for now. It will be enabled by default in one of the future releases once it is known to not have any regressions, so it is highly encouraged to be used for testing. It can be controlled by 2 settings: "enable_multiple_prewhere_read_steps" and "move_all_conditions_to_prewhere". [#46140](https://github.com/ClickHouse/ClickHouse/pull/46140) ([Alexander Gololobov](https://github.com/davenger)). -* An option added to aggregate partitions independently if table partition key and group by key are compatible. Controlled by the setting `allow_aggregate_partitions_independently`. Disabled by default because of limited applicability (please refer to the docs). [#45364](https://github.com/ClickHouse/ClickHouse/pull/45364) ([Nikita Taranov](https://github.com/nickitat)). -* Allow using Vertical merge algorithm with parts in Compact format. This will allow ClickHouse server to use much less memory for background operations. This closes [#46084](https://github.com/ClickHouse/ClickHouse/issues/46084). [#45681](https://github.com/ClickHouse/ClickHouse/pull/45681) [#46282](https://github.com/ClickHouse/ClickHouse/pull/46282) ([Anton Popov](https://github.com/CurtizJ)). -* Optimize `Parquet` reader by using batch reader. [#45878](https://github.com/ClickHouse/ClickHouse/pull/45878) ([LiuNeng](https://github.com/liuneng1994)). -* Add new `local_filesystem_read_method` method `io_uring` based on the asynchronous Linux [io_uring](https://kernel.dk/io_uring.pdf) subsystem, improving read performance almost universally compared to the default `pread` method. [#38456](https://github.com/ClickHouse/ClickHouse/pull/38456) ([Saulius Valatka](https://github.com/sauliusvl)). -* Rewrite aggregate functions with `if` expression as argument when logically equivalent. For example, `avg(if(cond, col, null))` can be rewritten to avgIf(cond, col). It is helpful in performance. [#44730](https://github.com/ClickHouse/ClickHouse/pull/44730) ([李扬](https://github.com/taiyang-li)). -* Improve lower/upper function performance with avx512 instructions. [#37894](https://github.com/ClickHouse/ClickHouse/pull/37894) ([yaqi-zhao](https://github.com/yaqi-zhao)). -* Remove the limitation that on systems with >=32 cores and SMT disabled ClickHouse uses only half of the cores (the case when you disable Hyper Threading in BIOS). [#44973](https://github.com/ClickHouse/ClickHouse/pull/44973) ([Robert Schulze](https://github.com/rschu1ze)). -* Improve performance of function `multiIf` by columnar executing, speed up by 2.3x. [#45296](https://github.com/ClickHouse/ClickHouse/pull/45296) ([李扬](https://github.com/taiyang-li)). -* Add fast path for function `position` when the needle is empty. [#45382](https://github.com/ClickHouse/ClickHouse/pull/45382) ([李扬](https://github.com/taiyang-li)). -* Enable `query_plan_remove_redundant_sorting` optimization by default. Optimization implemented in [#45420](https://github.com/ClickHouse/ClickHouse/issues/45420). [#45567](https://github.com/ClickHouse/ClickHouse/pull/45567) ([Igor Nikonov](https://github.com/devcrafter)). -* Increased HTTP Transfer Encoding chunk size to improve performance of large queries using the HTTP interface. [#45593](https://github.com/ClickHouse/ClickHouse/pull/45593) ([Geoff Genz](https://github.com/genzgd)). -* Fixed performance of short `SELECT` queries that read from tables with large number of `Array`/`Map`/`Nested` columns. [#45630](https://github.com/ClickHouse/ClickHouse/pull/45630) ([Anton Popov](https://github.com/CurtizJ)). -* Improve performance of filtering for big integers and decimal types. [#45949](https://github.com/ClickHouse/ClickHouse/pull/45949) ([李扬](https://github.com/taiyang-li)). -* This change could effectively reduce the overhead of obtaining the filter from ColumnNullable(UInt8) and improve the overall query performance. To evaluate the impact of this change, we adopted TPC-H benchmark but revised the column types from non-nullable to nullable, and we measured the QPS of its queries as the performance indicator. [#45962](https://github.com/ClickHouse/ClickHouse/pull/45962) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). -* Make the `_part` and `_partition_id` virtual column be `LowCardinality(String)` type. Closes [#45964](https://github.com/ClickHouse/ClickHouse/issues/45964). [#45975](https://github.com/ClickHouse/ClickHouse/pull/45975) ([flynn](https://github.com/ucasfl)). -* Improve the performance of Decimal conversion when the scale does not change. [#46095](https://github.com/ClickHouse/ClickHouse/pull/46095) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Allow to increase prefetching for read data. [#46168](https://github.com/ClickHouse/ClickHouse/pull/46168) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Rewrite `arrayExists(x -> x = 1, arr)` -> `has(arr, 1)`, which improve performance by 1.34x. [#46188](https://github.com/ClickHouse/ClickHouse/pull/46188) ([李扬](https://github.com/taiyang-li)). -* Fix too big memory usage for vertical merges on non-remote disk. Respect `max_insert_delayed_streams_for_parallel_write` for the remote disk. [#46275](https://github.com/ClickHouse/ClickHouse/pull/46275) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Update zstd to v1.5.4. It has some minor improvements in performance and compression ratio. If you run replicas with different versions of ClickHouse you may see reasonable error messages `Data after merge/mutation is not byte-identical to data on another replicas.` with explanation. These messages are Ok and you should not worry. [#46280](https://github.com/ClickHouse/ClickHouse/pull/46280) ([Raúl Marín](https://github.com/Algunenano)). -* Fix performance degradation caused by [#39737](https://github.com/ClickHouse/ClickHouse/issues/39737). [#46309](https://github.com/ClickHouse/ClickHouse/pull/46309) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The `replicas_status` handle will answer quickly even in case of a large replication queue. [#46310](https://github.com/ClickHouse/ClickHouse/pull/46310) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add avx512 support for aggregate function `sum`, function unary arithmetic, function comparison. [#37870](https://github.com/ClickHouse/ClickHouse/pull/37870) ([zhao zhou](https://github.com/zzachimed)). -* Rewrote the code around marks distribution and the overall coordination of the reading in order to achieve the maximum performance improvement. This closes [#34527](https://github.com/ClickHouse/ClickHouse/issues/34527). [#43772](https://github.com/ClickHouse/ClickHouse/pull/43772) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Remove redundant DISTINCT clauses in query (subqueries). Implemented on top of query plan. It does similar optimization as `optimize_duplicate_order_by_and_distinct` regarding DISTINCT clauses. Can be enabled via `query_plan_remove_redundant_distinct` setting. Related to [#42648](https://github.com/ClickHouse/ClickHouse/issues/42648). [#44176](https://github.com/ClickHouse/ClickHouse/pull/44176) ([Igor Nikonov](https://github.com/devcrafter)). -* A few query rewrite optimizations: `sumIf(123, cond) -> 123 * countIf(1, cond)`, `sum(if(cond, 123, 0)) -> 123 * countIf(cond)`, `sum(if(cond, 0, 123)) -> 123 * countIf(not(cond))` [#44728](https://github.com/ClickHouse/ClickHouse/pull/44728) ([李扬](https://github.com/taiyang-li)). -* Improved how memory bound merging and aggregation in order on top query plan interact. Previously we fell back to explicit sorting for AIO in some cases when it wasn't actually needed. [#45892](https://github.com/ClickHouse/ClickHouse/pull/45892) ([Nikita Taranov](https://github.com/nickitat)). -* Concurrent merges are scheduled using round-robin by default to ensure fair and starvation-free operation. Previously in heavily overloaded shards, big merges could possibly be starved by smaller merges due to the use of strict priority scheduling. Added `background_merges_mutations_scheduling_policy` server config option to select scheduling algorithm (`round_robin` or `shortest_task_first`). [#46247](https://github.com/ClickHouse/ClickHouse/pull/46247) ([Sergei Trifonov](https://github.com/serxa)). - -#### Improvement -* Enable retries for INSERT by default in case of ZooKeeper session loss. We already use it in production. [#46308](https://github.com/ClickHouse/ClickHouse/pull/46308) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add ability to ignore unknown keys in JSON object for named tuples (`input_format_json_ignore_unknown_keys_in_named_tuple`). [#45678](https://github.com/ClickHouse/ClickHouse/pull/45678) ([Azat Khuzhin](https://github.com/azat)). -* Support optimizing the `where` clause with sorting key expression move to `prewhere` for query with `final`. [#38893](https://github.com/ClickHouse/ClickHouse/issues/38893). [#38950](https://github.com/ClickHouse/ClickHouse/pull/38950) ([hexiaoting](https://github.com/hexiaoting)). -* Add new metrics for backups: num_processed_files and processed_files_size described actual number of processed files. [#42244](https://github.com/ClickHouse/ClickHouse/pull/42244) ([Aleksandr](https://github.com/AVMusorin)). -* Added retries on interserver DNS errors. [#43179](https://github.com/ClickHouse/ClickHouse/pull/43179) ([Anton Kozlov](https://github.com/tonickkozlov)). -* Keeper improvement: try preallocating space on the disk to avoid undefined out-of-space issues. Introduce setting `max_log_file_size` for the maximum size of Keeper's Raft log files. [#44370](https://github.com/ClickHouse/ClickHouse/pull/44370) ([Antonio Andelic](https://github.com/antonio2368)). -* Optimize behavior for a replica delay api logic in case the replica is read-only. [#45148](https://github.com/ClickHouse/ClickHouse/pull/45148) ([mateng915](https://github.com/mateng0915)). -* Ask for the password in clickhouse-client interactively in a case when the empty password is wrong. Closes [#46702](https://github.com/ClickHouse/ClickHouse/issues/46702). [#46730](https://github.com/ClickHouse/ClickHouse/pull/46730) ([Nikolay Degterinsky](https://github.com/evillique)). -* Mark `Gorilla` compression on columns of non-Float* type as suspicious. [#45376](https://github.com/ClickHouse/ClickHouse/pull/45376) ([Robert Schulze](https://github.com/rschu1ze)). -* Show replica name that is executing a merge in the `postpone_reason` column. [#45458](https://github.com/ClickHouse/ClickHouse/pull/45458) ([Frank Chen](https://github.com/FrankChen021)). -* Save exception stack trace in part_log. [#45459](https://github.com/ClickHouse/ClickHouse/pull/45459) ([Frank Chen](https://github.com/FrankChen021)). -* The `regexp_tree` dictionary is polished and now it is compatible with https://github.com/ua-parser/uap-core. [#45631](https://github.com/ClickHouse/ClickHouse/pull/45631) ([Han Fei](https://github.com/hanfei1991)). -* Updated checking of `SYSTEM SYNC REPLICA`, resolves [#45508](https://github.com/ClickHouse/ClickHouse/issues/45508) [#45648](https://github.com/ClickHouse/ClickHouse/pull/45648) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Rename setting `replication_alter_partitions_sync` to `alter_sync`. [#45659](https://github.com/ClickHouse/ClickHouse/pull/45659) ([Antonio Andelic](https://github.com/antonio2368)). -* The `generateRandom` table function and the engine now support `LowCardinality` data types. This is useful for testing, for example you can write `INSERT INTO table SELECT * FROM generateRandom() LIMIT 1000`. This is needed to debug [#45590](https://github.com/ClickHouse/ClickHouse/issues/45590). [#45661](https://github.com/ClickHouse/ClickHouse/pull/45661) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The experimental query result cache now provides more modular configuration settings. [#45679](https://github.com/ClickHouse/ClickHouse/pull/45679) ([Robert Schulze](https://github.com/rschu1ze)). -* Renamed "query result cache" to "query cache". [#45682](https://github.com/ClickHouse/ClickHouse/pull/45682) ([Robert Schulze](https://github.com/rschu1ze)). -* add `SYSTEM SYNC FILE CACHE` command. It will do the `sync` syscall. [#8921](https://github.com/ClickHouse/ClickHouse/issues/8921). [#45685](https://github.com/ClickHouse/ClickHouse/pull/45685) ([DR](https://github.com/freedomDR)). -* Add a new S3 setting `allow_head_object_request`. This PR makes usage of `GetObjectAttributes` request instead of `HeadObject` introduced in https://github.com/ClickHouse/ClickHouse/pull/45288 optional (and disabled by default). [#45701](https://github.com/ClickHouse/ClickHouse/pull/45701) ([Vitaly Baranov](https://github.com/vitlibar)). -* Add ability to override connection settings based on connection names (that said that now you can forget about storing password for each connection, you can simply put everything into `~/.clickhouse-client/config.xml` and even use different history files for them, which can be also useful). [#45715](https://github.com/ClickHouse/ClickHouse/pull/45715) ([Azat Khuzhin](https://github.com/azat)). -* Arrow format: support the duration type. Closes [#45669](https://github.com/ClickHouse/ClickHouse/issues/45669). [#45750](https://github.com/ClickHouse/ClickHouse/pull/45750) ([flynn](https://github.com/ucasfl)). -* Extend the logging in the Query Cache to improve investigations of the caching behavior. [#45751](https://github.com/ClickHouse/ClickHouse/pull/45751) ([Robert Schulze](https://github.com/rschu1ze)). -* The query cache's server-level settings are now reconfigurable at runtime. [#45758](https://github.com/ClickHouse/ClickHouse/pull/45758) ([Robert Schulze](https://github.com/rschu1ze)). -* Hide password in logs when a table function's arguments are specified with a named collection. [#45774](https://github.com/ClickHouse/ClickHouse/pull/45774) ([Vitaly Baranov](https://github.com/vitlibar)). -* Improve internal S3 client to correctly deduce regions and redirections for different types of URLs. [#45783](https://github.com/ClickHouse/ClickHouse/pull/45783) ([Antonio Andelic](https://github.com/antonio2368)). -* Add support for Map, IPv4 and IPv6 types in generateRandom. Mostly useful for testing. [#45785](https://github.com/ClickHouse/ClickHouse/pull/45785) ([Raúl Marín](https://github.com/Algunenano)). -* Support empty/notEmpty for IP types. [#45799](https://github.com/ClickHouse/ClickHouse/pull/45799) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* The column `num_processed_files` was split into two columns: `num_files` (for BACKUP) and `files_read` (for RESTORE). The column `processed_files_size` was split into two columns: `total_size` (for BACKUP) and `bytes_read` (for RESTORE). [#45800](https://github.com/ClickHouse/ClickHouse/pull/45800) ([Vitaly Baranov](https://github.com/vitlibar)). -* Add support for `SHOW ENGINES` query for MySQL compatibility. [#45859](https://github.com/ClickHouse/ClickHouse/pull/45859) ([Filatenkov Artur](https://github.com/FArthur-cmd)). -* Improved how the obfuscator deals with queries. [#45867](https://github.com/ClickHouse/ClickHouse/pull/45867) ([Raúl Marín](https://github.com/Algunenano)). -* Improve behaviour of conversion into Date for boundary value 65535 (2149-06-06). [#46042](https://github.com/ClickHouse/ClickHouse/pull/46042) [#45914](https://github.com/ClickHouse/ClickHouse/pull/45914) ([Joanna Hulboj](https://github.com/jh0x)). -* Add setting `check_referential_table_dependencies` to check referential dependencies on `DROP TABLE`. This PR solves [#38326](https://github.com/ClickHouse/ClickHouse/issues/38326). [#45936](https://github.com/ClickHouse/ClickHouse/pull/45936) ([Vitaly Baranov](https://github.com/vitlibar)). -* Fix `tupleElement` to return `Null` when having `Null` argument. Closes [#45894](https://github.com/ClickHouse/ClickHouse/issues/45894). [#45952](https://github.com/ClickHouse/ClickHouse/pull/45952) ([flynn](https://github.com/ucasfl)). -* Throw an error on no files satisfying the S3 wildcard. Closes [#45587](https://github.com/ClickHouse/ClickHouse/issues/45587). [#45957](https://github.com/ClickHouse/ClickHouse/pull/45957) ([chen](https://github.com/xiedeyantu)). -* Use cluster state data to check concurrent backup/restore. [#45982](https://github.com/ClickHouse/ClickHouse/pull/45982) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* ClickHouse Client: Use "exact" matching for fuzzy search, which has correct case ignorance and more appropriate algorithm for matching SQL queries. [#46000](https://github.com/ClickHouse/ClickHouse/pull/46000) ([Azat Khuzhin](https://github.com/azat)). -* Forbid wrong create View syntax `CREATE View X TO Y AS SELECT`. Closes [#4331](https://github.com/ClickHouse/ClickHouse/issues/4331). [#46043](https://github.com/ClickHouse/ClickHouse/pull/46043) ([flynn](https://github.com/ucasfl)). -* Storage `Log` family support setting the `storage_policy`. Closes [#43421](https://github.com/ClickHouse/ClickHouse/issues/43421). [#46044](https://github.com/ClickHouse/ClickHouse/pull/46044) ([flynn](https://github.com/ucasfl)). -* Improve `JSONColumns` format when the result is empty. Closes [#46024](https://github.com/ClickHouse/ClickHouse/issues/46024). [#46053](https://github.com/ClickHouse/ClickHouse/pull/46053) ([flynn](https://github.com/ucasfl)). -* Add reference implementation for SipHash128. [#46065](https://github.com/ClickHouse/ClickHouse/pull/46065) ([Salvatore Mesoraca](https://github.com/aiven-sal)). -* Add a new metric to record allocations times and bytes using mmap. [#46068](https://github.com/ClickHouse/ClickHouse/pull/46068) ([李扬](https://github.com/taiyang-li)). -* Currently for functions like `leftPad`, `rightPad`, `leftPadUTF8`, `rightPadUTF8`, the second argument `length` must be UInt8|16|32|64|128|256. Which is too strict for clickhouse users, besides, it is not consistent with other similar functions like `arrayResize`, `substring` and so on. [#46103](https://github.com/ClickHouse/ClickHouse/pull/46103) ([李扬](https://github.com/taiyang-li)). -* Fix assertion in the `welchTTest` function in debug build when the resulting statistics is NaN. Unified the behavior with other similar functions. Change the behavior of `studentTTest` to return NaN instead of throwing an exception because the previous behavior was inconvenient. This closes [#41176](https://github.com/ClickHouse/ClickHouse/issues/41176) This closes [#42162](https://github.com/ClickHouse/ClickHouse/issues/42162). [#46141](https://github.com/ClickHouse/ClickHouse/pull/46141) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* More convenient usage of big integers and ORDER BY WITH FILL. Allow using plain integers for start and end points in WITH FILL when ORDER BY big (128-bit and 256-bit) integers. Fix the wrong result for big integers with negative start or end points. This closes [#16733](https://github.com/ClickHouse/ClickHouse/issues/16733). [#46152](https://github.com/ClickHouse/ClickHouse/pull/46152) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add `parts`, `active_parts` and `total_marks` columns to `system.tables` on [issue](https://github.com/ClickHouse/ClickHouse/issues/44336). [#46161](https://github.com/ClickHouse/ClickHouse/pull/46161) ([attack204](https://github.com/attack204)). -* Functions "multi[Fuzzy]Match(Any|AnyIndex|AllIndices}" now reject regexes which will likely evaluate very slowly in vectorscan. [#46167](https://github.com/ClickHouse/ClickHouse/pull/46167) ([Robert Schulze](https://github.com/rschu1ze)). -* When `insert_null_as_default` is enabled and column doesn't have defined default value, the default of column type will be used. Also this PR fixes using default values on nulls in case of LowCardinality columns. [#46171](https://github.com/ClickHouse/ClickHouse/pull/46171) ([Kruglov Pavel](https://github.com/Avogar)). -* Prefer explicitly defined access keys for S3 clients. If `use_environment_credentials` is set to `true`, and the user has provided the access key through query or config, they will be used instead of the ones from the environment variable. [#46191](https://github.com/ClickHouse/ClickHouse/pull/46191) ([Antonio Andelic](https://github.com/antonio2368)). -* Add an alias "DATE_FORMAT()" for function "formatDateTime()" to improve compatibility with MySQL's SQL dialect, extend function `formatDateTime` with substitutions "a", "b", "c", "h", "i", "k", "l" "r", "s", "W". ### Documentation entry for user-facing changes User-readable short description: `DATE_FORMAT` is an alias of `formatDateTime`. Formats a Time according to the given Format string. Format is a constant expression, so you cannot have multiple formats for a single result column. (Provide link to [formatDateTime](https://clickhouse.com/docs/en/sql-reference/functions/date-time-functions/#formatdatetime)). [#46302](https://github.com/ClickHouse/ClickHouse/pull/46302) ([Jake Bamrah](https://github.com/JakeBamrah)). -* Add `ProfileEvents` and `CurrentMetrics` about the callback tasks for parallel replicas (`s3Cluster` and `MergeTree` tables). [#46313](https://github.com/ClickHouse/ClickHouse/pull/46313) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add support for `DELETE` and `UPDATE` for tables using `KeeperMap` storage engine. [#46330](https://github.com/ClickHouse/ClickHouse/pull/46330) ([Antonio Andelic](https://github.com/antonio2368)). -* Allow writing RENAME queries with query parameters. Resolves [#45778](https://github.com/ClickHouse/ClickHouse/issues/45778). [#46407](https://github.com/ClickHouse/ClickHouse/pull/46407) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix parameterized SELECT queries with REPLACE transformer. Resolves [#33002](https://github.com/ClickHouse/ClickHouse/issues/33002). [#46420](https://github.com/ClickHouse/ClickHouse/pull/46420) ([Nikolay Degterinsky](https://github.com/evillique)). -* Exclude the internal database used for temporary/external tables from the calculation of asynchronous metric "NumberOfDatabases". This makes the behavior consistent with system table "system.databases". [#46435](https://github.com/ClickHouse/ClickHouse/pull/46435) ([Robert Schulze](https://github.com/rschu1ze)). -* Added `last_exception_time` column into distribution_queue table. [#46564](https://github.com/ClickHouse/ClickHouse/pull/46564) ([Aleksandr](https://github.com/AVMusorin)). -* Support for IN clause with parameter in parameterized views. [#46583](https://github.com/ClickHouse/ClickHouse/pull/46583) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Do not load named collections on server startup (load them on first access instead). [#46607](https://github.com/ClickHouse/ClickHouse/pull/46607) ([Kseniia Sumarokova](https://github.com/kssenii)). - - -#### Build/Testing/Packaging Improvement -* Introduce GWP-ASan implemented by the LLVM runtime. This closes [#27039](https://github.com/ClickHouse/ClickHouse/issues/27039). [#45226](https://github.com/ClickHouse/ClickHouse/pull/45226) ([Han Fei](https://github.com/hanfei1991)). -* We want to make our tests less stable and more flaky: add randomization for merge tree settings in tests. [#38983](https://github.com/ClickHouse/ClickHouse/pull/38983) ([Anton Popov](https://github.com/CurtizJ)). -* Enable the HDFS support in PowerPC and which helps to fixes the following functional tests 02113_hdfs_assert.sh, 02244_hdfs_cluster.sql and 02368_cancel_write_into_hdfs.sh. [#44949](https://github.com/ClickHouse/ClickHouse/pull/44949) ([MeenaRenganathan22](https://github.com/MeenaRenganathan22)). -* Add systemd.service file for clickhouse-keeper. Fixes [#44293](https://github.com/ClickHouse/ClickHouse/issues/44293). [#45568](https://github.com/ClickHouse/ClickHouse/pull/45568) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* ClickHouse's fork of poco was moved from "contrib/" to "base/poco/". [#46075](https://github.com/ClickHouse/ClickHouse/pull/46075) ([Robert Schulze](https://github.com/rschu1ze)). -* Add an option for `clickhouse-watchdog` to restart the child process. This does not make a lot of use. [#46312](https://github.com/ClickHouse/ClickHouse/pull/46312) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* If the environment variable `CLICKHOUSE_DOCKER_RESTART_ON_EXIT` is set to 1, the Docker container will run `clickhouse-server` as a child instead of the first process, and restart it when it exited. [#46391](https://github.com/ClickHouse/ClickHouse/pull/46391) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix Systemd service file. [#46461](https://github.com/ClickHouse/ClickHouse/pull/46461) ([SuperDJY](https://github.com/cmsxbc)). -* Raised the minimum Clang version needed to build ClickHouse from 12 to 15. [#46710](https://github.com/ClickHouse/ClickHouse/pull/46710) ([Robert Schulze](https://github.com/rschu1ze)). -* Upgrade Intel QPL from v0.3.0 to v1.0.0 2. Build libaccel-config and link it statically to QPL library instead of dynamically. [#45809](https://github.com/ClickHouse/ClickHouse/pull/45809) ([jasperzhu](https://github.com/jinjunzh)). - - -#### Bug Fix (user-visible misbehavior in official stable release) - -* Flush data exactly by `rabbitmq_flush_interval_ms` or by `rabbitmq_max_block_size` in `StorageRabbitMQ`. Closes [#42389](https://github.com/ClickHouse/ClickHouse/issues/42389). Closes [#45160](https://github.com/ClickHouse/ClickHouse/issues/45160). [#44404](https://github.com/ClickHouse/ClickHouse/pull/44404) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Use PODArray to render in sparkBar function, so we can control the memory usage. Close [#44467](https://github.com/ClickHouse/ClickHouse/issues/44467). [#44489](https://github.com/ClickHouse/ClickHouse/pull/44489) ([Duc Canh Le](https://github.com/canhld94)). -* Fix functions (quantilesExactExclusive, quantilesExactInclusive) return unsorted array element. [#45379](https://github.com/ClickHouse/ClickHouse/pull/45379) ([wujunfu](https://github.com/wujunfu)). -* Fix uncaught exception in HTTPHandler when open telemetry is enabled. [#45456](https://github.com/ClickHouse/ClickHouse/pull/45456) ([Frank Chen](https://github.com/FrankChen021)). -* Don't infer Dates from 8 digit numbers. It could lead to wrong data to be read. [#45581](https://github.com/ClickHouse/ClickHouse/pull/45581) ([Kruglov Pavel](https://github.com/Avogar)). -* Fixes to correctly use `odbc_bridge_use_connection_pooling` setting. [#45591](https://github.com/ClickHouse/ClickHouse/pull/45591) ([Bharat Nallan](https://github.com/bharatnc)). -* When the callback in the cache is called, it is possible that this cache is destructed. To keep it safe, we capture members by value. It's also safe for task schedule because it will be deactivated before storage is destroyed. Resolve [#45548](https://github.com/ClickHouse/ClickHouse/issues/45548). [#45601](https://github.com/ClickHouse/ClickHouse/pull/45601) ([Han Fei](https://github.com/hanfei1991)). -* Fix data corruption when codecs Delta or DoubleDelta are combined with codec Gorilla. [#45615](https://github.com/ClickHouse/ClickHouse/pull/45615) ([Robert Schulze](https://github.com/rschu1ze)). -* Correctly check types when using N-gram bloom filter index to avoid invalid reads. [#45617](https://github.com/ClickHouse/ClickHouse/pull/45617) ([Antonio Andelic](https://github.com/antonio2368)). -* A couple of segfaults have been reported around `c-ares`. They were introduced in my previous pull requests. I have fixed them with the help of Alexander Tokmakov. [#45629](https://github.com/ClickHouse/ClickHouse/pull/45629) ([Arthur Passos](https://github.com/arthurpassos)). -* Fix key description when encountering duplicate primary keys. This can happen in projections. See [#45590](https://github.com/ClickHouse/ClickHouse/issues/45590) for details. [#45686](https://github.com/ClickHouse/ClickHouse/pull/45686) ([Amos Bird](https://github.com/amosbird)). -* Set compression method and level for backup Closes [#45690](https://github.com/ClickHouse/ClickHouse/issues/45690). [#45737](https://github.com/ClickHouse/ClickHouse/pull/45737) ([Pradeep Chhetri](https://github.com/chhetripradeep)). -* Should use `select_query_typed.limitByOffset` instead of `select_query_typed.limitOffset`. [#45817](https://github.com/ClickHouse/ClickHouse/pull/45817) ([刘陶峰](https://github.com/taofengliu)). -* When use experimental analyzer, queries like `SELECT number FROM numbers(100) LIMIT 10 OFFSET 10;` get wrong results (empty result for this sql). That is caused by an unnecessary offset step added by planner. [#45822](https://github.com/ClickHouse/ClickHouse/pull/45822) ([刘陶峰](https://github.com/taofengliu)). -* Backward compatibility - allow implicit narrowing conversion from UInt64 to IPv4 - required for "INSERT ... VALUES ..." expression. [#45865](https://github.com/ClickHouse/ClickHouse/pull/45865) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Bugfix IPv6 parser for mixed ip4 address with missed first octet (like `::.1.2.3`). [#45871](https://github.com/ClickHouse/ClickHouse/pull/45871) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Add the `query_kind` column to the `system.processes` table and the `SHOW PROCESSLIST` query. Remove duplicate code. It fixes a bug: the global configuration parameter `max_concurrent_select_queries` was not respected to queries with `INTERSECT` or `EXCEPT` chains. [#45872](https://github.com/ClickHouse/ClickHouse/pull/45872) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix crash in a function `stochasticLinearRegression`. Found by WingFuzz. [#45985](https://github.com/ClickHouse/ClickHouse/pull/45985) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix crash in `SELECT` queries with `INTERSECT` and `EXCEPT` modifiers that read data from tables with enabled sparse columns (controlled by setting `ratio_of_defaults_for_sparse_serialization`). [#45987](https://github.com/ClickHouse/ClickHouse/pull/45987) ([Anton Popov](https://github.com/CurtizJ)). -* Fix read in order optimization for DESC sorting with FINAL, close [#45815](https://github.com/ClickHouse/ClickHouse/issues/45815). [#46009](https://github.com/ClickHouse/ClickHouse/pull/46009) ([Vladimir C](https://github.com/vdimir)). -* Fix reading of non existing nested columns with multiple level in compact parts. [#46045](https://github.com/ClickHouse/ClickHouse/pull/46045) ([Azat Khuzhin](https://github.com/azat)). -* Fix elapsed column in system.processes (10x error). [#46047](https://github.com/ClickHouse/ClickHouse/pull/46047) ([Azat Khuzhin](https://github.com/azat)). -* Follow-up fix for Replace domain IP types (IPv4, IPv6) with native https://github.com/ClickHouse/ClickHouse/pull/43221. [#46087](https://github.com/ClickHouse/ClickHouse/pull/46087) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Fix environment variable substitution in the configuration when a parameter already has a value. This closes [#46131](https://github.com/ClickHouse/ClickHouse/issues/46131). This closes [#9547](https://github.com/ClickHouse/ClickHouse/issues/9547). [#46144](https://github.com/ClickHouse/ClickHouse/pull/46144) ([pufit](https://github.com/pufit)). -* Fix incorrect predicate push down with grouping sets. Closes [#45947](https://github.com/ClickHouse/ClickHouse/issues/45947). [#46151](https://github.com/ClickHouse/ClickHouse/pull/46151) ([flynn](https://github.com/ucasfl)). -* Fix possible pipeline stuck error on `fulls_sorting_join` with constant keys. [#46175](https://github.com/ClickHouse/ClickHouse/pull/46175) ([Vladimir C](https://github.com/vdimir)). -* Never rewrite tuple functions as literals during formatting to avoid incorrect results. [#46232](https://github.com/ClickHouse/ClickHouse/pull/46232) ([Salvatore Mesoraca](https://github.com/aiven-sal)). -* Fix possible out of bounds error while reading LowCardinality(Nullable) in Arrow format. [#46270](https://github.com/ClickHouse/ClickHouse/pull/46270) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix `SYSTEM UNFREEZE` queries failing with the exception `CANNOT_PARSE_INPUT_ASSERTION_FAILED`. [#46325](https://github.com/ClickHouse/ClickHouse/pull/46325) ([Aleksei Filatov](https://github.com/aalexfvk)). -* Fix possible crash which can be caused by an integer overflow while deserializing aggregating state of a function that stores HashTable. [#46349](https://github.com/ClickHouse/ClickHouse/pull/46349) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix possible `LOGICAL_ERROR` in asynchronous inserts with invalid data sent in format `VALUES`. [#46350](https://github.com/ClickHouse/ClickHouse/pull/46350) ([Anton Popov](https://github.com/CurtizJ)). -* Fixed a LOGICAL_ERROR on an attempt to execute `ALTER ... MOVE PART ... TO TABLE`. This type of query was never actually supported. [#46359](https://github.com/ClickHouse/ClickHouse/pull/46359) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Fix s3Cluster schema inference in parallel distributed insert select when `parallel_distributed_insert_select` is enabled. [#46381](https://github.com/ClickHouse/ClickHouse/pull/46381) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix queries like `ALTER TABLE ... UPDATE nested.arr1 = nested.arr2 ...`, where `arr1` and `arr2` are fields of the same `Nested` column. [#46387](https://github.com/ClickHouse/ClickHouse/pull/46387) ([Anton Popov](https://github.com/CurtizJ)). -* Scheduler may fail to schedule a task. If it happens, the whole MulityPartUpload should be aborted and `UploadHelper` must wait for already scheduled tasks. [#46451](https://github.com/ClickHouse/ClickHouse/pull/46451) ([Dmitry Novik](https://github.com/novikd)). -* Fix PREWHERE for Merge with different default types (fixes some `NOT_FOUND_COLUMN_IN_BLOCK` when the default type for the column differs, also allow `PREWHERE` when the type of column is the same across tables, and prohibit it, only if it differs). [#46454](https://github.com/ClickHouse/ClickHouse/pull/46454) ([Azat Khuzhin](https://github.com/azat)). -* Fix a crash that could happen when constant values are used in `ORDER BY`. Fixes [#46466](https://github.com/ClickHouse/ClickHouse/issues/46466). [#46493](https://github.com/ClickHouse/ClickHouse/pull/46493) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Do not throw exception if `disk` setting was specified on query level, but `storage_policy` was specified in config merge tree settings section. `disk` will override setting from config. [#46533](https://github.com/ClickHouse/ClickHouse/pull/46533) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix an invalid processing of constant `LowCardinality` argument in function `arrayMap`. This bug could lead to a segfault in release, and logical error `Bad cast` in debug build. [#46569](https://github.com/ClickHouse/ClickHouse/pull/46569) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* fixes [#46557](https://github.com/ClickHouse/ClickHouse/issues/46557). [#46611](https://github.com/ClickHouse/ClickHouse/pull/46611) ([Alexander Gololobov](https://github.com/davenger)). -* Fix endless restarts of clickhouse-server systemd unit if server cannot start within 1m30sec (Disable timeout logic for starting clickhouse-server from systemd service). [#46613](https://github.com/ClickHouse/ClickHouse/pull/46613) ([Azat Khuzhin](https://github.com/azat)). -* Allocated during asynchronous inserts memory buffers were deallocated in the global context and MemoryTracker counters for corresponding user and query were not updated correctly. That led to false positive OOM exceptions. [#46622](https://github.com/ClickHouse/ClickHouse/pull/46622) ([Dmitry Novik](https://github.com/novikd)). -* Updated to not clear on_expression from table_join as its used by future analyze runs resolves [#45185](https://github.com/ClickHouse/ClickHouse/issues/45185). [#46487](https://github.com/ClickHouse/ClickHouse/pull/46487) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). - - -### ClickHouse release 23.1, 2023-01-26 - -### ClickHouse release 23.1 - -#### Upgrade Notes -* The `SYSTEM RESTART DISK` query becomes a no-op. [#44647](https://github.com/ClickHouse/ClickHouse/pull/44647) ([alesapin](https://github.com/alesapin)). -* The `PREALLOCATE` option for `HASHED`/`SPARSE_HASHED` dictionaries becomes a no-op. [#45388](https://github.com/ClickHouse/ClickHouse/pull/45388) ([Azat Khuzhin](https://github.com/azat)). It does not give significant advantages anymore. -* Disallow `Gorilla` codec on columns of non-Float32 or non-Float64 type. [#45252](https://github.com/ClickHouse/ClickHouse/pull/45252) ([Robert Schulze](https://github.com/rschu1ze)). It was pointless and led to inconsistencies. -* Parallel quorum inserts might work incorrectly with `*MergeTree` tables created with the deprecated syntax. Therefore, parallel quorum inserts support is completely disabled for such tables. It does not affect tables created with a new syntax. [#45430](https://github.com/ClickHouse/ClickHouse/pull/45430) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Use the `GetObjectAttributes` request instead of the `HeadObject` request to get the size of an object in AWS S3. This change fixes handling endpoints without explicit regions after updating the AWS SDK, for example. [#45288](https://github.com/ClickHouse/ClickHouse/pull/45288) ([Vitaly Baranov](https://github.com/vitlibar)). AWS S3 and Minio are tested, but keep in mind that various S3-compatible services (GCS, R2, B2) may have subtle incompatibilities. This change also may require you to adjust the ACL to allow the `GetObjectAttributes` request. -* Forbid paths in timezone names. For example, a timezone name like `/usr/share/zoneinfo/Asia/Aden` is not allowed; the IANA timezone database name like `Asia/Aden` should be used. [#44225](https://github.com/ClickHouse/ClickHouse/pull/44225) ([Kruglov Pavel](https://github.com/Avogar)). -* Queries combining equijoin and constant expressions (e.g., `JOIN ON t1.x = t2.x AND 1 = 1`) are forbidden due to incorrect results. [#44016](https://github.com/ClickHouse/ClickHouse/pull/44016) ([Vladimir C](https://github.com/vdimir)). - - -#### New Feature -* Dictionary source for extracting keys by traversing regular expressions tree. It can be used for User-Agent parsing. [#40878](https://github.com/ClickHouse/ClickHouse/pull/40878) ([Vage Ogannisian](https://github.com/nooblose)). [#43858](https://github.com/ClickHouse/ClickHouse/pull/43858) ([Han Fei](https://github.com/hanfei1991)). -* Added parametrized view functionality, now it's possible to specify query parameters for the View table engine. resolves [#40907](https://github.com/ClickHouse/ClickHouse/issues/40907). [#41687](https://github.com/ClickHouse/ClickHouse/pull/41687) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Add `quantileInterpolatedWeighted`/`quantilesInterpolatedWeighted` functions. [#38252](https://github.com/ClickHouse/ClickHouse/pull/38252) ([Bharat Nallan](https://github.com/bharatnc)). -* Array join support for the `Map` type, like the function "explode" in Spark. [#43239](https://github.com/ClickHouse/ClickHouse/pull/43239) ([李扬](https://github.com/taiyang-li)). -* Support SQL standard binary and hex string literals. [#43785](https://github.com/ClickHouse/ClickHouse/pull/43785) ([Mo Xuan](https://github.com/mo-avatar)). -* Allow formatting `DateTime` in Joda-Time style. Refer to [the Joda-Time docs](https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html). [#43818](https://github.com/ClickHouse/ClickHouse/pull/43818) ([李扬](https://github.com/taiyang-li)). -* Implemented a fractional second formatter (`%f`) for `formatDateTime`. [#44060](https://github.com/ClickHouse/ClickHouse/pull/44060) ([ltrk2](https://github.com/ltrk2)). [#44497](https://github.com/ClickHouse/ClickHouse/pull/44497) ([Alexander Gololobov](https://github.com/davenger)). -* Added `age` function to calculate the difference between two dates or dates with time values expressed as the number of full units. Closes [#41115](https://github.com/ClickHouse/ClickHouse/issues/41115). [#44421](https://github.com/ClickHouse/ClickHouse/pull/44421) ([Robert Schulze](https://github.com/rschu1ze)). -* Add `Null` source for dictionaries. Closes [#44240](https://github.com/ClickHouse/ClickHouse/issues/44240). [#44502](https://github.com/ClickHouse/ClickHouse/pull/44502) ([mayamika](https://github.com/mayamika)). -* Allow configuring the S3 storage class with the `s3_storage_class` configuration option. Such as `STANDARD/INTELLIGENT_TIERING` Closes [#44443](https://github.com/ClickHouse/ClickHouse/issues/44443). [#44707](https://github.com/ClickHouse/ClickHouse/pull/44707) ([chen](https://github.com/xiedeyantu)). -* Insert default values in case of missing elements in JSON object while parsing named tuple. Add setting `input_format_json_defaults_for_missing_elements_in_named_tuple` that controls this behaviour. Closes [#45142](https://github.com/ClickHouse/ClickHouse/issues/45142)#issuecomment-1380153217. [#45231](https://github.com/ClickHouse/ClickHouse/pull/45231) ([Kruglov Pavel](https://github.com/Avogar)). -* Record server startup time in ProfileEvents (`ServerStartupMilliseconds`). Resolves [#43188](https://github.com/ClickHouse/ClickHouse/issues/43188). [#45250](https://github.com/ClickHouse/ClickHouse/pull/45250) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Refactor and Improve streaming engines Kafka/RabbitMQ/NATS and add support for all formats, also refactor formats a bit: - Fix producing messages in row-based formats with suffixes/prefixes. Now every message is formatted completely with all delimiters and can be parsed back using input format. - Support block-based formats like Native, Parquet, ORC, etc. Every block is formatted as a separate message. The number of rows in one message depends on the block size, so you can control it via the setting `max_block_size`. - Add new engine settings `kafka_max_rows_per_message/rabbitmq_max_rows_per_message/nats_max_rows_per_message`. They control the number of rows formatted in one message in row-based formats. Default value: 1. - Fix high memory consumption in the NATS table engine. - Support arbitrary binary data in NATS producer (previously it worked only with strings contained \0 at the end) - Add missing Kafka/RabbitMQ/NATS engine settings in the documentation. - Refactor producing and consuming in Kafka/RabbitMQ/NATS, separate it from WriteBuffers/ReadBuffers semantic. - Refactor output formats: remove callbacks on each row used in Kafka/RabbitMQ/NATS (now we don't use callbacks there), allow to use IRowOutputFormat directly, clarify row end and row between delimiters, make it possible to reset output format to start formatting again - Add proper implementation in formatRow function (bonus after formats refactoring). [#42777](https://github.com/ClickHouse/ClickHouse/pull/42777) ([Kruglov Pavel](https://github.com/Avogar)). -* Support reading/writing `Nested` tables as `List` of `Struct` in `CapnProto` format. Read/write `Decimal32/64` as `Int32/64`. Closes [#43319](https://github.com/ClickHouse/ClickHouse/issues/43319). [#43379](https://github.com/ClickHouse/ClickHouse/pull/43379) ([Kruglov Pavel](https://github.com/Avogar)). -* Added a `message_format_string` column to `system.text_log`. The column contains a pattern that was used to format the message. [#44543](https://github.com/ClickHouse/ClickHouse/pull/44543) ([Alexander Tokmakov](https://github.com/tavplubix)). This allows various analytics over the ClickHouse logs. -* Try to autodetect headers with column names (and maybe types) for CSV/TSV/CustomSeparated input formats. -Add settings input_format_tsv/csv/custom_detect_header that enable this behaviour (enabled by default). Closes [#44640](https://github.com/ClickHouse/ClickHouse/issues/44640). [#44953](https://github.com/ClickHouse/ClickHouse/pull/44953) ([Kruglov Pavel](https://github.com/Avogar)). - -#### Experimental Feature -* Add an experimental inverted index as a new secondary index type for efficient text search. [#38667](https://github.com/ClickHouse/ClickHouse/pull/38667) ([larryluogit](https://github.com/larryluogit)). -* Add experimental query result cache. [#43797](https://github.com/ClickHouse/ClickHouse/pull/43797) ([Robert Schulze](https://github.com/rschu1ze)). -* Added extendable and configurable scheduling subsystem for IO requests (not yet integrated with IO code itself). [#41840](https://github.com/ClickHouse/ClickHouse/pull/41840) ([Sergei Trifonov](https://github.com/serxa)). This feature does nothing at all, enjoy. -* Added `SYSTEM DROP DATABASE REPLICA` that removes metadata of a dead replica of a `Replicated` database. Resolves [#41794](https://github.com/ClickHouse/ClickHouse/issues/41794). [#42807](https://github.com/ClickHouse/ClickHouse/pull/42807) ([Alexander Tokmakov](https://github.com/tavplubix)). - -#### Performance Improvement -* Do not load inactive parts at startup of `MergeTree` tables. [#42181](https://github.com/ClickHouse/ClickHouse/pull/42181) ([Anton Popov](https://github.com/CurtizJ)). -* Improved latency of reading from storage `S3` and table function `s3` with large numbers of small files. Now settings `remote_filesystem_read_method` and `remote_filesystem_read_prefetch` take effect while reading from storage `S3`. [#43726](https://github.com/ClickHouse/ClickHouse/pull/43726) ([Anton Popov](https://github.com/CurtizJ)). -* Optimization for reading struct fields in Parquet/ORC files. Only the required fields are loaded. [#44484](https://github.com/ClickHouse/ClickHouse/pull/44484) ([lgbo](https://github.com/lgbo-ustc)). -* Two-level aggregation algorithm was mistakenly disabled for queries over the HTTP interface. It was enabled back, and it leads to a major performance improvement. [#45450](https://github.com/ClickHouse/ClickHouse/pull/45450) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Added mmap support for StorageFile, which should improve the performance of clickhouse-local. [#43927](https://github.com/ClickHouse/ClickHouse/pull/43927) ([pufit](https://github.com/pufit)). -* Added sharding support in HashedDictionary to allow parallel load (almost linear scaling based on number of shards). [#40003](https://github.com/ClickHouse/ClickHouse/pull/40003) ([Azat Khuzhin](https://github.com/azat)). -* Speed up query parsing. [#42284](https://github.com/ClickHouse/ClickHouse/pull/42284) ([Raúl Marín](https://github.com/Algunenano)). -* Always replace OR chain `expr = x1 OR ... OR expr = xN` to `expr IN (x1, ..., xN)` in the case where `expr` is a `LowCardinality` column. Setting `optimize_min_equality_disjunction_chain_length` is ignored in this case. [#42889](https://github.com/ClickHouse/ClickHouse/pull/42889) ([Guo Wangyang](https://github.com/guowangy)). -* Slightly improve performance by optimizing the code around ThreadStatus. [#43586](https://github.com/ClickHouse/ClickHouse/pull/43586) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). -* Optimize the column-wise ternary logic evaluation by achieving auto-vectorization. In the performance test of this [microbenchmark](https://github.com/ZhiguoZh/ClickHouse/blob/20221123-ternary-logic-opt-example/src/Functions/examples/associative_applier_perf.cpp), we've observed a peak **performance gain** of **21x** on the ICX device (Intel Xeon Platinum 8380 CPU). [#43669](https://github.com/ClickHouse/ClickHouse/pull/43669) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). -* Avoid acquiring read locks in the `system.tables` table if possible. [#43840](https://github.com/ClickHouse/ClickHouse/pull/43840) ([Raúl Marín](https://github.com/Algunenano)). -* Optimize ThreadPool. The performance experiments of SSB (Star Schema Benchmark) on the ICX device (Intel Xeon Platinum 8380 CPU, 80 cores, 160 threads) shows that this change could effectively decrease the lock contention for ThreadPoolImpl::mutex by **75%**, increasing the CPU utilization and improving the overall performance by **2.4%**. [#44308](https://github.com/ClickHouse/ClickHouse/pull/44308) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). -* Now the optimisation for predicting the hash table size is applied only if the cached hash table size is sufficiently large (thresholds were determined empirically and hardcoded). [#44455](https://github.com/ClickHouse/ClickHouse/pull/44455) ([Nikita Taranov](https://github.com/nickitat)). -* Small performance improvement for asynchronous reading from remote filesystems. [#44868](https://github.com/ClickHouse/ClickHouse/pull/44868) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Add fast path for: - `col like '%%'`; - `col like '%'`; - `col not like '%'`; - `col not like '%'`; - `match(col, '.*')`. [#45244](https://github.com/ClickHouse/ClickHouse/pull/45244) ([李扬](https://github.com/taiyang-li)). -* Slightly improve happy path optimisation in filtering (WHERE clause). [#45289](https://github.com/ClickHouse/ClickHouse/pull/45289) ([Nikita Taranov](https://github.com/nickitat)). -* Provide monotonicity info for `toUnixTimestamp64*` to enable more algebraic optimizations for index analysis. [#44116](https://github.com/ClickHouse/ClickHouse/pull/44116) ([Nikita Taranov](https://github.com/nickitat)). -* Allow the configuration of temporary data for query processing (spilling to disk) to cooperate with the filesystem cache (taking up the space from the cache disk) [#43972](https://github.com/ClickHouse/ClickHouse/pull/43972) ([Vladimir C](https://github.com/vdimir)). This mainly improves [ClickHouse Cloud](https://clickhouse.cloud/), but can be used for self-managed setups as well, if you know what to do. -* Make `system.replicas` table do parallel fetches of replicas statuses. Closes [#43918](https://github.com/ClickHouse/ClickHouse/issues/43918). [#43998](https://github.com/ClickHouse/ClickHouse/pull/43998) ([Nikolay Degterinsky](https://github.com/evillique)). -* Optimize memory consumption during backup to S3: files to S3 now will be copied directly without using `WriteBufferFromS3` (which could use a lot of memory). [#45188](https://github.com/ClickHouse/ClickHouse/pull/45188) ([Vitaly Baranov](https://github.com/vitlibar)). -* Add a cache for async block ids. This will reduce the number of requests of ZooKeeper when we enable async inserts deduplication. [#45106](https://github.com/ClickHouse/ClickHouse/pull/45106) ([Han Fei](https://github.com/hanfei1991)). - -#### Improvement - -* Use structure from insertion table in generateRandom without arguments. [#45239](https://github.com/ClickHouse/ClickHouse/pull/45239) ([Kruglov Pavel](https://github.com/Avogar)). -* Allow to implicitly convert floats stored in string fields of JSON to integers in `JSONExtract` functions. E.g. `JSONExtract('{"a": "1000.111"}', 'a', 'UInt64')` -> `1000`, previously it returned 0. [#45432](https://github.com/ClickHouse/ClickHouse/pull/45432) ([Anton Popov](https://github.com/CurtizJ)). -* Added fields `supports_parallel_parsing` and `supports_parallel_formatting` to table `system.formats` for better introspection. [#45499](https://github.com/ClickHouse/ClickHouse/pull/45499) ([Anton Popov](https://github.com/CurtizJ)). -* Improve reading CSV field in CustomSeparated/Template format. Closes [#42352](https://github.com/ClickHouse/ClickHouse/issues/42352) Closes [#39620](https://github.com/ClickHouse/ClickHouse/issues/39620). [#43332](https://github.com/ClickHouse/ClickHouse/pull/43332) ([Kruglov Pavel](https://github.com/Avogar)). -* Unify query elapsed time measurements. [#43455](https://github.com/ClickHouse/ClickHouse/pull/43455) ([Raúl Marín](https://github.com/Algunenano)). -* Improve automatic usage of structure from insertion table in table functions file/hdfs/s3 when virtual columns are present in a select query, it fixes the possible error `Block structure mismatch` or `number of columns mismatch`. [#43695](https://github.com/ClickHouse/ClickHouse/pull/43695) ([Kruglov Pavel](https://github.com/Avogar)). -* Add support for signed arguments in the function `range`. Fixes [#43333](https://github.com/ClickHouse/ClickHouse/issues/43333). [#43733](https://github.com/ClickHouse/ClickHouse/pull/43733) ([sanyu](https://github.com/wineternity)). -* Remove redundant sorting, for example, sorting related ORDER BY clauses in subqueries. Implemented on top of query plan. It does similar optimization as `optimize_duplicate_order_by_and_distinct` regarding `ORDER BY` clauses, but more generic, since it's applied to any redundant sorting steps (not only caused by ORDER BY clause) and applied to subqueries of any depth. Related to [#42648](https://github.com/ClickHouse/ClickHouse/issues/42648). [#43905](https://github.com/ClickHouse/ClickHouse/pull/43905) ([Igor Nikonov](https://github.com/devcrafter)). -* Add the ability to disable deduplication of files for BACKUP (for backups without deduplication ATTACH can be used instead of full RESTORE). For example `BACKUP foo TO S3(...) SETTINGS deduplicate_files=0` (default `deduplicate_files=1`). [#43947](https://github.com/ClickHouse/ClickHouse/pull/43947) ([Azat Khuzhin](https://github.com/azat)). -* Refactor and improve schema inference for text formats. Add new setting `schema_inference_make_columns_nullable` that controls making result types `Nullable` (enabled by default);. [#44019](https://github.com/ClickHouse/ClickHouse/pull/44019) ([Kruglov Pavel](https://github.com/Avogar)). -* Better support for `PROXYv1` protocol. [#44135](https://github.com/ClickHouse/ClickHouse/pull/44135) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Add information about the latest part check by cleanup threads into `system.parts` table. [#44244](https://github.com/ClickHouse/ClickHouse/pull/44244) ([Dmitry Novik](https://github.com/novikd)). -* Disable table functions in readonly mode for inserts. [#44290](https://github.com/ClickHouse/ClickHouse/pull/44290) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Add a setting `simultaneous_parts_removal_limit` to allow limiting the number of parts being processed by one iteration of CleanupThread. [#44461](https://github.com/ClickHouse/ClickHouse/pull/44461) ([Dmitry Novik](https://github.com/novikd)). -* Do not initialize ReadBufferFromS3 when only virtual columns are needed in a query. This may be helpful to [#44246](https://github.com/ClickHouse/ClickHouse/issues/44246). [#44493](https://github.com/ClickHouse/ClickHouse/pull/44493) ([chen](https://github.com/xiedeyantu)). -* Prevent duplicate column names hints. Closes [#44130](https://github.com/ClickHouse/ClickHouse/issues/44130). [#44519](https://github.com/ClickHouse/ClickHouse/pull/44519) ([Joanna Hulboj](https://github.com/jh0x)). -* Allow macro substitution in endpoint of disks. Resolve [#40951](https://github.com/ClickHouse/ClickHouse/issues/40951). [#44533](https://github.com/ClickHouse/ClickHouse/pull/44533) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Improve schema inference when `input_format_json_read_object_as_string` is enabled. [#44546](https://github.com/ClickHouse/ClickHouse/pull/44546) ([Kruglov Pavel](https://github.com/Avogar)). -* Add a user-level setting `database_replicated_allow_replicated_engine_arguments` which allows banning the creation of `ReplicatedMergeTree` tables with arguments in `DatabaseReplicated`. [#44566](https://github.com/ClickHouse/ClickHouse/pull/44566) ([alesapin](https://github.com/alesapin)). -* Prevent users from mistakenly specifying zero (invalid) value for `index_granularity`. This closes [#44536](https://github.com/ClickHouse/ClickHouse/issues/44536). [#44578](https://github.com/ClickHouse/ClickHouse/pull/44578) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Added possibility to set path to service keytab file in `keytab` parameter in `kerberos` section of config.xml. [#44594](https://github.com/ClickHouse/ClickHouse/pull/44594) ([Roman Vasin](https://github.com/rvasin)). -* Use already written part of the query for fuzzy search (pass to the `skim` library, which is written in Rust and linked statically to ClickHouse). [#44600](https://github.com/ClickHouse/ClickHouse/pull/44600) ([Azat Khuzhin](https://github.com/azat)). -* Enable `input_format_json_read_objects_as_strings` by default to be able to read nested JSON objects while JSON Object type is experimental. [#44657](https://github.com/ClickHouse/ClickHouse/pull/44657) ([Kruglov Pavel](https://github.com/Avogar)). -* Improvement for deduplication of async inserts: when users do duplicate async inserts, we should deduplicate inside the memory before we query Keeper. [#44682](https://github.com/ClickHouse/ClickHouse/pull/44682) ([Han Fei](https://github.com/hanfei1991)). -* Input/output `Avro` format will parse bool type as ClickHouse bool type. [#44684](https://github.com/ClickHouse/ClickHouse/pull/44684) ([Kruglov Pavel](https://github.com/Avogar)). -* Support Bool type in Arrow/Parquet/ORC. Closes [#43970](https://github.com/ClickHouse/ClickHouse/issues/43970). [#44698](https://github.com/ClickHouse/ClickHouse/pull/44698) ([Kruglov Pavel](https://github.com/Avogar)). -* Don't greedily parse beyond the quotes when reading UUIDs - it may lead to mistakenly successful parsing of incorrect data. [#44686](https://github.com/ClickHouse/ClickHouse/pull/44686) ([Raúl Marín](https://github.com/Algunenano)). -* Infer UInt64 in case of Int64 overflow and fix some transforms in schema inference. [#44696](https://github.com/ClickHouse/ClickHouse/pull/44696) ([Kruglov Pavel](https://github.com/Avogar)). -* Previously dependency resolving inside `Replicated` database was done in a hacky way, and now it's done right using an explicit graph. [#44697](https://github.com/ClickHouse/ClickHouse/pull/44697) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Fix `output_format_pretty_row_numbers` does not preserve the counter across the blocks. Closes [#44815](https://github.com/ClickHouse/ClickHouse/issues/44815). [#44832](https://github.com/ClickHouse/ClickHouse/pull/44832) ([flynn](https://github.com/ucasfl)). -* Don't report errors in `system.errors` due to parts being merged concurrently with the background cleanup process. [#44874](https://github.com/ClickHouse/ClickHouse/pull/44874) ([Raúl Marín](https://github.com/Algunenano)). -* Optimize and fix metrics for Distributed async INSERT. [#44922](https://github.com/ClickHouse/ClickHouse/pull/44922) ([Azat Khuzhin](https://github.com/azat)). -* Added settings to disallow concurrent backups and restores resolves [#43891](https://github.com/ClickHouse/ClickHouse/issues/43891) Implementation: * Added server-level settings to disallow concurrent backups and restores, which are read and set when BackupWorker is created in Context. * Settings are set to true by default. * Before starting backup or restores, added a check to see if any other backups/restores are running. For internal requests, it checks if it is from the self node using backup_uuid. [#45072](https://github.com/ClickHouse/ClickHouse/pull/45072) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Add `` config parameter for system logs. [#45320](https://github.com/ClickHouse/ClickHouse/pull/45320) ([Stig Bakken](https://github.com/stigsb)). - -#### Build/Testing/Packaging Improvement -* Statically link with the `skim` library (it is written in Rust) for fuzzy search in clickhouse client/local history. [#44239](https://github.com/ClickHouse/ClickHouse/pull/44239) ([Azat Khuzhin](https://github.com/azat)). -* We removed support for shared linking because of Rust. Actually, Rust is only an excuse for this removal, and we wanted to remove it nevertheless. [#44828](https://github.com/ClickHouse/ClickHouse/pull/44828) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Remove the dependency on the `adduser` tool from the packages, because we don't use it. This fixes [#44934](https://github.com/ClickHouse/ClickHouse/issues/44934). [#45011](https://github.com/ClickHouse/ClickHouse/pull/45011) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* The `SQLite` library is updated to the latest. It is used for the SQLite database and table integration engines. Also, fixed a false-positive TSan report. This closes [#45027](https://github.com/ClickHouse/ClickHouse/issues/45027). [#45031](https://github.com/ClickHouse/ClickHouse/pull/45031) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* CRC-32 changes to address the WeakHash collision issue in PowerPC. [#45144](https://github.com/ClickHouse/ClickHouse/pull/45144) ([MeenaRenganathan22](https://github.com/MeenaRenganathan22)). -* Update aws-c* submodules [#43020](https://github.com/ClickHouse/ClickHouse/pull/43020) ([Vitaly Baranov](https://github.com/vitlibar)). -* Automatically merge green backport PRs and green approved PRs [#41110](https://github.com/ClickHouse/ClickHouse/pull/41110) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* Introduce a [website](https://aretestsgreenyet.com/) for the status of ClickHouse CI. [Source](https://github.com/ClickHouse/aretestsgreenyet). - -#### Bug Fix - -* Replace domain IP types (IPv4, IPv6) with native. [#43221](https://github.com/ClickHouse/ClickHouse/pull/43221) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). It automatically fixes some missing implementations in the code. -* Fix the backup process if mutations get killed during the backup process. [#45351](https://github.com/ClickHouse/ClickHouse/pull/45351) ([Vitaly Baranov](https://github.com/vitlibar)). -* Fix the `Invalid number of rows in Chunk` exception message. [#41404](https://github.com/ClickHouse/ClickHouse/issues/41404). [#42126](https://github.com/ClickHouse/ClickHouse/pull/42126) ([Alexander Gololobov](https://github.com/davenger)). -* Fix possible use of an uninitialized value after executing expressions after sorting. Closes [#43386](https://github.com/ClickHouse/ClickHouse/issues/43386) [#43635](https://github.com/ClickHouse/ClickHouse/pull/43635) ([Kruglov Pavel](https://github.com/Avogar)). -* Better handling of NULL in aggregate combinators, fix possible segfault/logical error while using an obscure optimization `optimize_rewrite_sum_if_to_count_if`. Closes [#43758](https://github.com/ClickHouse/ClickHouse/issues/43758). [#43813](https://github.com/ClickHouse/ClickHouse/pull/43813) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix CREATE USER/ROLE query settings constraints. [#43993](https://github.com/ClickHouse/ClickHouse/pull/43993) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fixed bug with non-parsable default value for `EPHEMERAL` column in table metadata. [#44026](https://github.com/ClickHouse/ClickHouse/pull/44026) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Fix parsing of bad version from compatibility setting. [#44224](https://github.com/ClickHouse/ClickHouse/pull/44224) ([Kruglov Pavel](https://github.com/Avogar)). -* Bring interval subtraction from datetime in line with addition. [#44241](https://github.com/ClickHouse/ClickHouse/pull/44241) ([ltrk2](https://github.com/ltrk2)). -* Remove limits on the maximum size of the result for view. [#44261](https://github.com/ClickHouse/ClickHouse/pull/44261) ([lizhuoyu5](https://github.com/lzydmxy)). -* Fix possible logical error in cache if `do_not_evict_index_and_mrk_files=1`. Closes [#42142](https://github.com/ClickHouse/ClickHouse/issues/42142). [#44268](https://github.com/ClickHouse/ClickHouse/pull/44268) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix possible too early cache write interruption in write-through cache (caching could be stopped due to false assumption when it shouldn't have). [#44289](https://github.com/ClickHouse/ClickHouse/pull/44289) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix possible crash in the case function `IN` with constant arguments was used as a constant argument together with `LowCardinality`. Fixes [#44221](https://github.com/ClickHouse/ClickHouse/issues/44221). [#44346](https://github.com/ClickHouse/ClickHouse/pull/44346) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix support for complex parameters (like arrays) of parametric aggregate functions. This closes [#30975](https://github.com/ClickHouse/ClickHouse/issues/30975). The aggregate function `sumMapFiltered` was unusable in distributed queries before this change. [#44358](https://github.com/ClickHouse/ClickHouse/pull/44358) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix reading ObjectId in BSON schema inference. [#44382](https://github.com/ClickHouse/ClickHouse/pull/44382) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix race which can lead to premature temp parts removal before merge finishes in ReplicatedMergeTree. This issue could lead to errors like `No such file or directory: xxx`. Fixes [#43983](https://github.com/ClickHouse/ClickHouse/issues/43983). [#44383](https://github.com/ClickHouse/ClickHouse/pull/44383) ([alesapin](https://github.com/alesapin)). -* Some invalid `SYSTEM ... ON CLUSTER` queries worked in an unexpected way if a cluster name was not specified. It's fixed, now invalid queries throw `SYNTAX_ERROR` as they should. Fixes [#44264](https://github.com/ClickHouse/ClickHouse/issues/44264). [#44387](https://github.com/ClickHouse/ClickHouse/pull/44387) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Fix reading Map type in ORC format. [#44400](https://github.com/ClickHouse/ClickHouse/pull/44400) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix reading columns that are not presented in input data in Parquet/ORC formats. Previously it could lead to error `INCORRECT_NUMBER_OF_COLUMNS`. Closes [#44333](https://github.com/ClickHouse/ClickHouse/issues/44333). [#44405](https://github.com/ClickHouse/ClickHouse/pull/44405) ([Kruglov Pavel](https://github.com/Avogar)). -* Previously the `bar` function used the same '▋' (U+258B "Left five eighths block") character to display both 5/8 and 6/8 bars. This change corrects this behavior by using '▊' (U+258A "Left three quarters block") for displaying 6/8 bar. [#44410](https://github.com/ClickHouse/ClickHouse/pull/44410) ([Alexander Gololobov](https://github.com/davenger)). -* Placing profile settings after profile settings constraints in the configuration file made constraints ineffective. [#44411](https://github.com/ClickHouse/ClickHouse/pull/44411) ([Konstantin Bogdanov](https://github.com/thevar1able)). -* Fix `SYNTAX_ERROR` while running `EXPLAIN AST INSERT` queries with data. Closes [#44207](https://github.com/ClickHouse/ClickHouse/issues/44207). [#44413](https://github.com/ClickHouse/ClickHouse/pull/44413) ([save-my-heart](https://github.com/save-my-heart)). -* Fix reading bool value with CRLF in CSV format. Closes [#44401](https://github.com/ClickHouse/ClickHouse/issues/44401). [#44442](https://github.com/ClickHouse/ClickHouse/pull/44442) ([Kruglov Pavel](https://github.com/Avogar)). -* Don't execute and/or/if/multiIf on a LowCardinality dictionary, so the result type cannot be LowCardinality. It could lead to the error `Illegal column ColumnLowCardinality` in some cases. Fixes [#43603](https://github.com/ClickHouse/ClickHouse/issues/43603). [#44469](https://github.com/ClickHouse/ClickHouse/pull/44469) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix mutations with the setting `max_streams_for_merge_tree_reading`. [#44472](https://github.com/ClickHouse/ClickHouse/pull/44472) ([Anton Popov](https://github.com/CurtizJ)). -* Fix potential null pointer dereference with GROUPING SETS in ASTSelectQuery::formatImpl ([#43049](https://github.com/ClickHouse/ClickHouse/issues/43049)). [#44479](https://github.com/ClickHouse/ClickHouse/pull/44479) ([Robert Schulze](https://github.com/rschu1ze)). -* Validate types in table function arguments, CAST function arguments, JSONAsObject schema inference according to settings. [#44501](https://github.com/ClickHouse/ClickHouse/pull/44501) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix IN function with LowCardinality and const column, close [#44503](https://github.com/ClickHouse/ClickHouse/issues/44503). [#44506](https://github.com/ClickHouse/ClickHouse/pull/44506) ([Duc Canh Le](https://github.com/canhld94)). -* Fixed a bug in the normalization of a `DEFAULT` expression in `CREATE TABLE` statement. The second argument of the function `in` (or the right argument of operator `IN`) might be replaced with the result of its evaluation during CREATE query execution. Fixes [#44496](https://github.com/ClickHouse/ClickHouse/issues/44496). [#44547](https://github.com/ClickHouse/ClickHouse/pull/44547) ([Alexander Tokmakov](https://github.com/tavplubix)). -* Projections do not work in presence of WITH ROLLUP, WITH CUBE and WITH TOTALS. In previous versions, a query produced an exception instead of skipping the usage of projections. This closes [#44614](https://github.com/ClickHouse/ClickHouse/issues/44614). This closes [#42772](https://github.com/ClickHouse/ClickHouse/issues/42772). [#44615](https://github.com/ClickHouse/ClickHouse/pull/44615) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Async blocks were not cleaned because the function `get all blocks sorted by time` didn't get async blocks. [#44651](https://github.com/ClickHouse/ClickHouse/pull/44651) ([Han Fei](https://github.com/hanfei1991)). -* Fix `LOGICAL_ERROR` `The top step of the right pipeline should be ExpressionStep` for JOIN with subquery, UNION, and TOTALS. Fixes [#43687](https://github.com/ClickHouse/ClickHouse/issues/43687). [#44673](https://github.com/ClickHouse/ClickHouse/pull/44673) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Avoid `std::out_of_range` exception in the Executable table engine. [#44681](https://github.com/ClickHouse/ClickHouse/pull/44681) ([Kruglov Pavel](https://github.com/Avogar)). -* Do not apply `optimize_syntax_fuse_functions` to quantiles on AST, close [#44712](https://github.com/ClickHouse/ClickHouse/issues/44712). [#44713](https://github.com/ClickHouse/ClickHouse/pull/44713) ([Vladimir C](https://github.com/vdimir)). -* Fix bug with wrong type in Merge table and PREWHERE, close [#43324](https://github.com/ClickHouse/ClickHouse/issues/43324). [#44716](https://github.com/ClickHouse/ClickHouse/pull/44716) ([Vladimir C](https://github.com/vdimir)). -* Fix a possible crash during shutdown (while destroying TraceCollector). Fixes [#44757](https://github.com/ClickHouse/ClickHouse/issues/44757). [#44758](https://github.com/ClickHouse/ClickHouse/pull/44758) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix a possible crash in distributed query processing. The crash could happen if a query with totals or extremes returned an empty result and there are mismatched types in the Distributed and the local tables. Fixes [#44738](https://github.com/ClickHouse/ClickHouse/issues/44738). [#44760](https://github.com/ClickHouse/ClickHouse/pull/44760) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix fsync for fetches (`min_compressed_bytes_to_fsync_after_fetch`)/small files (ttl.txt, columns.txt) in mutations (`min_rows_to_fsync_after_merge`/`min_compressed_bytes_to_fsync_after_merge`). [#44781](https://github.com/ClickHouse/ClickHouse/pull/44781) ([Azat Khuzhin](https://github.com/azat)). -* A rare race condition was possible when querying the `system.parts` or `system.parts_columns` tables in the presence of parts being moved between disks. Introduced in [#41145](https://github.com/ClickHouse/ClickHouse/issues/41145). [#44809](https://github.com/ClickHouse/ClickHouse/pull/44809) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix the error `Context has expired` which could appear with enabled projections optimization. Can be reproduced for queries with specific functions, like `dictHas/dictGet` which use context in runtime. Fixes [#44844](https://github.com/ClickHouse/ClickHouse/issues/44844). [#44850](https://github.com/ClickHouse/ClickHouse/pull/44850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* A fix for `Cannot read all data` error which could happen while reading `LowCardinality` dictionary from remote fs. Fixes [#44709](https://github.com/ClickHouse/ClickHouse/issues/44709). [#44875](https://github.com/ClickHouse/ClickHouse/pull/44875) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Ignore cases when hardware monitor sensors cannot be read instead of showing a full exception message in logs. [#44895](https://github.com/ClickHouse/ClickHouse/pull/44895) ([Raúl Marín](https://github.com/Algunenano)). -* Use `max_delay_to_insert` value in case the calculated time to delay INSERT exceeds the setting value. Related to [#44902](https://github.com/ClickHouse/ClickHouse/issues/44902). [#44916](https://github.com/ClickHouse/ClickHouse/pull/44916) ([Igor Nikonov](https://github.com/devcrafter)). -* Fix error `Different order of columns in UNION subquery` for queries with `UNION`. Fixes [#44866](https://github.com/ClickHouse/ClickHouse/issues/44866). [#44920](https://github.com/ClickHouse/ClickHouse/pull/44920) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Delay for INSERT can be calculated incorrectly, which can lead to always using `max_delay_to_insert` setting as delay instead of a correct value. Using simple formula `max_delay_to_insert * (parts_over_threshold/max_allowed_parts_over_threshold)` i.e. delay grows proportionally to parts over threshold. Closes [#44902](https://github.com/ClickHouse/ClickHouse/issues/44902). [#44954](https://github.com/ClickHouse/ClickHouse/pull/44954) ([Igor Nikonov](https://github.com/devcrafter)). -* Fix alter table TTL error when a wide part has the lightweight delete mask. [#44959](https://github.com/ClickHouse/ClickHouse/pull/44959) ([Mingliang Pan](https://github.com/liangliangpan)). -* Follow-up fix for Replace domain IP types (IPv4, IPv6) with native [#43221](https://github.com/ClickHouse/ClickHouse/issues/43221). [#45024](https://github.com/ClickHouse/ClickHouse/pull/45024) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Follow-up fix for Replace domain IP types (IPv4, IPv6) with native https://github.com/ClickHouse/ClickHouse/pull/43221. [#45043](https://github.com/ClickHouse/ClickHouse/pull/45043) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* A buffer overflow was possible in the parser. Found by fuzzer. [#45047](https://github.com/ClickHouse/ClickHouse/pull/45047) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix possible cannot-read-all-data error in storage FileLog. Closes [#45051](https://github.com/ClickHouse/ClickHouse/issues/45051), [#38257](https://github.com/ClickHouse/ClickHouse/issues/38257). [#45057](https://github.com/ClickHouse/ClickHouse/pull/45057) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Memory efficient aggregation (setting `distributed_aggregation_memory_efficient`) is disabled when grouping sets are present in the query. [#45058](https://github.com/ClickHouse/ClickHouse/pull/45058) ([Nikita Taranov](https://github.com/nickitat)). -* Fix `RANGE_HASHED` dictionary to count range columns as part of the primary key during updates when `update_field` is specified. Closes [#44588](https://github.com/ClickHouse/ClickHouse/issues/44588). [#45061](https://github.com/ClickHouse/ClickHouse/pull/45061) ([Maksim Kita](https://github.com/kitaisreal)). -* Fix error `Cannot capture column` for `LowCardinality` captured argument of nested lambda. Fixes [#45028](https://github.com/ClickHouse/ClickHouse/issues/45028). [#45065](https://github.com/ClickHouse/ClickHouse/pull/45065) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix the wrong query result of `additional_table_filters` (additional filter was not applied) in case the minmax/count projection is used. [#45133](https://github.com/ClickHouse/ClickHouse/pull/45133) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fixed bug in `histogram` function accepting negative values. [#45147](https://github.com/ClickHouse/ClickHouse/pull/45147) ([simpleton](https://github.com/rgzntrade)). -* Fix wrong column nullability in StoreageJoin, close [#44940](https://github.com/ClickHouse/ClickHouse/issues/44940). [#45184](https://github.com/ClickHouse/ClickHouse/pull/45184) ([Vladimir C](https://github.com/vdimir)). -* Fix `background_fetches_pool_size` settings reload (increase at runtime). [#45189](https://github.com/ClickHouse/ClickHouse/pull/45189) ([Raúl Marín](https://github.com/Algunenano)). -* Correctly process `SELECT` queries on KV engines (e.g. KeeperMap, EmbeddedRocksDB) using `IN` on the key with subquery producing different type. [#45215](https://github.com/ClickHouse/ClickHouse/pull/45215) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix logical error in SEMI JOIN & join_use_nulls in some cases, close [#45163](https://github.com/ClickHouse/ClickHouse/issues/45163), close [#45209](https://github.com/ClickHouse/ClickHouse/issues/45209). [#45230](https://github.com/ClickHouse/ClickHouse/pull/45230) ([Vladimir C](https://github.com/vdimir)). -* Fix heap-use-after-free in reading from s3. [#45253](https://github.com/ClickHouse/ClickHouse/pull/45253) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix bug when the Avro Union type is ['null', Nested type], closes [#45275](https://github.com/ClickHouse/ClickHouse/issues/45275). Fix bug that incorrectly infers `bytes` type to `Float`. [#45276](https://github.com/ClickHouse/ClickHouse/pull/45276) ([flynn](https://github.com/ucasfl)). -* Throw a correct exception when explicit PREWHERE cannot be used with a table using the storage engine `Merge`. [#45319](https://github.com/ClickHouse/ClickHouse/pull/45319) ([Antonio Andelic](https://github.com/antonio2368)). -* Under WSL1 Ubuntu self-extracting ClickHouse fails to decompress due to inconsistency - /proc/self/maps reporting 32bit file's inode, while stat reporting 64bit inode. [#45339](https://github.com/ClickHouse/ClickHouse/pull/45339) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Fix race in Distributed table startup (that could lead to processing file of async INSERT multiple times). [#45360](https://github.com/ClickHouse/ClickHouse/pull/45360) ([Azat Khuzhin](https://github.com/azat)). -* Fix a possible crash while reading from storage `S3` and table function `s3` in the case when `ListObject` request has failed. [#45371](https://github.com/ClickHouse/ClickHouse/pull/45371) ([Anton Popov](https://github.com/CurtizJ)). -* Fix `SELECT ... FROM system.dictionaries` exception when there is a dictionary with a bad structure (e.g. incorrect type in XML config). [#45399](https://github.com/ClickHouse/ClickHouse/pull/45399) ([Aleksei Filatov](https://github.com/aalexfvk)). -* Fix s3Cluster schema inference when structure from insertion table is used in `INSERT INTO ... SELECT * FROM s3Cluster` queries. [#45422](https://github.com/ClickHouse/ClickHouse/pull/45422) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix bug in JSON/BSONEachRow parsing with HTTP that could lead to using default values for some columns instead of values from data. [#45424](https://github.com/ClickHouse/ClickHouse/pull/45424) ([Kruglov Pavel](https://github.com/Avogar)). -* Fixed bug (Code: 632. DB::Exception: Unexpected data ... after parsed IPv6 value ...) with typed parsing of IP types from text source. [#45425](https://github.com/ClickHouse/ClickHouse/pull/45425) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). -* close [#45297](https://github.com/ClickHouse/ClickHouse/issues/45297) Add check for empty regular expressions. [#45428](https://github.com/ClickHouse/ClickHouse/pull/45428) ([Han Fei](https://github.com/hanfei1991)). -* Fix possible (likely distributed) query hung. [#45448](https://github.com/ClickHouse/ClickHouse/pull/45448) ([Azat Khuzhin](https://github.com/azat)). -* Fix possible deadlock with `allow_asynchronous_read_from_io_pool_for_merge_tree` enabled in case of exception from `ThreadPool::schedule`. [#45481](https://github.com/ClickHouse/ClickHouse/pull/45481) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix possible in-use table after DETACH. [#45493](https://github.com/ClickHouse/ClickHouse/pull/45493) ([Azat Khuzhin](https://github.com/azat)). -* Fix rare abort in the case when a query is canceled and parallel parsing was used during its execution. [#45498](https://github.com/ClickHouse/ClickHouse/pull/45498) ([Anton Popov](https://github.com/CurtizJ)). -* Fix a race between Distributed table creation and INSERT into it (could lead to CANNOT_LINK during INSERT into the table). [#45502](https://github.com/ClickHouse/ClickHouse/pull/45502) ([Azat Khuzhin](https://github.com/azat)). -* Add proper default (SLRU) to cache policy getter. Closes [#45514](https://github.com/ClickHouse/ClickHouse/issues/45514). [#45524](https://github.com/ClickHouse/ClickHouse/pull/45524) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Disallow array join in mutations closes [#42637](https://github.com/ClickHouse/ClickHouse/issues/42637) [#44447](https://github.com/ClickHouse/ClickHouse/pull/44447) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Fix for qualified asterisks with alias table name and column transformer. Resolves [#44736](https://github.com/ClickHouse/ClickHouse/issues/44736). [#44755](https://github.com/ClickHouse/ClickHouse/pull/44755) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). - -## [Changelog for 2022](https://clickhouse.com/docs/en/whats-new/changelog/2022) +## [Changelog for 2023](https://clickhouse.com/docs/en/whats-new/changelog/2023) diff --git a/CMakeLists.txt b/CMakeLists.txt index 063cfc77302..3bd179a799c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -254,10 +254,17 @@ endif() include(cmake/cpu_features.cmake) -# Asynchronous unwind tables are needed for Query Profiler. -# They are already by default on some platforms but possibly not on all platforms. -# Enable it explicitly. -set (COMPILER_FLAGS "${COMPILER_FLAGS} -fasynchronous-unwind-tables") + +# Query Profiler doesn't work on MacOS for several reasons +# - PHDR cache is not available +# - We use native functionality to get stacktraces which is not async signal safe +# and thus we don't need to generate asynchronous unwind tables +if (NOT OS_DARWIN) + # Asynchronous unwind tables are needed for Query Profiler. + # They are already by default on some platforms but possibly not on all platforms. + # Enable it explicitly. + set (COMPILER_FLAGS "${COMPILER_FLAGS} -fasynchronous-unwind-tables") +endif() # Reproducible builds. if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") @@ -348,7 +355,7 @@ if (COMPILER_CLANG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-absolute-paths") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdiagnostics-absolute-paths") - if (NOT ENABLE_TESTS AND NOT SANITIZE AND OS_LINUX) + if (NOT ENABLE_TESTS AND NOT SANITIZE AND NOT SANITIZE_COVERAGE AND OS_LINUX) # https://clang.llvm.org/docs/ThinLTO.html # Applies to clang and linux only. # Disabled when building with tests or sanitizers. @@ -546,7 +553,7 @@ if (ENABLE_RUST) endif() endif() -if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO" AND NOT SANITIZE AND OS_LINUX AND (ARCH_AMD64 OR ARCH_AARCH64)) +if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO" AND NOT SANITIZE AND NOT SANITIZE_COVERAGE AND OS_LINUX AND (ARCH_AMD64 OR ARCH_AARCH64)) set(CHECK_LARGE_OBJECT_SIZES_DEFAULT ON) else () set(CHECK_LARGE_OBJECT_SIZES_DEFAULT OFF) diff --git a/README.md b/README.md index d356e429892..9ada350d173 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Keep an eye out for upcoming meetups around the world. Somewhere else you want u ## Recent Recordings * **Recent Meetup Videos**: [Meetup Playlist](https://www.youtube.com/playlist?list=PL0Z2YDlm0b3iNDUzpY1S3L_iV4nARda_U) Whenever possible recordings of the ClickHouse Community Meetups are edited and presented as individual talks. Current featuring "Modern SQL in 2023", "Fast, Concurrent, and Consistent Asynchronous INSERTS in ClickHouse", and "Full-Text Indices: Design and Experiments" -* **Recording available**: [**v23.10 Release Webinar**](https://www.youtube.com/watch?v=PGQS6uPb970) All the features of 23.10, one convenient video! Watch it now! +* **Recording available**: [**v24.1 Release Webinar**](https://www.youtube.com/watch?v=pBF9g0wGAGs) All the features of 24.1, one convenient video! Watch it now! * **All release webinar recordings**: [YouTube playlist](https://www.youtube.com/playlist?list=PL0Z2YDlm0b3jAlSy1JxyP8zluvXaN3nxU) diff --git a/SECURITY.md b/SECURITY.md index a200e172a3b..79ca0269838 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -13,9 +13,10 @@ The following versions of ClickHouse server are currently being supported with s | Version | Supported | |:-|:-| +| 24.1 | ✔️ | | 23.12 | ✔️ | | 23.11 | ✔️ | -| 23.10 | ✔️ | +| 23.10 | ❌ | | 23.9 | ❌ | | 23.8 | ✔️ | | 23.7 | ❌ | diff --git a/base/base/CMakeLists.txt b/base/base/CMakeLists.txt index 3886932d198..025687d2c59 100644 --- a/base/base/CMakeLists.txt +++ b/base/base/CMakeLists.txt @@ -17,6 +17,7 @@ set (SRCS getMemoryAmount.cpp getPageSize.cpp getThreadId.cpp + int8_to_string.cpp JSON.cpp mremap.cpp phdr_cache.cpp diff --git a/base/base/Decimal_fwd.h b/base/base/Decimal_fwd.h index 589d6224917..beb228cea3c 100644 --- a/base/base/Decimal_fwd.h +++ b/base/base/Decimal_fwd.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace wide { @@ -44,3 +45,8 @@ concept is_over_big_int = || std::is_same_v || std::is_same_v; } + +template <> struct is_signed { static constexpr bool value = true; }; +template <> struct is_signed { static constexpr bool value = true; }; +template <> struct is_signed { static constexpr bool value = true; }; +template <> struct is_signed { static constexpr bool value = true; }; diff --git a/base/base/bit_cast.h b/base/base/bit_cast.h index 4783a84586b..9a92b7660f1 100644 --- a/base/base/bit_cast.h +++ b/base/base/bit_cast.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include diff --git a/base/base/coverage.cpp b/base/base/coverage.cpp index d70c3bcd82b..b85f1a16d32 100644 --- a/base/base/coverage.cpp +++ b/base/base/coverage.cpp @@ -1,4 +1,5 @@ #include "coverage.h" +#include #pragma GCC diagnostic ignored "-Wreserved-identifier" @@ -52,11 +53,21 @@ namespace uint32_t * guards_start = nullptr; uint32_t * guards_end = nullptr; - uintptr_t * coverage_array = nullptr; + uintptr_t * current_coverage_array = nullptr; + uintptr_t * cumulative_coverage_array = nullptr; size_t coverage_array_size = 0; uintptr_t * all_addresses_array = nullptr; size_t all_addresses_array_size = 0; + + uintptr_t * allocate(size_t size) + { + /// Note: mmap return zero-initialized memory, and we count on that. + void * map = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (MAP_FAILED == map) + return nullptr; + return static_cast(map); + } } extern "C" @@ -79,7 +90,8 @@ void __sanitizer_cov_trace_pc_guard_init(uint32_t * start, uint32_t * stop) coverage_array_size = stop - start; /// Note: we will leak this. - coverage_array = static_cast(malloc(sizeof(uintptr_t) * coverage_array_size)); + current_coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); + cumulative_coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); resetCoverage(); } @@ -92,8 +104,8 @@ void __sanitizer_cov_pcs_init(const uintptr_t * pcs_begin, const uintptr_t * pcs return; pc_table_initialized = true; - all_addresses_array = static_cast(malloc(sizeof(uintptr_t) * coverage_array_size)); all_addresses_array_size = pcs_end - pcs_begin; + all_addresses_array = allocate(sizeof(uintptr_t) * all_addresses_array_size); /// They are not a real pointers, but also contain a flag in the most significant bit, /// in which we are not interested for now. Reset it. @@ -115,17 +127,24 @@ void __sanitizer_cov_trace_pc_guard(uint32_t * guard) /// The values of `*guard` are as you set them in /// __sanitizer_cov_trace_pc_guard_init and so you can make them consecutive /// and use them to dereference an array or a bit vector. - void * pc = __builtin_return_address(0); + intptr_t pc = reinterpret_cast(__builtin_return_address(0)); - coverage_array[guard - guards_start] = reinterpret_cast(pc); + current_coverage_array[guard - guards_start] = pc; + cumulative_coverage_array[guard - guards_start] = pc; } } __attribute__((no_sanitize("coverage"))) -std::span getCoverage() +std::span getCurrentCoverage() { - return {coverage_array, coverage_array_size}; + return {current_coverage_array, coverage_array_size}; +} + +__attribute__((no_sanitize("coverage"))) +std::span getCumulativeCoverage() +{ + return {cumulative_coverage_array, coverage_array_size}; } __attribute__((no_sanitize("coverage"))) @@ -137,7 +156,7 @@ std::span getAllInstrumentedAddresses() __attribute__((no_sanitize("coverage"))) void resetCoverage() { - memset(coverage_array, 0, coverage_array_size * sizeof(*coverage_array)); + memset(current_coverage_array, 0, coverage_array_size * sizeof(*current_coverage_array)); /// The guard defines whether the __sanitizer_cov_trace_pc_guard should be called. /// For example, you can unset it after first invocation to prevent excessive work. diff --git a/base/base/coverage.h b/base/base/coverage.h index f75ed2d3553..a6e5a6848d7 100644 --- a/base/base/coverage.h +++ b/base/base/coverage.h @@ -15,7 +15,10 @@ void dumpCoverageReportIfPossible(); /// Get accumulated unique program addresses of the instrumented parts of the code, /// seen so far after program startup or after previous reset. /// The returned span will be represented as a sparse map, containing mostly zeros, which you should filter away. -std::span getCoverage(); +std::span getCurrentCoverage(); + +/// Similar but not being reset. +std::span getCumulativeCoverage(); /// Get all instrumented addresses that could be in the coverage. std::span getAllInstrumentedAddresses(); diff --git a/base/base/getMemoryAmount.cpp b/base/base/getMemoryAmount.cpp index a46e964c5a3..ccdc0f0f976 100644 --- a/base/base/getMemoryAmount.cpp +++ b/base/base/getMemoryAmount.cpp @@ -1,8 +1,11 @@ -#include -#include #include + #include +#include +#include +#include + #include #include #include @@ -11,6 +14,80 @@ #endif +namespace +{ + +std::optional getCgroupsV2MemoryLimit() +{ +#if defined(OS_LINUX) + const std::filesystem::path default_cgroups_mount = "/sys/fs/cgroup"; + + /// This file exists iff the host has cgroups v2 enabled. + std::ifstream controllers_file(default_cgroups_mount / "cgroup.controllers"); + if (!controllers_file.is_open()) + return {}; + + /// Make sure that the memory controller is enabled. + /// - cgroup.controllers defines which controllers *can* be enabled. + /// - cgroup.subtree_control defines which controllers *are* enabled. + /// (see https://docs.kernel.org/admin-guide/cgroup-v2.html) + /// Caveat: nested groups may disable controllers. For simplicity, check only the top-level group. + /// ReadBufferFromFile subtree_control_file(default_cgroups_mount / "cgroup.subtree_control"); + /// std::string subtree_control; + /// readString(subtree_control, subtree_control_file); + /// if (subtree_control.find("memory") == std::string::npos) + /// return {}; + std::ifstream subtree_control_file(default_cgroups_mount / "cgroup.subtree_control"); + std::stringstream subtree_control_buf; + subtree_control_buf << subtree_control_file.rdbuf(); + std::string subtree_control = subtree_control_buf.str(); + if (subtree_control.find("memory") == std::string::npos) + return {}; + + /// Identify the cgroup the process belongs to + /// All PIDs assigned to a cgroup are in /sys/fs/cgroups/{cgroup_name}/cgroup.procs + /// A simpler way to get the membership is: + std::ifstream cgroup_name_file("/proc/self/cgroup"); + if (!cgroup_name_file.is_open()) + return {}; + + std::stringstream cgroup_name_buf; + cgroup_name_buf << cgroup_name_file.rdbuf(); + std::string cgroup_name = cgroup_name_buf.str(); + if (!cgroup_name.empty() && cgroup_name.back() == '\n') + cgroup_name.pop_back(); /// remove trailing newline, if any + /// With cgroups v2, there will be a *single* line with prefix "0::/" + const std::string v2_prefix = "0::/"; + if (!cgroup_name.starts_with(v2_prefix)) + return {}; + cgroup_name = cgroup_name.substr(v2_prefix.length()); + + std::filesystem::path current_cgroup = cgroup_name.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup_name); + + /// Open the bottom-most nested memory limit setting file. If there is no such file at the current + /// level, try again at the parent level as memory settings are inherited. + while (current_cgroup != default_cgroups_mount.parent_path()) + { + std::ifstream setting_file(current_cgroup / "memory.max"); + if (setting_file.is_open()) + { + uint64_t value; + if (setting_file >> value) + return {value}; + else + return {}; /// e.g. the cgroups default "max" + } + current_cgroup = current_cgroup.parent_path(); + } + + return {}; +#else + return {}; +#endif +} + +} + /** Returns the size of physical memory (RAM) in bytes. * Returns 0 on unsupported platform */ @@ -26,34 +103,27 @@ uint64_t getMemoryAmountOrZero() uint64_t memory_amount = num_pages * page_size; -#if defined(OS_LINUX) - // Try to lookup at the Cgroup limit - - // CGroups v2 - std::ifstream cgroupv2_limit("/sys/fs/cgroup/memory.max"); - if (cgroupv2_limit.is_open()) - { - uint64_t memory_limit = 0; - cgroupv2_limit >> memory_limit; - if (memory_limit > 0 && memory_limit < memory_amount) - memory_amount = memory_limit; - } + /// Respect the memory limit set by cgroups v2. + auto limit_v2 = getCgroupsV2MemoryLimit(); + if (limit_v2.has_value() && *limit_v2 < memory_amount) + memory_amount = *limit_v2; else { - // CGroups v1 - std::ifstream cgroup_limit("/sys/fs/cgroup/memory/memory.limit_in_bytes"); - if (cgroup_limit.is_open()) + /// Cgroups v1 were replaced by v2 in 2015. The only reason we keep supporting v1 is that the transition to v2 + /// has been slow. Caveat : Hierarchical groups as in v2 are not supported for v1, the location of the memory + /// limit (virtual) file is hard-coded. + /// TODO: check at the end of 2024 if we can get rid of v1. + std::ifstream limit_file_v1("/sys/fs/cgroup/memory/memory.limit_in_bytes"); + if (limit_file_v1.is_open()) { - uint64_t memory_limit = 0; // in case of read error - cgroup_limit >> memory_limit; - if (memory_limit > 0 && memory_limit < memory_amount) - memory_amount = memory_limit; + uint64_t limit_v1; + if (limit_file_v1 >> limit_v1) + if (limit_v1 < memory_amount) + memory_amount = limit_v1; } } -#endif return memory_amount; - } diff --git a/base/base/int8_to_string.cpp b/base/base/int8_to_string.cpp new file mode 100644 index 00000000000..f74a6b8077e --- /dev/null +++ b/base/base/int8_to_string.cpp @@ -0,0 +1,9 @@ +#include + +namespace std +{ +std::string to_string(Int8 v) /// NOLINT (cert-dcl58-cpp) +{ + return to_string(int8_t{v}); +} +} diff --git a/base/base/int8_to_string.h b/base/base/int8_to_string.h new file mode 100644 index 00000000000..af0914f4312 --- /dev/null +++ b/base/base/int8_to_string.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + +#include + +template <> +struct fmt::formatter : fmt::formatter +{ +}; + + +namespace std +{ +std::string to_string(Int8 v); /// NOLINT (cert-dcl58-cpp) +} diff --git a/base/base/sort.h b/base/base/sort.h index 1a814587763..99bf8a0830e 100644 --- a/base/base/sort.h +++ b/base/base/sort.h @@ -64,19 +64,14 @@ using ComparatorWrapper = Comparator; #include -template -void nth_element(RandomIt first, RandomIt nth, RandomIt last) +template +void nth_element(RandomIt first, RandomIt nth, RandomIt last, Compare compare) { - using value_type = typename std::iterator_traits::value_type; - using comparator = std::less; - - comparator compare; - ComparatorWrapper compare_wrapper = compare; - #ifndef NDEBUG ::shuffle(first, last); #endif + ComparatorWrapper compare_wrapper = compare; ::miniselect::floyd_rivest_select(first, nth, last, compare_wrapper); #ifndef NDEBUG @@ -87,6 +82,15 @@ void nth_element(RandomIt first, RandomIt nth, RandomIt last) #endif } +template +void nth_element(RandomIt first, RandomIt nth, RandomIt last) +{ + using value_type = typename std::iterator_traits::value_type; + using comparator = std::less; + + ::nth_element(first, nth, last, comparator()); +} + template void partial_sort(RandomIt first, RandomIt middle, RandomIt last, Compare compare) { diff --git a/base/base/types.h b/base/base/types.h index 3a7760eae91..a4874860514 100644 --- a/base/base/types.h +++ b/base/base/types.h @@ -3,14 +3,29 @@ #include #include -/// This is needed for more strict aliasing. https://godbolt.org/z/xpJBSb https://stackoverflow.com/a/57453713 +/// Using char8_t more strict aliasing (https://stackoverflow.com/a/57453713) using UInt8 = char8_t; +/// Same for using signed _BitInt(8) (there isn't a signed char8_t, which would be more convenient) +/// See https://godbolt.org/z/fafnWEnnf +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wbit-int-extension" +using Int8 = signed _BitInt(8); +#pragma clang diagnostic pop + +namespace std +{ +template <> +struct hash /// NOLINT (cert-dcl58-cpp) +{ + size_t operator()(const Int8 x) const { return std::hash()(int8_t{x}); } +}; +} + using UInt16 = uint16_t; using UInt32 = uint32_t; using UInt64 = uint64_t; -using Int8 = int8_t; using Int16 = int16_t; using Int32 = int32_t; using Int64 = int64_t; diff --git a/base/base/wide_integer_impl.h b/base/base/wide_integer_impl.h index c1fd7b69b7f..17b1fa7cd6a 100644 --- a/base/base/wide_integer_impl.h +++ b/base/base/wide_integer_impl.h @@ -6,6 +6,7 @@ #include "throwError.h" +#include #include #include #include diff --git a/base/poco/Foundation/include/Poco/Logger.h b/base/poco/Foundation/include/Poco/Logger.h index f91d836f190..2a1cb33b407 100644 --- a/base/poco/Foundation/include/Poco/Logger.h +++ b/base/poco/Foundation/include/Poco/Logger.h @@ -22,6 +22,7 @@ #include #include #include + #include "Poco/Channel.h" #include "Poco/Format.h" #include "Poco/Foundation.h" @@ -871,21 +872,11 @@ public: /// If the Logger does not yet exist, it is created, based /// on its parent logger. - static LoggerPtr getShared(const std::string & name); + static LoggerPtr getShared(const std::string & name, bool should_be_owned_by_shared_ptr_if_created = true); /// Returns a shared pointer to the Logger with the given name. /// If the Logger does not yet exist, it is created, based /// on its parent logger. - static Logger & unsafeGet(const std::string & name); - /// Returns a reference to the Logger with the given name. - /// If the Logger does not yet exist, it is created, based - /// on its parent logger. - /// - /// WARNING: This method is not thread safe. You should - /// probably use get() instead. - /// The only time this method should be used is during - /// program initialization, when only one thread is running. - static Logger & create(const std::string & name, Channel * pChannel, int level = Message::PRIO_INFORMATION); /// Creates and returns a reference to a Logger with the /// given name. The Logger's Channel and log level as set as @@ -904,13 +895,6 @@ public: /// Returns a pointer to the Logger with the given name if it /// exists, or a null pointer otherwise. - static bool destroy(const std::string & name); - /// Destroys the logger with the specified name. Does nothing - /// if the logger is not found. - /// - /// After a logger has been destroyed, all references to it - /// become invalid. - static void shutdown(); /// Shuts down the logging framework and releases all /// Loggers. @@ -939,9 +923,17 @@ public: static const std::string ROOT; /// The name of the root logger (""). -protected: - typedef std::map LoggerMap; +public: + struct LoggerEntry + { + Poco::Logger * logger; + bool owned_by_shared_ptr = false; + }; + using LoggerMap = std::unordered_map; + using LoggerMapIterator = LoggerMap::iterator; + +protected: Logger(const std::string & name, Channel * pChannel, int level); ~Logger(); @@ -949,12 +941,16 @@ protected: void log(const std::string & text, Message::Priority prio, const char * file, int line); static std::string format(const std::string & fmt, int argc, std::string argv[]); - static Logger & unsafeCreate(const std::string & name, Channel * pChannel, int level = Message::PRIO_INFORMATION); - static Logger & parent(const std::string & name); - static void add(Logger * pLogger); - static Logger * find(const std::string & name); private: + static std::pair unsafeGet(const std::string & name, bool get_shared); + static Logger * unsafeGetRawPtr(const std::string & name); + static std::pair unsafeCreate(const std::string & name, Channel * pChannel, int level = Message::PRIO_INFORMATION); + static Logger & parent(const std::string & name); + static std::pair add(Logger * pLogger); + static std::optional find(const std::string & name); + static Logger * findRawPtr(const std::string & name); + Logger(); Logger(const Logger &); Logger & operator=(const Logger &); @@ -962,8 +958,6 @@ private: std::string _name; Channel * _pChannel; std::atomic_int _level; - - static LoggerMap * _pLoggerMap; }; diff --git a/base/poco/Foundation/src/Logger.cpp b/base/poco/Foundation/src/Logger.cpp index 7c54116aaa4..779af384b0b 100644 --- a/base/poco/Foundation/src/Logger.cpp +++ b/base/poco/Foundation/src/Logger.cpp @@ -20,6 +20,7 @@ #include "Poco/NumberParser.h" #include "Poco/String.h" +#include #include namespace @@ -37,12 +38,13 @@ std::mutex & getLoggerMutex() return *logger_mutex; } +Poco::Logger::LoggerMap * _pLoggerMap = nullptr; + } namespace Poco { -Logger::LoggerMap* Logger::_pLoggerMap = 0; const std::string Logger::ROOT; @@ -134,12 +136,12 @@ void Logger::setLevel(const std::string& name, int level) if (_pLoggerMap) { std::string::size_type len = name.length(); - for (LoggerMap::iterator it = _pLoggerMap->begin(); it != _pLoggerMap->end(); ++it) + for (auto & it : *_pLoggerMap) { if (len == 0 || - (it->first.compare(0, len, name) == 0 && (it->first.length() == len || it->first[len] == '.'))) + (it.first.compare(0, len, name) == 0 && (it.first.length() == len || it.first[len] == '.'))) { - it->second->setLevel(level); + it.second.logger->setLevel(level); } } } @@ -153,12 +155,12 @@ void Logger::setChannel(const std::string& name, Channel* pChannel) if (_pLoggerMap) { std::string::size_type len = name.length(); - for (LoggerMap::iterator it = _pLoggerMap->begin(); it != _pLoggerMap->end(); ++it) + for (auto & it : *_pLoggerMap) { if (len == 0 || - (it->first.compare(0, len, name) == 0 && (it->first.length() == len || it->first[len] == '.'))) + (it.first.compare(0, len, name) == 0 && (it.first.length() == len || it.first[len] == '.'))) { - it->second->setChannel(pChannel); + it.second.logger->setChannel(pChannel); } } } @@ -172,12 +174,12 @@ void Logger::setProperty(const std::string& loggerName, const std::string& prope if (_pLoggerMap) { std::string::size_type len = loggerName.length(); - for (LoggerMap::iterator it = _pLoggerMap->begin(); it != _pLoggerMap->end(); ++it) + for (auto & it : *_pLoggerMap) { if (len == 0 || - (it->first.compare(0, len, loggerName) == 0 && (it->first.length() == len || it->first[len] == '.'))) + (it.first.compare(0, len, loggerName) == 0 && (it.first.length() == len || it.first[len] == '.'))) { - it->second->setProperty(propertyName, value); + it.second.logger->setProperty(propertyName, value); } } } @@ -304,52 +306,106 @@ struct LoggerDeleter { void operator()(Poco::Logger * logger) { - if (Logger::destroy(logger->name())) - return; + std::lock_guard lock(getLoggerMutex()); - logger->release(); + /// If logger infrastructure is destroyed just decrement logger reference count + if (!_pLoggerMap) + { + logger->release(); + return; + } + + auto it = _pLoggerMap->find(logger->name()); + assert(it != _pLoggerMap->end()); + + /** If reference count is 1, this means this shared pointer owns logger + * and need destroy it. + */ + size_t reference_count_before_release = logger->release(); + if (reference_count_before_release == 1) + { + assert(it->second.owned_by_shared_ptr); + _pLoggerMap->erase(it); + } } }; -inline LoggerPtr makeLoggerPtr(Logger & logger) +inline LoggerPtr makeLoggerPtr(Logger & logger, bool owned_by_shared_ptr) { - logger.duplicate(); - return std::shared_ptr(&logger, LoggerDeleter()); + if (owned_by_shared_ptr) + return LoggerPtr(&logger, LoggerDeleter()); + + return LoggerPtr(std::shared_ptr{}, &logger); } } + Logger& Logger::get(const std::string& name) { std::lock_guard lock(getLoggerMutex()); - return unsafeGet(name); + auto [it, inserted] = unsafeGet(name, false /*get_shared*/); + return *it->second.logger; } -LoggerPtr Logger::getShared(const std::string & name) + +LoggerPtr Logger::getShared(const std::string & name, bool should_be_owned_by_shared_ptr_if_created) { std::lock_guard lock(getLoggerMutex()); + auto [it, inserted] = unsafeGet(name, true /*get_shared*/); - return makeLoggerPtr(unsafeGet(name)); + /** If during `unsafeGet` logger was created, then this shared pointer owns it. + * If logger was already created, then this shared pointer does not own it. + */ + if (inserted && should_be_owned_by_shared_ptr_if_created) + it->second.owned_by_shared_ptr = true; + + return makeLoggerPtr(*it->second.logger, it->second.owned_by_shared_ptr); } -Logger& Logger::unsafeGet(const std::string& name) + +std::pair Logger::unsafeGet(const std::string& name, bool get_shared) { - Logger* pLogger = find(name); - if (!pLogger) + std::optional optional_logger_it = find(name); + + if (optional_logger_it) { + auto & logger_it = *optional_logger_it; + + if (logger_it->second.owned_by_shared_ptr) + { + logger_it->second.logger->duplicate(); + + if (!get_shared) + logger_it->second.owned_by_shared_ptr = false; + } + } + + if (!optional_logger_it) + { + Logger * logger = nullptr; + if (name == ROOT) { - pLogger = new Logger(name, 0, Message::PRIO_INFORMATION); + logger = new Logger(name, nullptr, Message::PRIO_INFORMATION); } else { Logger& par = parent(name); - pLogger = new Logger(name, par.getChannel(), par.getLevel()); + logger = new Logger(name, par.getChannel(), par.getLevel()); } - add(pLogger); + + return add(logger); } - return *pLogger; + + return std::make_pair(*optional_logger_it, false); +} + + +Logger * Logger::unsafeGetRawPtr(const std::string & name) +{ + return unsafeGet(name, false /*get_shared*/).first->second.logger; } @@ -357,21 +413,24 @@ Logger& Logger::create(const std::string& name, Channel* pChannel, int level) { std::lock_guard lock(getLoggerMutex()); - return unsafeCreate(name, pChannel, level); + return *unsafeCreate(name, pChannel, level).first->second.logger; } LoggerPtr Logger::createShared(const std::string & name, Channel * pChannel, int level) { std::lock_guard lock(getLoggerMutex()); - return makeLoggerPtr(unsafeCreate(name, pChannel, level)); + auto [it, inserted] = unsafeCreate(name, pChannel, level); + it->second.owned_by_shared_ptr = true; + + return makeLoggerPtr(*it->second.logger, it->second.owned_by_shared_ptr); } Logger& Logger::root() { std::lock_guard lock(getLoggerMutex()); - return unsafeGet(ROOT); + return *unsafeGetRawPtr(ROOT); } @@ -379,7 +438,11 @@ Logger* Logger::has(const std::string& name) { std::lock_guard lock(getLoggerMutex()); - return find(name); + auto optional_it = find(name); + if (!optional_it) + return nullptr; + + return (*optional_it)->second.logger; } @@ -389,45 +452,41 @@ void Logger::shutdown() if (_pLoggerMap) { - for (LoggerMap::iterator it = _pLoggerMap->begin(); it != _pLoggerMap->end(); ++it) + for (auto & it : *_pLoggerMap) { - it->second->release(); + if (it.second.owned_by_shared_ptr) + continue; + + it.second.logger->release(); } + delete _pLoggerMap; - _pLoggerMap = 0; + _pLoggerMap = nullptr; } } -Logger* Logger::find(const std::string& name) +std::optional Logger::find(const std::string& name) { if (_pLoggerMap) { LoggerMap::iterator it = _pLoggerMap->find(name); if (it != _pLoggerMap->end()) - return it->second; + return it; + + return {}; } - return 0; + + return {}; } - -bool Logger::destroy(const std::string& name) +Logger * Logger::findRawPtr(const std::string & name) { - std::lock_guard lock(getLoggerMutex()); + auto optional_it = find(name); + if (!optional_it) + return nullptr; - if (_pLoggerMap) - { - LoggerMap::iterator it = _pLoggerMap->find(name); - if (it != _pLoggerMap->end()) - { - if (it->second->release() == 1) - _pLoggerMap->erase(it); - - return true; - } - } - - return false; + return (*optional_it)->second.logger; } @@ -445,28 +504,28 @@ void Logger::names(std::vector& names) } } -Logger& Logger::unsafeCreate(const std::string & name, Channel * pChannel, int level) + +std::pair Logger::unsafeCreate(const std::string & name, Channel * pChannel, int level) { if (find(name)) throw ExistsException(); Logger* pLogger = new Logger(name, pChannel, level); - add(pLogger); - - return *pLogger; + return add(pLogger); } + Logger& Logger::parent(const std::string& name) { std::string::size_type pos = name.rfind('.'); if (pos != std::string::npos) { std::string pname = name.substr(0, pos); - Logger* pParent = find(pname); + Logger* pParent = findRawPtr(pname); if (pParent) return *pParent; else return parent(pname); } - else return unsafeGet(ROOT); + else return *unsafeGetRawPtr(ROOT); } @@ -534,11 +593,14 @@ namespace } -void Logger::add(Logger* pLogger) +std::pair Logger::add(Logger* pLogger) { if (!_pLoggerMap) - _pLoggerMap = new LoggerMap; - _pLoggerMap->insert(LoggerMap::value_type(pLogger->name(), pLogger)); + _pLoggerMap = new Logger::LoggerMap; + + auto result = _pLoggerMap->emplace(pLogger->name(), LoggerEntry{pLogger, false /*owned_by_shared_ptr*/}); + assert(result.second); + return result; } diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index e5a8c064808..885080a3e38 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -2,11 +2,11 @@ # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION, # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes. -SET(VERSION_REVISION 54482) +SET(VERSION_REVISION 54483) SET(VERSION_MAJOR 24) -SET(VERSION_MINOR 1) +SET(VERSION_MINOR 2) SET(VERSION_PATCH 1) -SET(VERSION_GITHASH a2faa65b080a587026c86844f3a20c74d23a86f8) -SET(VERSION_DESCRIBE v24.1.1.1-testing) -SET(VERSION_STRING 24.1.1.1) +SET(VERSION_GITHASH 5a024dfc0936e062770d0cfaad0805b57c1fba17) +SET(VERSION_DESCRIBE v24.2.1.1-testing) +SET(VERSION_STRING 24.2.1.1) # end of autochange diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index 3882b51227e..88dea294bf5 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -63,14 +63,14 @@ endif() option(WITH_COVERAGE "Instrumentation for code coverage with default implementation" OFF) if (WITH_COVERAGE) - message (INFORMATION "Enabled instrumentation for code coverage") + message (STATUS "Enabled instrumentation for code coverage") set(COVERAGE_FLAGS "-fprofile-instr-generate -fcoverage-mapping") endif() option (SANITIZE_COVERAGE "Instrumentation for code coverage with custom callbacks" OFF) if (SANITIZE_COVERAGE) - message (INFORMATION "Enabled instrumentation for code coverage") + message (STATUS "Enabled instrumentation for code coverage") # We set this define for whole build to indicate that at least some parts are compiled with coverage. # And to expose it in system.build_options. @@ -79,7 +79,10 @@ if (SANITIZE_COVERAGE) # But the actual coverage will be enabled on per-library basis: for ClickHouse code, but not for 3rd-party. set (COVERAGE_FLAGS "-fsanitize-coverage=trace-pc-guard,pc-table") -endif() -set (WITHOUT_COVERAGE_FLAGS "-fno-profile-instr-generate -fno-coverage-mapping -fno-sanitize-coverage=trace-pc-guard,pc-table") -set (WITHOUT_COVERAGE_FLAGS_LIST -fno-profile-instr-generate -fno-coverage-mapping -fno-sanitize-coverage=trace-pc-guard,pc-table) + set (WITHOUT_COVERAGE_FLAGS "-fno-profile-instr-generate -fno-coverage-mapping -fno-sanitize-coverage=trace-pc-guard,pc-table") + set (WITHOUT_COVERAGE_FLAGS_LIST -fno-profile-instr-generate -fno-coverage-mapping -fno-sanitize-coverage=trace-pc-guard,pc-table) +else() + set (WITHOUT_COVERAGE_FLAGS "") + set (WITHOUT_COVERAGE_FLAGS_LIST "") +endif() diff --git a/contrib/aws b/contrib/aws index ca02358dcc7..9eb5097a0ab 160000 --- a/contrib/aws +++ b/contrib/aws @@ -1 +1 @@ -Subproject commit ca02358dcc7ce3ab733dd4cbcc32734eecfa4ee3 +Subproject commit 9eb5097a0abfa837722cca7a5114a25837817bf2 diff --git a/contrib/aws-c-auth b/contrib/aws-c-auth index 97133a2b5db..baeffa791d9 160000 --- a/contrib/aws-c-auth +++ b/contrib/aws-c-auth @@ -1 +1 @@ -Subproject commit 97133a2b5dbca1ccdf88cd6f44f39d0531d27d12 +Subproject commit baeffa791d9d1cf61460662a6d9ac2186aaf05df diff --git a/contrib/aws-c-cal b/contrib/aws-c-cal index 85dd7664b78..9453687ff54 160000 --- a/contrib/aws-c-cal +++ b/contrib/aws-c-cal @@ -1 +1 @@ -Subproject commit 85dd7664b786a389c6fb1a6f031ab4bb2282133d +Subproject commit 9453687ff5493ba94eaccf8851200565c4364c77 diff --git a/contrib/aws-c-common b/contrib/aws-c-common index 45dcb2849c8..80f21b3cac5 160000 --- a/contrib/aws-c-common +++ b/contrib/aws-c-common @@ -1 +1 @@ -Subproject commit 45dcb2849c891dba2100b270b4676765c92949ff +Subproject commit 80f21b3cac5ac51c6b8a62c7d2a5ef58a75195ee diff --git a/contrib/aws-c-compression b/contrib/aws-c-compression index b517b7decd0..99ec79ee297 160000 --- a/contrib/aws-c-compression +++ b/contrib/aws-c-compression @@ -1 +1 @@ -Subproject commit b517b7decd0dac30be2162f5186c250221c53aff +Subproject commit 99ec79ee2970f1a045d4ced1501b97ee521f2f85 diff --git a/contrib/aws-c-event-stream b/contrib/aws-c-event-stream index 2f9b60c42f9..08f24e384e5 160000 --- a/contrib/aws-c-event-stream +++ b/contrib/aws-c-event-stream @@ -1 +1 @@ -Subproject commit 2f9b60c42f90840ec11822acda3d8cdfa97a773d +Subproject commit 08f24e384e5be20bcffa42b49213d24dad7881ae diff --git a/contrib/aws-c-http b/contrib/aws-c-http index dd344619879..a082f8a2067 160000 --- a/contrib/aws-c-http +++ b/contrib/aws-c-http @@ -1 +1 @@ -Subproject commit dd34461987947672444d0bc872c5a733dfdb9711 +Subproject commit a082f8a2067e4a31db73f1d4ffd702a8dc0f7089 diff --git a/contrib/aws-c-io b/contrib/aws-c-io index d58ed4f272b..11ce3c750a1 160000 --- a/contrib/aws-c-io +++ b/contrib/aws-c-io @@ -1 +1 @@ -Subproject commit d58ed4f272b1cb4f89ac9196526ceebe5f2b0d89 +Subproject commit 11ce3c750a1dac7b04069fc5bff89e97e91bad4d diff --git a/contrib/aws-c-mqtt b/contrib/aws-c-mqtt index 33c3455cec8..6d36cd37262 160000 --- a/contrib/aws-c-mqtt +++ b/contrib/aws-c-mqtt @@ -1 +1 @@ -Subproject commit 33c3455cec82b16feb940e12006cefd7b3ef4194 +Subproject commit 6d36cd3726233cb757468d0ea26f6cd8dad151ec diff --git a/contrib/aws-c-s3 b/contrib/aws-c-s3 index d7bfe602d69..de36fee8fe7 160000 --- a/contrib/aws-c-s3 +++ b/contrib/aws-c-s3 @@ -1 +1 @@ -Subproject commit d7bfe602d6925948f1fff95784e3613cca6a3900 +Subproject commit de36fee8fe7ab02f10987877ae94a805bf440c1f diff --git a/contrib/aws-c-sdkutils b/contrib/aws-c-sdkutils index 208a701fa01..fd8c0ba2e23 160000 --- a/contrib/aws-c-sdkutils +++ b/contrib/aws-c-sdkutils @@ -1 +1 @@ -Subproject commit 208a701fa01e99c7c8cc3dcebc8317da71362972 +Subproject commit fd8c0ba2e233997eaaefe82fb818b8b444b956d3 diff --git a/contrib/aws-checksums b/contrib/aws-checksums index ad53be196a2..321b805559c 160000 --- a/contrib/aws-checksums +++ b/contrib/aws-checksums @@ -1 +1 @@ -Subproject commit ad53be196a25bbefa3700a01187fdce573a7d2d0 +Subproject commit 321b805559c8e911be5bddba13fcbd222a3e2d3a diff --git a/contrib/aws-cmake/CMakeLists.txt b/contrib/aws-cmake/CMakeLists.txt index 950a0e06cd0..abde20addaf 100644 --- a/contrib/aws-cmake/CMakeLists.txt +++ b/contrib/aws-cmake/CMakeLists.txt @@ -25,6 +25,7 @@ include("${ClickHouse_SOURCE_DIR}/contrib/aws-cmake/AwsFeatureTests.cmake") include("${ClickHouse_SOURCE_DIR}/contrib/aws-cmake/AwsThreadAffinity.cmake") include("${ClickHouse_SOURCE_DIR}/contrib/aws-cmake/AwsThreadName.cmake") include("${ClickHouse_SOURCE_DIR}/contrib/aws-cmake/AwsSIMD.cmake") +include("${ClickHouse_SOURCE_DIR}/contrib/aws-crt-cpp/cmake/AwsGetVersion.cmake") # Gather sources and options. @@ -35,6 +36,8 @@ set(AWS_PUBLIC_COMPILE_DEFS) set(AWS_PRIVATE_COMPILE_DEFS) set(AWS_PRIVATE_LIBS) +list(APPEND AWS_PRIVATE_COMPILE_DEFS "-DINTEL_NO_ITTNOTIFY_API") + if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") list(APPEND AWS_PRIVATE_COMPILE_DEFS "-DDEBUG_BUILD") endif() @@ -85,14 +88,20 @@ file(GLOB AWS_SDK_CORE_SRC "${AWS_SDK_CORE_DIR}/source/external/cjson/*.cpp" "${AWS_SDK_CORE_DIR}/source/external/tinyxml2/*.cpp" "${AWS_SDK_CORE_DIR}/source/http/*.cpp" + "${AWS_SDK_CORE_DIR}/source/http/crt/*.cpp" "${AWS_SDK_CORE_DIR}/source/http/standard/*.cpp" "${AWS_SDK_CORE_DIR}/source/internal/*.cpp" "${AWS_SDK_CORE_DIR}/source/monitoring/*.cpp" + "${AWS_SDK_CORE_DIR}/source/net/*.cpp" + "${AWS_SDK_CORE_DIR}/source/net/linux-shared/*.cpp" + "${AWS_SDK_CORE_DIR}/source/platform/linux-shared/*.cpp" + "${AWS_SDK_CORE_DIR}/source/smithy/tracing/*.cpp" "${AWS_SDK_CORE_DIR}/source/utils/*.cpp" "${AWS_SDK_CORE_DIR}/source/utils/base64/*.cpp" + "${AWS_SDK_CORE_DIR}/source/utils/component-registry/*.cpp" "${AWS_SDK_CORE_DIR}/source/utils/crypto/*.cpp" - "${AWS_SDK_CORE_DIR}/source/utils/crypto/openssl/*.cpp" "${AWS_SDK_CORE_DIR}/source/utils/crypto/factory/*.cpp" + "${AWS_SDK_CORE_DIR}/source/utils/crypto/openssl/*.cpp" "${AWS_SDK_CORE_DIR}/source/utils/event/*.cpp" "${AWS_SDK_CORE_DIR}/source/utils/json/*.cpp" "${AWS_SDK_CORE_DIR}/source/utils/logging/*.cpp" @@ -115,9 +124,8 @@ OPTION(USE_AWS_MEMORY_MANAGEMENT "Aws memory management" OFF) configure_file("${AWS_SDK_CORE_DIR}/include/aws/core/SDKConfig.h.in" "${CMAKE_CURRENT_BINARY_DIR}/include/aws/core/SDKConfig.h" @ONLY) -list(APPEND AWS_PUBLIC_COMPILE_DEFS "-DAWS_SDK_VERSION_MAJOR=1") -list(APPEND AWS_PUBLIC_COMPILE_DEFS "-DAWS_SDK_VERSION_MINOR=10") -list(APPEND AWS_PUBLIC_COMPILE_DEFS "-DAWS_SDK_VERSION_PATCH=36") +aws_get_version(AWS_CRT_CPP_VERSION_MAJOR AWS_CRT_CPP_VERSION_MINOR AWS_CRT_CPP_VERSION_PATCH FULL_VERSION GIT_HASH) +configure_file("${AWS_CRT_DIR}/include/aws/crt/Config.h.in" "${AWS_CRT_DIR}/include/aws/crt/Config.h" @ONLY) list(APPEND AWS_SOURCES ${AWS_SDK_CORE_SRC} ${AWS_SDK_CORE_NET_SRC} ${AWS_SDK_CORE_PLATFORM_SRC}) @@ -176,6 +184,7 @@ file(GLOB AWS_COMMON_SRC "${AWS_COMMON_DIR}/source/*.c" "${AWS_COMMON_DIR}/source/external/*.c" "${AWS_COMMON_DIR}/source/posix/*.c" + "${AWS_COMMON_DIR}/source/linux/*.c" ) file(GLOB AWS_COMMON_ARCH_SRC diff --git a/contrib/aws-crt-cpp b/contrib/aws-crt-cpp index 8a301b7e842..f532d6abc0d 160000 --- a/contrib/aws-crt-cpp +++ b/contrib/aws-crt-cpp @@ -1 +1 @@ -Subproject commit 8a301b7e842f1daed478090c869207300972379f +Subproject commit f532d6abc0d2b0d8b5d6fe9e7c51eaedbe4afbd0 diff --git a/contrib/aws-s2n-tls b/contrib/aws-s2n-tls index 71f4794b758..9a1e7545402 160000 --- a/contrib/aws-s2n-tls +++ b/contrib/aws-s2n-tls @@ -1 +1 @@ -Subproject commit 71f4794b7580cf780eb4aca77d69eded5d3c7bb4 +Subproject commit 9a1e75454023e952b366ce1eab9c54007250119f diff --git a/contrib/corrosion-cmake/CMakeLists.txt b/contrib/corrosion-cmake/CMakeLists.txt index 04871c761ab..4f60304d74d 100644 --- a/contrib/corrosion-cmake/CMakeLists.txt +++ b/contrib/corrosion-cmake/CMakeLists.txt @@ -16,29 +16,30 @@ message(STATUS "Checking Rust toolchain for current target") # See https://doc.rust-lang.org/nightly/rustc/platform-support.html -if(CMAKE_TOOLCHAIN_FILE MATCHES "ppc64le") - set(Rust_CARGO_TARGET "powerpc64le-unknown-linux-gnu") -elseif((CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-x86_64") AND (CMAKE_TOOLCHAIN_FILE MATCHES "musl")) - set(Rust_CARGO_TARGET "x86_64-unknown-linux-musl") -elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-x86_64") - set(Rust_CARGO_TARGET "x86_64-unknown-linux-gnu") -elseif((CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-aarch64") AND (CMAKE_TOOLCHAIN_FILE MATCHES "musl")) - set(Rust_CARGO_TARGET "aarch64-unknown-linux-musl") -elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-aarch64") - set(Rust_CARGO_TARGET "aarch64-unknown-linux-gnu") -elseif((CMAKE_TOOLCHAIN_FILE MATCHES "darwin") AND (CMAKE_TOOLCHAIN_FILE MATCHES "x86_64")) - set(Rust_CARGO_TARGET "x86_64-apple-darwin") -elseif((CMAKE_TOOLCHAIN_FILE MATCHES "darwin") AND (CMAKE_TOOLCHAIN_FILE MATCHES "darwin")) - set(Rust_CARGO_TARGET "aarch64-apple-darwin") -elseif((CMAKE_TOOLCHAIN_FILE MATCHES "freebsd") AND (CMAKE_TOOLCHAIN_FILE MATCHES "x86_64")) - set(Rust_CARGO_TARGET "x86_64-unknown-freebsd") -elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-riscv64") - set(Rust_CARGO_TARGET "riscv64gc-unknown-linux-gnu") -else() - message(FATAL_ERROR "Unsupported rust target") -endif() - -message(STATUS "Switched Rust target to ${Rust_CARGO_TARGET}") +if(DEFINED CMAKE_TOOLCHAIN_FILE) + if(CMAKE_TOOLCHAIN_FILE MATCHES "ppc64le") + set(Rust_CARGO_TARGET "powerpc64le-unknown-linux-gnu") + elseif((CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-x86_64") AND (CMAKE_TOOLCHAIN_FILE MATCHES "musl")) + set(Rust_CARGO_TARGET "x86_64-unknown-linux-musl") + elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-x86_64") + set(Rust_CARGO_TARGET "x86_64-unknown-linux-gnu") + elseif((CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-aarch64") AND (CMAKE_TOOLCHAIN_FILE MATCHES "musl")) + set(Rust_CARGO_TARGET "aarch64-unknown-linux-musl") + elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-aarch64") + set(Rust_CARGO_TARGET "aarch64-unknown-linux-gnu") + elseif((CMAKE_TOOLCHAIN_FILE MATCHES "darwin") AND (CMAKE_TOOLCHAIN_FILE MATCHES "x86_64")) + set(Rust_CARGO_TARGET "x86_64-apple-darwin") + elseif((CMAKE_TOOLCHAIN_FILE MATCHES "darwin") AND (CMAKE_TOOLCHAIN_FILE MATCHES "aarch64")) + set(Rust_CARGO_TARGET "aarch64-apple-darwin") + elseif((CMAKE_TOOLCHAIN_FILE MATCHES "freebsd") AND (CMAKE_TOOLCHAIN_FILE MATCHES "x86_64")) + set(Rust_CARGO_TARGET "x86_64-unknown-freebsd") + elseif(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-riscv64") + set(Rust_CARGO_TARGET "riscv64gc-unknown-linux-gnu") + else() + message(FATAL_ERROR "Unsupported rust target") + endif() + message(STATUS "Switched Rust target to ${Rust_CARGO_TARGET}") +endif () # FindRust.cmake list(APPEND CMAKE_MODULE_PATH "${ClickHouse_SOURCE_DIR}/contrib/corrosion/cmake") diff --git a/contrib/curl b/contrib/curl index d755a5f7c00..5ce164e0e92 160000 --- a/contrib/curl +++ b/contrib/curl @@ -1 +1 @@ -Subproject commit d755a5f7c009dd63a61b2c745180d8ba937cbfeb +Subproject commit 5ce164e0e9290c96eb7d502173426c0a135ec008 diff --git a/contrib/libssh b/contrib/libssh index 2c76332ef56..ed4011b9187 160000 --- a/contrib/libssh +++ b/contrib/libssh @@ -1 +1 @@ -Subproject commit 2c76332ef56d90f55965ab24da6b6dbcbef29c4c +Subproject commit ed4011b91873836713576475a98cd132cd834539 diff --git a/contrib/libssh-cmake/CMakeLists.txt b/contrib/libssh-cmake/CMakeLists.txt index 7a3816d4dce..7b589718140 100644 --- a/contrib/libssh-cmake/CMakeLists.txt +++ b/contrib/libssh-cmake/CMakeLists.txt @@ -1,4 +1,4 @@ -option (ENABLE_SSH "Enable support for SSH keys and protocol" ON) +option (ENABLE_SSH "Enable support for SSH keys and protocol" ${ENABLE_LIBRARIES}) if (NOT ENABLE_SSH) message(STATUS "Not using SSH") @@ -8,24 +8,12 @@ endif() set(LIB_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/libssh") set(LIB_BINARY_DIR "${ClickHouse_BINARY_DIR}/contrib/libssh") -project(libssh VERSION 0.9.7 LANGUAGES C) +# Set CMake variables which are used in libssh_version.h.cmake +project(libssh VERSION 0.9.8 LANGUAGES C) -# global needed variable -set(APPLICATION_NAME ${PROJECT_NAME}) - -# SOVERSION scheme: CURRENT.AGE.REVISION -# If there was an incompatible interface change: -# Increment CURRENT. Set AGE and REVISION to 0 -# If there was a compatible interface change: -# Increment AGE. Set REVISION to 0 -# If the source code was changed, but there were no interface changes: -# Increment REVISION. -set(LIBRARY_VERSION "4.8.7") +set(LIBRARY_VERSION "4.8.8") set(LIBRARY_SOVERSION "4") -# Copy library files to a lib sub-directory -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${LIB_BINARY_DIR}/lib") - set(CMAKE_THREAD_PREFER_PTHREADS ON) set(THREADS_PREFER_PTHREAD_FLAG ON) @@ -33,7 +21,87 @@ set(WITH_ZLIB OFF) set(WITH_SYMBOL_VERSIONING OFF) set(WITH_SERVER ON) -include(IncludeSources.cmake) +set(libssh_SRCS + ${LIB_SOURCE_DIR}/src/agent.c + ${LIB_SOURCE_DIR}/src/auth.c + ${LIB_SOURCE_DIR}/src/base64.c + ${LIB_SOURCE_DIR}/src/bignum.c + ${LIB_SOURCE_DIR}/src/buffer.c + ${LIB_SOURCE_DIR}/src/callbacks.c + ${LIB_SOURCE_DIR}/src/channels.c + ${LIB_SOURCE_DIR}/src/client.c + ${LIB_SOURCE_DIR}/src/config.c + ${LIB_SOURCE_DIR}/src/connect.c + ${LIB_SOURCE_DIR}/src/connector.c + ${LIB_SOURCE_DIR}/src/curve25519.c + ${LIB_SOURCE_DIR}/src/dh.c + ${LIB_SOURCE_DIR}/src/ecdh.c + ${LIB_SOURCE_DIR}/src/error.c + ${LIB_SOURCE_DIR}/src/getpass.c + ${LIB_SOURCE_DIR}/src/init.c + ${LIB_SOURCE_DIR}/src/kdf.c + ${LIB_SOURCE_DIR}/src/kex.c + ${LIB_SOURCE_DIR}/src/known_hosts.c + ${LIB_SOURCE_DIR}/src/knownhosts.c + ${LIB_SOURCE_DIR}/src/legacy.c + ${LIB_SOURCE_DIR}/src/log.c + ${LIB_SOURCE_DIR}/src/match.c + ${LIB_SOURCE_DIR}/src/messages.c + ${LIB_SOURCE_DIR}/src/misc.c + ${LIB_SOURCE_DIR}/src/options.c + ${LIB_SOURCE_DIR}/src/packet.c + ${LIB_SOURCE_DIR}/src/packet_cb.c + ${LIB_SOURCE_DIR}/src/packet_crypt.c + ${LIB_SOURCE_DIR}/src/pcap.c + ${LIB_SOURCE_DIR}/src/pki.c + ${LIB_SOURCE_DIR}/src/pki_container_openssh.c + ${LIB_SOURCE_DIR}/src/poll.c + ${LIB_SOURCE_DIR}/src/session.c + ${LIB_SOURCE_DIR}/src/scp.c + ${LIB_SOURCE_DIR}/src/socket.c + ${LIB_SOURCE_DIR}/src/string.c + ${LIB_SOURCE_DIR}/src/threads.c + ${LIB_SOURCE_DIR}/src/wrapper.c + ${LIB_SOURCE_DIR}/src/external/bcrypt_pbkdf.c + ${LIB_SOURCE_DIR}/src/external/blowfish.c + ${LIB_SOURCE_DIR}/src/external/chacha.c + ${LIB_SOURCE_DIR}/src/external/poly1305.c + ${LIB_SOURCE_DIR}/src/chachapoly.c + ${LIB_SOURCE_DIR}/src/config_parser.c + ${LIB_SOURCE_DIR}/src/token.c + ${LIB_SOURCE_DIR}/src/pki_ed25519_common.c + + ${LIB_SOURCE_DIR}/src/threads/noop.c + ${LIB_SOURCE_DIR}/src/threads/pthread.c + + # LIBCRYPT specific + ${libssh_SRCS} + ${LIB_SOURCE_DIR}/src/threads/libcrypto.c + ${LIB_SOURCE_DIR}/src/pki_crypto.c + ${LIB_SOURCE_DIR}/src/ecdh_crypto.c + ${LIB_SOURCE_DIR}/src/libcrypto.c + ${LIB_SOURCE_DIR}/src/dh_crypto.c + + ${LIB_SOURCE_DIR}/src/options.c + ${LIB_SOURCE_DIR}/src/server.c + ${LIB_SOURCE_DIR}/src/bind.c + ${LIB_SOURCE_DIR}/src/bind_config.c +) + +if (NOT (ENABLE_OPENSSL OR ENABLE_OPENSSL_DYNAMIC)) + add_compile_definitions(USE_BORINGSSL=1) +endif() + +configure_file(${LIB_SOURCE_DIR}/include/libssh/libssh_version.h.cmake ${LIB_BINARY_DIR}/include/libssh/libssh_version.h @ONLY) + +add_library(_ssh STATIC ${libssh_SRCS}) +add_library(ch_contrib::ssh ALIAS _ssh) + +target_link_libraries(_ssh PRIVATE OpenSSL::Crypto) + +target_include_directories(_ssh PUBLIC "${LIB_SOURCE_DIR}/include" "${LIB_BINARY_DIR}/include") + +# These headers need to be generated using the native build system on each platform. if (OS_LINUX) if (ARCH_AMD64) if (USE_MUSL) @@ -63,7 +131,3 @@ elseif (OS_FREEBSD) else () message(FATAL_ERROR "Platform is not supported") endif() - -configure_file(${LIB_SOURCE_DIR}/include/libssh/libssh_version.h.cmake - ${LIB_BINARY_DIR}/include/libssh/libssh_version.h - @ONLY) diff --git a/contrib/libssh-cmake/IncludeSources.cmake b/contrib/libssh-cmake/IncludeSources.cmake deleted file mode 100644 index 30348d5d7dd..00000000000 --- a/contrib/libssh-cmake/IncludeSources.cmake +++ /dev/null @@ -1,98 +0,0 @@ -set(LIBSSH_LINK_LIBRARIES - ${LIBSSH_LINK_LIBRARIES} - OpenSSL::Crypto -) - -set(libssh_SRCS - ${LIB_SOURCE_DIR}/src/agent.c - ${LIB_SOURCE_DIR}/src/auth.c - ${LIB_SOURCE_DIR}/src/base64.c - ${LIB_SOURCE_DIR}/src/bignum.c - ${LIB_SOURCE_DIR}/src/buffer.c - ${LIB_SOURCE_DIR}/src/callbacks.c - ${LIB_SOURCE_DIR}/src/channels.c - ${LIB_SOURCE_DIR}/src/client.c - ${LIB_SOURCE_DIR}/src/config.c - ${LIB_SOURCE_DIR}/src/connect.c - ${LIB_SOURCE_DIR}/src/connector.c - ${LIB_SOURCE_DIR}/src/curve25519.c - ${LIB_SOURCE_DIR}/src/dh.c - ${LIB_SOURCE_DIR}/src/ecdh.c - ${LIB_SOURCE_DIR}/src/error.c - ${LIB_SOURCE_DIR}/src/getpass.c - ${LIB_SOURCE_DIR}/src/init.c - ${LIB_SOURCE_DIR}/src/kdf.c - ${LIB_SOURCE_DIR}/src/kex.c - ${LIB_SOURCE_DIR}/src/known_hosts.c - ${LIB_SOURCE_DIR}/src/knownhosts.c - ${LIB_SOURCE_DIR}/src/legacy.c - ${LIB_SOURCE_DIR}/src/log.c - ${LIB_SOURCE_DIR}/src/match.c - ${LIB_SOURCE_DIR}/src/messages.c - ${LIB_SOURCE_DIR}/src/misc.c - ${LIB_SOURCE_DIR}/src/options.c - ${LIB_SOURCE_DIR}/src/packet.c - ${LIB_SOURCE_DIR}/src/packet_cb.c - ${LIB_SOURCE_DIR}/src/packet_crypt.c - ${LIB_SOURCE_DIR}/src/pcap.c - ${LIB_SOURCE_DIR}/src/pki.c - ${LIB_SOURCE_DIR}/src/pki_container_openssh.c - ${LIB_SOURCE_DIR}/src/poll.c - ${LIB_SOURCE_DIR}/src/session.c - ${LIB_SOURCE_DIR}/src/scp.c - ${LIB_SOURCE_DIR}/src/socket.c - ${LIB_SOURCE_DIR}/src/string.c - ${LIB_SOURCE_DIR}/src/threads.c - ${LIB_SOURCE_DIR}/src/wrapper.c - ${LIB_SOURCE_DIR}/src/external/bcrypt_pbkdf.c - ${LIB_SOURCE_DIR}/src/external/blowfish.c - ${LIB_SOURCE_DIR}/src/external/chacha.c - ${LIB_SOURCE_DIR}/src/external/poly1305.c - ${LIB_SOURCE_DIR}/src/chachapoly.c - ${LIB_SOURCE_DIR}/src/config_parser.c - ${LIB_SOURCE_DIR}/src/token.c - ${LIB_SOURCE_DIR}/src/pki_ed25519_common.c -) - -set(libssh_SRCS - ${libssh_SRCS} - ${LIB_SOURCE_DIR}/src/threads/noop.c - ${LIB_SOURCE_DIR}/src/threads/pthread.c -) - -# LIBCRYPT specific -set(libssh_SRCS - ${libssh_SRCS} - ${LIB_SOURCE_DIR}/src/threads/libcrypto.c - ${LIB_SOURCE_DIR}/src/pki_crypto.c - ${LIB_SOURCE_DIR}/src/ecdh_crypto.c - ${LIB_SOURCE_DIR}/src/libcrypto.c - ${LIB_SOURCE_DIR}/src/dh_crypto.c -) - -if (NOT (ENABLE_OPENSSL OR ENABLE_OPENSSL_DYNAMIC)) - add_compile_definitions(USE_BORINGSSL=1) -endif() - -set(libssh_SRCS -${libssh_SRCS} -${LIB_SOURCE_DIR}/src/options.c -${LIB_SOURCE_DIR}/src/server.c -${LIB_SOURCE_DIR}/src/bind.c -${LIB_SOURCE_DIR}/src/bind_config.c -) - - -add_library(_ssh STATIC ${libssh_SRCS}) - -target_include_directories(_ssh PRIVATE ${LIB_BINARY_DIR}) -target_include_directories(_ssh PUBLIC "${LIB_SOURCE_DIR}/include" "${LIB_BINARY_DIR}/include") -target_link_libraries(_ssh - PRIVATE ${LIBSSH_LINK_LIBRARIES}) - -add_library(ch_contrib::ssh ALIAS _ssh) - -target_compile_options(_ssh - PRIVATE - ${DEFAULT_C_COMPILE_FLAGS} - -D_GNU_SOURCE) diff --git a/contrib/libunwind-cmake/unwind-override.c b/contrib/libunwind-cmake/unwind-override.c index 616bab6ae4b..57928d817eb 100644 --- a/contrib/libunwind-cmake/unwind-override.c +++ b/contrib/libunwind-cmake/unwind-override.c @@ -1,6 +1,10 @@ #include +/// On MacOS this function will be replaced with a dynamic symbol +/// from the system library. +#if !defined(OS_DARWIN) int backtrace(void ** buffer, int size) { return unw_backtrace(buffer, size); } +#endif diff --git a/contrib/libuv b/contrib/libuv index 3a85b2eb3d8..4482964660c 160000 --- a/contrib/libuv +++ b/contrib/libuv @@ -1 +1 @@ -Subproject commit 3a85b2eb3d83f369b8a8cafd329d7e9dc28f60cf +Subproject commit 4482964660c77eec1166cd7d14fb915e3dbd774a diff --git a/contrib/llvm-project b/contrib/llvm-project index 2568a7cd129..d2142eed980 160000 --- a/contrib/llvm-project +++ b/contrib/llvm-project @@ -1 +1 @@ -Subproject commit 2568a7cd1297c7c3044b0f3cc0c23a6f6444d856 +Subproject commit d2142eed98046a47ff7112e3cc1e197c8a5cd80f diff --git a/contrib/llvm-project-cmake/CMakeLists.txt b/contrib/llvm-project-cmake/CMakeLists.txt index d09060912d8..76e620314a2 100644 --- a/contrib/llvm-project-cmake/CMakeLists.txt +++ b/contrib/llvm-project-cmake/CMakeLists.txt @@ -1,5 +1,6 @@ -if (APPLE OR SANITIZE STREQUAL "undefined" OR SANITIZE STREQUAL "memory") - # llvm-tblgen, that is used during LLVM build, doesn't work with UBSan. +if (APPLE OR SANITIZE STREQUAL "memory") + # llvm-tblgen, that is used during LLVM build, will throw MSAN errors when running (breaking the build) + # TODO: Retest when upgrading LLVM or build only llvm-tblgen without sanitizers set (ENABLE_EMBEDDED_COMPILER_DEFAULT OFF) set (ENABLE_DWARF_PARSER_DEFAULT OFF) else() diff --git a/contrib/lz4 b/contrib/lz4 index 92ebf1870b9..ce45a9dbdb0 160000 --- a/contrib/lz4 +++ b/contrib/lz4 @@ -1 +1 @@ -Subproject commit 92ebf1870b9acbefc0e7970409a181954a10ff40 +Subproject commit ce45a9dbdb059511a3e9576b19db3e7f1a4f172e diff --git a/contrib/simdjson b/contrib/simdjson index 1075e8609c4..6060be2fdf6 160000 --- a/contrib/simdjson +++ b/contrib/simdjson @@ -1 +1 @@ -Subproject commit 1075e8609c4afa253162d441437af929c29e31bb +Subproject commit 6060be2fdf62edf4a8f51a8b0883d57d09397b30 diff --git a/contrib/update-submodules.sh b/contrib/update-submodules.sh index 7195de020bd..072d7a5dc2f 100755 --- a/contrib/update-submodules.sh +++ b/contrib/update-submodules.sh @@ -24,7 +24,7 @@ git config --file .gitmodules --get-regexp '.*path' | sed 's/[^ ]* //' | xargs - # We don't want to depend on any third-party CMake files. # To check it, find and delete them. grep -o -P '"contrib/[^"]+"' .gitmodules | - grep -v -P 'contrib/(llvm-project|google-protobuf|grpc|abseil-cpp|corrosion)' | + grep -v -P 'contrib/(llvm-project|google-protobuf|grpc|abseil-cpp|corrosion|aws-crt-cpp)' | xargs -I@ find @ \ -'(' -name 'CMakeLists.txt' -or -name '*.cmake' -')' -and -not -name '*.h.cmake' \ -delete diff --git a/docker/images.json b/docker/images.json index d2f098f53d7..2bf1efe005f 100644 --- a/docker/images.json +++ b/docker/images.json @@ -62,7 +62,6 @@ "dependent": [] }, "docker/test/integration/runner": { - "only_amd64": true, "name": "clickhouse/integration-tests-runner", "dependent": [] }, diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index 4b5e8cd3970..d39ca312454 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="23.12.2.59" +ARG VERSION="24.1.5.6" ARG PACKAGES="clickhouse-keeper" ARG DIRECT_DOWNLOAD_URLS="" diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index 1a99ab0d0b6..e20cbe9781c 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -72,7 +72,7 @@ RUN add-apt-repository ppa:ubuntu-toolchain-r/test --yes \ zstd \ zip \ && apt-get clean \ - && rm -rf /var/lib/apt/lists + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* # Download toolchain and SDK for Darwin RUN curl -sL -O https://github.com/phracker/MacOSX-SDKs/releases/download/11.3/MacOSX11.0.sdk.tar.xz diff --git a/docker/packager/packager b/docker/packager/packager index ade36a55591..ca0ae8358f3 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -115,12 +115,17 @@ def run_docker_image_with_env( subprocess.check_call(cmd, shell=True) -def is_release_build(debug_build: bool, package_type: str, sanitizer: str) -> bool: - return not debug_build and package_type == "deb" and sanitizer == "" +def is_release_build( + debug_build: bool, package_type: str, sanitizer: str, coverage: bool +) -> bool: + return ( + not debug_build and package_type == "deb" and sanitizer == "" and not coverage + ) def parse_env_variables( debug_build: bool, + coverage: bool, compiler: str, sanitizer: str, package_type: str, @@ -261,7 +266,7 @@ def parse_env_variables( build_target = ( f"{build_target} clickhouse-odbc-bridge clickhouse-library-bridge" ) - if is_release_build(debug_build, package_type, sanitizer): + if is_release_build(debug_build, package_type, sanitizer, coverage): cmake_flags.append("-DSPLIT_DEBUG_SYMBOLS=ON") result.append("WITH_PERFORMANCE=1") if is_cross_arm: @@ -287,6 +292,9 @@ def parse_env_variables( else: result.append("BUILD_TYPE=None") + if coverage: + cmake_flags.append("-DSANITIZE_COVERAGE=1 -DBUILD_STANDALONE_KEEPER=0") + if not cache: cmake_flags.append("-DCOMPILER_CACHE=disabled") @@ -415,6 +423,11 @@ def parse_args() -> argparse.Namespace: choices=("address", "thread", "memory", "undefined", ""), default="", ) + parser.add_argument( + "--coverage", + action="store_true", + help="enable granular coverage with introspection", + ) parser.add_argument("--clang-tidy", action="store_true") parser.add_argument( @@ -507,6 +520,7 @@ def main() -> None: env_prepared = parse_env_variables( args.debug_build, + args.coverage, args.compiler, args.sanitizer, args.package_type, diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index 452d8539a48..2d07937ad79 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="23.12.2.59" +ARG VERSION="24.1.5.6" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" ARG DIRECT_DOWNLOAD_URLS="" diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index 0cefa3c14cb..d4775b17319 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -23,14 +23,11 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list tzdata \ wget \ && apt-get clean \ - && rm -rf \ - /var/lib/apt/lists/* \ - /var/cache/debconf \ - /tmp/* + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* ARG REPO_CHANNEL="stable" ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION="23.12.2.59" +ARG VERSION="24.1.5.6" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # set non-empty deb_location_url url to create a docker image diff --git a/docker/server/entrypoint.sh b/docker/server/entrypoint.sh index 78f18f376f4..79e809ea7f1 100755 --- a/docker/server/entrypoint.sh +++ b/docker/server/entrypoint.sh @@ -118,13 +118,19 @@ if [ -n "$CLICKHOUSE_USER" ] && [ "$CLICKHOUSE_USER" != "default" ] || [ -n "$CL EOT fi +CLICKHOUSE_ALWAYS_RUN_INITDB_SCRIPTS="${CLICKHOUSE_ALWAYS_RUN_INITDB_SCRIPTS:-}" + # checking $DATA_DIR for initialization if [ -d "${DATA_DIR%/}/data" ]; then DATABASE_ALREADY_EXISTS='true' fi -# only run initialization on an empty data directory -if [ -z "${DATABASE_ALREADY_EXISTS}" ]; then +# run initialization if flag CLICKHOUSE_ALWAYS_RUN_INITDB_SCRIPTS is not empty or data directory is empty +if [[ -n "${CLICKHOUSE_ALWAYS_RUN_INITDB_SCRIPTS}" || -z "${DATABASE_ALREADY_EXISTS}" ]]; then + RUN_INITDB_SCRIPTS='true' +fi + +if [ -n "${RUN_INITDB_SCRIPTS}" ]; then if [ -n "$(ls /docker-entrypoint-initdb.d/)" ] || [ -n "$CLICKHOUSE_DB" ]; then # port is needed to check if clickhouse-server is ready for connections HTTP_PORT="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=http_port --try)" diff --git a/docker/test/base/Dockerfile b/docker/test/base/Dockerfile index b48017fdacc..55229e893de 100644 --- a/docker/test/base/Dockerfile +++ b/docker/test/base/Dockerfile @@ -13,7 +13,10 @@ RUN apt-get update \ zstd \ locales \ sudo \ - --yes --no-install-recommends + --yes --no-install-recommends \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* + # Sanitizer options for services (clickhouse-server) # Set resident memory limit for TSAN to 45GiB (46080MiB) to avoid OOMs in Stress tests diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index ea82e071112..8858e12c50e 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -17,16 +17,20 @@ CLICKHOUSE_CI_LOGS_CLUSTER=${CLICKHOUSE_CI_LOGS_CLUSTER:-system_logs_export} EXTRA_COLUMNS=${EXTRA_COLUMNS:-"pull_request_number UInt32, commit_sha String, check_start_time DateTime('UTC'), check_name LowCardinality(String), instance_type LowCardinality(String), instance_id String, INDEX ix_pr (pull_request_number) TYPE set(100), INDEX ix_commit (commit_sha) TYPE set(100), INDEX ix_check_time (check_start_time) TYPE minmax, "} EXTRA_COLUMNS_EXPRESSION=${EXTRA_COLUMNS_EXPRESSION:-"CAST(0 AS UInt32) AS pull_request_number, '' AS commit_sha, now() AS check_start_time, toLowCardinality('') AS check_name, toLowCardinality('') AS instance_type, '' AS instance_id"} -EXTRA_ORDER_BY_COLUMNS=${EXTRA_ORDER_BY_COLUMNS:-"check_name, "} +EXTRA_ORDER_BY_COLUMNS=${EXTRA_ORDER_BY_COLUMNS:-"check_name"} # trace_log needs more columns for symbolization EXTRA_COLUMNS_TRACE_LOG="${EXTRA_COLUMNS} symbols Array(LowCardinality(String)), lines Array(LowCardinality(String)), " EXTRA_COLUMNS_EXPRESSION_TRACE_LOG="${EXTRA_COLUMNS_EXPRESSION}, arrayMap(x -> demangle(addressToSymbol(x)), trace)::Array(LowCardinality(String)) AS symbols, arrayMap(x -> addressToLine(x), trace)::Array(LowCardinality(String)) AS lines" +# coverage_log needs more columns for symbolization, but only symbol names (the line numbers are too heavy to calculate) +EXTRA_COLUMNS_COVERAGE_LOG="${EXTRA_COLUMNS} symbols Array(LowCardinality(String)), " +EXTRA_COLUMNS_EXPRESSION_COVERAGE_LOG="${EXTRA_COLUMNS_EXPRESSION}, arrayMap(x -> demangle(addressToSymbol(x)), coverage)::Array(LowCardinality(String)) AS symbols" + function __set_connection_args { - # It's impossible to use generous $CONNECTION_ARGS string, it's unsafe from word splitting perspective. + # It's impossible to use a generic $CONNECTION_ARGS string, it's unsafe from word splitting perspective. # That's why we must stick to the generated option CONNECTION_ARGS=( --receive_timeout=45 --send_timeout=45 --secure @@ -129,6 +133,19 @@ function setup_logs_replication debug_or_sanitizer_build=$(clickhouse-client -q "WITH ((SELECT value FROM system.build_options WHERE name='BUILD_TYPE') AS build, (SELECT value FROM system.build_options WHERE name='CXX_FLAGS') as flags) SELECT build='Debug' OR flags LIKE '%fsanitize%'") echo "Build is debug or sanitizer: $debug_or_sanitizer_build" + # We will pre-create a table system.coverage_log. + # It is normally created by clickhouse-test rather than the server, + # so we will create it in advance to make it be picked up by the next commands: + + clickhouse-client --query " + CREATE TABLE IF NOT EXISTS system.coverage_log + ( + time DateTime COMMENT 'The time of test run', + test_name String COMMENT 'The name of the test', + coverage Array(UInt64) COMMENT 'An array of addresses of the code (a subset of addresses instrumented for coverage) that were encountered during the test run' + ) ENGINE = Null COMMENT 'Contains information about per-test coverage from the CI, but used only for exporting to the CI cluster' + " + # For each system log table: echo 'Create %_log tables' clickhouse-client --query "SHOW TABLES FROM system LIKE '%\\_log'" | while read -r table @@ -139,11 +156,16 @@ function setup_logs_replication # Do not try to resolve stack traces in case of debug/sanitizers # build, since it is too slow (flushing of trace_log can take ~1min # with such MV attached) - if [[ "$debug_or_sanitizer_build" = 1 ]]; then + if [[ "$debug_or_sanitizer_build" = 1 ]] + then EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION}" else EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION_TRACE_LOG}" fi + elif [[ "$table" = "coverage_log" ]] + then + EXTRA_COLUMNS_FOR_TABLE="${EXTRA_COLUMNS_COVERAGE_LOG}" + EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION_COVERAGE_LOG}" else EXTRA_COLUMNS_FOR_TABLE="${EXTRA_COLUMNS}" EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION}" @@ -160,7 +182,7 @@ function setup_logs_replication # Create the destination table with adapted name and structure: statement=$(clickhouse-client --format TSVRaw --query "SHOW CREATE TABLE system.${table}" | sed -r -e ' s/^\($/('"$EXTRA_COLUMNS_FOR_TABLE"'/; - s/ORDER BY \(/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"'/; + s/^ORDER BY (([^\(].+?)|\((.+?)\))$/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \2\3)/; s/^CREATE TABLE system\.\w+_log$/CREATE TABLE IF NOT EXISTS '"$table"'_'"$hash"'/; /^TTL /d ') @@ -168,7 +190,7 @@ function setup_logs_replication echo -e "Creating remote destination table ${table}_${hash} with statement:\n${statement}" >&2 echo "$statement" | clickhouse-client --database_replicated_initial_query_timeout_sec=10 \ - --distributed_ddl_task_timeout=30 \ + --distributed_ddl_task_timeout=30 --distributed_ddl_output_mode=throw_only_active \ "${CONNECTION_ARGS[@]}" || continue echo "Creating table system.${table}_sender" >&2 diff --git a/docker/test/fasttest/Dockerfile b/docker/test/fasttest/Dockerfile index 56ec0199849..e10555d4d4a 100644 --- a/docker/test/fasttest/Dockerfile +++ b/docker/test/fasttest/Dockerfile @@ -20,7 +20,9 @@ RUN apt-get update \ pv \ jq \ zstd \ - --yes --no-install-recommends + --yes --no-install-recommends \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* RUN pip3 install numpy==1.26.3 scipy==1.12.0 pandas==1.5.3 Jinja2==3.1.3 @@ -31,12 +33,14 @@ RUN mkdir -p /tmp/clickhouse-odbc-tmp \ && cp /tmp/clickhouse-odbc-tmp/lib64/*.so /usr/local/lib/ \ && odbcinst -i -d -f /tmp/clickhouse-odbc-tmp/share/doc/clickhouse-odbc/config/odbcinst.ini.sample \ && odbcinst -i -s -l -f /tmp/clickhouse-odbc-tmp/share/doc/clickhouse-odbc/config/odbc.ini.sample \ - && rm -rf /tmp/clickhouse-odbc-tmp \ + && rm -rf /tmp/clickhouse-odbc-tmp + +# Give suid to gdb to grant it attach permissions +# chmod 777 to make the container user independent +RUN chmod u+s /usr/bin/gdb \ && mkdir -p /var/lib/clickhouse \ && chmod 777 /var/lib/clickhouse -# chmod 777 to make the container user independent - ENV TZ=Europe/Amsterdam RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 5af05034415..d78c52f1fe6 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -211,6 +211,17 @@ function build echo "build_clickhouse_fasttest_binary: [ OK ] $BUILD_SECONDS_ELAPSED sec." \ | ts '%Y-%m-%d %H:%M:%S' \ | tee "$FASTTEST_OUTPUT/test_result.txt" + + ( + # This query should fail, and print stacktrace with proper symbol names (even on a stripped binary) + clickhouse_output=$(programs/clickhouse-stripped --stacktrace -q 'select' 2>&1 || :) + if [[ $clickhouse_output =~ DB::LocalServer::main ]]; then + echo "stripped_clickhouse_shows_symbols_names: [ OK ] 0 sec." + else + echo -e "stripped_clickhouse_shows_symbols_names: [ FAIL ] 0 sec. - clickhouse output:\n\n$clickhouse_output\n" + fi + ) | ts '%Y-%m-%d %H:%M:%S' | tee -a "$FASTTEST_OUTPUT/test_result.txt" + if [ "$COPY_CLICKHOUSE_BINARY_TO_OUTPUT" -eq "1" ]; then mkdir -p "$FASTTEST_OUTPUT/binaries/" cp programs/clickhouse "$FASTTEST_OUTPUT/binaries/clickhouse" diff --git a/docker/test/fuzzer/Dockerfile b/docker/test/fuzzer/Dockerfile index 0bc0fb06633..d3f78ac1d95 100644 --- a/docker/test/fuzzer/Dockerfile +++ b/docker/test/fuzzer/Dockerfile @@ -29,7 +29,7 @@ RUN apt-get update \ wget \ && apt-get autoremove --yes \ && apt-get clean \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* RUN pip3 install Jinja2 diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index 050d4b68628..ca6bff9c6be 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -389,8 +389,8 @@ fi rg --text -F '' server.log > fatal.log ||: dmesg -T > dmesg.log ||: -zstd --threads=0 server.log -zstd --threads=0 fuzzer.log +zstd --threads=0 --rm server.log +zstd --threads=0 --rm fuzzer.log cat > report.html < diff --git a/docker/test/install/deb/Dockerfile b/docker/test/install/deb/Dockerfile index e9c928b1fe7..71daffa6f2a 100644 --- a/docker/test/install/deb/Dockerfile +++ b/docker/test/install/deb/Dockerfile @@ -10,13 +10,13 @@ ENV \ init=/lib/systemd/systemd # install systemd packages -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ sudo \ systemd \ - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists + \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* # configure systemd # remove systemd 'wants' triggers diff --git a/docker/test/integration/hive_server/Dockerfile b/docker/test/integration/hive_server/Dockerfile index e37e2800557..e34899e3329 100644 --- a/docker/test/integration/hive_server/Dockerfile +++ b/docker/test/integration/hive_server/Dockerfile @@ -1,31 +1,27 @@ FROM ubuntu:20.04 MAINTAINER lgbo-ustc -RUN apt-get update -RUN apt-get install -y wget openjdk-8-jre - -RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz && \ - tar -xf hadoop-3.1.0.tar.gz && rm -rf hadoop-3.1.0.tar.gz -RUN wget https://apache.apache.org/dist/hive/hive-2.3.9/apache-hive-2.3.9-bin.tar.gz && \ - tar -xf apache-hive-2.3.9-bin.tar.gz && rm -rf apache-hive-2.3.9-bin.tar.gz -RUN apt install -y vim - -RUN apt install -y openssh-server openssh-client - -RUN apt install -y mysql-server - -RUN mkdir -p /root/.ssh && \ - ssh-keygen -t rsa -b 2048 -P '' -f /root/.ssh/id_rsa && \ - cat /root/.ssh/id_rsa.pub > /root/.ssh/authorized_keys && \ - cp /root/.ssh/id_rsa /etc/ssh/ssh_host_rsa_key && \ - cp /root/.ssh/id_rsa.pub /etc/ssh/ssh_host_rsa_key.pub - -RUN wget https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-8.0.27.tar.gz &&\ - tar -xf mysql-connector-java-8.0.27.tar.gz && \ - mv mysql-connector-java-8.0.27/mysql-connector-java-8.0.27.jar /apache-hive-2.3.9-bin/lib/ && \ - rm -rf mysql-connector-java-8.0.27.tar.gz mysql-connector-java-8.0.27 - -RUN apt install -y iputils-ping net-tools +RUN apt-get update \ + && apt-get install -y wget openjdk-8-jre \ + && wget https://archive.apache.org/dist/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz \ + && tar -xf hadoop-3.1.0.tar.gz && rm -rf hadoop-3.1.0.tar.gz \ + && wget https://apache.apache.org/dist/hive/hive-2.3.9/apache-hive-2.3.9-bin.tar.gz \ + && tar -xf apache-hive-2.3.9-bin.tar.gz && rm -rf apache-hive-2.3.9-bin.tar.gz \ + && apt install -y vim \ + && apt install -y openssh-server openssh-client \ + && apt install -y mysql-server \ + && mkdir -p /root/.ssh \ + && ssh-keygen -t rsa -b 2048 -P '' -f /root/.ssh/id_rsa \ + && cat /root/.ssh/id_rsa.pub > /root/.ssh/authorized_keys \ + && cp /root/.ssh/id_rsa /etc/ssh/ssh_host_rsa_key \ + && cp /root/.ssh/id_rsa.pub /etc/ssh/ssh_host_rsa_key.pub \ + && wget https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-8.0.27.tar.gz \ + && tar -xf mysql-connector-java-8.0.27.tar.gz \ + && mv mysql-connector-java-8.0.27/mysql-connector-java-8.0.27.jar /apache-hive-2.3.9-bin/lib/ \ + && rm -rf mysql-connector-java-8.0.27.tar.gz mysql-connector-java-8.0.27 \ + && apt install -y iputils-ping net-tools \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* ENV JAVA_HOME=/usr ENV HADOOP_HOME=/hadoop-3.1.0 @@ -44,4 +40,3 @@ COPY demo_data.txt / ENV PATH=/apache-hive-2.3.9-bin/bin:/hadoop-3.1.0/bin:/hadoop-3.1.0/sbin:$PATH RUN service ssh start && sed s/HOSTNAME/$HOSTNAME/ /hadoop-3.1.0/etc/hadoop/core-site.xml.template > /hadoop-3.1.0/etc/hadoop/core-site.xml && hdfs namenode -format COPY start.sh / - diff --git a/docker/test/integration/postgresql_java_client/Dockerfile b/docker/test/integration/postgresql_java_client/Dockerfile index f5484028ec9..c5583085ef3 100644 --- a/docker/test/integration/postgresql_java_client/Dockerfile +++ b/docker/test/integration/postgresql_java_client/Dockerfile @@ -3,14 +3,10 @@ FROM ubuntu:18.04 -RUN apt-get update && \ - apt-get install -y software-properties-common build-essential openjdk-8-jdk curl - -RUN rm -rf \ - /var/lib/apt/lists/* \ - /var/cache/debconf \ - /tmp/* \ -RUN apt-get clean +RUN apt-get update \ + && apt-get install -y software-properties-common build-essential openjdk-8-jdk curl \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* ARG ver=42.2.12 RUN curl -L -o /postgresql-java-${ver}.jar https://repo1.maven.org/maven2/org/postgresql/postgresql/${ver}/postgresql-${ver}.jar diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile index c795fbf0672..b876f7b9635 100644 --- a/docker/test/integration/runner/Dockerfile +++ b/docker/test/integration/runner/Dockerfile @@ -37,11 +37,8 @@ RUN apt-get update \ libkrb5-dev \ krb5-user \ g++ \ - && rm -rf \ - /var/lib/apt/lists/* \ - /var/cache/debconf \ - /tmp/* \ - && apt-get clean + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* ENV TZ=Etc/UTC RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone @@ -62,47 +59,49 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - \ && dockerd --version; docker --version +# kazoo 2.10.0 is broken +# https://s3.amazonaws.com/clickhouse-test-reports/59337/524625a1d2f4cc608a3f1059e3df2c30f353a649/integration_tests__asan__analyzer__[5_6].html RUN python3 -m pip install --no-cache-dir \ - PyMySQL \ - aerospike==11.1.0 \ - asyncio \ + PyMySQL==1.1.0 \ + asyncio==3.4.3 \ avro==1.10.2 \ - azure-storage-blob \ - boto3 \ - cassandra-driver \ - confluent-kafka==1.9.2 \ + azure-storage-blob==12.19.0 \ + boto3==1.34.24 \ + cassandra-driver==3.29.0 \ + confluent-kafka==2.3.0 \ delta-spark==2.3.0 \ - dict2xml \ - dicttoxml \ + dict2xml==1.7.4 \ + dicttoxml==1.7.16 \ docker==6.1.3 \ docker-compose==1.29.2 \ - grpcio \ - grpcio-tools \ - kafka-python \ - kazoo \ - lz4 \ - minio \ - nats-py \ - protobuf \ + grpcio==1.60.0 \ + grpcio-tools==1.60.0 \ + kafka-python==2.0.2 \ + lz4==4.3.3 \ + minio==7.2.3 \ + nats-py==2.6.0 \ + protobuf==4.25.2 \ + kazoo==2.9.0 \ psycopg2-binary==2.9.6 \ - pyhdfs \ + pyhdfs==0.3.1 \ pymongo==3.11.0 \ pyspark==3.3.2 \ - pytest \ + pytest==7.4.4 \ pytest-order==1.0.0 \ - pytest-random \ - pytest-repeat \ - pytest-timeout \ - pytest-xdist \ - pytz \ + pytest-random==0.2 \ + pytest-repeat==0.9.3 \ + pytest-timeout==2.2.0 \ + pytest-xdist==3.5.0 \ + pytest-reportlog==0.4.0 \ + pytz==2023.3.post1 \ pyyaml==5.3.1 \ - redis \ - requests-kerberos \ + redis==5.0.1 \ + requests-kerberos==0.14.0 \ tzlocal==2.1 \ - retry \ - bs4 \ - lxml \ - urllib3 + retry==0.9.2 \ + bs4==0.0.2 \ + lxml==5.1.0 \ + urllib3==2.0.7 # bs4, lxml are for cloud tests, do not delete # Hudi supports only spark 3.3.*, not 3.4 diff --git a/docker/test/integration/runner/dockerd-entrypoint.sh b/docker/test/integration/runner/dockerd-entrypoint.sh index b05aef76faf..8882daa38ea 100755 --- a/docker/test/integration/runner/dockerd-entrypoint.sh +++ b/docker/test/integration/runner/dockerd-entrypoint.sh @@ -23,13 +23,15 @@ if [ -f /sys/fs/cgroup/cgroup.controllers ]; then > /sys/fs/cgroup/cgroup.subtree_control fi -# In case of test hung it is convenient to use pytest --pdb to debug it, -# and on hung you can simply press Ctrl-C and it will spawn a python pdb, -# but on SIGINT dockerd will exit, so ignore it to preserve the daemon. -trap '' INT # Binding to an IP address without --tlsverify is deprecated. Startup is intentionally being slowed # unless --tls=false or --tlsverify=false is set -dockerd --host=unix:///var/run/docker.sock --tls=false --host=tcp://0.0.0.0:2375 --default-address-pool base=172.17.0.0/12,size=24 &>/ClickHouse/tests/integration/dockerd.log & +# +# In case of test hung it is convenient to use pytest --pdb to debug it, +# and on hung you can simply press Ctrl-C and it will spawn a python pdb, +# but on SIGINT dockerd will exit, so we spawn new session to ignore SIGINT by +# docker. +# Note, that if you will run it via runner, it will send SIGINT to docker anyway. +setsid dockerd --host=unix:///var/run/docker.sock --tls=false --host=tcp://0.0.0.0:2375 --default-address-pool base=172.17.0.0/12,size=24 &>/ClickHouse/tests/integration/dockerd.log & set +e reties=0 diff --git a/docker/test/keeper-jepsen/Dockerfile b/docker/test/keeper-jepsen/Dockerfile index a794e076ec0..3c5d0a6ecb4 100644 --- a/docker/test/keeper-jepsen/Dockerfile +++ b/docker/test/keeper-jepsen/Dockerfile @@ -24,7 +24,10 @@ RUN mkdir "/root/.ssh" RUN touch "/root/.ssh/known_hosts" # install java -RUN apt-get update && apt-get install default-jre default-jdk libjna-java libjna-jni ssh gnuplot graphviz --yes --no-install-recommends +RUN apt-get update && \ + apt-get install default-jre default-jdk libjna-java libjna-jni ssh gnuplot graphviz --yes --no-install-recommends \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* # install clojure RUN curl -O "https://download.clojure.org/install/linux-install-${CLOJURE_VERSION}.sh" && \ diff --git a/docker/test/libfuzzer/Dockerfile b/docker/test/libfuzzer/Dockerfile index 081cf5473f8..c9802a0e44e 100644 --- a/docker/test/libfuzzer/Dockerfile +++ b/docker/test/libfuzzer/Dockerfile @@ -27,7 +27,7 @@ RUN apt-get update \ wget \ && apt-get autoremove --yes \ && apt-get clean \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* RUN pip3 install Jinja2 diff --git a/docker/test/performance-comparison/Dockerfile b/docker/test/performance-comparison/Dockerfile index e4ced104445..1835900b316 100644 --- a/docker/test/performance-comparison/Dockerfile +++ b/docker/test/performance-comparison/Dockerfile @@ -37,7 +37,7 @@ RUN apt-get update \ && apt-get purge --yes python3-dev g++ \ && apt-get autoremove --yes \ && apt-get clean \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* COPY run.sh / diff --git a/docker/test/server-jepsen/Dockerfile b/docker/test/server-jepsen/Dockerfile index a212427b2a1..fd70fc45702 100644 --- a/docker/test/server-jepsen/Dockerfile +++ b/docker/test/server-jepsen/Dockerfile @@ -31,7 +31,9 @@ RUN mkdir "/root/.ssh" RUN touch "/root/.ssh/known_hosts" # install java -RUN apt-get update && apt-get install default-jre default-jdk libjna-java libjna-jni ssh gnuplot graphviz --yes --no-install-recommends +RUN apt-get update && apt-get install default-jre default-jdk libjna-java libjna-jni ssh gnuplot graphviz --yes --no-install-recommends \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* # install clojure RUN curl -O "https://download.clojure.org/install/linux-install-${CLOJURE_VERSION}.sh" && \ diff --git a/docker/test/sqlancer/Dockerfile b/docker/test/sqlancer/Dockerfile index 5977044345e..82fc2598397 100644 --- a/docker/test/sqlancer/Dockerfile +++ b/docker/test/sqlancer/Dockerfile @@ -5,9 +5,10 @@ FROM ubuntu:22.04 ARG apt_archive="http://archive.ubuntu.com" RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list -RUN apt-get update --yes && \ - env DEBIAN_FRONTEND=noninteractive apt-get install wget git default-jdk maven python3 --yes --no-install-recommends && \ - apt-get clean +RUN apt-get update --yes \ + && env DEBIAN_FRONTEND=noninteractive apt-get install wget git default-jdk maven python3 --yes --no-install-recommends \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* # We need to get the repository's HEAD each time despite, so we invalidate layers' cache ARG CACHE_INVALIDATOR=0 diff --git a/docker/test/sqllogic/Dockerfile b/docker/test/sqllogic/Dockerfile index 48457a99de3..05130044c45 100644 --- a/docker/test/sqllogic/Dockerfile +++ b/docker/test/sqllogic/Dockerfile @@ -15,7 +15,8 @@ RUN apt-get update --yes \ unixodbc-dev \ odbcinst \ sudo \ - && apt-get clean + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* RUN pip3 install \ numpy \ diff --git a/docker/test/sqltest/Dockerfile b/docker/test/sqltest/Dockerfile index 437677f4fd1..7f59f65761f 100644 --- a/docker/test/sqltest/Dockerfile +++ b/docker/test/sqltest/Dockerfile @@ -11,7 +11,8 @@ RUN apt-get update --yes \ python3-dev \ python3-pip \ sudo \ - && apt-get clean + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* RUN pip3 install \ pyyaml \ diff --git a/docker/test/stateful/Dockerfile b/docker/test/stateful/Dockerfile index 3509998e1d4..355e70f180e 100644 --- a/docker/test/stateful/Dockerfile +++ b/docker/test/stateful/Dockerfile @@ -9,7 +9,8 @@ RUN apt-get update -y \ python3-requests \ nodejs \ npm \ - && apt-get clean + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* COPY create.sql / COPY run.sh / diff --git a/docker/test/stateless/Dockerfile b/docker/test/stateless/Dockerfile index bc26234db24..7f4bad3d4e6 100644 --- a/docker/test/stateless/Dockerfile +++ b/docker/test/stateless/Dockerfile @@ -44,9 +44,10 @@ RUN apt-get update -y \ pv \ zip \ p7zip-full \ - && apt-get clean + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* -RUN pip3 install numpy scipy pandas Jinja2 pyarrow +RUN pip3 install numpy==1.26.3 scipy==1.12.0 pandas==1.5.3 Jinja2==3.1.3 pyarrow==15.0.0 RUN mkdir -p /tmp/clickhouse-odbc-tmp \ && wget -nv -O - ${odbc_driver_url} | tar --strip-components=1 -xz -C /tmp/clickhouse-odbc-tmp \ @@ -73,7 +74,6 @@ RUN arch=${TARGETARCH:-amd64} \ && wget "https://dl.min.io/client/mc/release/linux-${arch}/archive/mc.RELEASE.${MINIO_CLIENT_VERSION}" -O ./mc \ && chmod +x ./mc ./minio - RUN wget --no-verbose 'https://archive.apache.org/dist/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz' \ && tar -xvf hadoop-3.3.1.tar.gz \ && rm -rf hadoop-3.3.1.tar.gz diff --git a/docker/test/stateless/clickhouse-statelest-test-runner.Dockerfile b/docker/test/stateless/clickhouse-statelest-test-runner.Dockerfile index dc83e8b8d2e..a9802f6f1da 100644 --- a/docker/test/stateless/clickhouse-statelest-test-runner.Dockerfile +++ b/docker/test/stateless/clickhouse-statelest-test-runner.Dockerfile @@ -9,6 +9,8 @@ FROM ubuntu:20.04 as clickhouse-test-runner-base VOLUME /packages CMD apt-get update ;\ - DEBIAN_FRONTEND=noninteractive \ - apt install -y /packages/clickhouse-common-static_*.deb \ - /packages/clickhouse-client_*.deb + DEBIAN_FRONTEND=noninteractive \ + apt install -y /packages/clickhouse-common-static_*.deb \ + /packages/clickhouse-client_*.deb \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index b7c04a6fabd..aec2add2857 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -185,11 +185,15 @@ function run_tests() if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then ADDITIONAL_OPTIONS+=('--replicated-database') + # Too many tests fail for DatabaseReplicated in parallel. ADDITIONAL_OPTIONS+=('--jobs') ADDITIONAL_OPTIONS+=('2') + elif [[ 1 == $(clickhouse-client --query "SELECT value LIKE '%SANITIZE_COVERAGE%' FROM system.build_options WHERE name = 'CXX_FLAGS'") ]]; then + # Coverage on a per-test basis could only be collected sequentially. + # Do not set the --jobs parameter. + echo "Running tests with coverage collection." else - # Too many tests fail for DatabaseReplicated in parallel. All other - # configurations are OK. + # All other configurations are OK. ADDITIONAL_OPTIONS+=('--jobs') ADDITIONAL_OPTIONS+=('8') fi @@ -246,16 +250,19 @@ clickhouse-client -q "system flush logs" ||: stop_logs_replication # Try to get logs while server is running -successfuly_saved=0 +failed_to_save_logs=0 for table in query_log zookeeper_log trace_log transactions_info_log metric_log do - clickhouse-client -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst - successfuly_saved=$? + err=$( { clickhouse-client -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst; } 2>&1 ) + echo "$err" + [[ "0" != "${#err}" ]] && failed_to_save_logs=1 if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then - clickhouse-client -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.1.tsv.zst - successfuly_saved=$((successfuly_saved | $?)) - clickhouse-client -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.2.tsv.zst - successfuly_saved=$((successfuly_saved | $?)) + err=$( { clickhouse-client -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.1.tsv.zst; } 2>&1 ) + echo "$err" + [[ "0" != "${#err}" ]] && failed_to_save_logs=1 + err=$( { clickhouse-client -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.2.tsv.zst; } 2>&1 ) + echo "$err" + [[ "0" != "${#err}" ]] && failed_to_save_logs=1 fi done @@ -280,7 +287,7 @@ fi # If server crashed dump system logs with clickhouse-local -if [ $successfuly_saved -ne 0 ]; then +if [ $failed_to_save_logs -ne 0 ]; then # Compress tables. # # NOTE: @@ -290,10 +297,10 @@ if [ $successfuly_saved -ne 0 ]; then # for files >64MB, we want this files to be compressed explicitly for table in query_log zookeeper_log trace_log transactions_info_log metric_log do - clickhouse-local "$data_path_config" --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||: + clickhouse-local "$data_path_config" --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||: if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then - clickhouse-local --path /var/lib/clickhouse1/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.1.tsv.zst ||: - clickhouse-local --path /var/lib/clickhouse2/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.2.tsv.zst ||: + clickhouse-local --path /var/lib/clickhouse1/ --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.1.tsv.zst ||: + clickhouse-local --path /var/lib/clickhouse2/ --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.2.tsv.zst ||: fi done fi diff --git a/docker/test/stateless/stress_tests.lib b/docker/test/stateless/stress_tests.lib index 6f0dabb5207..6e1834d6cde 100644 --- a/docker/test/stateless/stress_tests.lib +++ b/docker/test/stateless/stress_tests.lib @@ -78,6 +78,8 @@ function configure() randomize_config_boolean_value use_compression zookeeper fi + randomize_config_boolean_value allow_experimental_block_number_column block_number + # for clickhouse-server (via service) echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment # for clickhouse-client diff --git a/docker/test/stress/Dockerfile b/docker/test/stress/Dockerfile index 5a9625d8109..0f81a1cd07f 100644 --- a/docker/test/stress/Dockerfile +++ b/docker/test/stress/Dockerfile @@ -19,7 +19,8 @@ RUN apt-get update -y \ openssl \ netcat-openbsd \ brotli \ - && apt-get clean + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* COPY run.sh / diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile index a4feae27c67..f2bac2f5da4 100644 --- a/docker/test/style/Dockerfile +++ b/docker/test/style/Dockerfile @@ -21,7 +21,8 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \ locales \ && pip3 install black==23.1.0 boto3 codespell==2.2.1 mypy==1.3.0 PyGithub unidiff pylint==2.6.2 \ && apt-get clean \ - && rm -rf /root/.cache/pip + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* \ + && rm -rf /root/.cache/pip RUN echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && locale-gen en_US.UTF-8 ENV LC_ALL en_US.UTF-8 diff --git a/docker/test/upgrade/Dockerfile b/docker/test/upgrade/Dockerfile index 9152230af1c..78d912fd031 100644 --- a/docker/test/upgrade/Dockerfile +++ b/docker/test/upgrade/Dockerfile @@ -19,7 +19,8 @@ RUN apt-get update -y \ openssl \ netcat-openbsd \ brotli \ - && apt-get clean + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* COPY run.sh / diff --git a/docker/test/upgrade/run.sh b/docker/test/upgrade/run.sh index 9c008209316..aaba5cc6a8c 100644 --- a/docker/test/upgrade/run.sh +++ b/docker/test/upgrade/run.sh @@ -122,6 +122,7 @@ rm /etc/clickhouse-server/config.d/merge_tree.xml rm /etc/clickhouse-server/config.d/enable_wait_for_shutdown_replicated_tables.xml rm /etc/clickhouse-server/config.d/zero_copy_destructive_operations.xml rm /etc/clickhouse-server/config.d/storage_conf_02963.xml +rm /etc/clickhouse-server/config.d/block_number.xml rm /etc/clickhouse-server/users.d/nonconst_timezone.xml rm /etc/clickhouse-server/users.d/s3_cache_new.xml rm /etc/clickhouse-server/users.d/replicated_ddl_entry.xml diff --git a/docker/test/util/Dockerfile b/docker/test/util/Dockerfile index eb5abce280a..396d5801be9 100644 --- a/docker/test/util/Dockerfile +++ b/docker/test/util/Dockerfile @@ -5,7 +5,6 @@ FROM ubuntu:22.04 ARG apt_archive="http://archive.ubuntu.com" RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list -# 15.0.2 ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=17 RUN apt-get update \ @@ -27,9 +26,10 @@ RUN apt-get update \ && export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \ && echo "deb https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \ /etc/apt/sources.list \ - && apt-get clean + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* -# Install cmake 3.20+ for rust support +# Install cmake 3.20+ for Rust support # Used https://askubuntu.com/a/1157132 as reference RUN curl -s https://apt.kitware.com/keys/kitware-archive-latest.asc | \ gpg --dearmor - > /etc/apt/trusted.gpg.d/kitware.gpg && \ @@ -60,9 +60,10 @@ RUN apt-get update \ software-properties-common \ tzdata \ --yes --no-install-recommends \ - && apt-get clean + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* -# This symlink required by gcc to find lld compiler +# This symlink is required by gcc to find the lld linker RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld # for external_symbolizer_path RUN ln -s /usr/bin/llvm-symbolizer-${LLVM_VERSION} /usr/bin/llvm-symbolizer @@ -107,5 +108,4 @@ RUN arch=${TARGETARCH:-amd64} \ && mv "/tmp/sccache-$SCCACHE_VERSION-$rarch-unknown-linux-musl/sccache" /usr/bin \ && rm "/tmp/sccache-$SCCACHE_VERSION-$rarch-unknown-linux-musl" -r - COPY process_functional_tests_result.py / diff --git a/docs/changelogs/v23.11.5.29-stable.md b/docs/changelogs/v23.11.5.29-stable.md new file mode 100644 index 00000000000..f73a21c2095 --- /dev/null +++ b/docs/changelogs/v23.11.5.29-stable.md @@ -0,0 +1,31 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v23.11.5.29-stable (d83b108deca) FIXME as compared to v23.11.4.24-stable (e79d840d7fe) + +#### Improvement +* Backported in [#58815](https://github.com/ClickHouse/ClickHouse/issues/58815): Add `SYSTEM JEMALLOC PURGE` for purging unused jemalloc pages, `SYSTEM JEMALLOC [ ENABLE | DISABLE | FLUSH ] PROFILE` for controlling jemalloc profile if the profiler is enabled. Add jemalloc-related 4LW command in Keeper: `jmst` for dumping jemalloc stats, `jmfp`, `jmep`, `jmdp` for controlling jemalloc profile if the profiler is enabled. [#58665](https://github.com/ClickHouse/ClickHouse/pull/58665) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#59234](https://github.com/ClickHouse/ClickHouse/issues/59234): Allow to ignore schema evolution in Iceberg table engine and read all data using schema specified by the user on table creation or latest schema parsed from metadata on table creation. This is done under a setting `iceberg_engine_ignore_schema_evolution` that is disabled by default. Note that enabling this setting can lead to incorrect result as in case of evolved schema all data files will be read using the same schema. [#59133](https://github.com/ClickHouse/ClickHouse/pull/59133) ([Kruglov Pavel](https://github.com/Avogar)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix a stupid case of intersecting parts [#58482](https://github.com/ClickHouse/ClickHouse/pull/58482) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix stream partitioning in parallel window functions [#58739](https://github.com/ClickHouse/ClickHouse/pull/58739) ([Dmitry Novik](https://github.com/novikd)). +* Fix double destroy call on exception throw in addBatchLookupTable8 [#58745](https://github.com/ClickHouse/ClickHouse/pull/58745) ([Raúl Marín](https://github.com/Algunenano)). +* Fix JSONExtract function for LowCardinality(Nullable) columns [#58808](https://github.com/ClickHouse/ClickHouse/pull/58808) ([vdimir](https://github.com/vdimir)). +* Fix: LIMIT BY and LIMIT in distributed query [#59153](https://github.com/ClickHouse/ClickHouse/pull/59153) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix not-ready set for system.tables [#59351](https://github.com/ClickHouse/ClickHouse/pull/59351) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix translate() with FixedString input [#59356](https://github.com/ClickHouse/ClickHouse/pull/59356) ([Raúl Marín](https://github.com/Algunenano)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* refine error message [#57991](https://github.com/ClickHouse/ClickHouse/pull/57991) ([Han Fei](https://github.com/hanfei1991)). +* Fix rare race in external sort/aggregation with temporary data in cache [#58013](https://github.com/ClickHouse/ClickHouse/pull/58013) ([Anton Popov](https://github.com/CurtizJ)). +* Follow-up to [#58482](https://github.com/ClickHouse/ClickHouse/issues/58482) [#58574](https://github.com/ClickHouse/ClickHouse/pull/58574) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix possible race in ManyAggregatedData dtor. [#58624](https://github.com/ClickHouse/ClickHouse/pull/58624) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Decrease log level for one log message [#59168](https://github.com/ClickHouse/ClickHouse/pull/59168) ([Kseniia Sumarokova](https://github.com/kssenii)). + diff --git a/docs/changelogs/v23.12.3.40-stable.md b/docs/changelogs/v23.12.3.40-stable.md new file mode 100644 index 00000000000..e2a9e3af407 --- /dev/null +++ b/docs/changelogs/v23.12.3.40-stable.md @@ -0,0 +1,36 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v23.12.3.40-stable (a594704ae75) FIXME as compared to v23.12.2.59-stable (17ab210e761) + +#### Improvement +* Backported in [#58660](https://github.com/ClickHouse/ClickHouse/issues/58660): When executing some queries, which require a lot of streams for reading data, the error `"Paste JOIN requires sorted tables only"` was previously thrown. Now the numbers of streams resize to 1 in that case. [#58608](https://github.com/ClickHouse/ClickHouse/pull/58608) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Backported in [#58817](https://github.com/ClickHouse/ClickHouse/issues/58817): Add `SYSTEM JEMALLOC PURGE` for purging unused jemalloc pages, `SYSTEM JEMALLOC [ ENABLE | DISABLE | FLUSH ] PROFILE` for controlling jemalloc profile if the profiler is enabled. Add jemalloc-related 4LW command in Keeper: `jmst` for dumping jemalloc stats, `jmfp`, `jmep`, `jmdp` for controlling jemalloc profile if the profiler is enabled. [#58665](https://github.com/ClickHouse/ClickHouse/pull/58665) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#59235](https://github.com/ClickHouse/ClickHouse/issues/59235): Allow to ignore schema evolution in Iceberg table engine and read all data using schema specified by the user on table creation or latest schema parsed from metadata on table creation. This is done under a setting `iceberg_engine_ignore_schema_evolution` that is disabled by default. Note that enabling this setting can lead to incorrect result as in case of evolved schema all data files will be read using the same schema. [#59133](https://github.com/ClickHouse/ClickHouse/pull/59133) ([Kruglov Pavel](https://github.com/Avogar)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Delay reading from StorageKafka to allow multiple reads in materialized views [#58477](https://github.com/ClickHouse/ClickHouse/pull/58477) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Fix a stupid case of intersecting parts [#58482](https://github.com/ClickHouse/ClickHouse/pull/58482) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Disable max_joined_block_rows in ConcurrentHashJoin [#58595](https://github.com/ClickHouse/ClickHouse/pull/58595) ([vdimir](https://github.com/vdimir)). +* Fix stream partitioning in parallel window functions [#58739](https://github.com/ClickHouse/ClickHouse/pull/58739) ([Dmitry Novik](https://github.com/novikd)). +* Fix double destroy call on exception throw in addBatchLookupTable8 [#58745](https://github.com/ClickHouse/ClickHouse/pull/58745) ([Raúl Marín](https://github.com/Algunenano)). +* Fix JSONExtract function for LowCardinality(Nullable) columns [#58808](https://github.com/ClickHouse/ClickHouse/pull/58808) ([vdimir](https://github.com/vdimir)). +* Multiple read file log storage in mv [#58877](https://github.com/ClickHouse/ClickHouse/pull/58877) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Fix: LIMIT BY and LIMIT in distributed query [#59153](https://github.com/ClickHouse/ClickHouse/pull/59153) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix not-ready set for system.tables [#59351](https://github.com/ClickHouse/ClickHouse/pull/59351) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix translate() with FixedString input [#59356](https://github.com/ClickHouse/ClickHouse/pull/59356) ([Raúl Marín](https://github.com/Algunenano)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Follow-up to [#58482](https://github.com/ClickHouse/ClickHouse/issues/58482) [#58574](https://github.com/ClickHouse/ClickHouse/pull/58574) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix possible race in ManyAggregatedData dtor. [#58624](https://github.com/ClickHouse/ClickHouse/pull/58624) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Change log level for super imporant message in Keeper [#59010](https://github.com/ClickHouse/ClickHouse/pull/59010) ([alesapin](https://github.com/alesapin)). +* Decrease log level for one log message [#59168](https://github.com/ClickHouse/ClickHouse/pull/59168) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix fasttest by pinning pip dependencies [#59256](https://github.com/ClickHouse/ClickHouse/pull/59256) ([Azat Khuzhin](https://github.com/azat)). +* No debug symbols in Rust [#59306](https://github.com/ClickHouse/ClickHouse/pull/59306) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + diff --git a/docs/changelogs/v23.12.4.15-stable.md b/docs/changelogs/v23.12.4.15-stable.md new file mode 100644 index 00000000000..a67b5aee312 --- /dev/null +++ b/docs/changelogs/v23.12.4.15-stable.md @@ -0,0 +1,21 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v23.12.4.15-stable (4233d111d20) FIXME as compared to v23.12.3.40-stable (a594704ae75) + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix incorrect result of arrayElement / map[] on empty value [#59594](https://github.com/ClickHouse/ClickHouse/pull/59594) ([Raúl Marín](https://github.com/Algunenano)). +* Fix crash in topK when merging empty states [#59603](https://github.com/ClickHouse/ClickHouse/pull/59603) ([Raúl Marín](https://github.com/Algunenano)). +* Fix distributed table with a constant sharding key [#59606](https://github.com/ClickHouse/ClickHouse/pull/59606) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix leftPad / rightPad function with FixedString input [#59739](https://github.com/ClickHouse/ClickHouse/pull/59739) ([Raúl Marín](https://github.com/Algunenano)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Fix 02720_row_policy_column_with_dots [#59453](https://github.com/ClickHouse/ClickHouse/pull/59453) ([Duc Canh Le](https://github.com/canhld94)). +* Pin python dependencies in stateless tests [#59663](https://github.com/ClickHouse/ClickHouse/pull/59663) ([Raúl Marín](https://github.com/Algunenano)). + diff --git a/docs/changelogs/v24.1.1.2048-stable.md b/docs/changelogs/v24.1.1.2048-stable.md new file mode 100644 index 00000000000..8e4647da86e --- /dev/null +++ b/docs/changelogs/v24.1.1.2048-stable.md @@ -0,0 +1,438 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.1.1.2048-stable (5a024dfc093) FIXME as compared to v23.12.1.1368-stable (a2faa65b080) + +#### Backward Incompatible Change +* The setting `print_pretty_type_names` is turned on by default. You can turn it off to keep the old behavior or `SET compatibility = '23.12'`. [#57726](https://github.com/ClickHouse/ClickHouse/pull/57726) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* The MergeTree setting `clean_deleted_rows` is deprecated, it has no effect anymore. The `CLEANUP` keyword for `OPTIMIZE` is not allowed by default (unless `allow_experimental_replacing_merge_with_cleanup` is enabled). [#58316](https://github.com/ClickHouse/ClickHouse/pull/58316) ([Alexander Tokmakov](https://github.com/tavplubix)). +* The function `reverseDNSQuery` is no longer available. This closes [#58368](https://github.com/ClickHouse/ClickHouse/issues/58368). [#58369](https://github.com/ClickHouse/ClickHouse/pull/58369) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Enable various changes to improve the access control in the configuration file. These changes affect the behavior, and you check the `config.xml` in the `access_control_improvements` section. In case you are not confident, keep the values in the configuration file as they were in the previous version. [#58584](https://github.com/ClickHouse/ClickHouse/pull/58584) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Allow queries without aliases for subqueries for `PASTE JOIN`. [#58654](https://github.com/ClickHouse/ClickHouse/pull/58654) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Fix sumMapFiltered with NaN values. NaN values are now placed at the end (instead of randomly) and considered different from any values. `-0` is now also treated as equal to `0`; since 0 values are discarded, `-0` values are discarded too. [#58959](https://github.com/ClickHouse/ClickHouse/pull/58959) ([Raúl Marín](https://github.com/Algunenano)). +* The function `visibleWidth` will behave according to the docs. In previous versions, it simply counted code points after string serialization, like the `lengthUTF8` function, but didn't consider zero-width and combining characters, full-width characters, tabs, and deletes. Now the behavior is changed accordingly. If you want to keep the old behavior, set `function_visible_width_behavior` to `0`, or set `compatibility` to `23.12` or lower. [#59022](https://github.com/ClickHouse/ClickHouse/pull/59022) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Kusto dialect is disabled until these two bugs will be fixed: [#59037](https://github.com/ClickHouse/ClickHouse/issues/59037) and [#59036](https://github.com/ClickHouse/ClickHouse/issues/59036). [#59305](https://github.com/ClickHouse/ClickHouse/pull/59305) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### New Feature +* Allow partitions from tables with different partition expressions to be attached when the destination table partition expression doesn't re-partition/ split the part. [#39507](https://github.com/ClickHouse/ClickHouse/pull/39507) ([Arthur Passos](https://github.com/arthurpassos)). +* Added statement `SYSTEM RELOAD ASYNCHRONOUS METRICS` which updates the asynchronous metrics. Mostly useful for testing and development. [#53710](https://github.com/ClickHouse/ClickHouse/pull/53710) ([Robert Schulze](https://github.com/rschu1ze)). +* Certain settings (currently `min_compress_block_size` and `max_compress_block_size`) can now be specified at column-level where they take precedence over the corresponding table-level setting. Example: `CREATE TABLE tab (col String SETTINGS (min_compress_block_size = 81920, max_compress_block_size = 163840)) ENGINE = MergeTree ORDER BY tuple();`. [#55201](https://github.com/ClickHouse/ClickHouse/pull/55201) ([Duc Canh Le](https://github.com/canhld94)). +* Add `quantileDDSketch` aggregate function as well as the corresponding `quantilesDDSketch` and `medianDDSketch`. It is based on the DDSketch https://www.vldb.org/pvldb/vol12/p2195-masson.pdf. ### Documentation entry for user-facing changes. [#56342](https://github.com/ClickHouse/ClickHouse/pull/56342) ([Srikanth Chekuri](https://github.com/srikanthccv)). +* Added function `seriesDecomposeSTL()` which decomposes a time series into a season, a trend and a residual component. [#57078](https://github.com/ClickHouse/ClickHouse/pull/57078) ([Bhavna Jindal](https://github.com/bhavnajindal)). +* Introduced MySQL Binlog Client for MaterializedMySQL: One binlog connection for many databases. [#57323](https://github.com/ClickHouse/ClickHouse/pull/57323) ([Val Doroshchuk](https://github.com/valbok)). +* Intel QuickAssist Technology (QAT) provides hardware-accelerated compression and cryptograpy. ClickHouse got a new compression codec `ZSTD_QAT` which utilizes QAT for zstd compression. The codec uses [Intel's QATlib](https://github.com/intel/qatlib) and [Inte's QAT ZSTD Plugin](https://github.com/intel/QAT-ZSTD-Plugin). Right now, only compression can be accelerated in hardware (a software fallback kicks in in case QAT could not be initialized), decompression always runs in software. [#57509](https://github.com/ClickHouse/ClickHouse/pull/57509) ([jasperzhu](https://github.com/jinjunzh)). +* Implementing the new way how object storage keys are generated for s3 disks. Now the format could be defined in terms of `re2` regex syntax with `key_template` option in disc description. [#57663](https://github.com/ClickHouse/ClickHouse/pull/57663) ([Sema Checherinda](https://github.com/CheSema)). +* Table system.dropped_tables_parts contains parts of system.dropped_tables tables (dropped but not yet removed tables). [#58038](https://github.com/ClickHouse/ClickHouse/pull/58038) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Implement Variant data type that represents a union of other data types. Type `Variant(T1, T2, ..., TN)` means that each row of this type has a value of either type `T1` or `T2` or ... or `TN` or none of them (`NULL` value). Variant type is available under a setting `allow_experimental_variant_type`. Reference: [#54864](https://github.com/ClickHouse/ClickHouse/issues/54864). [#58047](https://github.com/ClickHouse/ClickHouse/pull/58047) ([Kruglov Pavel](https://github.com/Avogar)). +* Add settings `max_materialized_views_size_for_table` to limit the number of materialized views attached to a table. [#58068](https://github.com/ClickHouse/ClickHouse/pull/58068) ([zhongyuankai](https://github.com/zhongyuankai)). +* `clickhouse-format` improvements: * support INSERT queries with `VALUES` * support comments (use `--comments` to output them) * support `--max_line_length` option to format only long queries in multiline. [#58246](https://github.com/ClickHouse/ClickHouse/pull/58246) ([vdimir](https://github.com/vdimir)). +* Added `null_status_on_timeout_only_active` and `throw_only_active` modes for `distributed_ddl_output_mode` that allow to avoid waiting for inactive replicas. [#58350](https://github.com/ClickHouse/ClickHouse/pull/58350) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Add table `system.database_engines`. [#58390](https://github.com/ClickHouse/ClickHouse/pull/58390) ([Bharat Nallan](https://github.com/bharatnc)). +* Added FROM modifier for SYSTEM SYNC REPLICA LIGHTWEIGHT query. The FROM modifier ensures we wait for for fetches and drop-ranges only for the specified source replicas, as well as any replica not in zookeeper or with an empty source_replica. [#58393](https://github.com/ClickHouse/ClickHouse/pull/58393) ([Jayme Bird](https://github.com/jaymebrd)). +* Add function `arrayShingles()` to compute subarrays, e.g. `arrayShingles([1, 2, 3, 4, 5], 3)` returns `[[1,2,3],[2,3,4],[3,4,5]]`. [#58396](https://github.com/ClickHouse/ClickHouse/pull/58396) ([Zheng Miao](https://github.com/zenmiao7)). +* Added functions `punycodeEncode()`, `punycodeDecode()`, `idnaEncode()` and `idnaDecode()` which are useful for translating international domain names to an ASCII representation according to the IDNA standard. [#58454](https://github.com/ClickHouse/ClickHouse/pull/58454) ([Robert Schulze](https://github.com/rschu1ze)). +* Added string similarity functions `dramerauLevenshteinDistance()`, `jaroSimilarity()` and `jaroWinklerSimilarity()`. [#58531](https://github.com/ClickHouse/ClickHouse/pull/58531) ([Robert Schulze](https://github.com/rschu1ze)). +* Add two settings `output_format_compression_level` to change output compression level and `output_format_compression_zstd_window_log` to explicitly set compression window size and enable long-range mode for zstd compression if output compression method is `zstd`. Applied for `INTO OUTFILE` and when writing to table functions `file`, `url`, `hdfs`, `s3`, and `azureBlobStorage`. [#58539](https://github.com/ClickHouse/ClickHouse/pull/58539) ([Duc Canh Le](https://github.com/canhld94)). +* Automatically disable ANSI escape sequences in Pretty formats if the output is not a terminal. Add new `auto` mode to setting `output_format_pretty_color`. [#58614](https://github.com/ClickHouse/ClickHouse/pull/58614) ([Shaun Struwig](https://github.com/Blargian)). +* Added setting `update_insert_deduplication_token_in_dependent_materialized_views`. This setting allows to update insert deduplication token with table identifier during insert in dependent materialized views. Closes [#59165](https://github.com/ClickHouse/ClickHouse/issues/59165). [#59238](https://github.com/ClickHouse/ClickHouse/pull/59238) ([Maksim Kita](https://github.com/kitaisreal)). + +#### Performance Improvement +* More cache-friendly final implementation. Note on the behaviour change: previously queries with `FINAL` modifier that read with a single stream (e.g. `max_threads=1`) produced sorted output without explicitly provided `ORDER BY` clause. This behaviour no longer exists when `enable_vertical_final = true` (and it is so by default). [#54366](https://github.com/ClickHouse/ClickHouse/pull/54366) ([Duc Canh Le](https://github.com/canhld94)). +* Optimize array element function when input is array(map)/array(array(num)/array(array(string))/array(bigint)/array(decimal). Current implementation causes too many reallocs. The optimization speed up by ~6x especially when input type is array(map). [#56403](https://github.com/ClickHouse/ClickHouse/pull/56403) ([李扬](https://github.com/taiyang-li)). +* Bypass `Poco::BasicBufferedStreamBuf` abstraction when reading from S3 (namely `ReadBufferFromIStream`) to avoid extra copying of data. [#56961](https://github.com/ClickHouse/ClickHouse/pull/56961) ([Nikita Taranov](https://github.com/nickitat)). +* Read column once while reading more that one subcolumn from it in Compact parts. [#57631](https://github.com/ClickHouse/ClickHouse/pull/57631) ([Kruglov Pavel](https://github.com/Avogar)). +* Rewrite the AST of sum(column + literal) function. [#57853](https://github.com/ClickHouse/ClickHouse/pull/57853) ([Jiebin Sun](https://github.com/jiebinn)). +* The evaluation of function `match()` now utilizes skipping indices `ngrambf_v1` and `tokenbf_v1`. [#57882](https://github.com/ClickHouse/ClickHouse/pull/57882) ([凌涛](https://github.com/lingtaolf)). +* Default coordinator for parallel replicas is rewritten for better cache locality (same mark ranges are almost always assigned to the same replicas). Consistent hashing is used also during work stealing, so better tail latency is expected. It has been tested for linear scalability on a hundred of replicas. [#57968](https://github.com/ClickHouse/ClickHouse/pull/57968) ([Nikita Taranov](https://github.com/nickitat)). +* MergeTree FINAL to not compare rows from same non-L0 part. [#58142](https://github.com/ClickHouse/ClickHouse/pull/58142) ([Duc Canh Le](https://github.com/canhld94)). +* Speed up iota calls (filling array with consecutive numbers). [#58271](https://github.com/ClickHouse/ClickHouse/pull/58271) ([Raúl Marín](https://github.com/Algunenano)). +* The evaluation of function `match()` now utilizes inverted indices. [#58284](https://github.com/ClickHouse/ClickHouse/pull/58284) ([凌涛](https://github.com/lingtaolf)). +* Speedup MIN/MAX for non numeric types. [#58334](https://github.com/ClickHouse/ClickHouse/pull/58334) ([Raúl Marín](https://github.com/Algunenano)). +* Enable JIT compilation for aggregation without a key. Closes [#41461](https://github.com/ClickHouse/ClickHouse/issues/41461). Originally [#53757](https://github.com/ClickHouse/ClickHouse/issues/53757). [#58440](https://github.com/ClickHouse/ClickHouse/pull/58440) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* The performance experiments of **OnTime** on the Intel server with up to AVX2 (and BMI2) support show that this change could effectively improve the QPS of **Q2** and **Q3** by **5.0%** and **3.7%** through reducing the cycle ratio of the hotspot, **_DB::MergeTreeRangeReader::ReadResult::optimize_**, **from 11.48% to 1.09%** and **from 8.09% to 0.67%** respectively while having no impact on others. [#58800](https://github.com/ClickHouse/ClickHouse/pull/58800) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). +* Use one thread less in `clickhouse-local`. [#58968](https://github.com/ClickHouse/ClickHouse/pull/58968) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Large aggregation states of `uniqExact` will be merged in parallel in distrubuted queries. [#59009](https://github.com/ClickHouse/ClickHouse/pull/59009) ([Nikita Taranov](https://github.com/nickitat)). +* Lower memory usage after reading from `MergeTree` tables. [#59290](https://github.com/ClickHouse/ClickHouse/pull/59290) ([Anton Popov](https://github.com/CurtizJ)). +* Lower memory usage in vertical merges. [#59340](https://github.com/ClickHouse/ClickHouse/pull/59340) ([Anton Popov](https://github.com/CurtizJ)). + +#### Improvement +* Enable MySQL/MariaDB on macOS. This closes [#21191](https://github.com/ClickHouse/ClickHouse/issues/21191). [#46316](https://github.com/ClickHouse/ClickHouse/pull/46316) ([Robert Schulze](https://github.com/rschu1ze)). +* Do not interpret numbers with leading zeroes as octals. [#55575](https://github.com/ClickHouse/ClickHouse/pull/55575) ([Joanna Hulboj](https://github.com/jh0x)). +* Replace HTTP outgoing buffering based on std ostream with CH Buffer. Add bytes counting metrics for interfaces. [#56064](https://github.com/ClickHouse/ClickHouse/pull/56064) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Disable `max_rows_in_set_to_optimize_join` by default. [#56396](https://github.com/ClickHouse/ClickHouse/pull/56396) ([vdimir](https://github.com/vdimir)). +* Add `` config parameter that allows avoiding resolving hostnames in DDLWorker. This mitigates the possibility of the queue being stuck in case of a change in cluster definition. Closes [#57573](https://github.com/ClickHouse/ClickHouse/issues/57573). [#57603](https://github.com/ClickHouse/ClickHouse/pull/57603) ([Nikolay Degterinsky](https://github.com/evillique)). +* Increase `load_metadata_threads` to 16 for the filesystem cache. It will make the server start up faster. [#57732](https://github.com/ClickHouse/ClickHouse/pull/57732) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Improve the `multiIf` function performance when the type is Nullable. [#57745](https://github.com/ClickHouse/ClickHouse/pull/57745) ([KevinyhZou](https://github.com/KevinyhZou)). +* Add ability to throttle merges/mutations (`max_mutations_bandwidth_for_server`/`max_merges_bandwidth_for_server`). [#57877](https://github.com/ClickHouse/ClickHouse/pull/57877) ([Azat Khuzhin](https://github.com/azat)). +* Replaced undocumented (boolean) column `is_hot_reloadable` in system table `system.server_settings` by (Enum8) column `changeable_without_restart` with possible values `No`, `Yes`, `IncreaseOnly` and `DecreaseOnly`. Also documented the column. [#58029](https://github.com/ClickHouse/ClickHouse/pull/58029) ([skyoct](https://github.com/skyoct)). +* ClusterDiscovery supports setting username and password, close [#58063](https://github.com/ClickHouse/ClickHouse/issues/58063). [#58123](https://github.com/ClickHouse/ClickHouse/pull/58123) ([vdimir](https://github.com/vdimir)). +* Support query parameters in ALTER TABLE ... PART. [#58297](https://github.com/ClickHouse/ClickHouse/pull/58297) ([Azat Khuzhin](https://github.com/azat)). +* Create consumers for Kafka tables on fly (but keep them for some period - `kafka_consumers_pool_ttl_ms`, since last used), this should fix problem with statistics for `system.kafka_consumers` (that does not consumed when nobody reads from Kafka table, which leads to live memory leak and slow table detach) and also this PR enables stats for `system.kafka_consumers` by default again. [#58310](https://github.com/ClickHouse/ClickHouse/pull/58310) ([Azat Khuzhin](https://github.com/azat)). +* Sparkbar as an alias to sparkbar. [#58335](https://github.com/ClickHouse/ClickHouse/pull/58335) ([凌涛](https://github.com/lingtaolf)). +* Avoid sending ComposeObject requests after upload to GCS. [#58343](https://github.com/ClickHouse/ClickHouse/pull/58343) ([Azat Khuzhin](https://github.com/azat)). +* Correctly handle keys with dot in the name in configurations XMLs. [#58354](https://github.com/ClickHouse/ClickHouse/pull/58354) ([Azat Khuzhin](https://github.com/azat)). +* Added comments (brief descriptions) to all columns of system tables. The are several reasons fro this: - We use system tables a lot and sometimes is could be very difficult for developer to understand the purpose and the meaning of a particular column. - We change (add new ones or modify existing) system tables a lot and the documentation for them is always outdated. For example take a look at the documentation page for [`system.parts`](https://clickhouse.com/docs/en/operations/system-tables/parts). It misses a lot of columns - We would like to eventually generate documentation directly from ClickHouse. [#58356](https://github.com/ClickHouse/ClickHouse/pull/58356) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Allow to configure any kind of object storage with any kind of metadata type. [#58357](https://github.com/ClickHouse/ClickHouse/pull/58357) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Make function `format` return constant on constant arguments. This closes [#58355](https://github.com/ClickHouse/ClickHouse/issues/58355). [#58358](https://github.com/ClickHouse/ClickHouse/pull/58358) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Attach all system tables in `clickhouse-local`, including `system.parts`. This closes [#58312](https://github.com/ClickHouse/ClickHouse/issues/58312). [#58359](https://github.com/ClickHouse/ClickHouse/pull/58359) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Support for `Enum` data types in function `transform`. This closes [#58241](https://github.com/ClickHouse/ClickHouse/issues/58241). [#58360](https://github.com/ClickHouse/ClickHouse/pull/58360) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Allow registering database engines independently. [#58365](https://github.com/ClickHouse/ClickHouse/pull/58365) ([Bharat Nallan](https://github.com/bharatnc)). +* Adding a setting `max_estimated_execution_time` to separate `max_execution_time` and `max_estimated_execution_time`. [#58402](https://github.com/ClickHouse/ClickHouse/pull/58402) ([Zhang Yifan](https://github.com/zhangyifan27)). +* Allow registering interpreters independently. [#58443](https://github.com/ClickHouse/ClickHouse/pull/58443) ([Bharat Nallan](https://github.com/bharatnc)). +* Provide hint when an invalid database engine name is used. [#58444](https://github.com/ClickHouse/ClickHouse/pull/58444) ([Bharat Nallan](https://github.com/bharatnc)). +* Avoid huge memory consumption during Keeper startup for more cases. [#58455](https://github.com/ClickHouse/ClickHouse/pull/58455) ([Antonio Andelic](https://github.com/antonio2368)). +* Add settings for better control of indexes type in Arrow dictionary. Use signed integer type for indexes by default as Arrow recommends. Closes [#57401](https://github.com/ClickHouse/ClickHouse/issues/57401). [#58519](https://github.com/ClickHouse/ClickHouse/pull/58519) ([Kruglov Pavel](https://github.com/Avogar)). +* Added function `sqidDecode()` which decodes [Sqids](https://sqids.org/). [#58544](https://github.com/ClickHouse/ClickHouse/pull/58544) ([Robert Schulze](https://github.com/rschu1ze)). +* Allow to read Bool values into String in JSON input formats. It's done under a setting `input_format_json_read_bools_as_strings` that is enabled by default. [#58561](https://github.com/ClickHouse/ClickHouse/pull/58561) ([Kruglov Pavel](https://github.com/Avogar)). +* Implement [#58575](https://github.com/ClickHouse/ClickHouse/issues/58575) Support `CLICKHOUSE_PASSWORD_FILE ` environment variable when running the docker image. [#58583](https://github.com/ClickHouse/ClickHouse/pull/58583) ([Eyal Halpern Shalev](https://github.com/Eyal-Shalev)). +* When executing some queries, which require a lot of streams for reading data, the error `"Paste JOIN requires sorted tables only"` was previously thrown. Now the numbers of streams resize to 1 in that case. [#58608](https://github.com/ClickHouse/ClickHouse/pull/58608) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Add `SYSTEM JEMALLOC PURGE` for purging unused jemalloc pages, `SYSTEM JEMALLOC [ ENABLE | DISABLE | FLUSH ] PROFILE` for controlling jemalloc profile if the profiler is enabled. Add jemalloc-related 4LW command in Keeper: `jmst` for dumping jemalloc stats, `jmfp`, `jmep`, `jmdp` for controlling jemalloc profile if the profiler is enabled. [#58665](https://github.com/ClickHouse/ClickHouse/pull/58665) ([Antonio Andelic](https://github.com/antonio2368)). +* Better message for INVALID_IDENTIFIER error. [#58703](https://github.com/ClickHouse/ClickHouse/pull/58703) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Improved handling of signed numeric literals in normalizeQuery. [#58710](https://github.com/ClickHouse/ClickHouse/pull/58710) ([Salvatore Mesoraca](https://github.com/aiven-sal)). +* Support Point data type for MySQL. [#58721](https://github.com/ClickHouse/ClickHouse/pull/58721) ([Kseniia Sumarokova](https://github.com/kssenii)). +* When comparing a Float32 column and a const string, read the string as Float32 (instead of Float64). [#58724](https://github.com/ClickHouse/ClickHouse/pull/58724) ([Raúl Marín](https://github.com/Algunenano)). +* Improve S3 compatible, add Ecloud EOS storage support. [#58786](https://github.com/ClickHouse/ClickHouse/pull/58786) ([xleoken](https://github.com/xleoken)). +* Allow `KILL QUERY` to cancel backups / restores. This PR also makes running backups and restores visible in `system.processes`. Also there is a new setting in the server configuration now - `shutdown_wait_backups_and_restores` (default=true) which makes the server either wait on shutdown for all running backups and restores to finish or just cancel them. [#58804](https://github.com/ClickHouse/ClickHouse/pull/58804) ([Vitaly Baranov](https://github.com/vitlibar)). +* Avro format support Zstd codec. Closes [#58735](https://github.com/ClickHouse/ClickHouse/issues/58735). [#58805](https://github.com/ClickHouse/ClickHouse/pull/58805) ([flynn](https://github.com/ucasfl)). +* MySQL interface gained support for `net_write_timeout` and `net_read_timeout` settings. `net_write_timeout` is translated into the native `send_timeout` ClickHouse setting and, similarly, `net_read_timeout` into `receive_timeout`. Fixed an issue where it was possible to set MySQL `sql_select_limit` setting only if the entire statement was in upper case. [#58835](https://github.com/ClickHouse/ClickHouse/pull/58835) ([Serge Klochkov](https://github.com/slvrtrn)). +* Fixing a problem described in [#58719](https://github.com/ClickHouse/ClickHouse/issues/58719). [#58841](https://github.com/ClickHouse/ClickHouse/pull/58841) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Make sure that for custom (created from SQL) disks ether `filesystem_caches_path` (a common directory prefix for all filesystem caches) or `custom_cached_disks_base_directory` (a common directory prefix for only filesystem caches created from custom disks) is specified in server config. `custom_cached_disks_base_directory` has higher priority for custom disks over `filesystem_caches_path`, which is used if the former one is absent. Filesystem cache setting `path` must lie inside that directory, otherwise exception will be thrown preventing disk to be created. This will not affect disks created on an older version and server was upgraded - then the exception will not be thrown to allow the server to successfully start). `custom_cached_disks_base_directory` is added to default server config as `/var/lib/clickhouse/caches/`. Closes [#57825](https://github.com/ClickHouse/ClickHouse/issues/57825). [#58869](https://github.com/ClickHouse/ClickHouse/pull/58869) ([Kseniia Sumarokova](https://github.com/kssenii)). +* MySQL interface gained compatibility with `SHOW WARNINGS`/`SHOW COUNT(*) WARNINGS` queries, though the returned result is always an empty set. [#58929](https://github.com/ClickHouse/ClickHouse/pull/58929) ([Serge Klochkov](https://github.com/slvrtrn)). +* Skip unavailable replicas when executing parallel distributed `INSERT SELECT`. [#58931](https://github.com/ClickHouse/ClickHouse/pull/58931) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Display word-descriptive log level while enabling structured log formatting in json. [#58936](https://github.com/ClickHouse/ClickHouse/pull/58936) ([Tim Liou](https://github.com/wheatdog)). +* MySQL interface gained support for `CAST(x AS SIGNED)` and `CAST(x AS UNSIGNED)` statements via data type aliases: `SIGNED` for Int64, and `UNSIGNED` for UInt64. This improves compatibility with BI tools such as Looker Studio. [#58954](https://github.com/ClickHouse/ClickHouse/pull/58954) ([Serge Klochkov](https://github.com/slvrtrn)). +* Function `seriesDecomposeSTL()` now returns a baseline component as season + trend components. [#58961](https://github.com/ClickHouse/ClickHouse/pull/58961) ([Bhavna Jindal](https://github.com/bhavnajindal)). +* Fix memory management in copyDataToS3File. [#58962](https://github.com/ClickHouse/ClickHouse/pull/58962) ([Vitaly Baranov](https://github.com/vitlibar)). +* Change working directory to data path in docker container. [#58975](https://github.com/ClickHouse/ClickHouse/pull/58975) ([cangyin](https://github.com/cangyin)). +* Added setting for Azure Blob Storage `azure_max_unexpected_write_error_retries` , can also be set from config under azure section. [#59001](https://github.com/ClickHouse/ClickHouse/pull/59001) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Keeper improvement: reduce Keeper's memory usage for stored nodes. [#59002](https://github.com/ClickHouse/ClickHouse/pull/59002) ([Antonio Andelic](https://github.com/antonio2368)). +* Allow server to start with broken data lake table. Closes [#58625](https://github.com/ClickHouse/ClickHouse/issues/58625). [#59080](https://github.com/ClickHouse/ClickHouse/pull/59080) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fixes https://github.com/ClickHouse/ClickHouse/pull/59120#issuecomment-1906177350. [#59122](https://github.com/ClickHouse/ClickHouse/pull/59122) ([Arthur Passos](https://github.com/arthurpassos)). +* The state of URL's #hash in the dashboard is now compressed using [lz-string](https://github.com/pieroxy/lz-string). The default size of the state string is compressed from 6856B to 2823B. [#59124](https://github.com/ClickHouse/ClickHouse/pull/59124) ([Amos Bird](https://github.com/amosbird)). +* Allow to ignore schema evolution in Iceberg table engine and read all data using schema specified by the user on table creation or latest schema parsed from metadata on table creation. This is done under a setting `iceberg_engine_ignore_schema_evolution` that is disabled by default. Note that enabling this setting can lead to incorrect result as in case of evolved schema all data files will be read using the same schema. [#59133](https://github.com/ClickHouse/ClickHouse/pull/59133) ([Kruglov Pavel](https://github.com/Avogar)). +* Prohibit mutable operations (`INSERT`/`ALTER`/`OPTIMIZE`/...) on read-only/write-once storages with a proper `TABLE_IS_READ_ONLY` error (to avoid leftovers). Avoid leaving left-overs on write-once disks (`format_version.txt`) on `CREATE`/`ATTACH`. Ignore `DROP` for `ReplicatedMergeTree` (so as for `MergeTree`). Fix iterating over `s3_plain` (`MetadataStorageFromPlainObjectStorage::iterateDirectory`). Note read-only is `web` disk, and write-once is `s3_plain`. [#59170](https://github.com/ClickHouse/ClickHouse/pull/59170) ([Azat Khuzhin](https://github.com/azat)). +* MySQL interface gained support for `net_write_timeout` and `net_read_timeout` settings. `net_write_timeout` is translated into the native `send_timeout` ClickHouse setting and, similarly, `net_read_timeout` into `receive_timeout`. Fixed an issue where it was possible to set MySQL `sql_select_limit` setting only if the entire statement was in upper case. [#59293](https://github.com/ClickHouse/ClickHouse/pull/59293) ([Serge Klochkov](https://github.com/slvrtrn)). +* Fix bug in experimental `_block_number` column which could lead to logical error during complex combination of `ALTER`s and `merge`s. Fixes [#56202](https://github.com/ClickHouse/ClickHouse/issues/56202). Replaces [#58601](https://github.com/ClickHouse/ClickHouse/issues/58601). CC @SmitaRKulkarni. [#59295](https://github.com/ClickHouse/ClickHouse/pull/59295) ([alesapin](https://github.com/alesapin)). +* Play UI understands when an exception is returned inside JSON. Adjustment for [#52853](https://github.com/ClickHouse/ClickHouse/issues/52853). [#59303](https://github.com/ClickHouse/ClickHouse/pull/59303) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* `/binary` HTTP handler allows to specify user, host, and optionally, password in the query string. [#59311](https://github.com/ClickHouse/ClickHouse/pull/59311) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Support backups for compressed in-memory tables. This closes [#57893](https://github.com/ClickHouse/ClickHouse/issues/57893). [#59315](https://github.com/ClickHouse/ClickHouse/pull/59315) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Improve exception message of function regexp_extract, close [#56393](https://github.com/ClickHouse/ClickHouse/issues/56393). [#59319](https://github.com/ClickHouse/ClickHouse/pull/59319) ([李扬](https://github.com/taiyang-li)). +* Support the FORMAT clause in BACKUP and RESTORE queries. [#59338](https://github.com/ClickHouse/ClickHouse/pull/59338) ([Vitaly Baranov](https://github.com/vitlibar)). +* Function `concatWithSeparator()` now supports arbitrary argument types (instead of only `String` and `FixedString` arguments). For example, `SELECT concatWithSeparator('.', 'number', 1)` now returns `number.1`. [#59341](https://github.com/ClickHouse/ClickHouse/pull/59341) ([Robert Schulze](https://github.com/rschu1ze)). + +#### Build/Testing/Packaging Improvement +* Improve aliases for clickhouse binary (now `ch`/`clickhouse` is `clickhouse-local` or `clickhouse` depends on the arguments) and add bash completion for new aliases. [#58344](https://github.com/ClickHouse/ClickHouse/pull/58344) ([Azat Khuzhin](https://github.com/azat)). +* Add settings changes check to CI to check that all settings changes are reflected in settings changes history. [#58555](https://github.com/ClickHouse/ClickHouse/pull/58555) ([Kruglov Pavel](https://github.com/Avogar)). +* Use tables directly attached from S3 in stateful tests. [#58791](https://github.com/ClickHouse/ClickHouse/pull/58791) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Save the whole `fuzzer.log` as an archive instead of the last 100k lines. `tail -n 100000` often removes lines with table definitions. Example:. [#58821](https://github.com/ClickHouse/ClickHouse/pull/58821) ([Dmitry Novik](https://github.com/novikd)). +* Enable Rust on OSX ARM64 (this will add fuzzy search in client with skim and prql language, though I don't think that are people who hosts ClickHouse on darwin, so it is mostly for fuzzy search in client I would say). [#59272](https://github.com/ClickHouse/ClickHouse/pull/59272) ([Azat Khuzhin](https://github.com/azat)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Add join keys conversion for nested lowcardinality [#51550](https://github.com/ClickHouse/ClickHouse/pull/51550) ([vdimir](https://github.com/vdimir)). +* Flatten only true Nested type if flatten_nested=1, not all Array(Tuple) [#56132](https://github.com/ClickHouse/ClickHouse/pull/56132) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix a bug with projections and the aggregate_functions_null_for_empty setting during insertion. [#56944](https://github.com/ClickHouse/ClickHouse/pull/56944) ([Amos Bird](https://github.com/amosbird)). +* Fixed potential exception due to stale profile UUID [#57263](https://github.com/ClickHouse/ClickHouse/pull/57263) ([Vasily Nemkov](https://github.com/Enmk)). +* Fix working with read buffers in StreamingFormatExecutor [#57438](https://github.com/ClickHouse/ClickHouse/pull/57438) ([Kruglov Pavel](https://github.com/Avogar)). +* Ignore MVs with dropped target table during pushing to views [#57520](https://github.com/ClickHouse/ClickHouse/pull/57520) ([Kruglov Pavel](https://github.com/Avogar)). +* [RFC] Eliminate possible race between ALTER_METADATA and MERGE_PARTS [#57755](https://github.com/ClickHouse/ClickHouse/pull/57755) ([Azat Khuzhin](https://github.com/azat)). +* Fix the exprs order bug in group by with rollup [#57786](https://github.com/ClickHouse/ClickHouse/pull/57786) ([Chen768959](https://github.com/Chen768959)). +* Fix lost blobs after dropping a replica with broken detached parts [#58333](https://github.com/ClickHouse/ClickHouse/pull/58333) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Allow users to work with symlinks in user_files_path (again) [#58447](https://github.com/ClickHouse/ClickHouse/pull/58447) ([Duc Canh Le](https://github.com/canhld94)). +* Fix segfault when graphite table does not have agg function [#58453](https://github.com/ClickHouse/ClickHouse/pull/58453) ([Duc Canh Le](https://github.com/canhld94)). +* Delay reading from StorageKafka to allow multiple reads in materialized views [#58477](https://github.com/ClickHouse/ClickHouse/pull/58477) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Fix a stupid case of intersecting parts [#58482](https://github.com/ClickHouse/ClickHouse/pull/58482) ([Alexander Tokmakov](https://github.com/tavplubix)). +* MergeTreePrefetchedReadPool disable for LIMIT only queries [#58505](https://github.com/ClickHouse/ClickHouse/pull/58505) ([Maksim Kita](https://github.com/kitaisreal)). +* Enable ordinary databases while restoration [#58520](https://github.com/ClickHouse/ClickHouse/pull/58520) ([Jihyuk Bok](https://github.com/tomahawk28)). +* Fix hive threadpool read ORC/Parquet/... Failed [#58537](https://github.com/ClickHouse/ClickHouse/pull/58537) ([sunny](https://github.com/sunny19930321)). +* Hide credentials in system.backup_log base_backup_name column [#58550](https://github.com/ClickHouse/ClickHouse/pull/58550) ([Daniel Pozo Escalona](https://github.com/danipozo)). +* toStartOfInterval for milli- microsencods values rounding [#58557](https://github.com/ClickHouse/ClickHouse/pull/58557) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Disable max_joined_block_rows in ConcurrentHashJoin [#58595](https://github.com/ClickHouse/ClickHouse/pull/58595) ([vdimir](https://github.com/vdimir)). +* Fix join using nullable in old analyzer [#58596](https://github.com/ClickHouse/ClickHouse/pull/58596) ([vdimir](https://github.com/vdimir)). +* `makeDateTime64()`: Allow non-const fraction argument [#58597](https://github.com/ClickHouse/ClickHouse/pull/58597) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix possible NULL dereference during symbolizing inline frames [#58607](https://github.com/ClickHouse/ClickHouse/pull/58607) ([Azat Khuzhin](https://github.com/azat)). +* Improve isolation of query cache entries under re-created users or role switches [#58611](https://github.com/ClickHouse/ClickHouse/pull/58611) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix broken partition key analysis when doing projection optimization [#58638](https://github.com/ClickHouse/ClickHouse/pull/58638) ([Amos Bird](https://github.com/amosbird)). +* Query cache: Fix per-user quota [#58731](https://github.com/ClickHouse/ClickHouse/pull/58731) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix stream partitioning in parallel window functions [#58739](https://github.com/ClickHouse/ClickHouse/pull/58739) ([Dmitry Novik](https://github.com/novikd)). +* Fix double destroy call on exception throw in addBatchLookupTable8 [#58745](https://github.com/ClickHouse/ClickHouse/pull/58745) ([Raúl Marín](https://github.com/Algunenano)). +* Don't process requests in Keeper during shutdown [#58765](https://github.com/ClickHouse/ClickHouse/pull/58765) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix Segfault in `SlabsPolygonIndex::find` [#58771](https://github.com/ClickHouse/ClickHouse/pull/58771) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Fix JSONExtract function for LowCardinality(Nullable) columns [#58808](https://github.com/ClickHouse/ClickHouse/pull/58808) ([vdimir](https://github.com/vdimir)). +* Table CREATE DROP Poco::Logger memory leak fix [#58831](https://github.com/ClickHouse/ClickHouse/pull/58831) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix HTTP compressors finalization [#58846](https://github.com/ClickHouse/ClickHouse/pull/58846) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Multiple read file log storage in mv [#58877](https://github.com/ClickHouse/ClickHouse/pull/58877) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Restriction for the access key id for s3. [#58900](https://github.com/ClickHouse/ClickHouse/pull/58900) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). +* Fix possible crash in clickhouse-local during loading suggestions [#58907](https://github.com/ClickHouse/ClickHouse/pull/58907) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix crash when indexHint() is used [#58911](https://github.com/ClickHouse/ClickHouse/pull/58911) ([Dmitry Novik](https://github.com/novikd)). +* Fix StorageURL forgetting headers on server restart [#58933](https://github.com/ClickHouse/ClickHouse/pull/58933) ([Michael Kolupaev](https://github.com/al13n321)). +* Analyzer: fix storage replacement with insertion block [#58958](https://github.com/ClickHouse/ClickHouse/pull/58958) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Fix seek in ReadBufferFromZipArchive [#58966](https://github.com/ClickHouse/ClickHouse/pull/58966) ([Michael Kolupaev](https://github.com/al13n321)). +* `DROP INDEX` of inverted index now removes all relevant files from persistence [#59040](https://github.com/ClickHouse/ClickHouse/pull/59040) ([mochi](https://github.com/MochiXu)). +* Fix data race on query_factories_info [#59049](https://github.com/ClickHouse/ClickHouse/pull/59049) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Disable "Too many redirects" error retry [#59099](https://github.com/ClickHouse/ClickHouse/pull/59099) ([skyoct](https://github.com/skyoct)). +* Fix aggregation issue in mixed x86_64 and ARM clusters [#59132](https://github.com/ClickHouse/ClickHouse/pull/59132) ([Harry Lee](https://github.com/HarryLeeIBM)). +* Fix not started database shutdown deadlock [#59137](https://github.com/ClickHouse/ClickHouse/pull/59137) ([Sergei Trifonov](https://github.com/serxa)). +* Fix: LIMIT BY and LIMIT in distributed query [#59153](https://github.com/ClickHouse/ClickHouse/pull/59153) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix crash with nullable timezone for `toString` [#59190](https://github.com/ClickHouse/ClickHouse/pull/59190) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* Fix abort in iceberg metadata on bad file paths [#59275](https://github.com/ClickHouse/ClickHouse/pull/59275) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix architecture name in select of Rust target [#59307](https://github.com/ClickHouse/ClickHouse/pull/59307) ([p1rattttt](https://github.com/p1rattttt)). +* Fix not-ready set for system.tables [#59351](https://github.com/ClickHouse/ClickHouse/pull/59351) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix lazy initialization in RabbitMQ [#59352](https://github.com/ClickHouse/ClickHouse/pull/59352) ([Kruglov Pavel](https://github.com/Avogar)). + +#### NO CL ENTRY + +* NO CL ENTRY: 'Revert "Refreshable materialized views (takeover)"'. [#58296](https://github.com/ClickHouse/ClickHouse/pull/58296) ([Alexander Tokmakov](https://github.com/tavplubix)). +* NO CL ENTRY: 'Revert "Fix an error in the release script - it didn't allow to make 23.12."'. [#58381](https://github.com/ClickHouse/ClickHouse/pull/58381) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* NO CL ENTRY: 'Revert "Use CH Buffer for HTTP out stream, add metrics for interfaces"'. [#58450](https://github.com/ClickHouse/ClickHouse/pull/58450) ([Raúl Marín](https://github.com/Algunenano)). +* NO CL ENTRY: 'Second attempt: Use CH Buffer for HTTP out stream, add metrics for interfaces'. [#58475](https://github.com/ClickHouse/ClickHouse/pull/58475) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* NO CL ENTRY: 'Revert "Merging [#53757](https://github.com/ClickHouse/ClickHouse/issues/53757)"'. [#58542](https://github.com/ClickHouse/ClickHouse/pull/58542) ([Raúl Marín](https://github.com/Algunenano)). +* NO CL ENTRY: 'Revert "Add support for MySQL `net_write_timeout` and `net_read_timeout` settings"'. [#58872](https://github.com/ClickHouse/ClickHouse/pull/58872) ([Alexander Tokmakov](https://github.com/tavplubix)). +* NO CL ENTRY: 'Revert "Extend performance test norm_dist.xml"'. [#58989](https://github.com/ClickHouse/ClickHouse/pull/58989) ([Raúl Marín](https://github.com/Algunenano)). +* NO CL ENTRY: 'Revert "Add a test for [#47892](https://github.com/ClickHouse/ClickHouse/issues/47892)"'. [#58990](https://github.com/ClickHouse/ClickHouse/pull/58990) ([Raúl Marín](https://github.com/Algunenano)). +* NO CL ENTRY: 'Revert "Allow parallel replicas for JOIN with analyzer [part 1]."'. [#59059](https://github.com/ClickHouse/ClickHouse/pull/59059) ([Alexander Tokmakov](https://github.com/tavplubix)). +* NO CL ENTRY: 'Revert "Consume leading zeroes when parsing a number in ConstantExpressionTemplate"'. [#59070](https://github.com/ClickHouse/ClickHouse/pull/59070) ([Alexander Tokmakov](https://github.com/tavplubix)). +* NO CL ENTRY: 'Revert "Revert "Allow parallel replicas for JOIN with analyzer [part 1].""'. [#59076](https://github.com/ClickHouse/ClickHouse/pull/59076) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* NO CL ENTRY: 'Revert "Allow to attach partition from table with different partition expression when destination partition expression doesn't re-partition"'. [#59120](https://github.com/ClickHouse/ClickHouse/pull/59120) ([Robert Schulze](https://github.com/rschu1ze)). +* NO CL ENTRY: 'DisksApp.cpp: fix typo (specifiged → specified)'. [#59140](https://github.com/ClickHouse/ClickHouse/pull/59140) ([Nikolay Edigaryev](https://github.com/edigaryev)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Analyzer: Fix resolving subcolumns in JOIN [#49703](https://github.com/ClickHouse/ClickHouse/pull/49703) ([vdimir](https://github.com/vdimir)). +* Analyzer: always qualify execution names [#53705](https://github.com/ClickHouse/ClickHouse/pull/53705) ([Dmitry Novik](https://github.com/novikd)). +* Insert quorum: check host node version in addition [#55528](https://github.com/ClickHouse/ClickHouse/pull/55528) ([Igor Nikonov](https://github.com/devcrafter)). +* Remove more old code of projection analysis [#55579](https://github.com/ClickHouse/ClickHouse/pull/55579) ([Anton Popov](https://github.com/CurtizJ)). +* Better exception messages in input formats [#57053](https://github.com/ClickHouse/ClickHouse/pull/57053) ([Kruglov Pavel](https://github.com/Avogar)). +* Parallel replicas custom key: skip unavailable replicas [#57235](https://github.com/ClickHouse/ClickHouse/pull/57235) ([Igor Nikonov](https://github.com/devcrafter)). +* Small change in log message in MergeTreeDataMergerMutator [#57550](https://github.com/ClickHouse/ClickHouse/pull/57550) ([Nikita Taranov](https://github.com/nickitat)). +* fs cache: small optimization [#57615](https://github.com/ClickHouse/ClickHouse/pull/57615) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Customizable dependency failure handling for AsyncLoader [#57697](https://github.com/ClickHouse/ClickHouse/pull/57697) ([Sergei Trifonov](https://github.com/serxa)). +* Bring test back [#57700](https://github.com/ClickHouse/ClickHouse/pull/57700) ([Nikita Taranov](https://github.com/nickitat)). +* Change default database name in clickhouse-local to 'default' [#57774](https://github.com/ClickHouse/ClickHouse/pull/57774) ([Kruglov Pavel](https://github.com/Avogar)). +* Add option `--show-whitespaces-in-diff` to clickhouse-test [#57870](https://github.com/ClickHouse/ClickHouse/pull/57870) ([vdimir](https://github.com/vdimir)). +* Update `query_masking_rules` when reloading the config, attempt 2 [#57993](https://github.com/ClickHouse/ClickHouse/pull/57993) ([Mikhail Koviazin](https://github.com/mkmkme)). +* Remove unneeded parameter `use_external_buffer` from `AsynchronousReadBuffer*` [#58077](https://github.com/ClickHouse/ClickHouse/pull/58077) ([Nikita Taranov](https://github.com/nickitat)). +* Print another message in Bugfix check if internal check had been failed [#58091](https://github.com/ClickHouse/ClickHouse/pull/58091) ([vdimir](https://github.com/vdimir)). +* Refactor StorageMerge virtual columns filtering. [#58255](https://github.com/ClickHouse/ClickHouse/pull/58255) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Analyzer: fix tuple comparison when result is always null [#58266](https://github.com/ClickHouse/ClickHouse/pull/58266) ([vdimir](https://github.com/vdimir)). +* Fix an error in the release script - it didn't allow to make 23.12. [#58288](https://github.com/ClickHouse/ClickHouse/pull/58288) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Update version_date.tsv and changelogs after v23.12.1.1368-stable [#58290](https://github.com/ClickHouse/ClickHouse/pull/58290) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Fix test_storage_s3_queue/test.py::test_drop_table [#58293](https://github.com/ClickHouse/ClickHouse/pull/58293) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix timeout in 01732_race_condition_storage_join_long [#58298](https://github.com/ClickHouse/ClickHouse/pull/58298) ([vdimir](https://github.com/vdimir)). +* Handle another case for preprocessing in Keeper [#58308](https://github.com/ClickHouse/ClickHouse/pull/58308) ([Antonio Andelic](https://github.com/antonio2368)). +* Disable max_bytes_before_external* in 00172_hits_joins [#58309](https://github.com/ClickHouse/ClickHouse/pull/58309) ([vdimir](https://github.com/vdimir)). +* Analyzer: support functional arguments in USING clause [#58317](https://github.com/ClickHouse/ClickHouse/pull/58317) ([Dmitry Novik](https://github.com/novikd)). +* Fixed logical error in CheckSortedTransform [#58318](https://github.com/ClickHouse/ClickHouse/pull/58318) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Refreshable materialized views again [#58320](https://github.com/ClickHouse/ClickHouse/pull/58320) ([Michael Kolupaev](https://github.com/al13n321)). +* Organize symbols from src/* into DB namespace [#58336](https://github.com/ClickHouse/ClickHouse/pull/58336) ([Amos Bird](https://github.com/amosbird)). +* Add a style check against DOS and Windows [#58345](https://github.com/ClickHouse/ClickHouse/pull/58345) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Check what happen if remove array joined columns from KeyCondition [#58346](https://github.com/ClickHouse/ClickHouse/pull/58346) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Upload time of the perf tests into artifacts as test_duration_ms [#58348](https://github.com/ClickHouse/ClickHouse/pull/58348) ([Azat Khuzhin](https://github.com/azat)). +* Keep exception format string in retries ctl [#58351](https://github.com/ClickHouse/ClickHouse/pull/58351) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix replication.lib helper (system.mutations has database not current_database) [#58352](https://github.com/ClickHouse/ClickHouse/pull/58352) ([Azat Khuzhin](https://github.com/azat)). +* Refactor StorageHDFS and StorageFile virtual columns filtering [#58353](https://github.com/ClickHouse/ClickHouse/pull/58353) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix suspended workers for AsyncLoader [#58362](https://github.com/ClickHouse/ClickHouse/pull/58362) ([Sergei Trifonov](https://github.com/serxa)). +* Remove stale events from README [#58364](https://github.com/ClickHouse/ClickHouse/pull/58364) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Do not fail the CI on an expired token [#58384](https://github.com/ClickHouse/ClickHouse/pull/58384) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Add a test for [#38534](https://github.com/ClickHouse/ClickHouse/issues/38534) [#58391](https://github.com/ClickHouse/ClickHouse/pull/58391) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* fix database engine validation inside database factory [#58395](https://github.com/ClickHouse/ClickHouse/pull/58395) ([Bharat Nallan](https://github.com/bharatnc)). +* Fix bad formatting of the `timeDiff` compatibility alias [#58398](https://github.com/ClickHouse/ClickHouse/pull/58398) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix a comment; remove unused method; stop using pointers [#58399](https://github.com/ClickHouse/ClickHouse/pull/58399) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix test_user_valid_until [#58409](https://github.com/ClickHouse/ClickHouse/pull/58409) ([Nikolay Degterinsky](https://github.com/evillique)). +* Make a test not depend on the lack of floating point associativity [#58439](https://github.com/ClickHouse/ClickHouse/pull/58439) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix `02944_dynamically_change_filesystem_cache_size` [#58445](https://github.com/ClickHouse/ClickHouse/pull/58445) ([Nikolay Degterinsky](https://github.com/evillique)). +* Analyzer: Fix LOGICAL_ERROR with LowCardinality [#58457](https://github.com/ClickHouse/ClickHouse/pull/58457) ([Dmitry Novik](https://github.com/novikd)). +* Replace `std::regex` by re2 [#58458](https://github.com/ClickHouse/ClickHouse/pull/58458) ([Robert Schulze](https://github.com/rschu1ze)). +* Improve perf tests [#58478](https://github.com/ClickHouse/ClickHouse/pull/58478) ([Raúl Marín](https://github.com/Algunenano)). +* Check if I can remove KeyCondition analysis on AST. [#58480](https://github.com/ClickHouse/ClickHouse/pull/58480) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix some thread pool settings not updating at runtime [#58485](https://github.com/ClickHouse/ClickHouse/pull/58485) ([Michael Kolupaev](https://github.com/al13n321)). +* Lower log levels for some Raft logs to new test level [#58487](https://github.com/ClickHouse/ClickHouse/pull/58487) ([Antonio Andelic](https://github.com/antonio2368)). +* PartsSplitter small refactoring [#58506](https://github.com/ClickHouse/ClickHouse/pull/58506) ([Maksim Kita](https://github.com/kitaisreal)). +* Sync content of the docker test images [#58507](https://github.com/ClickHouse/ClickHouse/pull/58507) ([Max K.](https://github.com/maxknv)). +* CI: move ci-specifics from job scripts to ci.py [#58516](https://github.com/ClickHouse/ClickHouse/pull/58516) ([Max K.](https://github.com/maxknv)). +* Minor fixups for `sqid()` [#58517](https://github.com/ClickHouse/ClickHouse/pull/58517) ([Robert Schulze](https://github.com/rschu1ze)). +* Update version_date.tsv and changelogs after v23.12.2.59-stable [#58545](https://github.com/ClickHouse/ClickHouse/pull/58545) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update version_date.tsv and changelogs after v23.11.4.24-stable [#58546](https://github.com/ClickHouse/ClickHouse/pull/58546) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update version_date.tsv and changelogs after v23.8.9.54-lts [#58547](https://github.com/ClickHouse/ClickHouse/pull/58547) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update version_date.tsv and changelogs after v23.10.6.60-stable [#58548](https://github.com/ClickHouse/ClickHouse/pull/58548) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update version_date.tsv and changelogs after v23.3.19.32-lts [#58549](https://github.com/ClickHouse/ClickHouse/pull/58549) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update CHANGELOG.md [#58559](https://github.com/ClickHouse/ClickHouse/pull/58559) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Fix test 02932_kill_query_sleep [#58560](https://github.com/ClickHouse/ClickHouse/pull/58560) ([Vitaly Baranov](https://github.com/vitlibar)). +* CI fix. Add packager script to build digest [#58571](https://github.com/ClickHouse/ClickHouse/pull/58571) ([Max K.](https://github.com/maxknv)). +* fix and test that S3Clients are reused [#58573](https://github.com/ClickHouse/ClickHouse/pull/58573) ([Sema Checherinda](https://github.com/CheSema)). +* Follow-up to [#58482](https://github.com/ClickHouse/ClickHouse/issues/58482) [#58574](https://github.com/ClickHouse/ClickHouse/pull/58574) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Do not load database engines in suggest [#58586](https://github.com/ClickHouse/ClickHouse/pull/58586) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix wrong message in Keeper [#58588](https://github.com/ClickHouse/ClickHouse/pull/58588) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add some missing LLVM includes [#58594](https://github.com/ClickHouse/ClickHouse/pull/58594) ([Raúl Marín](https://github.com/Algunenano)). +* Small fix in Keeper [#58598](https://github.com/ClickHouse/ClickHouse/pull/58598) ([Antonio Andelic](https://github.com/antonio2368)). +* Update analyzer_tech_debt.txt [#58599](https://github.com/ClickHouse/ClickHouse/pull/58599) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Simplify release.py script [#58600](https://github.com/ClickHouse/ClickHouse/pull/58600) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Update analyzer_tech_debt.txt [#58602](https://github.com/ClickHouse/ClickHouse/pull/58602) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Refactor stacktrace symbolizer to avoid copy-paste [#58610](https://github.com/ClickHouse/ClickHouse/pull/58610) ([Azat Khuzhin](https://github.com/azat)). +* Add intel AMX checking [#58617](https://github.com/ClickHouse/ClickHouse/pull/58617) ([Roman Glinskikh](https://github.com/omgronny)). +* Optional `client` argument for `S3Helper` [#58619](https://github.com/ClickHouse/ClickHouse/pull/58619) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Add sorting to 02366_kql_summarize.sql [#58621](https://github.com/ClickHouse/ClickHouse/pull/58621) ([Raúl Marín](https://github.com/Algunenano)). +* Fix possible race in ManyAggregatedData dtor. [#58624](https://github.com/ClickHouse/ClickHouse/pull/58624) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Remove more projections code [#58628](https://github.com/ClickHouse/ClickHouse/pull/58628) ([Anton Popov](https://github.com/CurtizJ)). +* Remove finalize() from ~WriteBufferFromEncryptedFile [#58629](https://github.com/ClickHouse/ClickHouse/pull/58629) ([Vitaly Baranov](https://github.com/vitlibar)). +* Update test_replicated_database/test.py [#58647](https://github.com/ClickHouse/ClickHouse/pull/58647) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Try disabling `muzzy_decay_ms` in jemalloc [#58648](https://github.com/ClickHouse/ClickHouse/pull/58648) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix test_replicated_database::test_startup_without_zk flakiness [#58649](https://github.com/ClickHouse/ClickHouse/pull/58649) ([Azat Khuzhin](https://github.com/azat)). +* Fix 01600_remerge_sort_lowered_memory_bytes_ratio flakiness (due to settings randomization) [#58650](https://github.com/ClickHouse/ClickHouse/pull/58650) ([Azat Khuzhin](https://github.com/azat)). +* Analyzer: Fix assertion in HashJoin with duplicate columns [#58652](https://github.com/ClickHouse/ClickHouse/pull/58652) ([vdimir](https://github.com/vdimir)). +* Document that `match()` can use `ngrambf_v1` and `tokenbf_v1` indexes [#58655](https://github.com/ClickHouse/ClickHouse/pull/58655) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix perf tests duration (checks.test_duration_ms) [#58656](https://github.com/ClickHouse/ClickHouse/pull/58656) ([Azat Khuzhin](https://github.com/azat)). +* Analyzer: Correctly handle constant set in index [#58657](https://github.com/ClickHouse/ClickHouse/pull/58657) ([Dmitry Novik](https://github.com/novikd)). +* fix a typo in stress randomization setting [#58658](https://github.com/ClickHouse/ClickHouse/pull/58658) ([Sema Checherinda](https://github.com/CheSema)). +* Small follow-up to `std::regex` --> `re2` conversion ([#58458](https://github.com/ClickHouse/ClickHouse/issues/58458)) [#58678](https://github.com/ClickHouse/ClickHouse/pull/58678) ([Robert Schulze](https://github.com/rschu1ze)). +* Remove `` from libcxx [#58681](https://github.com/ClickHouse/ClickHouse/pull/58681) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix bad log message [#58698](https://github.com/ClickHouse/ClickHouse/pull/58698) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Some small improvements to version_helper from [#57203](https://github.com/ClickHouse/ClickHouse/issues/57203) [#58712](https://github.com/ClickHouse/ClickHouse/pull/58712) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Small fixes in different helpers [#58717](https://github.com/ClickHouse/ClickHouse/pull/58717) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix bug in new (not released yet) parallel replicas coordinator [#58722](https://github.com/ClickHouse/ClickHouse/pull/58722) ([Nikita Taranov](https://github.com/nickitat)). +* Analyzer: Fix LOGICAL_ERROR in CountDistinctPass [#58723](https://github.com/ClickHouse/ClickHouse/pull/58723) ([Dmitry Novik](https://github.com/novikd)). +* Fix reading of offsets subcolumn (`size0`) from `Nested` [#58729](https://github.com/ClickHouse/ClickHouse/pull/58729) ([Anton Popov](https://github.com/CurtizJ)). +* Fix Mac OS X [#58733](https://github.com/ClickHouse/ClickHouse/pull/58733) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* fix stress with generate-template-key [#58740](https://github.com/ClickHouse/ClickHouse/pull/58740) ([Sema Checherinda](https://github.com/CheSema)). +* more relaxed check [#58751](https://github.com/ClickHouse/ClickHouse/pull/58751) ([Sema Checherinda](https://github.com/CheSema)). +* Fix usage of small buffers for remote reading [#58768](https://github.com/ClickHouse/ClickHouse/pull/58768) ([Nikita Taranov](https://github.com/nickitat)). +* Add missing includes when _LIBCPP_REMOVE_TRANSITIVE_INCLUDES enabled [#58770](https://github.com/ClickHouse/ClickHouse/pull/58770) ([Artem Alperin](https://github.com/hdnpth)). +* Remove some code [#58772](https://github.com/ClickHouse/ClickHouse/pull/58772) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Remove some code [#58790](https://github.com/ClickHouse/ClickHouse/pull/58790) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix trash in performance tests [#58794](https://github.com/ClickHouse/ClickHouse/pull/58794) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix data race in Keeper [#58806](https://github.com/ClickHouse/ClickHouse/pull/58806) ([Antonio Andelic](https://github.com/antonio2368)). +* Increase log level to trace to help debug `00993_system_parts_race_condition_drop_zookeeper` [#58809](https://github.com/ClickHouse/ClickHouse/pull/58809) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* DatabaseCatalog background tasks add log names [#58832](https://github.com/ClickHouse/ClickHouse/pull/58832) ([Maksim Kita](https://github.com/kitaisreal)). +* Analyzer: Resolve GROUPING function on shards [#58833](https://github.com/ClickHouse/ClickHouse/pull/58833) ([Dmitry Novik](https://github.com/novikd)). +* Allow parallel replicas for JOIN with analyzer [part 1]. [#58838](https://github.com/ClickHouse/ClickHouse/pull/58838) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix `isRetry` method [#58839](https://github.com/ClickHouse/ClickHouse/pull/58839) ([alesapin](https://github.com/alesapin)). +* fs cache: fix data race in slru [#58842](https://github.com/ClickHouse/ClickHouse/pull/58842) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix reading from an invisible part in new (not released yet) parallel replicas coordinator [#58844](https://github.com/ClickHouse/ClickHouse/pull/58844) ([Nikita Taranov](https://github.com/nickitat)). +* Fix bad log message [#58849](https://github.com/ClickHouse/ClickHouse/pull/58849) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Set max_bytes_before_external_group_by in 01961_roaring_memory_tracking [#58863](https://github.com/ClickHouse/ClickHouse/pull/58863) ([vdimir](https://github.com/vdimir)). +* Fix `00089_group_by_arrays_of_fixed` with external aggregation [#58873](https://github.com/ClickHouse/ClickHouse/pull/58873) ([Antonio Andelic](https://github.com/antonio2368)). +* DiskWeb minor improvement in loading [#58874](https://github.com/ClickHouse/ClickHouse/pull/58874) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix RPN construction for indexHint [#58875](https://github.com/ClickHouse/ClickHouse/pull/58875) ([Dmitry Novik](https://github.com/novikd)). +* Analyzer: add test with GROUP BY on shards [#58876](https://github.com/ClickHouse/ClickHouse/pull/58876) ([Dmitry Novik](https://github.com/novikd)). +* Jepsen job to reuse builds [#58881](https://github.com/ClickHouse/ClickHouse/pull/58881) ([Max K.](https://github.com/maxknv)). +* Fix ambiguity in the setting description [#58883](https://github.com/ClickHouse/ClickHouse/pull/58883) ([Denny Crane](https://github.com/den-crane)). +* Less error prone interface of read buffers [#58886](https://github.com/ClickHouse/ClickHouse/pull/58886) ([Anton Popov](https://github.com/CurtizJ)). +* Add metric for keeper memory soft limit [#58890](https://github.com/ClickHouse/ClickHouse/pull/58890) ([Pradeep Chhetri](https://github.com/chhetripradeep)). +* Add a test for [#47988](https://github.com/ClickHouse/ClickHouse/issues/47988) [#58893](https://github.com/ClickHouse/ClickHouse/pull/58893) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Whitespaces [#58894](https://github.com/ClickHouse/ClickHouse/pull/58894) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix data race in `AggregatingTransform` [#58896](https://github.com/ClickHouse/ClickHouse/pull/58896) ([Antonio Andelic](https://github.com/antonio2368)). +* Update SLRUFileCachePriority.cpp [#58898](https://github.com/ClickHouse/ClickHouse/pull/58898) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Add tests for [#57193](https://github.com/ClickHouse/ClickHouse/issues/57193) [#58899](https://github.com/ClickHouse/ClickHouse/pull/58899) ([Raúl Marín](https://github.com/Algunenano)). +* Add log for already download binary in Jepsen [#58901](https://github.com/ClickHouse/ClickHouse/pull/58901) ([Antonio Andelic](https://github.com/antonio2368)). +* fs cache: minor refactoring [#58902](https://github.com/ClickHouse/ClickHouse/pull/58902) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Checking on flaky test_parallel_replicas_custom_key_failover [#58909](https://github.com/ClickHouse/ClickHouse/pull/58909) ([Igor Nikonov](https://github.com/devcrafter)). +* Style fix [#58913](https://github.com/ClickHouse/ClickHouse/pull/58913) ([Dmitry Novik](https://github.com/novikd)). +* Opentelemetry spans to analyze CPU and S3 bottlenecks on inserts [#58914](https://github.com/ClickHouse/ClickHouse/pull/58914) ([Alexander Gololobov](https://github.com/davenger)). +* Fix fault handler in case of thread (for fault handler) cannot be spawned [#58917](https://github.com/ClickHouse/ClickHouse/pull/58917) ([Azat Khuzhin](https://github.com/azat)). +* Analyzer: Support GROUP BY injective function elimination [#58919](https://github.com/ClickHouse/ClickHouse/pull/58919) ([Dmitry Novik](https://github.com/novikd)). +* Cancel MasterCI in PRs [#58920](https://github.com/ClickHouse/ClickHouse/pull/58920) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix and test for azure [#58697](https://github.com/ClickHouse/ClickHouse/issues/58697) [#58921](https://github.com/ClickHouse/ClickHouse/pull/58921) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Extend performance test norm_dist.xml [#58922](https://github.com/ClickHouse/ClickHouse/pull/58922) ([Robert Schulze](https://github.com/rschu1ze)). +* Add regression test for parallel replicas (follow up [#58722](https://github.com/ClickHouse/ClickHouse/issues/58722), [#58844](https://github.com/ClickHouse/ClickHouse/issues/58844)) [#58923](https://github.com/ClickHouse/ClickHouse/pull/58923) ([Nikita Taranov](https://github.com/nickitat)). +* Add a test for [#47892](https://github.com/ClickHouse/ClickHouse/issues/47892) [#58927](https://github.com/ClickHouse/ClickHouse/pull/58927) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix `FunctionToSubcolumnsPass` in debug build [#58930](https://github.com/ClickHouse/ClickHouse/pull/58930) ([Anton Popov](https://github.com/CurtizJ)). +* Call `getMaxFileDescriptorCount` once in Keeper [#58938](https://github.com/ClickHouse/ClickHouse/pull/58938) ([Antonio Andelic](https://github.com/antonio2368)). +* Add missing files to digests [#58942](https://github.com/ClickHouse/ClickHouse/pull/58942) ([Raúl Marín](https://github.com/Algunenano)). +* Analyzer: fix join column not found with compound identifiers [#58943](https://github.com/ClickHouse/ClickHouse/pull/58943) ([vdimir](https://github.com/vdimir)). +* CI: pr_info to provide event_type for job scripts [#58947](https://github.com/ClickHouse/ClickHouse/pull/58947) ([Max K.](https://github.com/maxknv)). +* Using the destination object for paths generation in S3copy. [#58949](https://github.com/ClickHouse/ClickHouse/pull/58949) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). +* Fix data race in slru (2) [#58950](https://github.com/ClickHouse/ClickHouse/pull/58950) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix flaky test_postgresql_replica_database_engine_2/test.py::test_dependent_loading [#58951](https://github.com/ClickHouse/ClickHouse/pull/58951) ([Kseniia Sumarokova](https://github.com/kssenii)). +* More safe way to dump system logs in tests [#58955](https://github.com/ClickHouse/ClickHouse/pull/58955) ([alesapin](https://github.com/alesapin)). +* Add a comment about sparse checkout [#58960](https://github.com/ClickHouse/ClickHouse/pull/58960) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Follow up to [#58357](https://github.com/ClickHouse/ClickHouse/issues/58357) [#58963](https://github.com/ClickHouse/ClickHouse/pull/58963) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Better error message about tuples [#58971](https://github.com/ClickHouse/ClickHouse/pull/58971) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix timeout for prometheus exporter for HTTP/1.1 (due to keep-alive) [#58981](https://github.com/ClickHouse/ClickHouse/pull/58981) ([Azat Khuzhin](https://github.com/azat)). +* Fix 02891_array_shingles with analyzer [#58982](https://github.com/ClickHouse/ClickHouse/pull/58982) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix script name in SQL example in executable.md [#58984](https://github.com/ClickHouse/ClickHouse/pull/58984) ([Lino Uruñuela](https://github.com/Wachynaky)). +* Fix typo [#58986](https://github.com/ClickHouse/ClickHouse/pull/58986) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Revert flaky [#58992](https://github.com/ClickHouse/ClickHouse/pull/58992) ([Raúl Marín](https://github.com/Algunenano)). +* Revive: Parallel replicas custom key: skip unavailable replicas [#58993](https://github.com/ClickHouse/ClickHouse/pull/58993) ([Igor Nikonov](https://github.com/devcrafter)). +* Make performance test `test norm_dist.xml` more realistic [#58995](https://github.com/ClickHouse/ClickHouse/pull/58995) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix 02404_memory_bound_merging with analyzer (follow up [#56419](https://github.com/ClickHouse/ClickHouse/issues/56419)) [#58996](https://github.com/ClickHouse/ClickHouse/pull/58996) ([Nikita Taranov](https://github.com/nickitat)). +* Add test for [#58930](https://github.com/ClickHouse/ClickHouse/issues/58930) [#58999](https://github.com/ClickHouse/ClickHouse/pull/58999) ([Anton Popov](https://github.com/CurtizJ)). +* initialization ConnectionTimeouts [#59000](https://github.com/ClickHouse/ClickHouse/pull/59000) ([Sema Checherinda](https://github.com/CheSema)). +* DiskWeb fix loading [#59006](https://github.com/ClickHouse/ClickHouse/pull/59006) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Update log level for http buffer [#59008](https://github.com/ClickHouse/ClickHouse/pull/59008) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Change log level for super imporant message in Keeper [#59010](https://github.com/ClickHouse/ClickHouse/pull/59010) ([alesapin](https://github.com/alesapin)). +* Fix async loader stress test [#59011](https://github.com/ClickHouse/ClickHouse/pull/59011) ([Sergei Trifonov](https://github.com/serxa)). +* Remove `StaticResourceManager` [#59013](https://github.com/ClickHouse/ClickHouse/pull/59013) ([Sergei Trifonov](https://github.com/serxa)). +* preserve 'amz-sdk-invocation-id' and 'amz-sdk-request' headers with gcp [#59015](https://github.com/ClickHouse/ClickHouse/pull/59015) ([Sema Checherinda](https://github.com/CheSema)). +* Update rename.md [#59017](https://github.com/ClickHouse/ClickHouse/pull/59017) ([filimonov](https://github.com/filimonov)). +* очепятка [#59024](https://github.com/ClickHouse/ClickHouse/pull/59024) ([edpyt](https://github.com/edpyt)). +* Split resource scheduler off `IO/` into `Common/Scheduler/` [#59025](https://github.com/ClickHouse/ClickHouse/pull/59025) ([Sergei Trifonov](https://github.com/serxa)). +* Add a parameter for testing purposes [#59027](https://github.com/ClickHouse/ClickHouse/pull/59027) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix test 02932_kill_query_sleep when running with query cache [#59041](https://github.com/ClickHouse/ClickHouse/pull/59041) ([Vitaly Baranov](https://github.com/vitlibar)). +* CI: Jepsen: fix sanity check in ci.py [#59043](https://github.com/ClickHouse/ClickHouse/pull/59043) ([Max K.](https://github.com/maxknv)). +* CI: add ci_config classes for job and build names [#59046](https://github.com/ClickHouse/ClickHouse/pull/59046) ([Max K.](https://github.com/maxknv)). +* remove flaky test [#59066](https://github.com/ClickHouse/ClickHouse/pull/59066) ([Sema Checherinda](https://github.com/CheSema)). +* Followup to 57853 [#59068](https://github.com/ClickHouse/ClickHouse/pull/59068) ([Dmitry Novik](https://github.com/novikd)). +* Follow-up to [#59027](https://github.com/ClickHouse/ClickHouse/issues/59027) [#59075](https://github.com/ClickHouse/ClickHouse/pull/59075) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix `test_parallel_replicas_invisible_parts` [#59077](https://github.com/ClickHouse/ClickHouse/pull/59077) ([Nikita Taranov](https://github.com/nickitat)). +* Increase max_bytes_before_external_group_by for 00165_jit_aggregate_functions [#59078](https://github.com/ClickHouse/ClickHouse/pull/59078) ([Raúl Marín](https://github.com/Algunenano)). +* Fix stateless/run.sh [#59079](https://github.com/ClickHouse/ClickHouse/pull/59079) ([Kseniia Sumarokova](https://github.com/kssenii)). +* CI: hot fix for reuse [#59081](https://github.com/ClickHouse/ClickHouse/pull/59081) ([Max K.](https://github.com/maxknv)). +* Fix server shutdown due to exception while loading metadata [#59083](https://github.com/ClickHouse/ClickHouse/pull/59083) ([Sergei Trifonov](https://github.com/serxa)). +* Coordinator returns ranges for reading in sorted order [#59089](https://github.com/ClickHouse/ClickHouse/pull/59089) ([Nikita Taranov](https://github.com/nickitat)). +* Raise timeout in 02294_decimal_second_errors [#59090](https://github.com/ClickHouse/ClickHouse/pull/59090) ([Raúl Marín](https://github.com/Algunenano)). +* Add `[[nodiscard]]` to a couple of methods [#59093](https://github.com/ClickHouse/ClickHouse/pull/59093) ([Nikita Taranov](https://github.com/nickitat)). +* Docs: Update integer and float aliases [#59100](https://github.com/ClickHouse/ClickHouse/pull/59100) ([Robert Schulze](https://github.com/rschu1ze)). +* Avoid election timeouts during startup in Keeper [#59102](https://github.com/ClickHouse/ClickHouse/pull/59102) ([Antonio Andelic](https://github.com/antonio2368)). +* Add missing setting max_estimated_execution_time in SettingsChangesHistory [#59104](https://github.com/ClickHouse/ClickHouse/pull/59104) ([Kruglov Pavel](https://github.com/Avogar)). +* Rename some inverted index test files [#59106](https://github.com/ClickHouse/ClickHouse/pull/59106) ([Robert Schulze](https://github.com/rschu1ze)). +* Further reduce runtime of `norm_distance.xml` [#59108](https://github.com/ClickHouse/ClickHouse/pull/59108) ([Robert Schulze](https://github.com/rschu1ze)). +* Minor follow-up to [#53710](https://github.com/ClickHouse/ClickHouse/issues/53710) [#59109](https://github.com/ClickHouse/ClickHouse/pull/59109) ([Robert Schulze](https://github.com/rschu1ze)). +* Update stateless/run.sh [#59116](https://github.com/ClickHouse/ClickHouse/pull/59116) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Followup 57875 [#59117](https://github.com/ClickHouse/ClickHouse/pull/59117) ([Dmitry Novik](https://github.com/novikd)). +* Fixing build [#59130](https://github.com/ClickHouse/ClickHouse/pull/59130) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Capability check for `s3_plain` [#59145](https://github.com/ClickHouse/ClickHouse/pull/59145) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix `02015_async_inserts_stress_long` [#59146](https://github.com/ClickHouse/ClickHouse/pull/59146) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix AggregateFunctionNothing result type issues introducing it with different names [#59147](https://github.com/ClickHouse/ClickHouse/pull/59147) ([vdimir](https://github.com/vdimir)). +* Fix url encoding issue [#59162](https://github.com/ClickHouse/ClickHouse/pull/59162) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Upgrade simdjson to v3.6.3 [#59166](https://github.com/ClickHouse/ClickHouse/pull/59166) ([Robert Schulze](https://github.com/rschu1ze)). +* Decrease log level for one log message [#59168](https://github.com/ClickHouse/ClickHouse/pull/59168) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix broken cache for non-existing temp_path [#59172](https://github.com/ClickHouse/ClickHouse/pull/59172) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Move some headers [#59175](https://github.com/ClickHouse/ClickHouse/pull/59175) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Analyzer: Fix CTE name clash resolution [#59177](https://github.com/ClickHouse/ClickHouse/pull/59177) ([Dmitry Novik](https://github.com/novikd)). +* Fix another place with special symbols in the URL [#59184](https://github.com/ClickHouse/ClickHouse/pull/59184) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Actions dag build filter actions refactoring [#59228](https://github.com/ClickHouse/ClickHouse/pull/59228) ([Maksim Kita](https://github.com/kitaisreal)). +* Minor cleanup of msan usage [#59229](https://github.com/ClickHouse/ClickHouse/pull/59229) ([Robert Schulze](https://github.com/rschu1ze)). +* Load server configs in clickhouse local [#59231](https://github.com/ClickHouse/ClickHouse/pull/59231) ([pufit](https://github.com/pufit)). +* Make libssh build dependent on `-DENABLE_LIBRARIES` [#59242](https://github.com/ClickHouse/ClickHouse/pull/59242) ([Robert Schulze](https://github.com/rschu1ze)). +* Disable copy constructor for MultiVersion [#59244](https://github.com/ClickHouse/ClickHouse/pull/59244) ([Vitaly Baranov](https://github.com/vitlibar)). +* CI: fix ci configuration for nightly job [#59252](https://github.com/ClickHouse/ClickHouse/pull/59252) ([Max K.](https://github.com/maxknv)). +* Fix 02475_bson_each_row_format flakiness (due to small parsing block) [#59253](https://github.com/ClickHouse/ClickHouse/pull/59253) ([Azat Khuzhin](https://github.com/azat)). +* Improve pytest --pdb experience by preserving dockerd on SIGINT (v2) [#59255](https://github.com/ClickHouse/ClickHouse/pull/59255) ([Azat Khuzhin](https://github.com/azat)). +* Fix fasttest by pinning pip dependencies [#59256](https://github.com/ClickHouse/ClickHouse/pull/59256) ([Azat Khuzhin](https://github.com/azat)). +* Added AtomicLogger [#59273](https://github.com/ClickHouse/ClickHouse/pull/59273) ([Maksim Kita](https://github.com/kitaisreal)). +* Update test_reload_after_fail_in_cache_dictionary for analyzer [#59274](https://github.com/ClickHouse/ClickHouse/pull/59274) ([vdimir](https://github.com/vdimir)). +* Update run.sh [#59280](https://github.com/ClickHouse/ClickHouse/pull/59280) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Add missing setting optimize_injective_functions_in_group_by to SettingsChangesHistory [#59283](https://github.com/ClickHouse/ClickHouse/pull/59283) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix perf tests (after sumMap starts to filter out -0.) [#59287](https://github.com/ClickHouse/ClickHouse/pull/59287) ([Azat Khuzhin](https://github.com/azat)). +* Use fresh ZooKeeper client on DROP (to have higher chances on success) [#59288](https://github.com/ClickHouse/ClickHouse/pull/59288) ([Azat Khuzhin](https://github.com/azat)). +* Additional check [#59292](https://github.com/ClickHouse/ClickHouse/pull/59292) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* No debug symbols in Rust [#59306](https://github.com/ClickHouse/ClickHouse/pull/59306) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix deadlock in `AsyncLoader::stop()` [#59308](https://github.com/ClickHouse/ClickHouse/pull/59308) ([Sergei Trifonov](https://github.com/serxa)). +* Speed up `00165_jit_aggregate_functions` [#59312](https://github.com/ClickHouse/ClickHouse/pull/59312) ([Nikita Taranov](https://github.com/nickitat)). +* CI: WA for issue with perf test with artifact reuse [#59325](https://github.com/ClickHouse/ClickHouse/pull/59325) ([Max K.](https://github.com/maxknv)). +* Fix typo [#59329](https://github.com/ClickHouse/ClickHouse/pull/59329) ([Raúl Marín](https://github.com/Algunenano)). +* Simplify query_run_metric_arrays in perf tests [#59333](https://github.com/ClickHouse/ClickHouse/pull/59333) ([Raúl Marín](https://github.com/Algunenano)). +* IVolume constructor improve exception message [#59335](https://github.com/ClickHouse/ClickHouse/pull/59335) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix upgrade check for new setting [#59343](https://github.com/ClickHouse/ClickHouse/pull/59343) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix sccache when building without coverage [#59345](https://github.com/ClickHouse/ClickHouse/pull/59345) ([Raúl Marín](https://github.com/Algunenano)). +* Loggers initialization fix [#59347](https://github.com/ClickHouse/ClickHouse/pull/59347) ([Maksim Kita](https://github.com/kitaisreal)). +* Add setting update_insert_deduplication_token_in_dependent_materialized_views to settings changes history [#59349](https://github.com/ClickHouse/ClickHouse/pull/59349) ([Maksim Kita](https://github.com/kitaisreal)). +* Slightly better memory usage in `AsynchronousBoundedReadBuffer` [#59354](https://github.com/ClickHouse/ClickHouse/pull/59354) ([Anton Popov](https://github.com/CurtizJ)). +* Try to make variant tests a bit faster [#59355](https://github.com/ClickHouse/ClickHouse/pull/59355) ([Kruglov Pavel](https://github.com/Avogar)). +* Minor typos in Settings.h [#59371](https://github.com/ClickHouse/ClickHouse/pull/59371) ([Jordi Villar](https://github.com/jrdi)). +* Rename `quantileDDSketch` to `quantileDD` [#59372](https://github.com/ClickHouse/ClickHouse/pull/59372) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + diff --git a/docs/changelogs/v24.1.2.5-stable.md b/docs/changelogs/v24.1.2.5-stable.md new file mode 100644 index 00000000000..bac25c9b9ed --- /dev/null +++ b/docs/changelogs/v24.1.2.5-stable.md @@ -0,0 +1,14 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.1.2.5-stable (b2605dd4a5a) FIXME as compared to v24.1.1.2048-stable (5a024dfc093) + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix translate() with FixedString input [#59356](https://github.com/ClickHouse/ClickHouse/pull/59356) ([Raúl Marín](https://github.com/Algunenano)). +* Fix stacktraces for binaries without debug symbols [#59444](https://github.com/ClickHouse/ClickHouse/pull/59444) ([Azat Khuzhin](https://github.com/azat)). + diff --git a/docs/changelogs/v24.1.3.31-stable.md b/docs/changelogs/v24.1.3.31-stable.md new file mode 100644 index 00000000000..046ca451fbc --- /dev/null +++ b/docs/changelogs/v24.1.3.31-stable.md @@ -0,0 +1,34 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.1.3.31-stable (135b08cbd28) FIXME as compared to v24.1.2.5-stable (b2605dd4a5a) + +#### Improvement +* Backported in [#59569](https://github.com/ClickHouse/ClickHouse/issues/59569): Now dashboard understands both compressed and uncompressed state of URL's #hash (backward compatibility). Continuation of [#59124](https://github.com/ClickHouse/ClickHouse/issues/59124) . [#59548](https://github.com/ClickHouse/ClickHouse/pull/59548) ([Amos Bird](https://github.com/amosbird)). +* Backported in [#59776](https://github.com/ClickHouse/ClickHouse/issues/59776): Added settings `split_parts_ranges_into_intersecting_and_non_intersecting_final` and `split_intersecting_parts_ranges_into_layers_final`. This settings are needed to disable optimizations for queries with `FINAL` and needed for debug only. [#59705](https://github.com/ClickHouse/ClickHouse/pull/59705) ([Maksim Kita](https://github.com/kitaisreal)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix `ASTAlterCommand::formatImpl` in case of column specific settings… [#59445](https://github.com/ClickHouse/ClickHouse/pull/59445) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Make MAX use the same rules as permutation for complex types [#59498](https://github.com/ClickHouse/ClickHouse/pull/59498) ([Raúl Marín](https://github.com/Algunenano)). +* Fix corner case when passing `update_insert_deduplication_token_in_dependent_materialized_views` [#59544](https://github.com/ClickHouse/ClickHouse/pull/59544) ([Jordi Villar](https://github.com/jrdi)). +* Fix incorrect result of arrayElement / map[] on empty value [#59594](https://github.com/ClickHouse/ClickHouse/pull/59594) ([Raúl Marín](https://github.com/Algunenano)). +* Fix crash in topK when merging empty states [#59603](https://github.com/ClickHouse/ClickHouse/pull/59603) ([Raúl Marín](https://github.com/Algunenano)). +* Maintain function alias in RewriteSumFunctionWithSumAndCountVisitor [#59658](https://github.com/ClickHouse/ClickHouse/pull/59658) ([Raúl Marín](https://github.com/Algunenano)). +* Fix leftPad / rightPad function with FixedString input [#59739](https://github.com/ClickHouse/ClickHouse/pull/59739) ([Raúl Marín](https://github.com/Algunenano)). + +#### NO CL ENTRY + +* NO CL ENTRY: 'Revert "Backport [#59650](https://github.com/ClickHouse/ClickHouse/issues/59650) to 24.1: MergeTree FINAL optimization diagnostics and settings"'. [#59701](https://github.com/ClickHouse/ClickHouse/pull/59701) ([Raúl Marín](https://github.com/Algunenano)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Fix 02720_row_policy_column_with_dots [#59453](https://github.com/ClickHouse/ClickHouse/pull/59453) ([Duc Canh Le](https://github.com/canhld94)). +* Refactoring of dashboard state encoding [#59554](https://github.com/ClickHouse/ClickHouse/pull/59554) ([Sergei Trifonov](https://github.com/serxa)). +* MergeTree FINAL optimization diagnostics and settings [#59650](https://github.com/ClickHouse/ClickHouse/pull/59650) ([Maksim Kita](https://github.com/kitaisreal)). +* Pin python dependencies in stateless tests [#59663](https://github.com/ClickHouse/ClickHouse/pull/59663) ([Raúl Marín](https://github.com/Algunenano)). + diff --git a/docs/changelogs/v24.1.4.20-stable.md b/docs/changelogs/v24.1.4.20-stable.md new file mode 100644 index 00000000000..8612a485f12 --- /dev/null +++ b/docs/changelogs/v24.1.4.20-stable.md @@ -0,0 +1,28 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.1.4.20-stable (f59d842b3fa) FIXME as compared to v24.1.3.31-stable (135b08cbd28) + +#### Improvement +* Backported in [#59826](https://github.com/ClickHouse/ClickHouse/issues/59826): In case when `merge_max_block_size_bytes` is small enough and tables contain wide rows (strings or tuples) background merges may stuck in an endless loop. This behaviour is fixed. Follow-up for https://github.com/ClickHouse/ClickHouse/pull/59340. [#59812](https://github.com/ClickHouse/ClickHouse/pull/59812) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). + +#### Build/Testing/Packaging Improvement +* Backported in [#59885](https://github.com/ClickHouse/ClickHouse/issues/59885): If you want to run initdb scripts every time when ClickHouse container is starting you shoud initialize environment varible CLICKHOUSE_ALWAYS_RUN_INITDB_SCRIPTS. [#59808](https://github.com/ClickHouse/ClickHouse/pull/59808) ([Alexander Nikolaev](https://github.com/AlexNik)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix digest calculation in Keeper [#59439](https://github.com/ClickHouse/ClickHouse/pull/59439) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix distributed table with a constant sharding key [#59606](https://github.com/ClickHouse/ClickHouse/pull/59606) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix query start time on non initial queries [#59662](https://github.com/ClickHouse/ClickHouse/pull/59662) ([Raúl Marín](https://github.com/Algunenano)). +* Fix parsing of partition expressions surrounded by parens [#59901](https://github.com/ClickHouse/ClickHouse/pull/59901) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Temporarily remove a feature that doesn't work [#59688](https://github.com/ClickHouse/ClickHouse/pull/59688) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Make ZooKeeper actually sequentialy consistent [#59735](https://github.com/ClickHouse/ClickHouse/pull/59735) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix special build reports in release branches [#59797](https://github.com/ClickHouse/ClickHouse/pull/59797) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v24.1.5.6-stable.md b/docs/changelogs/v24.1.5.6-stable.md new file mode 100644 index 00000000000..ce46c51e2f4 --- /dev/null +++ b/docs/changelogs/v24.1.5.6-stable.md @@ -0,0 +1,17 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.1.5.6-stable (7f67181ff31) FIXME as compared to v24.1.4.20-stable (f59d842b3fa) + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* UniqExactSet read crash fix [#59928](https://github.com/ClickHouse/ClickHouse/pull/59928) ([Maksim Kita](https://github.com/kitaisreal)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* CI: do not reuse builds on release branches [#59798](https://github.com/ClickHouse/ClickHouse/pull/59798) ([Max K.](https://github.com/maxknv)). + diff --git a/docs/en/development/architecture.md b/docs/en/development/architecture.md index cfdd2bbcc41..d3a29c9171b 100644 --- a/docs/en/development/architecture.md +++ b/docs/en/development/architecture.md @@ -166,11 +166,11 @@ For most external applications, we recommend using the HTTP interface because it ## Configuration {#configuration} -ClickHouse Server is based on POCO C++ Libraries and uses `Poco::Util::AbstractConfiguration` to represent it's configuration. Configuration is held by `Poco::Util::ServerApplication` class inherited by `DaemonBase` class, which in turn is inherited by `DB::Server` class, implementing clickhouse-server itself. So config can be accessed by `ServerApplication::config()` method. +ClickHouse Server is based on POCO C++ Libraries and uses `Poco::Util::AbstractConfiguration` to represent its configuration. Configuration is held by `Poco::Util::ServerApplication` class inherited by `DaemonBase` class, which in turn is inherited by `DB::Server` class, implementing clickhouse-server itself. So config can be accessed by `ServerApplication::config()` method. Config is read from multiple files (in XML or YAML format) and merged into single `AbstractConfiguration` by `ConfigProcessor` class. Configuration is loaded at server startup and can be reloaded later if one of config files is updated, removed or added. `ConfigReloader` class is responsible for periodic monitoring of these changes and reload procedure as well. `SYSTEM RELOAD CONFIG` query also triggers config to be reloaded. -For queries and subsystems other than `Server` config is accessible using `Context::getConfigRef()` method. Every subsystem that is capable of reloading it's config without server restart should register itself in reload callback in `Server::main()` method. Note that if newer config has an error, most subsystems will ignore new config, log warning messages and keep working with previously loaded config. Due to the nature of `AbstractConfiguration` it is not possible to pass reference to specific section, so `String config_prefix` is usually used instead. +For queries and subsystems other than `Server` config is accessible using `Context::getConfigRef()` method. Every subsystem that is capable of reloading its config without server restart should register itself in reload callback in `Server::main()` method. Note that if newer config has an error, most subsystems will ignore new config, log warning messages and keep working with previously loaded config. Due to the nature of `AbstractConfiguration` it is not possible to pass reference to specific section, so `String config_prefix` is usually used instead. ## Threads and jobs {#threads-and-jobs} @@ -255,7 +255,7 @@ When we are going to read something from a part in `MergeTree`, we look at `prim When you `INSERT` a bunch of data into `MergeTree`, that bunch is sorted by primary key order and forms a new part. There are background threads that periodically select some parts and merge them into a single sorted part to keep the number of parts relatively low. That’s why it is called `MergeTree`. Of course, merging leads to “write amplification”. All parts are immutable: they are only created and deleted, but not modified. When SELECT is executed, it holds a snapshot of the table (a set of parts). After merging, we also keep old parts for some time to make a recovery after failure easier, so if we see that some merged part is probably broken, we can replace it with its source parts. -`MergeTree` is not an LSM tree because it does not contain MEMTABLE and LOG: inserted data is written directly to the filesystem. This behavior makes MergeTree much more suitable to insert data in batches. Therefore frequently inserting small amounts of rows is not ideal for MergeTree. For example, a couple of rows per second is OK, but doing it a thousand times a second is not optimal for MergeTree. However, there is an async insert mode for small inserts to overcome this limitation. We did it this way for simplicity’s sake, and because we are already inserting data in batches in our applications +`MergeTree` is not an LSM tree because it does not contain MEMTABLE and LOG: inserted data is written directly to the filesystem. This behavior makes MergeTree much more suitable to insert data in batches. Therefore, frequently inserting small amounts of rows is not ideal for MergeTree. For example, a couple of rows per second is OK, but doing it a thousand times a second is not optimal for MergeTree. However, there is an async insert mode for small inserts to overcome this limitation. We did it this way for simplicity’s sake, and because we are already inserting data in batches in our applications There are MergeTree engines that are doing additional work during background merges. Examples are `CollapsingMergeTree` and `AggregatingMergeTree`. This could be treated as special support for updates. Keep in mind that these are not real updates because users usually have no control over the time when background merges are executed, and data in a `MergeTree` table is almost always stored in more than one part, not in completely merged form. diff --git a/docs/en/development/build-cross-s390x.md b/docs/en/development/build-cross-s390x.md index b7cda515d77..a4a83c7989b 100644 --- a/docs/en/development/build-cross-s390x.md +++ b/docs/en/development/build-cross-s390x.md @@ -38,7 +38,7 @@ ninja ## Running -Once built, the binary can be run with, eg.: +Once built, the binary can be run with, e.g.: ```bash qemu-s390x-static -L /usr/s390x-linux-gnu ./clickhouse diff --git a/docs/en/development/building_and_benchmarking_deflate_qpl.md b/docs/en/development/building_and_benchmarking_deflate_qpl.md index 4e01b41ab3c..b9d39b8cc2d 100644 --- a/docs/en/development/building_and_benchmarking_deflate_qpl.md +++ b/docs/en/development/building_and_benchmarking_deflate_qpl.md @@ -95,7 +95,7 @@ Complete below three steps mentioned in [Star Schema Benchmark](https://clickhou - Inserting data. Here should use `./benchmark_sample/rawdata_dir/ssb-dbgen/*.tbl` as input data. - Converting “star schema” to de-normalized “flat schema” -Set up database with with IAA Deflate codec +Set up database with IAA Deflate codec ``` bash $ cd ./database_dir/deflate @@ -104,7 +104,7 @@ $ [CLICKHOUSE_EXE] client ``` Complete three steps same as lz4 above -Set up database with with ZSTD codec +Set up database with ZSTD codec ``` bash $ cd ./database_dir/zstd diff --git a/docs/en/development/contrib.md b/docs/en/development/contrib.md index 4b296c43db4..bbc5fbeebcb 100644 --- a/docs/en/development/contrib.md +++ b/docs/en/development/contrib.md @@ -13,7 +13,7 @@ ClickHouse utilizes third-party libraries for different purposes, e.g., to conne SELECT library_name, license_type, license_path FROM system.licenses ORDER BY library_name COLLATE 'en'; ``` -(Note that the listed libraries are the ones located in the `contrib/` directory of the ClickHouse repository. Depending on the build options, some of of the libraries may have not been compiled, and as a result, their functionality may not be available at runtime. +Note that the listed libraries are the ones located in the `contrib/` directory of the ClickHouse repository. Depending on the build options, some of the libraries may have not been compiled, and as a result, their functionality may not be available at runtime. [Example](https://play.clickhouse.com/play?user=play#U0VMRUNUIGxpYnJhcnlfbmFtZSwgbGljZW5zZV90eXBlLCBsaWNlbnNlX3BhdGggRlJPTSBzeXN0ZW0ubGljZW5zZXMgT1JERVIgQlkgbGlicmFyeV9uYW1lIENPTExBVEUgJ2VuJw==) diff --git a/docs/en/development/developer-instruction.md b/docs/en/development/developer-instruction.md index 31346c77949..e08096d8042 100644 --- a/docs/en/development/developer-instruction.md +++ b/docs/en/development/developer-instruction.md @@ -7,13 +7,13 @@ description: Prerequisites and an overview of how to build ClickHouse # Getting Started Guide for Building ClickHouse -ClickHouse can be build on Linux, FreeBSD and macOS. If you use Windows, you can still build ClickHouse in a virtual machine running Linux, e.g. [VirtualBox](https://www.virtualbox.org/) with Ubuntu. +ClickHouse can be built on Linux, FreeBSD and macOS. If you use Windows, you can still build ClickHouse in a virtual machine running Linux, e.g. [VirtualBox](https://www.virtualbox.org/) with Ubuntu. ClickHouse requires a 64-bit system to compile and run, 32-bit systems do not work. ## Creating a Repository on GitHub {#creating-a-repository-on-github} -To start developing for ClickHouse you will need a [GitHub](https://www.virtualbox.org/) account. Please also generate a SSH key locally (if you don't have one already) and upload the public key to GitHub as this is a prerequisite for contributing patches. +To start developing for ClickHouse you will need a [GitHub](https://www.virtualbox.org/) account. Please also generate an SSH key locally (if you don't have one already) and upload the public key to GitHub as this is a prerequisite for contributing patches. Next, create a fork of the [ClickHouse repository](https://github.com/ClickHouse/ClickHouse/) in your personal account by clicking the "fork" button in the upper right corner. @@ -37,7 +37,7 @@ git clone git@github.com:your_github_username/ClickHouse.git # replace placehol cd ClickHouse ``` -This command creates a directory `ClickHouse/` containing the source code of ClickHouse. If you specify a custom checkout directory after the URL but it is important that this path does not contain whitespaces as it may lead to problems with the build later on. +This command creates a directory `ClickHouse/` containing the source code of ClickHouse. If you specify a custom checkout directory after the URL, but it is important that this path does not contain whitespaces as it may lead to problems with the build later on. The ClickHouse repository uses Git submodules, i.e. references to external repositories (usually 3rd party libraries used by ClickHouse). These are not checked out by default. To do so, you can either @@ -45,7 +45,7 @@ The ClickHouse repository uses Git submodules, i.e. references to external repos - if `git clone` did not check out submodules, run `git submodule update --init --jobs ` (e.g. ` = 12` to parallelize the checkout) to achieve the same as the previous alternative, or -- if `git clone` did not check out submodules and you like to use [sparse](https://github.blog/2020-01-17-bring-your-monorepo-down-to-size-with-sparse-checkout/) and [shallow](https://github.blog/2020-12-21-get-up-to-speed-with-partial-clone-and-shallow-clone/) submodule checkout to omit unneeded files and history in submodules to save space (ca. 5 GB instead of ca. 15 GB), run `./contrib/update-submodules.sh`. Not really recommended as it generally makes working with submodules less convenient and slower. +- if `git clone` did not check out submodules, and you like to use [sparse](https://github.blog/2020-01-17-bring-your-monorepo-down-to-size-with-sparse-checkout/) and [shallow](https://github.blog/2020-12-21-get-up-to-speed-with-partial-clone-and-shallow-clone/) submodule checkout to omit unneeded files and history in submodules to save space (ca. 5 GB instead of ca. 15 GB), run `./contrib/update-submodules.sh`. Not really recommended as it generally makes working with submodules less convenient and slower. You can check the Git status with the command: `git submodule status`. @@ -91,7 +91,7 @@ If you use Arch or Gentoo, you probably know it yourself how to install CMake. ## C++ Compiler {#c-compiler} -Compilers Clang starting from version 15 is supported for building ClickHouse. +Compilers Clang starting from version 16 is supported for building ClickHouse. Clang should be used instead of gcc. Though, our continuous integration (CI) platform runs checks for about a dozen of build combinations. @@ -143,7 +143,7 @@ When a large amount of RAM is available on build machine you should limit the nu On machines with 4GB of RAM, it is recommended to specify 1, for 8GB of RAM `-j 2` is recommended. -If you get the message: `ninja: error: loading 'build.ninja': No such file or directory`, it means that generating a build configuration has failed and you need to inspect the message above. +If you get the message: `ninja: error: loading 'build.ninja': No such file or directory`, it means that generating a build configuration has failed, and you need to inspect the message above. Upon the successful start of the building process, you’ll see the build progress - the number of processed tasks and the total number of tasks. @@ -184,7 +184,7 @@ You can also run your custom-built ClickHouse binary with the config file from t **CLion (recommended)** -If you do not know which IDE to use, we recommend that you use [CLion](https://www.jetbrains.com/clion/). CLion is commercial software but it offers a 30 day free trial. It is also free of charge for students. CLion can be used on both Linux and macOS. +If you do not know which IDE to use, we recommend that you use [CLion](https://www.jetbrains.com/clion/). CLion is commercial software, but it offers a 30 day free trial. It is also free of charge for students. CLion can be used on both Linux and macOS. A few things to know when using CLion to develop ClickHouse: diff --git a/docs/en/development/tests.md b/docs/en/development/tests.md index 1d3e7d4964e..efbce54d44b 100644 --- a/docs/en/development/tests.md +++ b/docs/en/development/tests.md @@ -109,6 +109,9 @@ Do not check for a particular wording of error message, it may change in the fut If you want to use distributed queries in functional tests, you can leverage `remote` table function with `127.0.0.{1..2}` addresses for the server to query itself; or you can use predefined test clusters in server configuration file like `test_shard_localhost`. Remember to add the words `shard` or `distributed` to the test name, so that it is run in CI in correct configurations, where the server is configured to support distributed queries. +### Working with Temporary Files + +Sometimes in a shell test you may need to create a file on the fly to work with. Keep in mind that some CI checks run tests in parallel, so if you are creating or removing a temporary file in your script without a unique name this can cause some of the CI checks, such as Flaky, to fail. To get around this you should use environment variable `$CLICKHOUSE_TEST_UNIQUE_NAME` to give temporary files a name unique to the test that is running. That way you can be sure that the file you are creating during setup or removing during cleanup is the file only in use by that test and not some other test which is running in parallel. ## Known Bugs {#known-bugs} diff --git a/docs/en/engines/table-engines/integrations/nats.md b/docs/en/engines/table-engines/integrations/nats.md index e898d1f1b82..9f7409a6893 100644 --- a/docs/en/engines/table-engines/integrations/nats.md +++ b/docs/en/engines/table-engines/integrations/nats.md @@ -38,6 +38,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] [nats_username = 'user',] [nats_password = 'password',] [nats_token = 'clickhouse',] + [nats_credential_file = '/var/nats_credentials',] [nats_startup_connect_tries = '5'] [nats_max_rows_per_message = 1,] [nats_handle_error_mode = 'default'] @@ -63,6 +64,7 @@ Optional parameters: - `nats_username` - NATS username. - `nats_password` - NATS password. - `nats_token` - NATS auth token. +- `nats_credential_file` - Path to a NATS credentials file. - `nats_startup_connect_tries` - Number of connect tries at startup. Default: `5`. - `nats_max_rows_per_message` — The maximum number of rows written in one NATS message for row-based formats. (default : `1`). - `nats_handle_error_mode` — How to handle errors for RabbitMQ engine. Possible values: default (the exception will be thrown if we fail to parse a message), stream (the exception message and raw message will be saved in virtual columns `_error` and `_raw_message`). diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index be588f1764d..78a27d3ff86 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -2,7 +2,7 @@ Nearest neighborhood search is the problem of finding the M closest points for a given point in an N-dimensional vector space. The most straightforward approach to solve this problem is a brute force search where the distance between all points in the vector space and the -reference point is computed. This method guarantees perfect accuracy but it is usually too slow for practical applications. Thus, nearest +reference point is computed. This method guarantees perfect accuracy, but it is usually too slow for practical applications. Thus, nearest neighborhood search problems are often solved with [approximative algorithms](https://github.com/erikbern/ann-benchmarks). Approximative nearest neighborhood search techniques, in conjunction with [embedding methods](https://cloud.google.com/architecture/overview-extracting-and-serving-feature-embeddings-for-machine-learning) allow to search huge @@ -24,7 +24,7 @@ LIMIT N `vectors` contains N-dimensional values of type [Array](../../../sql-reference/data-types/array.md) or [Tuple](../../../sql-reference/data-types/tuple.md), for example embeddings. Function `Distance` computes the distance between two vectors. -Often, the the Euclidean (L2) distance is chosen as distance function but [other +Often, the Euclidean (L2) distance is chosen as distance function but [other distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point, e.g. `(0.17, 0.33, ...)`, and `N` limits the number of search results. @@ -109,7 +109,7 @@ clickhouse-client --param_vec='hello' --query="SELECT * FROM table_with_ann_inde **Restrictions**: Queries that contain both a `WHERE Distance(vectors, Point) < MaxDistance` and an `ORDER BY Distance(vectors, Point)` clause cannot use ANN indexes. Also, the approximate algorithms used to determine the nearest neighbors require a limit, hence queries -without `LIMIT` clause cannot utilize ANN indexes. Also ANN indexes are only used if the query has a `LIMIT` value smaller than setting +without `LIMIT` clause cannot utilize ANN indexes. Also, ANN indexes are only used if the query has a `LIMIT` value smaller than setting `max_limit_for_ann_queries` (default: 1 million rows). This is a safeguard to prevent large memory allocations by external libraries for approximate neighbor search. @@ -120,9 +120,9 @@ then each indexed block will contain 16384 rows. However, data structures and al provided by external libraries) are inherently row-oriented. They store a compact representation of a set of rows and also return rows for ANN queries. This causes some rather unintuitive differences in the way ANN indexes behave compared to normal skip indexes. -When a user defines a ANN index on a column, ClickHouse internally creates a ANN "sub-index" for each index block. The sub-index is "local" +When a user defines an ANN index on a column, ClickHouse internally creates an ANN "sub-index" for each index block. The sub-index is "local" in the sense that it only knows about the rows of its containing index block. In the previous example and assuming that a column has 65536 -rows, we obtain four index blocks (spanning eight granules) and a ANN sub-index for each index block. A sub-index is theoretically able to +rows, we obtain four index blocks (spanning eight granules) and an ANN sub-index for each index block. A sub-index is theoretically able to return the rows with the N closest points within its index block directly. However, since ClickHouse loads data from disk to memory at the granularity of granules, sub-indexes extrapolate matching rows to granule granularity. This is different from regular skip indexes which skip data at the granularity of index blocks. @@ -231,7 +231,7 @@ The Annoy index currently does not work with per-table, non-default `index_granu ## USearch {#usearch} -This type of ANN index is based on the [the USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW +This type of ANN index is based on the [USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW algorithm](https://arxiv.org/abs/1603.09320), i.e., builds a hierarchical graph where each point represents a vector and the edges represent similarity. Such hierarchical structures can be very efficient on large collections. They may often fetch 0.05% or less data from the overall dataset, while still providing 99% recall. This is especially useful when working with high-dimensional vectors, diff --git a/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md b/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md index 0043e1b6748..ba4021d8422 100644 --- a/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md @@ -125,7 +125,7 @@ For each resulting data part ClickHouse saves: 3. The first “cancel” row, if there are more “cancel” rows than “state” rows. 4. None of the rows, in all other cases. -Also when there are at least 2 more “state” rows than “cancel” rows, or at least 2 more “cancel” rows then “state” rows, the merge continues, but ClickHouse treats this situation as a logical error and records it in the server log. This error can occur if the same data were inserted more than once. +Also, when there are at least 2 more “state” rows than “cancel” rows, or at least 2 more “cancel” rows then “state” rows, the merge continues, but ClickHouse treats this situation as a logical error and records it in the server log. This error can occur if the same data were inserted more than once. Thus, collapsing should not change the results of calculating statistics. Changes gradually collapsed so that in the end only the last state of almost every object left. @@ -196,7 +196,7 @@ What do we see and where is collapsing? With two `INSERT` queries, we created 2 data parts. The `SELECT` query was performed in 2 threads, and we got a random order of rows. Collapsing not occurred because there was no merge of the data parts yet. ClickHouse merges data part in an unknown moment which we can not predict. -Thus we need aggregation: +Thus, we need aggregation: ``` sql SELECT diff --git a/docs/en/engines/table-engines/special/distributed.md b/docs/en/engines/table-engines/special/distributed.md index de8ae0357dc..c3b8a2f2048 100644 --- a/docs/en/engines/table-engines/special/distributed.md +++ b/docs/en/engines/table-engines/special/distributed.md @@ -72,7 +72,7 @@ Specifying the `sharding_key` is necessary for the following: #### fsync_directories -`fsync_directories` - do the `fsync` for directories. Guarantees that the OS refreshed directory metadata after operations related to background inserts on Distributed table (after insert, after sending the data to shard, etc). +`fsync_directories` - do the `fsync` for directories. Guarantees that the OS refreshed directory metadata after operations related to background inserts on Distributed table (after insert, after sending the data to shard, etc.). #### bytes_to_throw_insert @@ -220,7 +220,7 @@ Second, you can perform `INSERT` statements on a `Distributed` table. In this ca Each shard can have a `` defined in the config file. By default, the weight is `1`. Data is distributed across shards in the amount proportional to the shard weight. All shard weights are summed up, then each shard's weight is divided by the total to determine each shard's proportion. For example, if there are two shards and the first has a weight of 1 while the second has a weight of 2, the first will be sent one third (1 / 3) of inserted rows and the second will be sent two thirds (2 / 3). -Each shard can have the `internal_replication` parameter defined in the config file. If this parameter is set to `true`, the write operation selects the first healthy replica and writes data to it. Use this if the tables underlying the `Distributed` table are replicated tables (e.g. any of the `Replicated*MergeTree` table engines). One of the table replicas will receive the write and it will be replicated to the other replicas automatically. +Each shard can have the `internal_replication` parameter defined in the config file. If this parameter is set to `true`, the write operation selects the first healthy replica and writes data to it. Use this if the tables underlying the `Distributed` table are replicated tables (e.g. any of the `Replicated*MergeTree` table engines). One of the table replicas will receive the write, and it will be replicated to the other replicas automatically. If `internal_replication` is set to `false` (the default), data is written to all replicas. In this case, the `Distributed` table replicates data itself. This is worse than using replicated tables because the consistency of replicas is not checked and, over time, they will contain slightly different data. diff --git a/docs/en/engines/table-engines/special/memory.md b/docs/en/engines/table-engines/special/memory.md index 5cd766a318a..0d552a69804 100644 --- a/docs/en/engines/table-engines/special/memory.md +++ b/docs/en/engines/table-engines/special/memory.md @@ -6,6 +6,12 @@ sidebar_label: Memory # Memory Table Engine +:::note +When using the Memory table engine on ClickHouse Cloud, data is not replicated across all nodes (by design). To guarantee that all queries are routed to the same node and that the Memory table engine works as expected, you can do one of the following: +- Execute all operations in the same session +- Use a client that uses TCP or the native interface (which enables support for sticky connections) such as [clickhouse-client](/en/interfaces/cli) +::: + The Memory engine stores data in RAM, in uncompressed form. Data is stored in exactly the same form as it is received when read. In other words, reading from this table is completely free. Concurrent data access is synchronized. Locks are short: read and write operations do not block each other. Indexes are not supported. Reading is parallelized. diff --git a/docs/en/getting-started/example-datasets/amazon-reviews.md b/docs/en/getting-started/example-datasets/amazon-reviews.md index 00dc553782c..c07ffa86dd9 100644 --- a/docs/en/getting-started/example-datasets/amazon-reviews.md +++ b/docs/en/getting-started/example-datasets/amazon-reviews.md @@ -12,7 +12,7 @@ The queries below were executed on a **Production** instance of [ClickHouse Clou ::: -1. Without inserting the data into ClickHouse, we can query it in place. Let's grab some rows so we can see what they look like: +1. Without inserting the data into ClickHouse, we can query it in place. Let's grab some rows, so we can see what they look like: ```sql SELECT * diff --git a/docs/en/getting-started/example-datasets/cell-towers.md b/docs/en/getting-started/example-datasets/cell-towers.md index a84eb5d561f..090de1b32fd 100644 --- a/docs/en/getting-started/example-datasets/cell-towers.md +++ b/docs/en/getting-started/example-datasets/cell-towers.md @@ -29,7 +29,7 @@ Here is a preview of the dashboard created in this guide: This dataset is from [OpenCelliD](https://www.opencellid.org/) - The world's largest Open Database of Cell Towers. -As of 2021, it contains more than 40 million records about cell towers (GSM, LTE, UMTS, etc.) around the world with their geographical coordinates and metadata (country code, network, etc). +As of 2021, it contains more than 40 million records about cell towers (GSM, LTE, UMTS, etc.) around the world with their geographical coordinates and metadata (country code, network, etc.). OpenCelliD Project is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License, and we redistribute a snapshot of this dataset under the terms of the same license. The up-to-date version of the dataset is available to download after sign in. @@ -355,7 +355,7 @@ Click on **UPDATE CHART** to render the visualization. ### Add the charts to a **dashboard** -This screenshot shows cell tower locations with LTE, UMTS, and GSM radios. The charts are all created in the same way and they are added to a dashboard. +This screenshot shows cell tower locations with LTE, UMTS, and GSM radios. The charts are all created in the same way, and they are added to a dashboard. ![Dashboard of cell towers by radio type in mcc 204](@site/docs/en/getting-started/example-datasets/images/superset-cell-tower-dashboard.png) diff --git a/docs/en/getting-started/example-datasets/covid19.md b/docs/en/getting-started/example-datasets/covid19.md index 3a7fae89ae0..da9dc4aa96b 100644 --- a/docs/en/getting-started/example-datasets/covid19.md +++ b/docs/en/getting-started/example-datasets/covid19.md @@ -132,7 +132,7 @@ FROM covid19; └────────────────────────────────────────────┘ ``` -7. You will notice the data has a lot of 0's for dates - either weekends or days where numbers were not reported each day. We can use a window function to smooth out the daily averages of new cases: +7. You will notice the data has a lot of 0's for dates - either weekends or days when numbers were not reported each day. We can use a window function to smooth out the daily averages of new cases: ```sql SELECT @@ -262,4 +262,4 @@ The results look like :::note As mentioned in the [GitHub repo](https://github.com/GoogleCloudPlatform/covid-19-open-data), the dataset is no longer updated as of September 15, 2022. -::: \ No newline at end of file +::: diff --git a/docs/en/getting-started/example-datasets/noaa.md b/docs/en/getting-started/example-datasets/noaa.md new file mode 100644 index 00000000000..9a3ec7791b6 --- /dev/null +++ b/docs/en/getting-started/example-datasets/noaa.md @@ -0,0 +1,342 @@ +--- +slug: /en/getting-started/example-datasets/noaa +sidebar_label: NOAA Global Historical Climatology Network +sidebar_position: 1 +description: 2.5 billion rows of climate data for the last 120 yrs +--- + +# NOAA Global Historical Climatology Network + +This dataset contains weather measurements for the last 120 years. Each row is a measurement for a point in time and station. + +More precisely and according to the [origin of this data](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn): + +> GHCN-Daily is a dataset that contains daily observations over global land areas. It contains station-based measurements from land-based stations worldwide, about two-thirds of which are for precipitation measurements only (Menne et al., 2012). GHCN-Daily is a composite of climate records from numerous sources that were merged together and subjected to a common suite of quality assurance reviews (Durre et al., 2010). The archive includes the following meteorological elements: + + - Daily maximum temperature + - Daily minimum temperature + - Temperature at the time of observation + - Precipitation (i.e., rain, melted snow) + - Snowfall + - Snow depth + - Other elements where available + +## Downloading the data + +- A [pre-prepared version](#pre-prepared-data) of the data for ClickHouse, which has been cleansed, re-structured, and enriched. This data covers the years 1900 to 2022. +- [Download the original data](#original-data) and convert to the format required by ClickHouse. Users wanting to add their own columns may wish to explore this approach. + +### Pre-prepared data + +More specifically, rows have been removed that did not fail any quality assurance checks by Noaa. The data has also been restructured from a measurement per line to a row per station id and date, i.e. + +```csv +"station_id","date","tempAvg","tempMax","tempMin","precipitation","snowfall","snowDepth","percentDailySun","averageWindSpeed","maxWindSpeed","weatherType" +"AEM00041194","2022-07-30",347,0,308,0,0,0,0,0,0,0 +"AEM00041194","2022-07-31",371,413,329,0,0,0,0,0,0,0 +"AEM00041194","2022-08-01",384,427,357,0,0,0,0,0,0,0 +"AEM00041194","2022-08-02",381,424,352,0,0,0,0,0,0,0 +``` + +This is simpler to query and ensures the resulting table is less sparse. Finally, the data has also been enriched with latitude and longitude. + +This data is available in the following S3 location. Either download the data to your local filesystem (and insert using the ClickHouse client) or insert directly into ClickHouse (see [Inserting from S3](#inserting-from-s3)). + +To download: + +```bash +wget https://datasets-documentation.s3.eu-west-3.amazonaws.com/noaa/noaa_enriched.parquet +``` + +### Original data + +The following details the steps to download and transform the original data in preparation for loading into ClickHouse. + +#### Download + +To download the original data: + +```bash +for i in {1900..2023}; do wget https://noaa-ghcn-pds.s3.amazonaws.com/csv.gz/${i}.csv.gz; done +``` + +#### Sampling the data + +```bash +$ clickhouse-local --query "SELECT * FROM '2021.csv.gz' LIMIT 10" --format PrettyCompact +┌─c1──────────┬───────c2─┬─c3───┬──c4─┬─c5───┬─c6───┬─c7─┬───c8─┐ +│ AE000041196 │ 20210101 │ TMAX │ 278 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AE000041196 │ 20210101 │ PRCP │ 0 │ D │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AE000041196 │ 20210101 │ TAVG │ 214 │ H │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041194 │ 20210101 │ TMAX │ 266 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041194 │ 20210101 │ TMIN │ 178 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041194 │ 20210101 │ PRCP │ 0 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041194 │ 20210101 │ TAVG │ 217 │ H │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041217 │ 20210101 │ TMAX │ 262 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041217 │ 20210101 │ TMIN │ 155 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +│ AEM00041217 │ 20210101 │ TAVG │ 202 │ H │ ᴺᵁᴸᴸ │ S │ ᴺᵁᴸᴸ │ +└─────────────┴──────────┴──────┴─────┴──────┴──────┴────┴──────┘ +``` + +Summarizing the [format documentation](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn): + + +Summarizing the format documentation and the columns in order: + + - An 11 character station identification code. This itself encodes some useful information + - YEAR/MONTH/DAY = 8 character date in YYYYMMDD format (e.g. 19860529 = May 29, 1986) + - ELEMENT = 4 character indicator of element type. Effectively the measurement type. While there are many measurements available, we select the following: + - PRCP - Precipitation (tenths of mm) + - SNOW - Snowfall (mm) + - SNWD - Snow depth (mm) + - TMAX - Maximum temperature (tenths of degrees C) + - TAVG - Average temperature (tenths of a degree C) + - TMIN - Minimum temperature (tenths of degrees C) + - PSUN - Daily percent of possible sunshine (percent) + - AWND - Average daily wind speed (tenths of meters per second) + - WSFG - Peak gust wind speed (tenths of meters per second) + - WT** = Weather Type where ** defines the weather type. Full list of weather types here. +- DATA VALUE = 5 character data value for ELEMENT i.e. the value of the measurement. +- M-FLAG = 1 character Measurement Flag. This has 10 possible values. Some of these values indicate questionable data accuracy. We accept data where this is set to “P” - identified as missing presumed zero, as this is only relevant to the PRCP, SNOW and SNWD measurements. +- Q-FLAG is the measurement quality flag with 14 possible values. We are only interested in data with an empty value i.e. it did not fail any quality assurance checks. +- S-FLAG is the source flag for the observation. Not useful for our analysis and ignored. +- OBS-TIME = 4-character time of observation in hour-minute format (i.e. 0700 =7:00 am). Typically not present in older data. We ignore this for our purposes. + +A measurement per line would result in a sparse table structure in ClickHouse. We should transform to a row per time and station, with measurements as columns. First, we limit the dataset to those rows without issues i.e. where `qFlag` is equal to an empty string. + +#### Clean the data + +Using [ClickHouse local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local) we can filter rows that represent measurements of interest and pass our quality requirements: + +```bash +clickhouse local --query "SELECT count() +FROM file('*.csv.gz', CSV, 'station_id String, date String, measurement String, value Int64, mFlag String, qFlag String, sFlag String, obsTime String') WHERE qFlag = '' AND (measurement IN ('PRCP', 'SNOW', 'SNWD', 'TMAX', 'TAVG', 'TMIN', 'PSUN', 'AWND', 'WSFG') OR startsWith(measurement, 'WT'))" + +2679264563 +``` + +With over 2.6 billion rows, this isn’t a fast query since it involves parsing all the files. On our 8 core machine, this takes around 160 seconds. + + +### Pivot data + +While the measurement per line structure can be used with ClickHouse, it will unnecessarily complicate future queries. Ideally, we need a row per station id and date, where each measurement type and associated value are a column i.e. + +```csv +"station_id","date","tempAvg","tempMax","tempMin","precipitation","snowfall","snowDepth","percentDailySun","averageWindSpeed","maxWindSpeed","weatherType" +"AEM00041194","2022-07-30",347,0,308,0,0,0,0,0,0,0 +"AEM00041194","2022-07-31",371,413,329,0,0,0,0,0,0,0 +"AEM00041194","2022-08-01",384,427,357,0,0,0,0,0,0,0 +"AEM00041194","2022-08-02",381,424,352,0,0,0,0,0,0,0 +``` + +Using ClickHouse local and a simple `GROUP BY`, we can repivot our data to this structure. To limit memory overhead, we do this one file at a time. + +```bash +for i in {1900..2022} +do +clickhouse-local --query "SELECT station_id, + toDate32(date) as date, + anyIf(value, measurement = 'TAVG') as tempAvg, + anyIf(value, measurement = 'TMAX') as tempMax, + anyIf(value, measurement = 'TMIN') as tempMin, + anyIf(value, measurement = 'PRCP') as precipitation, + anyIf(value, measurement = 'SNOW') as snowfall, + anyIf(value, measurement = 'SNWD') as snowDepth, + anyIf(value, measurement = 'PSUN') as percentDailySun, + anyIf(value, measurement = 'AWND') as averageWindSpeed, + anyIf(value, measurement = 'WSFG') as maxWindSpeed, + toUInt8OrZero(replaceOne(anyIf(measurement, startsWith(measurement, 'WT') AND value = 1), 'WT', '')) as weatherType +FROM file('$i.csv.gz', CSV, 'station_id String, date String, measurement String, value Int64, mFlag String, qFlag String, sFlag String, obsTime String') + WHERE qFlag = '' AND (measurement IN ('PRCP', 'SNOW', 'SNWD', 'TMAX', 'TAVG', 'TMIN', 'PSUN', 'AWND', 'WSFG') OR startsWith(measurement, 'WT')) +GROUP BY station_id, date +ORDER BY station_id, date FORMAT CSV" >> "noaa.csv"; +done +``` + +This query produces a single 50GB file `noaa.csv`. + +### Enriching the data + +The data has no indication of location aside from a station id, which includes a prefix country code. Ideally, each station would have a latitude and longitude associated with it. To achieve this, NOAA conveniently provides the details of each station as a separate [ghcnd-stations.txt](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn#format-of-ghcnd-stationstxt-file). This file has [several columns](https://github.com/awslabs/open-data-docs/tree/main/docs/noaa/noaa-ghcn#format-of-ghcnd-stationstxt-file), of which five are useful to our future analysis: id, latitude, longitude, elevation, and name. + +```bash +wget http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-stations.txt +``` + +```bash +clickhouse local --query "WITH stations AS (SELECT id, lat, lon, elevation, splitByString(' GSN ',name)[1] as name FROM file('ghcnd-stations.txt', Regexp, 'id String, lat Float64, lon Float64, elevation Float32, name String')) +SELECT station_id, + date, + tempAvg, + tempMax, + tempMin, + precipitation, + snowfall, + snowDepth, + percentDailySun, + averageWindSpeed, + maxWindSpeed, + weatherType, + tuple(lon, lat) as location, + elevation, + name +FROM file('noaa.csv', CSV, + 'station_id String, date Date32, tempAvg Int32, tempMax Int32, tempMin Int32, precipitation Int32, snowfall Int32, snowDepth Int32, percentDailySun Int8, averageWindSpeed Int32, maxWindSpeed Int32, weatherType UInt8') as noaa LEFT OUTER + JOIN stations ON noaa.station_id = stations.id INTO OUTFILE 'noaa_enriched.parquet' FORMAT Parquet SETTINGS format_regexp='^(.{11})\s+(\-?\d{1,2}\.\d{4})\s+(\-?\d{1,3}\.\d{1,4})\s+(\-?\d*\.\d*)\s+(.*)\s+(?:[\d]*)'" +``` +This query takes a few minutes to run and produces a 6.4 GB file, `noaa_enriched.parquet`. + +## Create table + +Create a MergeTree table in ClickHouse (from the ClickHouse client). + +```sql +CREATE TABLE noaa +( + `station_id` LowCardinality(String), + `date` Date32, + `tempAvg` Int32 COMMENT 'Average temperature (tenths of a degrees C)', + `tempMax` Int32 COMMENT 'Maximum temperature (tenths of degrees C)', + `tempMin` Int32 COMMENT 'Minimum temperature (tenths of degrees C)', + `precipitation` UInt32 COMMENT 'Precipitation (tenths of mm)', + `snowfall` UInt32 COMMENT 'Snowfall (mm)', + `snowDepth` UInt32 COMMENT 'Snow depth (mm)', + `percentDailySun` UInt8 COMMENT 'Daily percent of possible sunshine (percent)', + `averageWindSpeed` UInt32 COMMENT 'Average daily wind speed (tenths of meters per second)', + `maxWindSpeed` UInt32 COMMENT 'Peak gust wind speed (tenths of meters per second)', + `weatherType` Enum8('Normal' = 0, 'Fog' = 1, 'Heavy Fog' = 2, 'Thunder' = 3, 'Small Hail' = 4, 'Hail' = 5, 'Glaze' = 6, 'Dust/Ash' = 7, 'Smoke/Haze' = 8, 'Blowing/Drifting Snow' = 9, 'Tornado' = 10, 'High Winds' = 11, 'Blowing Spray' = 12, 'Mist' = 13, 'Drizzle' = 14, 'Freezing Drizzle' = 15, 'Rain' = 16, 'Freezing Rain' = 17, 'Snow' = 18, 'Unknown Precipitation' = 19, 'Ground Fog' = 21, 'Freezing Fog' = 22), + `location` Point, + `elevation` Float32, + `name` LowCardinality(String) +) ENGINE = MergeTree() ORDER BY (station_id, date); + +``` + +## Inserting into ClickHouse + +### Inserting from local file + +Data can be inserted from a local file as follows (from the ClickHouse client): + +```sql +INSERT INTO noaa FROM INFILE '/noaa_enriched.parquet' +``` + +where `` represents the full path to the local file on disk. + +See [here](https://clickhouse.com/blog/real-world-data-noaa-climate-data#load-the-data) for how to speed this load up. + +### Inserting from S3 + +```sql +INSERT INTO noaa SELECT * +FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/noaa/noaa_enriched.parquet') + +``` +For how to speed this up, see our blog post on [tuning large data loads](https://clickhouse.com/blog/supercharge-your-clickhouse-data-loads-part2). + +## Sample queries + +### Highest temperature ever + +```sql +SELECT + tempMax / 10 AS maxTemp, + location, + name, + date +FROM blogs.noaa +WHERE tempMax > 500 +ORDER BY + tempMax DESC, + date ASC +LIMIT 5 + +┌─maxTemp─┬─location──────────┬─name───────────────────────────────────────────┬───────date─┐ +│ 56.7 │ (-116.8667,36.45) │ CA GREENLAND RCH │ 1913-07-10 │ +│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1949-08-20 │ +│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1949-09-18 │ +│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1952-07-17 │ +│ 56.7 │ (-115.4667,32.55) │ MEXICALI (SMN) │ 1952-09-04 │ +└─────────┴───────────────────┴────────────────────────────────────────────────┴────────────┘ + +5 rows in set. Elapsed: 0.514 sec. Processed 1.06 billion rows, 4.27 GB (2.06 billion rows/s., 8.29 GB/s.) +``` + +Reassuringly consistent with the [documented record](https://en.wikipedia.org/wiki/List_of_weather_records#Highest_temperatures_ever_recorded) at [Furnace Creek](https://www.google.com/maps/place/36%C2%B027'00.0%22N+116%C2%B052'00.1%22W/@36.1329666,-116.1104099,8.95z/data=!4m5!3m4!1s0x0:0xf2ed901b860f4446!8m2!3d36.45!4d-116.8667) as of 2023. + +### Best ski resorts + +Using a [list of ski resorts](https://gist.githubusercontent.com/gingerwizard/dd022f754fd128fdaf270e58fa052e35/raw/622e03c37460f17ef72907afe554cb1c07f91f23/ski_resort_stats.csv) in the united states and their respective locations, we join these against the top 1000 weather stations with the most in any month in the last 5 yrs. Sorting this join by [geoDistance](https://clickhouse.com/docs/en/sql-reference/functions/geo/coordinates/#geodistance) and restricting the results to those where the distance is less than 20km, we select the top result per resort and sort this by total snow. Note we also restrict resorts to those above 1800m, as a broad indicator of good skiing conditions. + +```sql +SELECT + resort_name, + total_snow / 1000 AS total_snow_m, + resort_location, + month_year +FROM +( + WITH resorts AS + ( + SELECT + resort_name, + state, + (lon, lat) AS resort_location, + 'US' AS code + FROM url('https://gist.githubusercontent.com/gingerwizard/dd022f754fd128fdaf270e58fa052e35/raw/622e03c37460f17ef72907afe554cb1c07f91f23/ski_resort_stats.csv', CSVWithNames) + ) + SELECT + resort_name, + highest_snow.station_id, + geoDistance(resort_location.1, resort_location.2, station_location.1, station_location.2) / 1000 AS distance_km, + highest_snow.total_snow, + resort_location, + station_location, + month_year + FROM + ( + SELECT + sum(snowfall) AS total_snow, + station_id, + any(location) AS station_location, + month_year, + substring(station_id, 1, 2) AS code + FROM noaa + WHERE (date > '2017-01-01') AND (code = 'US') AND (elevation > 1800) + GROUP BY + station_id, + toYYYYMM(date) AS month_year + ORDER BY total_snow DESC + LIMIT 1000 + ) AS highest_snow + INNER JOIN resorts ON highest_snow.code = resorts.code + WHERE distance_km < 20 + ORDER BY + resort_name ASC, + total_snow DESC + LIMIT 1 BY + resort_name, + station_id +) +ORDER BY total_snow DESC +LIMIT 5 + +┌─resort_name──────────┬─total_snow_m─┬─resort_location─┬─month_year─┐ +│ Sugar Bowl, CA │ 7.799 │ (-120.3,39.27) │ 201902 │ +│ Donner Ski Ranch, CA │ 7.799 │ (-120.34,39.31) │ 201902 │ +│ Boreal, CA │ 7.799 │ (-120.35,39.33) │ 201902 │ +│ Homewood, CA │ 4.926 │ (-120.17,39.08) │ 201902 │ +│ Alpine Meadows, CA │ 4.926 │ (-120.22,39.17) │ 201902 │ +└──────────────────────┴──────────────┴─────────────────┴────────────┘ + +5 rows in set. Elapsed: 0.750 sec. Processed 689.10 million rows, 3.20 GB (918.20 million rows/s., 4.26 GB/s.) +Peak memory usage: 67.66 MiB. +``` + +## Credits + +We would like to acknowledge the efforts of the Global Historical Climatology Network for preparing, cleansing, and distributing this data. We appreciate your efforts. + +Menne, M.J., I. Durre, B. Korzeniewski, S. McNeal, K. Thomas, X. Yin, S. Anthony, R. Ray, R.S. Vose, B.E.Gleason, and T.G. Houston, 2012: Global Historical Climatology Network - Daily (GHCN-Daily), Version 3. [indicate subset used following decimal, e.g. Version 3.25]. NOAA National Centers for Environmental Information. http://doi.org/10.7289/V5D21VHZ [17/08/2020] diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index a53844e792f..2b90d684c13 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -197,6 +197,29 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va Instead of `--host`, `--port`, `--user` and `--password` options, ClickHouse client also supports connection strings (see next section). +## Aliases {#cli_aliases} + +- `\l` - SHOW DATABASES +- `\d` - SHOW TABLES +- `\c ` - USE DATABASE +- `.` - repeat the last query + + +## Shortkeys {#shortkeys_aliases} + +- `Alt (Option) + Shift + e` - open editor with current query. It is possible to set up an environment variable - `EDITOR`, by default vim is used. +- `Alt (Option) + #` - comment line. +- `Ctrl + r` - fuzzy history search. + +:::tip +To configure the correct work of meta key (Option) on MacOS: + +iTerm2: Go to Preferences -> Profile -> Keys -> Left Option key and click Esc+ +::: + +The full list with all available shortkeys - [replxx](https://github.com/AmokHuginnsson/replxx/blob/1f149bf/src/replxx_impl.cxx#L262). + + ## Connection string {#connection_string} clickhouse-client alternatively supports connecting to clickhouse server using a connection string similar to [MongoDB](https://www.mongodb.com/docs/manual/reference/connection-string/), [PostgreSQL](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING), [MySQL](https://dev.mysql.com/doc/refman/8.0/en/connecting-using-uri-or-key-value-pairs.html#connecting-using-uri). It has the following syntax: @@ -220,7 +243,7 @@ If no database is specified, the `default` database will be used. If the user name, password or database was specified in the connection string, it cannot be specified using `--user`, `--password` or `--database` (and vice versa). -The host component can either be an a host name and IP address. Put an IPv6 address in square brackets to specify it: +The host component can either be a host name and IP address. Put an IPv6 address in square brackets to specify it: ```text clickhouse://[2001:db8::1234] diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index a11c3e5ef19..285737312bd 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -33,7 +33,7 @@ The supported formats are: | [JSONAsString](#jsonasstring) | ✔ | ✗ | | [JSONStrings](#jsonstrings) | ✔ | ✔ | | [JSONColumns](#jsoncolumns) | ✔ | ✔ | -| [JSONColumnsWithMetadata](#jsoncolumnsmonoblock)) | ✔ | ✔ | +| [JSONColumnsWithMetadata](#jsoncolumnsmonoblock) | ✔ | ✔ | | [JSONCompact](#jsoncompact) | ✔ | ✔ | | [JSONCompactStrings](#jsoncompactstrings) | ✗ | ✔ | | [JSONCompactColumns](#jsoncompactcolumns) | ✔ | ✔ | @@ -253,7 +253,7 @@ This format is also available under the name `TSVRawWithNamesAndNames`. This format allows specifying a custom format string with placeholders for values with a specified escaping rule. -It uses settings `format_template_resultset`, `format_template_row`, `format_template_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) +It uses settings `format_template_resultset`, `format_template_row` (`format_template_row_format`), `format_template_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) Setting `format_template_row` specifies the path to the file containing format strings for rows with the following syntax: @@ -279,9 +279,11 @@ the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quo `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` +In cases where it is challenging or not possible to deploy format output configuration for the template format to a directory on all nodes in a cluster, or if the format is trivial then `format_template_row_format` can be used to set the template string directly in the query, rather than a path to the file which contains it. + The `format_template_rows_between_delimiter` setting specifies the delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) -Setting `format_template_resultset` specifies the path to the file, which contains a format string for resultset. Format string for resultset has the same syntax as a format string for row and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: +Setting `format_template_resultset` specifies the path to the file, which contains a format string for resultset. Setting `format_template_resultset_format` can be used to set the template string for the result set directly in the query itself. Format string for resultset has the same syntax as a format string for row and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: - `data` is the rows with data in `format_template_row` format, separated by `format_template_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. - `totals` is the row with total values in `format_template_row` format (when using WITH TOTALS) diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md index 4db1d53987a..39ae69eaef4 100644 --- a/docs/en/interfaces/schema-inference.md +++ b/docs/en/interfaces/schema-inference.md @@ -13,7 +13,7 @@ can control it. Schema inference is used when ClickHouse needs to read the data in a specific data format and the structure is unknown. -## Table functions [file](../sql-reference/table-functions/file.md), [s3](../sql-reference/table-functions/s3.md), [url](../sql-reference/table-functions/url.md), [hdfs](../sql-reference/table-functions/hdfs.md). +## Table functions [file](../sql-reference/table-functions/file.md), [s3](../sql-reference/table-functions/s3.md), [url](../sql-reference/table-functions/url.md), [hdfs](../sql-reference/table-functions/hdfs.md), [azureBlobStorage](../sql-reference/table-functions/azureBlobStorage.md). These table functions have the optional argument `structure` with the structure of input data. If this argument is not specified or set to `auto`, the structure will be inferred from the data. @@ -55,7 +55,7 @@ DESCRIBE file('hobbies.jsonl') └─────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ ``` -## Table engines [File](../engines/table-engines/special/file.md), [S3](../engines/table-engines/integrations/s3.md), [URL](../engines/table-engines/special/url.md), [HDFS](../engines/table-engines/integrations/hdfs.md) +## Table engines [File](../engines/table-engines/special/file.md), [S3](../engines/table-engines/integrations/s3.md), [URL](../engines/table-engines/special/url.md), [HDFS](../engines/table-engines/integrations/hdfs.md), [azureBlobStorage](../engines/table-engines/integrations/azureBlobStorage.md) If the list of columns is not specified in `CREATE TABLE` query, the structure of the table will be inferred automatically from the data. @@ -1061,7 +1061,7 @@ $$) └──────────────┴───────────────┘ ``` -## Values {#values} +### Values {#values} In Values format ClickHouse extracts column value from the row and then parses it using the recursive parser similar to how literals are parsed. @@ -1986,3 +1986,46 @@ Note: - As some of the files may not contain some columns from the resulting schema, union mode is supported only for formats that support reading subset of columns (like JSONEachRow, Parquet, TSVWithNames, etc) and won't work for other formats (like CSV, TSV, JSONCompactEachRow, etc). - If ClickHouse cannot infer the schema from one of the files, the exception will be thrown. - If you have a lot of files, reading schema from all of them can take a lot of time. + + +## Automatic format detection {#automatic-format-detection} + +If data format is not specified and cannot be determined by the file extension, ClickHouse will try to detect the file format by its content. + +**Examples:** + +Let's say we have `data` with the following content: +``` +"a","b" +1,"Data1" +2,"Data2" +3,"Data3" +``` + +We can inspect and query this file without specifying format or structure: +```sql +:) desc file(data); +``` + +```text +┌─name─┬─type─────────────┐ +│ a │ Nullable(Int64) │ +│ b │ Nullable(String) │ +└──────┴──────────────────┘ +``` + +```sql +:) select * from file(data); +``` + +```text +┌─a─┬─b─────┐ +│ 1 │ Data1 │ +│ 2 │ Data2 │ +│ 3 │ Data3 │ +└───┴───────┘ +``` + +:::note +ClickHouse can detect only some subset of formats and this detection takes some time, it's always better to specify the format explicitly. +::: \ No newline at end of file diff --git a/docs/en/operations/backup.md b/docs/en/operations/backup.md index d45885ee816..44df05a7260 100644 --- a/docs/en/operations/backup.md +++ b/docs/en/operations/backup.md @@ -451,3 +451,24 @@ To disallow concurrent backup/restore, you can use these settings respectively. The default value for both is true, so by default concurrent backup/restores are allowed. When these settings are false on a cluster, only 1 backup/restore is allowed to run on a cluster at a time. + +## Configuring BACKUP/RESTORE to use an AzureBlobStorage Endpoint + +To write backups to an AzureBlobStorage container you need the following pieces of information: +- AzureBlobStorage endpoint connection string / url, +- Container, +- Path, +- Account name (if url is specified) +- Account Key (if url is specified) + +The destination for a backup will be specified like this: +``` +AzureBlobStorage('/', '', '', '', '') +``` + +```sql +BACKUP TABLE data TO AzureBlobStorage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1/;', + 'test_container', 'data_backup'); +RESTORE TABLE data AS data_restored FROM AzureBlobStorage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1/;', + 'test_container', 'data_backup'); +``` diff --git a/docs/en/operations/configuration-files.md b/docs/en/operations/configuration-files.md index dfe62d591e3..9f17f4af1e8 100644 --- a/docs/en/operations/configuration-files.md +++ b/docs/en/operations/configuration-files.md @@ -6,15 +6,66 @@ sidebar_label: Configuration Files # Configuration Files -The ClickHouse server can be configured with configuration files in XML or YAML syntax. In most installation types, the ClickHouse server runs with `/etc/clickhouse-server/config.xml` as default configuration file but it is also possible to specify the location of the configuration file manually at server startup using command line option `--config-file=` or `-C`. Additional configuration files may be placed into directory `config.d/` relative to the main configuration file, for example into directory `/etc/clickhouse-server/config.d/`. Files in this directory and the main configuration are merged in a preprocessing step before the configuration is applied in ClickHouse server. Configuration files are merged in alphabetical order. To simplify updates and improve modularization, it is best practice to keep the default `config.xml` file unmodified and place additional customization into `config.d/`. +The ClickHouse server can be configured with configuration files in XML or YAML syntax. In most installation types, the ClickHouse server runs with `/etc/clickhouse-server/config.xml` as default configuration file, but it is also possible to specify the location of the configuration file manually at server startup using command line option `--config-file=` or `-C`. Additional configuration files may be placed into directory `config.d/` relative to the main configuration file, for example into directory `/etc/clickhouse-server/config.d/`. Files in this directory and the main configuration are merged in a preprocessing step before the configuration is applied in ClickHouse server. Configuration files are merged in alphabetical order. To simplify updates and improve modularization, it is best practice to keep the default `config.xml` file unmodified and place additional customization into `config.d/`. It is possible to mix XML and YAML configuration files, for example you could have a main configuration file `config.xml` and additional configuration files `config.d/network.xml`, `config.d/timezone.yaml` and `config.d/keeper.yaml`. Mixing XML and YAML within a single configuration file is not supported. XML configuration files should use `...` as top-level tag. In YAML configuration files, `clickhouse:` is optional, the parser inserts it implicitly if absent. -## Overriding Configuration {#override} +## Merging Configuration {#merging} -The merge of configuration files behaves as one intuitively expects: The contents of both files are combined recursively, children with the same name are replaced by the element of the more specific configuration file. The merge can be customized using attributes `replace` and `remove`. -- Attribute `replace` means that the element is replaced by the specified one. -- Attribute `remove` means that the element is deleted. +Two configuration files (usually the main configuration file and another configuration files from `config.d/`) are merged as follows: + +- If a node (i.e. a path leading to an element) appears in both files and does not have attributes `replace` or `remove`, it is included in the merged configuration file and children from both nodes are included and merged recursively. +- If one of both nodes contains attribute `replace`, it is included in the merged configuration file but only children from the node with attribute `replace` are included. +- If one of both nodes contains attribute `remove`, the node is not included in the merged configuration file (if it exists already, it is deleted). + +Example: + + +```xml + + + + 1 + + + 2 + + + 3 + + +``` + +and + +```xml + + + + 4 + + + 5 + + + 6 + + +``` + +generates merged configuration file: + +```xml + + + 1 + 4 + + + 5 + + +``` To specify that a value of an element should be replaced by the value of an environment variable, you can use attribute `from_env`. @@ -36,7 +87,7 @@ which is equal to - 150000 + 150000 @@ -63,7 +114,7 @@ XML substitution example: ``` -Substitutions can also be performed from ZooKeeper. To do this, specify the attribute `from_zk = "/path/to/node"`. The element value is replaced with the contents of the node at `/path/to/node` in ZooKeeper. You can also put an entire XML subtree on the ZooKeeper node and it will be fully inserted into the source element. +Substitutions can also be performed from ZooKeeper. To do this, specify the attribute `from_zk = "/path/to/node"`. The element value is replaced with the contents of the node at `/path/to/node` in ZooKeeper. You can also put an entire XML subtree on the ZooKeeper node, and it will be fully inserted into the source element. ## Encrypting and Hiding Configuration {#encryption} @@ -125,7 +176,7 @@ Users configuration can be split into separate files similar to `config.xml` and Directory name is defined as `users_config` setting without `.xml` postfix concatenated with `.d`. Directory `users.d` is used by default, as `users_config` defaults to `users.xml`. -Note that configuration files are first merged taking into account [Override](#override) settings and includes are processed after that. +Note that configuration files are first [merged](#merging) taking into account settings, and includes are processed after that. ## XML example {#example} @@ -163,7 +214,7 @@ key: value Corresponding XML: ``` xml -value +value ``` A nested XML node is represented by a YAML map: diff --git a/docs/en/operations/query-cache.md b/docs/en/operations/query-cache.md index fbff622ae38..a8532bc22b7 100644 --- a/docs/en/operations/query-cache.md +++ b/docs/en/operations/query-cache.md @@ -31,6 +31,10 @@ This reduces maintenance effort and avoids redundancy. ## Configuration Settings and Usage +:::note +In ClickHouse Cloud, you must use [query level settings](/en/operations/settings/query-level) to edit query cache settings. Editing [config level settings](/en/operations/configuration-files) is currently not supported. +::: + Setting [use_query_cache](settings/settings.md#use-query-cache) can be used to control whether a specific query or all queries of the current session should utilize the query cache. For example, the first execution of query diff --git a/docs/en/operations/settings/query-complexity.md b/docs/en/operations/settings/query-complexity.md index 9a80f977ed1..d86f18ff982 100644 --- a/docs/en/operations/settings/query-complexity.md +++ b/docs/en/operations/settings/query-complexity.md @@ -28,6 +28,8 @@ The maximum amount of RAM to use for running a query on a single server. The default setting is unlimited (set to `0`). +Cloud default value: depends on the amount of RAM on the replica. + The setting does not consider the volume of available memory or the total volume of memory on the machine. The restriction applies to a single query within a single server. You can use `SHOW PROCESSLIST` to see the current memory consumption for each query. @@ -104,7 +106,9 @@ Possible values: - Maximum volume of RAM (in bytes) that can be used by the single [GROUP BY](../../sql-reference/statements/select/group-by.md#select-group-by-clause) operation. - 0 — `GROUP BY` in external memory disabled. -Default value: 0. +Default value: `0`. + +Cloud default value: half the memory amount per replica. ## max_bytes_before_external_sort {#settings-max_bytes_before_external_sort} @@ -115,6 +119,8 @@ Enables or disables execution of `ORDER BY` clauses in external memory. See [ORD Default value: 0. +Cloud default value: half the memory amount per replica. + ## max_rows_to_sort {#max-rows-to-sort} A maximum number of rows before sorting. This allows you to limit memory consumption when sorting. @@ -129,7 +135,11 @@ What to do if the number of rows received before sorting exceeds one of the limi ## max_result_rows {#setting-max_result_rows} -Limit on the number of rows in the result. Also checked for subqueries, and on remote servers when running parts of a distributed query. +Limit on the number of rows in the result. Also checked for subqueries, and on remote servers when running parts of a distributed query. No limit is applied when value is `0`. + +Default value: `0`. + +Cloud default value: `0`. ## max_result_bytes {#max-result-bytes} @@ -137,10 +147,14 @@ Limit on the number of bytes in the result. The same as the previous setting. ## result_overflow_mode {#result-overflow-mode} -What to do if the volume of the result exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. +What to do if the volume of the result exceeds one of the limits: ‘throw’ or ‘break’. Using ‘break’ is similar to using LIMIT. `Break` interrupts execution only at the block level. This means that amount of returned rows is greater than [max_result_rows](#setting-max_result_rows), multiple of [max_block_size](../../operations/settings/settings.md#setting-max_block_size) and depends on [max_threads](../../operations/settings/settings.md#max_threads). +Default value: `throw`. + +Cloud default value: `throw`. + Example: ``` sql diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index eb09af44efd..9265fffa323 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -212,6 +212,8 @@ Possible values: Default value: `'basic'`. +Cloud default value: `'best_effort'`. + See also: - [DateTime data type.](../../sql-reference/data-types/datetime.md) @@ -1660,6 +1662,10 @@ Result: Path to file which contains format string for result set (for Template format). +### format_template_resultset_format {#format_template_resultset_format} + +Format string for result set (for Template format) + ### format_template_row {#format_template_row} Path to file which contains format string for rows (for Template format). @@ -1668,6 +1674,10 @@ Path to file which contains format string for rows (for Template format). Delimiter between rows (for Template format). +### format_template_row_format {#format_template_row_format} + +Format string for rows (for Template format) + ## CustomSeparated format settings {custom-separated-format-settings} ### format_custom_escaping_rule {#format_custom_escaping_rule} diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 75d05d55366..1bdec81ae88 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -508,7 +508,9 @@ Possible values: - Any positive integer number of hops. - 0 — No hops allowed. -Default value: 0. +Default value: `0`. + +Cloud default value: `10`. ## insert_null_as_default {#insert_null_as_default} @@ -1126,7 +1128,9 @@ Possible values: - 0 (or 1) — `INSERT SELECT` no parallel execution. - Positive integer. Bigger than 1. -Default value: 0. +Default value: `0`. + +Cloud default value: from `2` to `4`, depending on the service size. Parallel `INSERT SELECT` has effect only if the `SELECT` part is executed in parallel, see [max_threads](#max_threads) setting. Higher values will lead to higher memory usage. @@ -1207,7 +1211,9 @@ Default value: 10000. Cancels HTTP read-only queries (e.g. SELECT) when a client closes the connection without waiting for the response. -Default value: 0 +Default value: `0`. + +Cloud default value: `1`. ## poll_interval {#poll-interval} @@ -1769,6 +1775,10 @@ Default value: 0 (no restriction). ## insert_quorum {#insert_quorum} +:::note +`insert_quorum` does not apply when using the [`SharedMergeTree` table engine](/en/cloud/reference/shared-merge-tree) in ClickHouse Cloud as all inserts are quorum inserted. +::: + Enables the quorum writes. - If `insert_quorum < 2`, the quorum writes are disabled. @@ -1808,6 +1818,10 @@ See also: ## insert_quorum_parallel {#insert_quorum_parallel} +:::note +`insert_quorum_parallel` does not apply when using the [`SharedMergeTree` table engine](/en/cloud/reference/shared-merge-tree) in ClickHouse Cloud as all inserts are quorum inserted. +::: + Enables or disables parallelism for quorum `INSERT` queries. If enabled, additional `INSERT` queries can be sent while previous queries have not yet finished. If disabled, additional writes to the same table will be rejected. Possible values: @@ -1922,7 +1936,7 @@ Possible values: - Positive integer. - 0 — Asynchronous insertions are disabled. -Default value: `100000`. +Default value: `1000000`. ### async_insert_max_query_number {#async-insert-max-query-number} @@ -1935,7 +1949,7 @@ Possible values: Default value: `450`. -### async_insert_busy_timeout_ms {#async-insert-busy-timeout-ms} +### async_insert_busy_timeout_max_ms {#async-insert-busy-timeout-max-ms} The maximum timeout in milliseconds since the first `INSERT` query before inserting collected data. @@ -1946,6 +1960,63 @@ Possible values: Default value: `200`. +Cloud default value: `1000`. + +### async_insert_poll_timeout_ms {#async-insert-poll-timeout-ms} + +Timeout in milliseconds for polling data from asynchronous insert queue. + +Possible values: + +- Positive integer. + +Default value: `10`. + +### async_insert_use_adaptive_busy_timeout {#allow-experimental-async-insert-adaptive-busy-timeout} + +Use adaptive asynchronous insert timeout. + +Possible values: + +- 0 - Disabled. +- 1 - Enabled. + +Default value: `0`. + +### async_insert_busy_timeout_min_ms {#async-insert-busy-timeout-min-ms} + +If adaptive asynchronous insert timeout is allowed through [async_insert_use_adaptive_busy_timeout](#allow-experimental-async-insert-adaptive-busy-timeout), the setting specifies the minimum value of the asynchronous insert timeout in milliseconds. It also serves as the initial value, which may be increased later by the adaptive algorithm, up to the [async_insert_busy_timeout_ms](#async_insert_busy_timeout_ms). + +Possible values: + +- Positive integer. + +Default value: `50`. + +### async_insert_busy_timeout_ms {#async-insert-busy-timeout-ms} + +Alias for [`async_insert_busy_timeout_max_ms`](#async_insert_busy_timeout_max_ms). + +### async_insert_busy_timeout_increase_rate {#async-insert-busy-timeout-increase-rate} + +If adaptive asynchronous insert timeout is allowed through [async_insert_use_adaptive_busy_timeout](#allow-experimental-async-insert-adaptive-busy-timeout), the setting specifies the exponential growth rate at which the adaptive asynchronous insert timeout increases. + +Possible values: + +- A positive floating-point number. + +Default value: `0.2`. + +### async_insert_busy_timeout_decrease_rate {#async-insert-busy-timeout-decrease-rate} + +If adaptive asynchronous insert timeout is allowed through [async_insert_use_adaptive_busy_timeout](#allow-experimental-async-insert-adaptive-busy-timeout), the setting specifies the exponential growth rate at which the adaptive asynchronous insert timeout decreases. + +Possible values: + +- A positive floating-point number. + +Default value: `0.2`. + ### async_insert_stale_timeout_ms {#async-insert-stale-timeout-ms} The maximum timeout in milliseconds since the last `INSERT` query before dumping collected data. If enabled, the settings prolongs the [async_insert_busy_timeout_ms](#async-insert-busy-timeout-ms) with every `INSERT` query as long as [async_insert_max_data_size](#async-insert-max-data-size) is not exceeded. @@ -2040,6 +2111,32 @@ SELECT * FROM test_table └───┘ ``` +## update_insert_deduplication_token_in_dependent_materialized_views {#update-insert-deduplication-token-in-dependent-materialized-views} + +Allows to update `insert_deduplication_token` with view identifier during insert in dependent materialized views, if setting `deduplicate_blocks_in_dependent_materialized_views` is enabled and `insert_deduplication_token` is set. + +Possible values: + + 0 — Disabled. + 1 — Enabled. + +Default value: 0. + +Usage: + +If setting `deduplicate_blocks_in_dependent_materialized_views` is enabled, `insert_deduplication_token` is passed to dependent materialized views. But in complex INSERT flows it is possible that we want to avoid deduplication for dependent materialized views. + +Example: +``` +landing -┬--> mv_1_1 ---> ds_1_1 ---> mv_2_1 --┬-> ds_2_1 ---> mv_3_1 ---> ds_3_1 + | | + └--> mv_1_2 ---> ds_1_2 ---> mv_2_2 --┘ +``` + +In this example we want to avoid deduplication for two different blocks generated from `mv_2_1` and `mv_2_2` that will be inserted into `ds_2_1`. Without `update_insert_deduplication_token_in_dependent_materialized_views` setting enabled, those two different blocks will be deduplicated, because different blocks from `mv_2_1` and `mv_2_2` will have the same `insert_deduplication_token`. + +If setting `update_insert_deduplication_token_in_dependent_materialized_views` is enabled, during each insert into dependent materialized views `insert_deduplication_token` is updated with table identifier, so block from `mv_2_1` and block from `mv_2_2` will have different `insert_deduplication_token` and will not be deduplicated. + ## insert_keeper_max_retries The setting sets the maximum number of retries for ClickHouse Keeper (or ZooKeeper) requests during insert into replicated MergeTree. Only Keeper requests which failed due to network error, Keeper session timeout, or request timeout are considered for retries. @@ -2049,7 +2146,9 @@ Possible values: - Positive integer. - 0 — Retries are disabled -Default value: 0 +Default value: 20 + +Cloud default value: `20`. Keeper request retries are done after some timeout. The timeout is controlled by the following settings: `insert_keeper_retry_initial_backoff_ms`, `insert_keeper_retry_max_backoff_ms`. The first retry is done after `insert_keeper_retry_initial_backoff_ms` timeout. The consequent timeouts will be calculated as follows: @@ -2579,6 +2678,8 @@ Type: [UInt64](../../sql-reference/data-types/int-uint.md). Default value: 1000000000 nanoseconds (once a second). +**Temporarily disabled in ClickHouse Cloud.** + See also: - System table [trace_log](../../operations/system-tables/trace_log.md/#system_tables-trace_log) @@ -2602,6 +2703,8 @@ Type: [UInt64](../../sql-reference/data-types/int-uint.md). Default value: 1000000000 nanoseconds. +**Temporarily disabled in ClickHouse Cloud.** + See also: - System table [trace_log](../../operations/system-tables/trace_log.md/#system_tables-trace_log) @@ -2723,6 +2826,8 @@ Possible values: Default value: `0`. +Cloud default value: `1`. + **See Also** - [Distributed Table Engine](../../engines/table-engines/special/distributed.md/#distributed) @@ -3238,7 +3343,9 @@ Possible values: - a string representing any valid table engine name -Default value: `None` +Default value: `MergeTree`. + +Cloud default value: `SharedMergeTree`. **Example** @@ -3814,6 +3921,8 @@ Possible values: Default value: `0`. +Cloud default value: `1`. + ## database_replicated_initial_query_timeout_sec {#database_replicated_initial_query_timeout_sec} Sets how long initial DDL query should wait for Replicated database to process previous DDL queue entries in seconds. @@ -3852,6 +3961,8 @@ Possible values: Default value: `throw`. +Cloud default value: `none`. + ## flatten_nested {#flatten-nested} Sets the data format of a [nested](../../sql-reference/data-types/nested-data-structures/index.md) columns. @@ -3987,6 +4098,8 @@ Possible values: Default value: `1`. +Cloud default value: `0`. + :::note `alter_sync` is applicable to `Replicated` tables only, it does nothing to alters of not `Replicated` tables. ::: @@ -4642,6 +4755,8 @@ other connections are cancelled. Queries with `max_parallel_replicas > 1` are su Enabled by default. +Disabled by default on Cloud. + ## hedged_connection_timeout {#hedged_connection_timeout} If we can't establish connection with replica after this timeout in hedged requests, we start working with the next replica without cancelling connection to the previous. @@ -5165,7 +5280,7 @@ SETTINGS(dictionary_use_async_executor=1, max_threads=8); ## storage_metadata_write_full_object_key {#storage_metadata_write_full_object_key} When set to `true` the metadata files are written with `VERSION_FULL_OBJECT_KEY` format version. With that format full object storage key names are written to the metadata files. -When set to `false` the metadata files are written with the previous format version, `VERSION_INLINE_DATA`. With that format only suffixes of object storage key names are are written to the metadata files. The prefix for all of object storage key names is set in configurations files at `storage_configuration.disks` section. +When set to `false` the metadata files are written with the previous format version, `VERSION_INLINE_DATA`. With that format only suffixes of object storage key names are are written to the metadata files. The prefix for all of object storage key names is set in configurations files at `storage_configuration.disks` section. Default value: `false`. @@ -5176,12 +5291,102 @@ When set to `false` than all attempts are made with identical timeouts. Default value: `true`. +## allow_experimental_variant_type {#allow_experimental_variant_type} + +Allows creation of experimental [Variant](../../sql-reference/data-types/variant.md). + +Default value: `false`. + +## use_variant_as_common_type {#use_variant_as_common_type} + +Allows to use `Variant` type as a result type for [if](../../sql-reference/functions/conditional-functions.md/#if)/[multiIf](../../sql-reference/functions/conditional-functions.md/#multiif)/[array](../../sql-reference/functions/array-functions.md)/[map](../../sql-reference/functions/tuple-map-functions.md) functions when there is no common type for argument types. + +Example: + +```sql +SET use_variant_as_common_type = 1; +SELECT toTypeName(if(number % 2, number, range(number))) as variant_type FROM numbers(1); +SELECT if(number % 2, number, range(number)) as variant FROM numbers(5); +``` + +```text +┌─variant_type───────────────────┐ +│ Variant(Array(UInt64), UInt64) │ +└────────────────────────────────┘ +┌─variant───┐ +│ [] │ +│ 1 │ +│ [0,1] │ +│ 3 │ +│ [0,1,2,3] │ +└───────────┘ +``` + +```sql +SET use_variant_as_common_type = 1; +SELECT toTypeName(multiIf((number % 4) = 0, 42, (number % 4) = 1, [1, 2, 3], (number % 4) = 2, 'Hello, World!', NULL)) AS variant_type FROM numbers(1); +SELECT multiIf((number % 4) = 0, 42, (number % 4) = 1, [1, 2, 3], (number % 4) = 2, 'Hello, World!', NULL) AS variant FROM numbers(4); +``` + +```text +─variant_type─────────────────────────┐ +│ Variant(Array(UInt8), String, UInt8) │ +└──────────────────────────────────────┘ + +┌─variant───────┐ +│ 42 │ +│ [1,2,3] │ +│ Hello, World! │ +│ ᴺᵁᴸᴸ │ +└───────────────┘ +``` + +```sql +SET use_variant_as_common_type = 1; +SELECT toTypeName(array(range(number), number, 'str_' || toString(number))) as array_of_variants_type from numbers(1); +SELECT array(range(number), number, 'str_' || toString(number)) as array_of_variants FROM numbers(3); +``` + +```text +┌─array_of_variants_type────────────────────────┐ +│ Array(Variant(Array(UInt64), String, UInt64)) │ +└───────────────────────────────────────────────┘ + +┌─array_of_variants─┐ +│ [[],0,'str_0'] │ +│ [[0],1,'str_1'] │ +│ [[0,1],2,'str_2'] │ +└───────────────────┘ +``` + +```sql +SET use_variant_as_common_type = 1; +SELECT toTypeName(map('a', range(number), 'b', number, 'c', 'str_' || toString(number))) as map_of_variants_type from numbers(1); +SELECT map('a', range(number), 'b', number, 'c', 'str_' || toString(number)) as map_of_variants FROM numbers(3); +``` + +```text +┌─map_of_variants_type────────────────────────────────┐ +│ Map(String, Variant(Array(UInt64), String, UInt64)) │ +└─────────────────────────────────────────────────────┘ + +┌─map_of_variants───────────────┐ +│ {'a':[],'b':0,'c':'str_0'} │ +│ {'a':[0],'b':1,'c':'str_1'} │ +│ {'a':[0,1],'b':2,'c':'str_2'} │ +└───────────────────────────────┘ +``` + + +Default value: `false`. + ## max_partition_size_to_drop -Restriction on dropping partitions in query time. +Restriction on dropping partitions in query time. The value 0 means that you can drop partitions without any restrictions. Default value: 50 GB. -The value 0 means that you can drop partitions without any restrictions. + +Cloud default value: 1 TB. :::note This query setting overwrites its server setting equivalent, see [max_partition_size_to_drop](/docs/en/operations/server-configuration-parameters/settings.md/#max-partition-size-to-drop) @@ -5189,10 +5394,11 @@ This query setting overwrites its server setting equivalent, see [max_partition_ ## max_table_size_to_drop -Restriction on deleting tables in query time. +Restriction on deleting tables in query time. The value 0 means that you can delete all tables without any restrictions. Default value: 50 GB. -The value 0 means that you can delete all tables without any restrictions. + +Cloud default value: 1 TB. :::note This query setting overwrites its server setting equivalent, see [max_table_size_to_drop](/docs/en/operations/server-configuration-parameters/settings.md/#max-table-size-to-drop) @@ -5206,4 +5412,4 @@ Allow to ignore schema evolution in Iceberg table engine and read all data using Enabling this setting can lead to incorrect result as in case of evolved schema all data files will be read using the same schema. ::: -Default value: 'false'. \ No newline at end of file +Default value: 'false'. diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index b3ef1128c42..003277c8d4f 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -206,7 +206,7 @@ Some of these settings will disable cache features per query/profile that are en - `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` - allows to use cache in query only if it already exists, otherwise query data will not be written to local cache storage. Default: `false`. -- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on. Default: `false`. +- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on. Default: `false`. Cloud default value: `true`. - `enable_filesystem_cache_log` - turn on logging to `system.filesystem_cache_log` table. Gives a detailed view of cache usage per query. It can be turn on for specific queries or enabled in a profile. Default: `false`. diff --git a/docs/en/operations/system-tables/asynchronous_loader.md b/docs/en/operations/system-tables/asynchronous_loader.md index af9aa4ecd09..75d98e4549d 100644 --- a/docs/en/operations/system-tables/asynchronous_loader.md +++ b/docs/en/operations/system-tables/asynchronous_loader.md @@ -49,6 +49,6 @@ Every job has a pool associated with it and is started in this pool. Each pool h Time instants during job lifetime: - `schedule_time` (`DateTime64`) - Time when job was created and scheduled to be executed (usually with all its dependencies). -- `enqueue_time` (`Nullable(DateTime64)`) - Time when job became ready and was enqueued into a ready queue of it's pool. Null if the job is not ready yet. +- `enqueue_time` (`Nullable(DateTime64)`) - Time when job became ready and was enqueued into a ready queue of its pool. Null if the job is not ready yet. - `start_time` (`Nullable(DateTime64)`) - Time when worker dequeues the job from ready queue and start its execution. Null if the job is not started yet. - `finish_time` (`Nullable(DateTime64)`) - Time when job execution is finished. Null if the job is not finished yet. diff --git a/docs/en/operations/system-tables/asynchronous_metrics.md b/docs/en/operations/system-tables/asynchronous_metrics.md index fe8f963b1ec..81725b97e41 100644 --- a/docs/en/operations/system-tables/asynchronous_metrics.md +++ b/docs/en/operations/system-tables/asynchronous_metrics.md @@ -297,11 +297,11 @@ Total number of databases on the server. ### NumberOfDetachedByUserParts -The total number of parts detached from MergeTree tables by users with the `ALTER TABLE DETACH` query (as opposed to unexpected, broken or ignored parts). The server does not care about detached parts and they can be removed. +The total number of parts detached from MergeTree tables by users with the `ALTER TABLE DETACH` query (as opposed to unexpected, broken or ignored parts). The server does not care about detached parts, and they can be removed. ### NumberOfDetachedParts -The total number of parts detached from MergeTree tables. A part can be detached by a user with the `ALTER TABLE DETACH` query or by the server itself it the part is broken, unexpected or unneeded. The server does not care about detached parts and they can be removed. +The total number of parts detached from MergeTree tables. A part can be detached by a user with the `ALTER TABLE DETACH` query or by the server itself it the part is broken, unexpected or unneeded. The server does not care about detached parts, and they can be removed. ### NumberOfTables @@ -393,7 +393,7 @@ The amount of free memory plus OS page cache memory on the host system, in bytes ### OSMemoryFreeWithoutCached -The amount of free memory on the host system, in bytes. This does not include the memory used by the OS page cache memory, in bytes. The page cache memory is also available for usage by programs, so the value of this metric can be confusing. See the `OSMemoryAvailable` metric instead. For convenience we also provide the `OSMemoryFreePlusCached` metric, that should be somewhat similar to OSMemoryAvailable. See also https://www.linuxatemyram.com/. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server. +The amount of free memory on the host system, in bytes. This does not include the memory used by the OS page cache memory, in bytes. The page cache memory is also available for usage by programs, so the value of this metric can be confusing. See the `OSMemoryAvailable` metric instead. For convenience, we also provide the `OSMemoryFreePlusCached` metric, that should be somewhat similar to OSMemoryAvailable. See also https://www.linuxatemyram.com/. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server. ### OSMemoryTotal @@ -493,7 +493,7 @@ Number of threads in the server of the PostgreSQL compatibility protocol. ### QueryCacheBytes -Total size of the query cache cache in bytes. +Total size of the query cache in bytes. ### QueryCacheEntries @@ -549,7 +549,7 @@ Total amount of bytes (compressed, including data and indices) stored in all tab ### TotalPartsOfMergeTreeTables -Total amount of data parts in all tables of MergeTree family. Numbers larger than 10 000 will negatively affect the server startup time and it may indicate unreasonable choice of the partition key. +Total amount of data parts in all tables of MergeTree family. Numbers larger than 10 000 will negatively affect the server startup time, and it may indicate unreasonable choice of the partition key. ### TotalPrimaryKeyBytesInMemory diff --git a/docs/en/operations/system-tables/clusters.md b/docs/en/operations/system-tables/clusters.md index 63cc083e4bc..7a9f1438b87 100644 --- a/docs/en/operations/system-tables/clusters.md +++ b/docs/en/operations/system-tables/clusters.md @@ -19,7 +19,7 @@ Columns: - `default_database` ([String](../../sql-reference/data-types/string.md)) — The default database name. - `errors_count` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of times this host failed to reach replica. - `slowdowns_count` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of slowdowns that led to changing replica when establishing a connection with hedged requests. -- `estimated_recovery_time` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Seconds remaining until the replica error count is zeroed and it is considered to be back to normal. +- `estimated_recovery_time` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Seconds remaining until the replica error count is zeroed, and it is considered to be back to normal. - `database_shard_name` ([String](../../sql-reference/data-types/string.md)) — The name of the `Replicated` database shard (for clusters that belong to a `Replicated` database). - `database_replica_name` ([String](../../sql-reference/data-types/string.md)) — The name of the `Replicated` database replica (for clusters that belong to a `Replicated` database). - `is_active` ([Nullable(UInt8)](../../sql-reference/data-types/int-uint.md)) — The status of the `Replicated` database replica (for clusters that belong to a `Replicated` database): 1 means "replica is online", 0 means "replica is offline", `NULL` means "unknown". diff --git a/docs/en/operations/system-tables/dictionaries.md b/docs/en/operations/system-tables/dictionaries.md index 8632581144c..c4cf7ba8bfb 100644 --- a/docs/en/operations/system-tables/dictionaries.md +++ b/docs/en/operations/system-tables/dictionaries.md @@ -18,7 +18,7 @@ Columns: - `LOADED_AND_RELOADING` — Dictionary is loaded successfully, and is being reloaded right now (frequent reasons: [SYSTEM RELOAD DICTIONARY](../../sql-reference/statements/system.md#query_language-system-reload-dictionary) query, timeout, dictionary config has changed). - `FAILED_AND_RELOADING` — Could not load the dictionary as a result of an error and is loading now. - `origin` ([String](../../sql-reference/data-types/string.md)) — Path to the configuration file that describes the dictionary. -- `type` ([String](../../sql-reference/data-types/string.md)) — Type of a dictionary allocation. [Storing Dictionaries in Memory](../../sql-reference/dictionaries/index.md#storig-dictionaries-in-memory). +- `type` ([String](../../sql-reference/data-types/string.md)) — Type of dictionary allocation. [Storing Dictionaries in Memory](../../sql-reference/dictionaries/index.md#storig-dictionaries-in-memory). - `key.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of [key names](../../sql-reference/dictionaries/index.md#dictionary-key-and-fields#ext_dict_structure-key) provided by the dictionary. - `key.types` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Corresponding array of [key types](../../sql-reference/dictionaries/index.md#dictionary-key-and-fields#ext_dict_structure-key) provided by the dictionary. - `attribute.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of [attribute names](../../sql-reference/dictionaries/index.md#dictionary-key-and-fields#ext_dict_structure-attributes) provided by the dictionary. diff --git a/docs/en/operations/system-tables/metrics.md b/docs/en/operations/system-tables/metrics.md index 3dec6345eb6..898e6ae2e2c 100644 --- a/docs/en/operations/system-tables/metrics.md +++ b/docs/en/operations/system-tables/metrics.md @@ -287,7 +287,7 @@ Number of threads in the HashedDictionary thread pool running a task. ### IOPrefetchThreads -Number of threads in the IO prefertch thread pool. +Number of threads in the IO prefetch thread pool. ### IOPrefetchThreadsActive diff --git a/docs/en/operations/system-tables/quota_usage.md b/docs/en/operations/system-tables/quota_usage.md index 0dca7c525f2..3d4b8f62d2d 100644 --- a/docs/en/operations/system-tables/quota_usage.md +++ b/docs/en/operations/system-tables/quota_usage.md @@ -25,6 +25,8 @@ Columns: - `max_read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of rows read from all tables and table functions participated in queries. - `read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The total number of bytes read from all tables and table functions participated in queries. - `max_read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum of bytes read from all tables and table functions. +- `failed_sequential_authentications` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/float.md))) — The total count of sequential authentication failures. If the user entered the correct password before exceed `failed_sequential_authentications` threshold then the counter will be reset. +- `max_failed_sequential_authentications` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/float.md))) — Maximum count of sequential authentication failures. - `execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — The total query execution time, in seconds (wall time). - `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — Maximum of query execution time. diff --git a/docs/en/operations/system-tables/quotas_usage.md b/docs/en/operations/system-tables/quotas_usage.md index a04018ac2c8..960903fa25f 100644 --- a/docs/en/operations/system-tables/quotas_usage.md +++ b/docs/en/operations/system-tables/quotas_usage.md @@ -28,8 +28,10 @@ Columns: - `max_read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of rows read from all tables and table functions participated in queries. - `read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The total number of bytes read from all tables and table functions participated in queries. - `max_read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum of bytes read from all tables and table functions. -- `execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — The total query execution time, in seconds (wall time). -- `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — Maximum of query execution time. +- `failed_sequential_authentications` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — The total count of sequential authentication failures. If the user entered the correct password before exceed `failed_sequential_authentications` threshold then the counter will be reset. +- `max_failed_sequential_authentications` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — Maximum count of sequential authentication failures. +- `execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/float.md))) — The total query execution time, in seconds (wall time). +- `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/float.md))) — Maximum of query execution time. ## See Also {#see-also} diff --git a/docs/en/operations/system-tables/tables.md b/docs/en/operations/system-tables/tables.md index 8049ab091c0..2132f69319e 100644 --- a/docs/en/operations/system-tables/tables.md +++ b/docs/en/operations/system-tables/tables.md @@ -27,6 +27,8 @@ Columns: - `metadata_modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) - Time of latest modification of the table metadata. +- `metadata_version` ([Int32](../../sql-reference/data-types/int-uint.md)) - Metadata version for ReplicatedMergeTree table, 0 for non ReplicatedMergeTree table. + - `dependencies_database` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - Database dependencies. - `dependencies_table` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - Table dependencies ([materialized views](../../sql-reference/statements/create/view.md#materialized-view) the current table). diff --git a/docs/en/operations/utilities/clickhouse-benchmark.md b/docs/en/operations/utilities/clickhouse-benchmark.md index 8b7d7f85552..6d5148ad965 100644 --- a/docs/en/operations/utilities/clickhouse-benchmark.md +++ b/docs/en/operations/utilities/clickhouse-benchmark.md @@ -45,11 +45,11 @@ clickhouse-benchmark [keys] < queries_file; - `-c N`, `--concurrency=N` — Number of queries that `clickhouse-benchmark` sends simultaneously. Default value: 1. - `-d N`, `--delay=N` — Interval in seconds between intermediate reports (to disable reports set 0). Default value: 1. - `-h HOST`, `--host=HOST` — Server host. Default value: `localhost`. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `-h` keys. -- `-p N`, `--port=N` — Server port. Default value: 9000. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `-p` keys. - `-i N`, `--iterations=N` — Total number of queries. Default value: 0 (repeat forever). - `-r`, `--randomize` — Random order of queries execution if there is more than one input query. - `-s`, `--secure` — Using `TLS` connection. - `-t N`, `--timelimit=N` — Time limit in seconds. `clickhouse-benchmark` stops sending queries when the specified time limit is reached. Default value: 0 (time limit disabled). +- `--port=N` — Server port. Default value: 9000. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `--port` keys. - `--confidence=N` — Level of confidence for T-test. Possible values: 0 (80%), 1 (90%), 2 (95%), 3 (98%), 4 (99%), 5 (99.5%). Default value: 5. In the [comparison mode](#clickhouse-benchmark-comparison-mode) `clickhouse-benchmark` performs the [Independent two-sample Student’s t-test](https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test) to determine whether the two distributions aren’t different with the selected level of confidence. - `--cumulative` — Printing cumulative data instead of data per interval. - `--database=DATABASE_NAME` — ClickHouse database name. Default value: `default`. diff --git a/docs/en/operations/utilities/clickhouse-local.md b/docs/en/operations/utilities/clickhouse-local.md index c863282efc1..437a5f0fff0 100644 --- a/docs/en/operations/utilities/clickhouse-local.md +++ b/docs/en/operations/utilities/clickhouse-local.md @@ -34,7 +34,7 @@ The binary you just downloaded can run all sorts of ClickHouse tools and utiliti A common use of `clickhouse-local` is to run ad-hoc queries on files: where you don't have to insert the data into a table. `clickhouse-local` can stream the data from a file into a temporary table and execute your SQL. -If the file is sitting on the same machine as `clickhouse-local`, you can simple specify the file to load. The following `reviews.tsv` file contains a sampling of Amazon product reviews: +If the file is sitting on the same machine as `clickhouse-local`, you can simply specify the file to load. The following `reviews.tsv` file contains a sampling of Amazon product reviews: ```bash ./clickhouse local -q "SELECT * FROM 'reviews.tsv'" @@ -220,7 +220,7 @@ Arguments: - `--help` — arguments references for `clickhouse-local`. - `-V`, `--version` — print version information and exit. -Also there are arguments for each ClickHouse configuration variable which are more commonly used instead of `--config-file`. +Also, there are arguments for each ClickHouse configuration variable which are more commonly used instead of `--config-file`. ## Examples {#examples} diff --git a/docs/en/operations/utilities/clickhouse-obfuscator.md b/docs/en/operations/utilities/clickhouse-obfuscator.md index ad51e9c7776..f9a94713be7 100644 --- a/docs/en/operations/utilities/clickhouse-obfuscator.md +++ b/docs/en/operations/utilities/clickhouse-obfuscator.md @@ -38,7 +38,7 @@ For example, you have a column `IsMobile` in your table with values 0 and 1. In So, the user will be able to count the exact ratio of mobile traffic. -Let's give another example. When you have some private data in your table, like user email and you don't want to publish any single email address. +Let's give another example. When you have some private data in your table, like user email, and you don't want to publish any single email address. If your table is large enough and contains multiple different emails and no email has a very high frequency than all others, it will anonymize all data. But if you have a small number of different values in a column, it can reproduce some of them. You should look at the working algorithm of this tool works, and fine-tune its command line parameters. diff --git a/docs/en/sql-reference/aggregate-functions/reference/any.md b/docs/en/sql-reference/aggregate-functions/reference/any.md index a45eb1b409f..4631060f33f 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/any.md +++ b/docs/en/sql-reference/aggregate-functions/reference/any.md @@ -9,7 +9,7 @@ Selects the first encountered value of a column. By default, it ignores NULL values and returns the first NOT NULL value found in the column. As [`first_value`](../../../sql-reference/aggregate-functions/reference/first_value.md) if supports `RESPECT NULLS`, in which case it will select the first value passed, independently on whether it's NULL or not. -The return type of the function is the same as the input, except for LowCardinality which is discarded). This means that given no rows as input it will return the default value of that type (0 for integers, or Null for a Nullable() column). You might use the `-OrNull` [combinator](../../../sql-reference/aggregate-functions/combinators.md) ) to modify this behaviour. +The return type of the function is the same as the input, except for LowCardinality which is discarded. This means that given no rows as input it will return the default value of that type (0 for integers, or Null for a Nullable() column). You might use the `-OrNull` [combinator](../../../sql-reference/aggregate-functions/combinators.md) ) to modify this behaviour. The query can be executed in any order and even in a different order each time, so the result of this function is indeterminate. To get a determinate result, you can use the ‘min’ or ‘max’ function instead of ‘any’. diff --git a/docs/en/sql-reference/aggregate-functions/reference/contingency.md b/docs/en/sql-reference/aggregate-functions/reference/contingency.md index 1b53ca1528f..902c1f4af80 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/contingency.md +++ b/docs/en/sql-reference/aggregate-functions/reference/contingency.md @@ -20,7 +20,7 @@ contingency(column1, column2) **Returned value** -- a value between 0 to 1. The larger the result, the closer the association of the two columns. +- a value between 0 and 1. The larger the result, the closer the association of the two columns. **Return type** is always [Float64](../../../sql-reference/data-types/float.md). @@ -48,4 +48,4 @@ Result: ┌──────cramersV(a, b)─┬───contingency(a, b)─┐ │ 0.41171788506213564 │ 0.05812725261759165 │ └─────────────────────┴─────────────────────┘ -``` \ No newline at end of file +``` diff --git a/docs/en/sql-reference/aggregate-functions/reference/grouparraysorted.md b/docs/en/sql-reference/aggregate-functions/reference/grouparraysorted.md new file mode 100644 index 00000000000..cc601c097fe --- /dev/null +++ b/docs/en/sql-reference/aggregate-functions/reference/grouparraysorted.md @@ -0,0 +1,48 @@ + --- + toc_priority: 112 + --- + + # groupArraySorted {#groupArraySorted} + + Returns an array with the first N items in ascending order. + + ``` sql + groupArraySorted(N)(column) + ``` + + **Arguments** + + - `N` – The number of elements to return. + + If the parameter is omitted, default value is the size of input. + + - `column` – The value (Integer, String, Float and other Generic types). + + **Example** + + Gets the first 10 numbers: + + ``` sql + SELECT groupArraySorted(10)(number) FROM numbers(100) + ``` + + ``` text + ┌─groupArraySorted(10)(number)─┐ + │ [0,1,2,3,4,5,6,7,8,9] │ + └──────────────────────────────┘ + ``` + + + Gets all the String implementations of all numbers in column: + + ``` sql +SELECT groupArraySorted(str) FROM (SELECT toString(number) as str FROM numbers(5)); + + ``` + + ``` text + ┌─groupArraySorted(str)────────┐ + │ ['0','1','2','3','4'] │ + └──────────────────────────────┘ + ``` + \ No newline at end of file diff --git a/docs/en/sql-reference/aggregate-functions/reference/index.md b/docs/en/sql-reference/aggregate-functions/reference/index.md index 10bd3e11064..93d4282c32b 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/index.md +++ b/docs/en/sql-reference/aggregate-functions/reference/index.md @@ -54,6 +54,7 @@ ClickHouse-specific aggregate functions: - [groupArrayMovingAvg](/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingavg.md) - [groupArrayMovingSum](/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingsum.md) - [groupArraySample](./grouparraysample.md) +- [groupArraySorted](/docs/en/sql-reference/aggregate-functions/reference/grouparraysorted.md) - [groupBitAnd](/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md) - [groupBitOr](/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md) - [groupBitXor](/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md) @@ -88,7 +89,7 @@ ClickHouse-specific aggregate functions: - [quantileTDigestWeighted](/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md) - [quantileBFloat16](/docs/en/sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16) - [quantileBFloat16Weighted](/docs/en/sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16weighted) -- [quantileDDSketch](/docs/en/sql-reference/aggregate-functions/reference/quantileddsketch.md#quantileddsketch) +- [quantileDD](/docs/en/sql-reference/aggregate-functions/reference/quantileddsketch.md#quantileddsketch) - [simpleLinearRegression](/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md) - [stochasticLinearRegression](/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md) - [stochasticLogisticRegression](/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md) @@ -105,4 +106,3 @@ ClickHouse-specific aggregate functions: - [sparkBar](./sparkbar.md) - [sumCount](./sumcount.md) - [largestTriangleThreeBuckets](./largestTriangleThreeBuckets.md) - diff --git a/docs/en/sql-reference/aggregate-functions/reference/median.md b/docs/en/sql-reference/aggregate-functions/reference/median.md index 7467a47cf5f..2a166c83dad 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/median.md +++ b/docs/en/sql-reference/aggregate-functions/reference/median.md @@ -18,7 +18,7 @@ Functions: - `medianTDigest` — Alias for [quantileTDigest](../../../sql-reference/aggregate-functions/reference/quantiletdigest.md#quantiletdigest). - `medianTDigestWeighted` — Alias for [quantileTDigestWeighted](../../../sql-reference/aggregate-functions/reference/quantiletdigestweighted.md#quantiletdigestweighted). - `medianBFloat16` — Alias for [quantileBFloat16](../../../sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16). -- `medianDDSketch` — Alias for [quantileDDSketch](../../../sql-reference/aggregate-functions/reference/quantileddsketch.md#quantileddsketch). +- `medianDD` — Alias for [quantileDD](../../../sql-reference/aggregate-functions/reference/quantileddsketch.md#quantileddsketch). **Example** diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantileddsketch.md b/docs/en/sql-reference/aggregate-functions/reference/quantileddsketch.md index 9cb73dfc9d8..f9acd2e20cb 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantileddsketch.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantileddsketch.md @@ -1,10 +1,10 @@ --- slug: /en/sql-reference/aggregate-functions/reference/quantileddsketch sidebar_position: 211 -title: quantileDDSketch +title: quantileDD --- -Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a sample with relative-error guarantees. It works by building a [DDSketch](https://www.vldb.org/pvldb/vol12/p2195-masson.pdf). +Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a sample with relative-error guarantees. It works by building a [DD](https://www.vldb.org/pvldb/vol12/p2195-masson.pdf). **Syntax** @@ -44,13 +44,13 @@ Input table has an integer and a float columns: Query to calculate 0.75-quantile (third quartile): ``` sql -SELECT quantileDDSketch(0.01, 0.75)(a), quantileDDSketch(0.01, 0.75)(b) FROM example_table; +SELECT quantileDD(0.01, 0.75)(a), quantileDD(0.01, 0.75)(b) FROM example_table; ``` Result: ``` text -┌─quantileDDSketch(0.01, 0.75)(a)─┬─quantileDDSketch(0.01, 0.75)(b)─┐ +┌─quantileDD(0.01, 0.75)(a)─┬─quantileDD(0.01, 0.75)(b)─┐ │ 2.974233423476717 │ 1.01 │ └─────────────────────────────────┴─────────────────────────────────┘ ``` diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiles.md b/docs/en/sql-reference/aggregate-functions/reference/quantiles.md index e5da6a9c1de..e2a5bc53e32 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantiles.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantiles.md @@ -9,7 +9,7 @@ sidebar_position: 201 Syntax: `quantiles(level1, level2, …)(x)` -All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantileInterpolatedWeighted`, `quantilesTDigest`, `quantilesBFloat16`, `quantilesDDSketch`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values. +All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantileInterpolatedWeighted`, `quantilesTDigest`, `quantilesBFloat16`, `quantilesDD`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values. ## quantilesExactExclusive diff --git a/docs/en/sql-reference/data-types/datetime64.md b/docs/en/sql-reference/data-types/datetime64.md index 8c7fa17ae92..504d0e2b0a6 100644 --- a/docs/en/sql-reference/data-types/datetime64.md +++ b/docs/en/sql-reference/data-types/datetime64.md @@ -9,7 +9,7 @@ sidebar_label: DateTime64 Allows to store an instant in time, that can be expressed as a calendar date and a time of a day, with defined sub-second precision Tick size (precision): 10-precision seconds. Valid range: [ 0 : 9 ]. -Typically are used - 3 (milliseconds), 6 (microseconds), 9 (nanoseconds). +Typically, are used - 3 (milliseconds), 6 (microseconds), 9 (nanoseconds). **Syntax:** diff --git a/docs/en/sql-reference/data-types/decimal.md b/docs/en/sql-reference/data-types/decimal.md index e082eb29fbd..2b32e72a28f 100644 --- a/docs/en/sql-reference/data-types/decimal.md +++ b/docs/en/sql-reference/data-types/decimal.md @@ -10,7 +10,7 @@ Signed fixed-point numbers that keep precision during add, subtract and multiply ## Parameters -- P - precision. Valid range: \[ 1 : 76 \]. Determines how many decimal digits number can have (including fraction). By default the precision is 10. +- P - precision. Valid range: \[ 1 : 76 \]. Determines how many decimal digits number can have (including fraction). By default, the precision is 10. - S - scale. Valid range: \[ 0 : P \]. Determines how many decimal digits fraction can have. Decimal(P) is equivalent to Decimal(P, 0). Similarly, the syntax Decimal is equivalent to Decimal(10, 0). diff --git a/docs/en/sql-reference/data-types/json.md b/docs/en/sql-reference/data-types/json.md index f727f0d75f7..fd548a0d5a2 100644 --- a/docs/en/sql-reference/data-types/json.md +++ b/docs/en/sql-reference/data-types/json.md @@ -7,7 +7,7 @@ sidebar_label: JSON # JSON :::note -This feature is experimental and is not production ready. If you need to work with JSON documents, consider using [this guide](/docs/en/integrations/data-ingestion/data-formats/json.md) instead. +This feature is experimental and is not production-ready. If you need to work with JSON documents, consider using [this guide](/docs/en/integrations/data-ingestion/data-formats/json.md) instead. ::: Stores JavaScript Object Notation (JSON) documents in a single column. @@ -15,7 +15,8 @@ Stores JavaScript Object Notation (JSON) documents in a single column. `JSON` is an alias for `Object('json')`. :::note -The JSON data type is an experimental feature. To use it, set `allow_experimental_object_type = 1`. +The JSON data type is an obsolete feature. Do not use it. +If you want to use it, set `allow_experimental_object_type = 1`. ::: ## Example diff --git a/docs/en/sql-reference/data-types/variant.md b/docs/en/sql-reference/data-types/variant.md new file mode 100644 index 00000000000..f027e3fe343 --- /dev/null +++ b/docs/en/sql-reference/data-types/variant.md @@ -0,0 +1,274 @@ +--- +slug: /en/sql-reference/data-types/variant +sidebar_position: 55 +sidebar_label: Variant +--- + +# Variant(T1, T2, T3, ...) + +This type represents a union of other data types. Type `Variant(T1, T2, ..., TN)` means that each row of this type +has a value of either type `T1` or `T2` or ... or `TN` or none of them (`NULL` value). + +The order of nested types doesn't matter: Variant(T1, T2) = Variant(T2, T1). +Nested types can be arbitrary types except Nullable(...), LowCardinality(Nullable(...)) and Variant(...) types. + +:::note +The Variant data type is an experimental feature. To use it, set `allow_experimental_variant_type = 1`. +::: + +## Creating Variant + +Using `Variant` type in table column definition: + +```sql +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT v FROM test; +``` + +```text +┌─v─────────────┐ +│ ᴺᵁᴸᴸ │ +│ 42 │ +│ Hello, World! │ +│ [1,2,3] │ +└───────────────┘ +``` + +Using CAST from ordinary columns: + +```sql +SELECT toTypeName(variant) as type_name, 'Hello, World!'::Variant(UInt64, String, Array(UInt64)) as variant; +``` + +```text +┌─type_name──────────────────────────────┬─variant───────┐ +│ Variant(Array(UInt64), String, UInt64) │ Hello, World! │ +└────────────────────────────────────────┴───────────────┘ +``` + +Using functions `if/multiIf` when arguments don't have common type (setting `use_variant_as_common_type` should be enabled for it): + +```sql +SET use_variant_as_common_type = 1; +SELECT if(number % 2, number, range(number)) as variant FROM numbers(5); +``` + +```text +┌─variant───┐ +│ [] │ +│ 1 │ +│ [0,1] │ +│ 3 │ +│ [0,1,2,3] │ +└───────────┘ +``` + +```sql +SET use_variant_as_common_type = 1; +SELECT multiIf((number % 4) = 0, 42, (number % 4) = 1, [1, 2, 3], (number % 4) = 2, 'Hello, World!', NULL) AS variant FROM numbers(4); +``` + +```text +┌─variant───────┐ +│ 42 │ +│ [1,2,3] │ +│ Hello, World! │ +│ ᴺᵁᴸᴸ │ +└───────────────┘ +``` + +Using functions 'array/map' if array elements/map values don't have common type (setting `use_variant_as_common_type` should be enabled for it): + +```sql +SET use_variant_as_common_type = 1; +SELECT array(range(number), number, 'str_' || toString(number)) as array_of_variants FROM numbers(3); +``` + +```text +┌─array_of_variants─┐ +│ [[],0,'str_0'] │ +│ [[0],1,'str_1'] │ +│ [[0,1],2,'str_2'] │ +└───────────────────┘ +``` + +```sql +SET use_variant_as_common_type = 1; +SELECT map('a', range(number), 'b', number, 'c', 'str_' || toString(number)) as map_of_variants FROM numbers(3); +``` + +```text +┌─map_of_variants───────────────┐ +│ {'a':[],'b':0,'c':'str_0'} │ +│ {'a':[0],'b':1,'c':'str_1'} │ +│ {'a':[0,1],'b':2,'c':'str_2'} │ +└───────────────────────────────┘ +``` + +## Reading Variant nested types as subcolumns + +Variant type supports reading a single nested type from a Variant column using the type name as a subcolumn. +So, if you have column `variant Variant(T1, T2, T3)` you can read a subcolumn of type `T2` using syntax `variant.T2`, +this subcolumn will have type `Nullable(T2)` if `T2` can be inside `Nullable` and `T2` otherwise. This subcolumn will +be the same size as original `Variant` column and will contain `NULL` values (or empty values if `T2` cannot be inside `Nullable`) +in all rows in which original `Variant` column doesn't have type `T2`. + +Variant subcolumns can be also read using function `variantElement(variant_column, type_name)`. + +Examples: + +```sql +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT v, v.String, v.UInt64, v.`Array(UInt64)` FROM test; +``` + +```text +┌─v─────────────┬─v.String──────┬─v.UInt64─┬─v.Array(UInt64)─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ [] │ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└───────────────┴───────────────┴──────────┴─────────────────┘ +``` + +```sql +SELECT toTypeName(v.String), toTypeName(v.UInt64), toTypeName(v.`Array(UInt64)`) FROM test LIMIT 1; +``` + +```text +┌─toTypeName(v.String)─┬─toTypeName(v.UInt64)─┬─toTypeName(v.Array(UInt64))─┐ +│ Nullable(String) │ Nullable(UInt64) │ Array(UInt64) │ +└──────────────────────┴──────────────────────┴─────────────────────────────┘ +``` + +```sql +SELECT v, variantElement(v, 'String'), variantElement(v, 'UInt64'), variantElement(v, 'Array(UInt64)') FROM test; +``` + +```text +┌─v─────────────┬─variantElement(v, 'String')─┬─variantElement(v, 'UInt64')─┬─variantElement(v, 'Array(UInt64)')─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ [] │ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└───────────────┴─────────────────────────────┴─────────────────────────────┴────────────────────────────────────┘ +``` + +To know what variant is stored in each row function `variantType(variant_column)` can be used. It returns `Enum` with variant type name for each row (or `'None'` if row is `NULL`). + +Example: + +```sql +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT variantType(v) from test; +``` + +```text +┌─variantType(v)─┐ +│ None │ +│ UInt64 │ +│ String │ +│ Array(UInt64) │ +└────────────────┘ +``` + +```sql +SELECT toTypeName(variantType(v)) FROM test LIMIT 1; +``` + +```text +┌─toTypeName(variantType(v))──────────────────────────────────────────┐ +│ Enum8('None' = -1, 'Array(UInt64)' = 0, 'String' = 1, 'UInt64' = 2) │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +## Conversion between Variant column and other columns + +There are 3 possible conversions that can be performed with Variant column. + +### Converting an ordinary column to a Variant column + +It is possible to convert ordinary column with type `T` to a `Variant` column containing this type: + +```sql +SELECT toTypeName(variant) as type_name, 'Hello, World!'::Variant(UInt64, String, Array(UInt64)) as variant; +``` + +```text +┌─type_name──────────────────────────────┬─variant───────┐ +│ Variant(Array(UInt64), String, UInt64) │ Hello, World! │ +└────────────────────────────────────────┴───────────────┘ +``` + +### Converting a Variant column to an ordinary column + +It is possible to convert a `Variant` column to an ordinary column. In this case all nested variants will be converted to a destination type: + +```sql +CREATE TABLE test (v Variant(UInt64, String)) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('42.42'); +SELECT v::Nullable(Float64) FROM test; +``` + +```text +┌─CAST(v, 'Nullable(Float64)')─┐ +│ ᴺᵁᴸᴸ │ +│ 42 │ +│ 42.42 │ +└──────────────────────────────┘ +``` + +### Converting a Variant to another Variant + +It is possible to convert a `Variant` column to another `Variant` column, but only if the destination `Variant` column contains all nested types from the original `Variant`: + +```sql +CREATE TABLE test (v Variant(UInt64, String)) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('String'); +SELECT v::Variant(UInt64, String, Array(UInt64)) FROM test; +``` + +```text +┌─CAST(v, 'Variant(UInt64, String, Array(UInt64))')─┐ +│ ᴺᵁᴸᴸ │ +│ 42 │ +│ String │ +└───────────────────────────────────────────────────┘ +``` + + +## Reading Variant type from the data + +All text formats (TSV, CSV, CustomSeparated, Values, JSONEachRow, etc) supports reading `Variant` type. During data parsing ClickHouse tries to insert value into most appropriate variant type. + +Example: + +```sql +SELECT + v, + variantElement(v, 'String') AS str, + variantElement(v, 'UInt64') AS num, + variantElement(v, 'Float64') AS float, + variantElement(v, 'DateTime') AS date, + variantElement(v, 'Array(UInt64)') AS arr +FROM format(JSONEachRow, 'v Variant(String, UInt64, Float64, DateTime, Array(UInt64))', $$ +{"v" : "Hello, World!"}, +{"v" : 42}, +{"v" : 42.42}, +{"v" : "2020-01-01 00:00:00"}, +{"v" : [1, 2, 3]} +$$) +``` + +```text +┌─v───────────────────┬─str───────────┬──num─┬─float─┬────────────────date─┬─arr─────┐ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42.42 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 42.42 │ ᴺᵁᴸᴸ │ [] │ +│ 2020-01-01 00:00:00 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 2020-01-01 00:00:00 │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└─────────────────────┴───────────────┴──────┴───────┴─────────────────────┴─────────┘ +``` diff --git a/docs/en/sql-reference/distributed-ddl.md b/docs/en/sql-reference/distributed-ddl.md index d170f3765c2..7952792cbf4 100644 --- a/docs/en/sql-reference/distributed-ddl.md +++ b/docs/en/sql-reference/distributed-ddl.md @@ -6,7 +6,7 @@ sidebar_label: Distributed DDL # Distributed DDL Queries (ON CLUSTER Clause) -By default the `CREATE`, `DROP`, `ALTER`, and `RENAME` queries affect only the current server where they are executed. In a cluster setup, it is possible to run such queries in a distributed manner with the `ON CLUSTER` clause. +By default, the `CREATE`, `DROP`, `ALTER`, and `RENAME` queries affect only the current server where they are executed. In a cluster setup, it is possible to run such queries in a distributed manner with the `ON CLUSTER` clause. For example, the following query creates the `all_hits` `Distributed` table on each host in `cluster`: diff --git a/docs/en/sql-reference/functions/bitmap-functions.md b/docs/en/sql-reference/functions/bitmap-functions.md index 9b66d00656b..379be302881 100644 --- a/docs/en/sql-reference/functions/bitmap-functions.md +++ b/docs/en/sql-reference/functions/bitmap-functions.md @@ -372,7 +372,7 @@ Result: ## bitmapAnd -Computes the logical conjunction of two two bitmaps. +Computes the logical conjunction of two bitmaps. **Syntax** diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 5622097537e..c5b3b4cc3ae 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -1564,7 +1564,7 @@ Alias: `TO_DAYS` **Arguments** - `date` — The date to calculate the number of days passed since year zero from. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). -- `time_zone` — A String type const value or a expression represent the time zone. [String types](../../sql-reference/data-types/string.md) +- `time_zone` — A String type const value or an expression represent the time zone. [String types](../../sql-reference/data-types/string.md) **Returned value** @@ -2218,7 +2218,7 @@ now64([scale], [timezone]) **Arguments** -- `scale` - Tick size (precision): 10-precision seconds. Valid range: [ 0 : 9 ]. Typically are used - 3 (default) (milliseconds), 6 (microseconds), 9 (nanoseconds). +- `scale` - Tick size (precision): 10-precision seconds. Valid range: [ 0 : 9 ]. Typically, are used - 3 (default) (milliseconds), 6 (microseconds), 9 (nanoseconds). - `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../../sql-reference/data-types/string.md). **Returned value** @@ -2305,7 +2305,7 @@ Rounds the time to the half hour. Converts a date or date with time to a UInt32 number containing the year and month number (YYYY \* 100 + MM). Accepts a second optional timezone argument. If provided, the timezone must be a string constant. -This functions is the opposite of function `YYYYMMDDToDate()`. +This function is the opposite of function `YYYYMMDDToDate()`. **Example** @@ -2362,7 +2362,7 @@ Result: Converts a number containing the year, month and day number to a [Date](../../sql-reference/data-types/date.md). -This functions is the opposite of function `toYYYYMMDD()`. +This function is the opposite of function `toYYYYMMDD()`. The output is undefined if the input does not encode a valid Date value. @@ -2406,7 +2406,7 @@ Converts a number containing the year, month, day, hours, minute and second numb The output is undefined if the input does not encode a valid DateTime value. -This functions is the opposite of function `toYYYYMMDDhhmmss()`. +This function is the opposite of function `toYYYYMMDDhhmmss()`. **Syntax** @@ -2981,8 +2981,8 @@ toUTCTimestamp(time_val, time_zone) **Arguments** -- `time_val` — A DateTime/DateTime64 type const value or a expression . [DateTime/DateTime64 types](../../sql-reference/data-types/datetime.md) -- `time_zone` — A String type const value or a expression represent the time zone. [String types](../../sql-reference/data-types/string.md) +- `time_val` — A DateTime/DateTime64 type const value or an expression . [DateTime/DateTime64 types](../../sql-reference/data-types/datetime.md) +- `time_zone` — A String type const value or an expression represent the time zone. [String types](../../sql-reference/data-types/string.md) **Returned value** @@ -3014,8 +3014,8 @@ fromUTCTimestamp(time_val, time_zone) **Arguments** -- `time_val` — A DateTime/DateTime64 type const value or a expression . [DateTime/DateTime64 types](../../sql-reference/data-types/datetime.md) -- `time_zone` — A String type const value or a expression represent the time zone. [String types](../../sql-reference/data-types/string.md) +- `time_val` — A DateTime/DateTime64 type const value or an expression . [DateTime/DateTime64 types](../../sql-reference/data-types/datetime.md) +- `time_zone` — A String type const value or an expression represent the time zone. [String types](../../sql-reference/data-types/string.md) **Returned value** diff --git a/docs/en/sql-reference/functions/distance-functions.md b/docs/en/sql-reference/functions/distance-functions.md index 1774c22014d..e20c35c6b6f 100644 --- a/docs/en/sql-reference/functions/distance-functions.md +++ b/docs/en/sql-reference/functions/distance-functions.md @@ -509,7 +509,7 @@ Result: ## cosineDistance -Calculates the cosine distance between two vectors (the values of the tuples are the coordinates). The less the returned value is, the more similar are the vectors. +Calculates the cosine distance between two vectors (the values of the tuples are the coordinates). The smaller the returned value is, the more similar are the vectors. **Syntax** diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 35f9c7af2ce..d05e7bbfe51 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2832,6 +2832,88 @@ Result: └─────────────────────────────────────────────────────────────────────────┘ ``` +## variantElement + +Extracts a column with specified type from a `Variant` column. + +**Syntax** + +```sql +variantElement(variant, type_name, [, default_value]) +``` + +**Arguments** + +- `variant` — Variant column. [Variant](../../sql-reference/data-types/variant.md). +- `type_name` — The name of the variant type to extract. [String](../../sql-reference/data-types/string.md). +- `default_value` - The default value that will be used if variant doesn't have variant with specified type. Can be any type. Optional. + +**Returned value** + +- Subcolumn of a `Variant` column with specified type. + +**Example** + +```sql +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT v, variantElement(v, 'String'), variantElement(v, 'UInt64'), variantElement(v, 'Array(UInt64)') FROM test; +``` + +```text +┌─v─────────────┬─variantElement(v, 'String')─┬─variantElement(v, 'UInt64')─┬─variantElement(v, 'Array(UInt64)')─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ [] │ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└───────────────┴─────────────────────────────┴─────────────────────────────┴────────────────────────────────────┘ +``` + +## variantType + +Returns the variant type name for each row of `Variant` column. If row contains NULL, it returns `'None'` for it. + +**Syntax** + +```sql +variantType(variant) +``` + +**Arguments** + +- `variant` — Variant column. [Variant](../../sql-reference/data-types/variant.md). + +**Returned value** + +- Enum8 column with variant type name for each row. + +**Example** + +```sql +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT variantType(v) FROM test; +``` + +```text +┌─variantType(v)─┐ +│ None │ +│ UInt64 │ +│ String │ +│ Array(UInt64) │ +└────────────────┘ +``` + +```sql +SELECT toTypeName(variantType(v)) FROM test LIMIT 1; +``` + +```text +┌─toTypeName(variantType(v))──────────────────────────────────────────┐ +│ Enum8('None' = -1, 'Array(UInt64)' = 0, 'String' = 1, 'UInt64' = 2) │ +└─────────────────────────────────────────────────────────────────────┘ +``` + ## minSampleSizeConversion Calculates minimum required sample size for an A/B test comparing conversions (proportions) in two samples. diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index a2f1b0d7752..9ae403be524 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -4,6 +4,8 @@ sidebar_position: 170 sidebar_label: Strings --- +import VersionBadge from '@theme/badges/VersionBadge'; + # Functions for Working with Strings Functions for [searching](string-search-functions.md) in strings and for [replacing](string-replace-functions.md) in strings are described separately. @@ -515,7 +517,7 @@ Alias: `concat_ws` **Arguments** - sep — separator. Const [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). -- exprN — expression to be concatenated. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). +- exprN — expression to be concatenated. Arguments which are not of types [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md) are converted to strings using their default serialization. As this decreases performance, it is not recommended to use non-String/FixedString arguments. **Returned values** @@ -783,6 +785,8 @@ SELECT startsWith('Spider-Man', 'Spi'); ## startsWithUTF8 + + Returns whether string `str` starts with `prefix`, the difference between `startsWithUTF8` and `startsWith` is that `startsWithUTF8` match `str` and `suffix` by UTF-8 characters. diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index d5dbca3f2b7..22f879c62ae 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -590,6 +590,10 @@ Result: └───────────────────────────────┘ ``` +## countMatchesCaseInsensitive + +Like `countMatches(haystack, pattern)` but matching ignores the case. + ## regexpExtract Extracts the first string in haystack that matches the regexp pattern and corresponds to the regex group index. diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index 144d832b36a..ce36c89f473 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -6,11 +6,67 @@ sidebar_label: Time Series # Time Series Functions -Below functions are used for time series analysis. +Below functions are used for series data analysis. + +## seriesOutliersDetectTukey + +Detects outliers in series data using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). + +**Syntax** + +``` sql +seriesOutliersDetectTukey(series); +seriesOutliersDetectTukey(series, min_percentile, max_percentile, K); +``` + +**Arguments** + +- `series` - An array of numeric values. +- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25. +- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75. +- `K` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5. + +At least four data points are required in `series` to detect outliers. + +**Returned value** + +- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly. + +Type: [Array](../../sql-reference/data-types/array.md). + +**Examples** + +Query: + +``` sql +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6]) AS print_0; +``` + +Result: + +``` text +┌───────────print_0─────────────────┐ +│[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0] │ +└───────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 20, 80, 1.5) AS print_0; +``` + +Result: + +``` text +┌─print_0──────────────────────────────┐ +│ [0,0,0,0,0,0,0,0,0,19.5,0,0,0,0,0,0] │ +└──────────────────────────────────────┘ +``` ## seriesPeriodDetectFFT -Finds the period of the given time series data using FFT +Finds the period of the given series data data using FFT FFT - [Fast Fourier transform](https://en.wikipedia.org/wiki/Fast_Fourier_transform) **Syntax** @@ -25,7 +81,7 @@ seriesPeriodDetectFFT(series); **Returned value** -- A real value equal to the period of time series +- A real value equal to the period of series data - Returns NAN when number of data points are less than four. Type: [Float64](../../sql-reference/data-types/float.md). @@ -60,7 +116,7 @@ Result: ## seriesDecomposeSTL -Decomposes a time series using STL [(Seasonal-Trend Decomposition Procedure Based on Loess)](https://www.wessa.net/download/stl.pdf) into a season, a trend and a residual component. +Decomposes a series data using STL [(Seasonal-Trend Decomposition Procedure Based on Loess)](https://www.wessa.net/download/stl.pdf) into a season, a trend and a residual component. **Syntax** @@ -77,8 +133,8 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** -- An array of three arrays where the first array include seasonal components, the second array - trend, -and the third array - residue component. +- An array of four arrays where the first array include seasonal components, the second array - trend, +the third array - residue component, and the fourth array - baseline(seasonal + trend) component. Type: [Array](../../sql-reference/data-types/array.md). @@ -107,6 +163,10 @@ Result: [ 0, 0.0000019073486, -0.0000019073486, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.0000019073486, 0, 0 + ], + [ + 10.1, 20.449999, 40.340004, 10.100001, 20.45, 40.34, 10.100001, 20.45, 40.34, 10.1, 20.45, 40.34, + 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.100002, 20.45, 40.34 ]] │ └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` diff --git a/docs/en/sql-reference/statements/alter/apply-deleted-mask.md b/docs/en/sql-reference/statements/alter/apply-deleted-mask.md index 7a11d66e739..1afc2a0ff5a 100644 --- a/docs/en/sql-reference/statements/alter/apply-deleted-mask.md +++ b/docs/en/sql-reference/statements/alter/apply-deleted-mask.md @@ -10,7 +10,7 @@ sidebar_label: APPLY DELETED MASK ALTER TABLE [db].name [ON CLUSTER cluster] APPLY DELETED MASK [IN PARTITION partition_id] ``` -The command applies mask created by [lightweight delete](/docs/en/sql-reference/statements/delete) and forcefully removes rows marked as deleted from disk. This command is a heavyweight mutation and it semantically equals to query ```ALTER TABLE [db].name DELETE WHERE _row_exists = 0```. +The command applies mask created by [lightweight delete](/docs/en/sql-reference/statements/delete) and forcefully removes rows marked as deleted from disk. This command is a heavyweight mutation, and it semantically equals to query ```ALTER TABLE [db].name DELETE WHERE _row_exists = 0```. :::note It only works for tables in the [`MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) family (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) tables). diff --git a/docs/en/sql-reference/statements/alter/column.md b/docs/en/sql-reference/statements/alter/column.md index 676d30f5e44..f6d9668e628 100644 --- a/docs/en/sql-reference/statements/alter/column.md +++ b/docs/en/sql-reference/statements/alter/column.md @@ -139,8 +139,8 @@ ALTER TABLE visits COMMENT COLUMN browser 'This column shows the browser used fo ## MODIFY COLUMN ``` sql -MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [codec] [TTL] [AFTER name_after | FIRST] -ALTER COLUMN [IF EXISTS] name TYPE [type] [default_expr] [codec] [TTL] [AFTER name_after | FIRST] +MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [codec] [TTL] [settings] [AFTER name_after | FIRST] +ALTER COLUMN [IF EXISTS] name TYPE [type] [default_expr] [codec] [TTL] [settings] [AFTER name_after | FIRST] ``` This query changes the `name` column properties: @@ -153,10 +153,14 @@ This query changes the `name` column properties: - TTL +- Column-level Settings + For examples of columns compression CODECS modifying, see [Column Compression Codecs](../create/table.md/#codecs). For examples of columns TTL modifying, see [Column TTL](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#mergetree-column-ttl). +For examples of column-level settings modifying, see [Column-level Settings](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#column-level-settings). + If the `IF EXISTS` clause is specified, the query won’t return an error if the column does not exist. When changing the type, values are converted as if the [toType](/docs/en/sql-reference/functions/type-conversion-functions.md) functions were applied to them. If only the default expression is changed, the query does not do anything complex, and is completed almost instantly. @@ -209,7 +213,7 @@ The `ALTER` query for changing columns is replicated. The instructions are saved ## MODIFY COLUMN REMOVE -Removes one of the column properties: `DEFAULT`, `ALIAS`, `MATERIALIZED`, `CODEC`, `COMMENT`, `TTL`, `SETTING`. +Removes one of the column properties: `DEFAULT`, `ALIAS`, `MATERIALIZED`, `CODEC`, `COMMENT`, `TTL`, `SETTINGS`. Syntax: @@ -237,7 +241,7 @@ Modify a column setting. Syntax: ```sql -ALTER TABLE table_name MODIFY COLUMN MODIFY SETTING name=value,...; +ALTER TABLE table_name MODIFY COLUMN column_name MODIFY SETTING name=value,...; ``` **Example** @@ -245,7 +249,7 @@ ALTER TABLE table_name MODIFY COLUMN MODIFY SETTING name=value,...; Modify column's `max_compress_block_size` to `1MB`: ```sql -ALTER TABLE table_name MODIFY COLUMN MODIFY SETTING max_compress_block_size = 1048576; +ALTER TABLE table_name MODIFY COLUMN column_name MODIFY SETTING max_compress_block_size = 1048576; ``` ## MODIFY COLUMN RESET SETTING @@ -255,21 +259,21 @@ Reset a column setting, also removes the setting declaration in the column expre Syntax: ```sql -ALTER TABLE table_name MODIFY COLUMN RESET SETTING name,...; +ALTER TABLE table_name MODIFY COLUMN column_name RESET SETTING name,...; ``` **Example** -Remove column setting `max_compress_block_size` to `1MB`: +Reset column setting `max_compress_block_size` to it's default value: ```sql -ALTER TABLE table_name MODIFY COLUMN REMOVE SETTING max_compress_block_size; +ALTER TABLE table_name MODIFY COLUMN column_name RESET SETTING max_compress_block_size; ``` ## MATERIALIZE COLUMN Materializes or updates a column with an expression for a default value (`DEFAULT` or `MATERIALIZED`). -It is used if it is necessary to add or update a column with a complicated expression, because evaluating such an expression directly on `SELECT` executing turns out to be expensive. +It is used if it is necessary to add or update a column with a complicated expression, because evaluating such an expression directly on `SELECT` executing turns out to be expensive. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). Syntax: diff --git a/docs/en/sql-reference/statements/alter/constraint.md b/docs/en/sql-reference/statements/alter/constraint.md index 7a8f5809320..29675f704b5 100644 --- a/docs/en/sql-reference/statements/alter/constraint.md +++ b/docs/en/sql-reference/statements/alter/constraint.md @@ -15,7 +15,7 @@ ALTER TABLE [db].name [ON CLUSTER cluster] DROP CONSTRAINT constraint_name; See more on [constraints](../../../sql-reference/statements/create/table.md#constraints). -Queries will add or remove metadata about constraints from table so they are processed immediately. +Queries will add or remove metadata about constraints from table, so they are processed immediately. :::tip Constraint check **will not be executed** on existing data if it was added. diff --git a/docs/en/sql-reference/statements/create/quota.md b/docs/en/sql-reference/statements/create/quota.md index a6ced870c18..d16b40876c7 100644 --- a/docs/en/sql-reference/statements/create/quota.md +++ b/docs/en/sql-reference/statements/create/quota.md @@ -21,7 +21,7 @@ CREATE QUOTA [IF NOT EXISTS | OR REPLACE] name [ON CLUSTER cluster_name] Keys `user_name`, `ip_address`, `client_key`, `client_key, user_name` and `client_key, ip_address` correspond to the fields in the [system.quotas](../../../operations/system-tables/quotas.md) table. -Parameters `queries`, `query_selects`, `query_inserts`, `errors`, `result_rows`, `result_bytes`, `read_rows`, `read_bytes`, `execution_time` correspond to the fields in the [system.quotas_usage](../../../operations/system-tables/quotas_usage.md) table. +Parameters `queries`, `query_selects`, `query_inserts`, `errors`, `result_rows`, `result_bytes`, `read_rows`, `read_bytes`, `execution_time`, `failed_sequential_authentications` correspond to the fields in the [system.quotas_usage](../../../operations/system-tables/quotas_usage.md) table. `ON CLUSTER` clause allows creating quotas on a cluster, see [Distributed DDL](../../../sql-reference/distributed-ddl.md). diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 7322bc17b76..0edf158e981 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -514,6 +514,10 @@ ENGINE = MergeTree ORDER BY x; ## Temporary Tables +:::note +Please note that temporary tables are not replicated. As a result, there is no guarantee that data inserted into a temporary table will be available in other replicas. The primary use case where temporary tables can be useful is for querying or joining small external datasets during a single session. +::: + ClickHouse supports temporary tables which have the following characteristics: - Temporary tables disappear when the session ends, including if the connection is lost. diff --git a/docs/en/sql-reference/statements/detach.md b/docs/en/sql-reference/statements/detach.md index 938a5f9c3cb..e88e625aed1 100644 --- a/docs/en/sql-reference/statements/detach.md +++ b/docs/en/sql-reference/statements/detach.md @@ -16,13 +16,13 @@ DETACH TABLE|VIEW|DICTIONARY|DATABASE [IF EXISTS] [db.]name [ON CLUSTER cluster] Detaching does not delete the data or metadata of a table, a materialized view, a dictionary or a database. If an entity was not detached `PERMANENTLY`, on the next server launch the server will read the metadata and recall the table/view/dictionary/database again. If an entity was detached `PERMANENTLY`, there will be no automatic recall. Whether a table, a dictionary or a database was detached permanently or not, in both cases you can reattach them using the [ATTACH](../../sql-reference/statements/attach.md) query. -System log tables can be also attached back (e.g. `query_log`, `text_log`, etc). Other system tables can't be reattached. On the next server launch the server will recall those tables again. +System log tables can be also attached back (e.g. `query_log`, `text_log`, etc.). Other system tables can't be reattached. On the next server launch the server will recall those tables again. `ATTACH MATERIALIZED VIEW` does not work with short syntax (without `SELECT`), but you can attach it using the `ATTACH TABLE` query. Note that you can not detach permanently the table which is already detached (temporary). But you can attach it back and then detach permanently again. -Also you can not [DROP](../../sql-reference/statements/drop.md#drop-table) the detached table, or [CREATE TABLE](../../sql-reference/statements/create/table.md) with the same name as detached permanently, or replace it with the other table with [RENAME TABLE](../../sql-reference/statements/rename.md) query. +Also, you can not [DROP](../../sql-reference/statements/drop.md#drop-table) the detached table, or [CREATE TABLE](../../sql-reference/statements/create/table.md) with the same name as detached permanently, or replace it with the other table with [RENAME TABLE](../../sql-reference/statements/rename.md) query. The `SYNC` modifier executes the action without delay. diff --git a/docs/en/sql-reference/statements/insert-into.md b/docs/en/sql-reference/statements/insert-into.md index f9d93305071..f5544f96750 100644 --- a/docs/en/sql-reference/statements/insert-into.md +++ b/docs/en/sql-reference/statements/insert-into.md @@ -204,6 +204,20 @@ Result: └─────┴───────────────────────┘ ``` +## Inserts into ClickHouse Cloud + +By default, services on ClickHouse Cloud provide multiple replicas for high availability. When you connect to a service, a connection is established to one of these replicas. + +After an `INSERT` succeeds, data is written to the underlying storage. However, it may take some time for replicas to receive these updates. Therefore, if you use a different connection that executes a `SELECT` query on one of these other replicas, the updated data may not yet be reflected. + +It is possible to use the `select_sequential_consistency` to force the replica to receive the latest updates. Here is an example of a SELECT query using this setting: + +```sql +SELECT .... SETTINGS select_sequential_consistency = 1; +``` + +Note that using `select_sequential_consistency` will increase the load on ClickHouse Keeper (used by ClickHouse Cloud internally) and may result in slower performance depending on the load on the service. We recommend against enabling this setting unless necessary. The recommended approach is to execute read/writes in the same session or to use a client driver that uses the native protocol (and thus supports sticky connections). + ## Performance Considerations `INSERT` sorts the input data by primary key and splits them into partitions by a partition key. If you insert data into several partitions at once, it can significantly reduce the performance of the `INSERT` query. To avoid this: diff --git a/docs/en/sql-reference/statements/select/distinct.md b/docs/en/sql-reference/statements/select/distinct.md index 10326b0ef8f..08359b035ae 100644 --- a/docs/en/sql-reference/statements/select/distinct.md +++ b/docs/en/sql-reference/statements/select/distinct.md @@ -5,7 +5,7 @@ sidebar_label: DISTINCT # DISTINCT Clause -If `SELECT DISTINCT` is specified, only unique rows will remain in a query result. Thus only a single row will remain out of all the sets of fully matching rows in the result. +If `SELECT DISTINCT` is specified, only unique rows will remain in a query result. Thus, only a single row will remain out of all the sets of fully matching rows in the result. You can specify the list of columns that must have unique values: `SELECT DISTINCT ON (column1, column2,...)`. If the columns are not specified, all of them are taken into consideration. diff --git a/docs/ru/development/architecture.md b/docs/ru/development/architecture.md index b2e851a78cd..575799cccc4 100644 --- a/docs/ru/development/architecture.md +++ b/docs/ru/development/architecture.md @@ -63,7 +63,7 @@ ClickHouse — полноценная столбцовая СУБД. Данны Для байт-ориентированного ввода-вывода существуют абстрактные классы `ReadBuffer` и `WriteBuffer`. Они используются вместо `iostream`. Не волнуйтесь: каждый зрелый проект C++ использует что-то другое вместо `iostream` по уважительным причинам. -`ReadBuffer` и `WriteBuffer` — это просто непрерывный буфер и курсор, указывающий на позицию в этом буфере. Реализации могут как владеть так и не владеть памятью буфера. Существует виртуальный метод заполнения буфера следующими данными (для `ReadBuffer`) или сброса буфера куда-нибудь (например `WriteBuffer`). Виртуальные методы редко вызываются. +`ReadBuffer` и `WriteBuffer` — это просто непрерывный буфер и курсор, указывающий на позицию в этом буфере. Реализации могут как владеть, так и не владеть памятью буфера. Существует виртуальный метод заполнения буфера следующими данными (для `ReadBuffer`) или сброса буфера куда-нибудь (например `WriteBuffer`). Виртуальные методы редко вызываются. Реализации `ReadBuffer`/`WriteBuffer` используются для работы с файлами и файловыми дескрипторами, а также сетевыми сокетами, для реализации сжатия (`CompressedWriteBuffer` инициализируется вместе с другим `WriteBuffer` и осуществляет сжатие данных перед записью в него), и для других целей – названия `ConcatReadBuffer`, `LimitReadBuffer`, и `HashingWriteBuffer` говорят сами за себя. diff --git a/docs/ru/development/developer-instruction.md b/docs/ru/development/developer-instruction.md index c63622594e4..01ff4dd5f28 100644 --- a/docs/ru/development/developer-instruction.md +++ b/docs/ru/development/developer-instruction.md @@ -71,7 +71,7 @@ ClickHouse не работает и не собирается на 32-битны Please make sure you have the correct access rights and the repository exists. -Как правило это означает, что отсутствуют ssh ключи для соединения с GitHub. Ключи расположены в директории `~/.ssh`. В интерфейсе GitHub, в настройках, необходимо загрузить публичные ключи, чтобы он их понимал. +Как правило, это означает, что отсутствуют ssh ключи для соединения с GitHub. Ключи расположены в директории `~/.ssh`. В интерфейсе GitHub, в настройках, необходимо загрузить публичные ключи, чтобы он их понимал. Вы также можете клонировать репозиторий по протоколу https: @@ -199,7 +199,7 @@ sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" В случае успешного запуска, вы увидите прогресс сборки - количество обработанных задач и общее количество задач. -В процессе сборки могут появится сообщения `libprotobuf WARNING` про protobuf файлы в библиотеке libhdfs2. Это не имеет значения. +В процессе сборки могут появиться сообщения `libprotobuf WARNING` про protobuf файлы в библиотеке libhdfs2. Это не имеет значения. При успешной сборке, вы получите готовый исполняемый файл `ClickHouse/build/programs/clickhouse`: @@ -207,7 +207,7 @@ sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" ## Запуск собранной версии ClickHouse {#zapusk-sobrannoi-versii-clickhouse} -Для запуска сервера из под текущего пользователя, с выводом логов в терминал и с использованием примеров конфигурационных файлов, расположенных в исходниках, перейдите в директорию `ClickHouse/programs/server/` (эта директория находится не в директории build) и выполните: +Для запуска сервера из-под текущего пользователя, с выводом логов в терминал и с использованием примеров конфигурационных файлов, расположенных в исходниках, перейдите в директорию `ClickHouse/programs/server/` (эта директория находится не в директории build) и выполните: ../../build/programs/clickhouse server diff --git a/docs/ru/engines/table-engines/mergetree-family/collapsingmergetree.md b/docs/ru/engines/table-engines/mergetree-family/collapsingmergetree.md index cfafddf0bc2..4a7d81d38fc 100644 --- a/docs/ru/engines/table-engines/mergetree-family/collapsingmergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/collapsingmergetree.md @@ -37,7 +37,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **Секции запроса** -При создании таблицы с движком `CollapsingMergeTree` используются те же [секции запроса](mergetree.md#table_engine-mergetree-creating-a-table) что и при создании таблицы с движком `MergeTree`. +При создании таблицы с движком `CollapsingMergeTree` используются те же [секции запроса](mergetree.md#table_engine-mergetree-creating-a-table), что и при создании таблицы с движком `MergeTree`.
diff --git a/docs/ru/engines/table-engines/special/buffer.md b/docs/ru/engines/table-engines/special/buffer.md index 1fd8483e54d..3d2f1ee850d 100644 --- a/docs/ru/engines/table-engines/special/buffer.md +++ b/docs/ru/engines/table-engines/special/buffer.md @@ -42,7 +42,7 @@ CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10 В качестве имени базы данных и имени таблицы можно указать пустые строки в одинарных кавычках. Это обозначает отсутствие таблицы назначения. В таком случае, при достижении условий на сброс данных, буфер будет просто очищаться. Это может быть полезным, чтобы хранить в оперативке некоторое окно данных. При чтении из таблицы типа Buffer, будут обработаны данные, как находящиеся в буфере, так и данные из таблицы назначения (если такая есть). -Но следует иметь ввиду, что таблица Buffer не поддерживает индекс. То есть, данные в буфере будут просканированы полностью, что может быть медленно для буферов большого размера. (Для данных в подчинённой таблице, будет использоваться тот индекс, который она поддерживает.) +Но следует иметь в виду, что таблица Buffer не поддерживает индекс. То есть, данные в буфере будут просканированы полностью, что может быть медленно для буферов большого размера. (Для данных в подчинённой таблице, будет использоваться тот индекс, который она поддерживает.) Если множество столбцов таблицы Buffer не совпадает с множеством столбцов подчинённой таблицы, то будут вставлено подмножество столбцов, которое присутствует в обеих таблицах. @@ -66,4 +66,4 @@ CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10 Таблицы типа Buffer используются в тех случаях, когда от большого количества серверов поступает слишком много INSERT-ов в единицу времени, и нет возможности заранее самостоятельно буферизовать данные перед вставкой, в результате чего, INSERT-ы не успевают выполняться. -Заметим, что даже для таблиц типа Buffer не имеет смысла вставлять данные по одной строке, так как таким образом будет достигнута скорость всего лишь в несколько тысяч строк в секунду, тогда как при вставке более крупными блоками, достижимо более миллиона строк в секунду (смотрите раздел [«Производительность»](../../../introduction/performance.md). +Заметим, что даже для таблиц типа Buffer не имеет смысла вставлять данные по одной строке, так как таким образом будет достигнута скорость всего лишь в несколько тысяч строк в секунду, тогда как при вставке более крупными блоками, достижимо более миллиона строк в секунду (смотрите раздел [«Производительность»](../../../introduction/performance.md)). diff --git a/docs/ru/faq/general/ne-tormozit.md b/docs/ru/faq/general/ne-tormozit.md index 0f888de839f..6d0803680a8 100644 --- a/docs/ru/faq/general/ne-tormozit.md +++ b/docs/ru/faq/general/ne-tormozit.md @@ -20,6 +20,6 @@ sidebar_position: 11 Если вы не видели наших футболок, посмотрите видео о ClickHouse. Например, вот это: -![iframe](https://www.youtube.com/embed/bSyQahMVZ7w) + P.S. Эти футболки не продаются, а распространяются бесплатно на большинстве митапов [ClickHouse](https://clickhouse.com/#meet), обычно в награду за самые интересные вопросы или другие виды активного участия. diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index 8910c258788..4d19cf50ae1 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -177,11 +177,11 @@ URI позволяет подключаться к нескольким хост -Строка подключения должна быть указана в первом аргументе clickhouse-client. Строка подключения может комбинироваться с другими [параметрами командной строки] (#command-line-options) кроме `--host/-h` и `--port`. +Строка подключения должна быть указана в первом аргументе clickhouse-client. Строка подключения может комбинироваться с другими [параметрами командной строки](#command-line-options) кроме `--host/-h` и `--port`. Для компонента `query_parameter` разрешены следующие ключи: -- `secure` или сокращенно `s` - без значение. Если параметр указан, то соединение с сервером будет осуществляться по защищенному каналу (TLS). См. `secure` в [command-line-options](#command-line-options). +- `secure` или сокращенно `s` - без значения. Если параметр указан, то соединение с сервером будет осуществляться по защищенному каналу (TLS). См. `secure` в [command-line-options](#command-line-options). ### Кодирование URI {#connection_string_uri_percent_encoding} @@ -206,7 +206,7 @@ clickhouse-client clickhouse://john:secret@127.0.0.1:9000 clickhouse-client clickhouse://[::1]:9000 ``` -Подключиться к localhost через порт 9000 многострочном режиме. +Подключиться к localhost через порт 9000 в многострочном режиме. ``` bash clickhouse-client clickhouse://localhost:9000 '-m' diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index b4794b02743..a9280de9c7b 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -201,7 +201,7 @@ SELECT * FROM nestedt FORMAT TSV Этот формат позволяет указать произвольную форматную строку, в которую подставляются значения, сериализованные выбранным способом. -Для этого используются настройки `format_template_resultset`, `format_template_row`, `format_template_rows_between_delimiter` и настройки экранирования других форматов (например, `output_format_json_quote_64bit_integers` при экранировании как в `JSON`, см. далее) +Для этого используются настройки `format_template_resultset`, `format_template_row` (`format_template_row_format`), `format_template_rows_between_delimiter` и настройки экранирования других форматов (например, `output_format_json_quote_64bit_integers` при экранировании как в `JSON`, см. далее) Настройка `format_template_row` задаёт путь к файлу, содержащему форматную строку для строк таблицы, которая должна иметь вид: @@ -227,9 +227,11 @@ SELECT * FROM nestedt FORMAT TSV `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` +В тех случаях, когда не удобно или не возможно указать произвольную форматную строку в файле, можно использовать `format_template_row_format` указать произвольную форматную строку в запросе. + Настройка `format_template_rows_between_delimiter` задаёт разделитель между строками, который выводится (или ожмдается при вводе) после каждой строки, кроме последней. По умолчанию `\n`. -Настройка `format_template_resultset` задаёт путь к файлу, содержащему форматную строку для результата. Форматная строка для результата имеет синтаксис аналогичный форматной строке для строк таблицы и позволяет указать префикс, суффикс и способ вывода дополнительной информации. Вместо имён столбцов в ней указываются следующие имена подстановок: +Настройка `format_template_resultset` задаёт путь к файлу, содержащему форматную строку для результата. Настройка `format_template_resultset_format` используется для установки форматной строки для результата непосредственно в запросе. Форматная строка для результата имеет синтаксис аналогичный форматной строке для строк таблицы и позволяет указать префикс, суффикс и способ вывода дополнительной информации. Вместо имён столбцов в ней указываются следующие имена подстановок: - `data` - строки с данными в формате `format_template_row`, разделённые `format_template_rows_between_delimiter`. Эта подстановка должна быть первой подстановкой в форматной строке. - `totals` - строка с тотальными значениями в формате `format_template_row` (при использовании WITH TOTALS) diff --git a/docs/ru/operations/clickhouse-keeper.md b/docs/ru/operations/clickhouse-keeper.md index 3a931529b32..9f1301d817d 100644 --- a/docs/ru/operations/clickhouse-keeper.md +++ b/docs/ru/operations/clickhouse-keeper.md @@ -69,7 +69,7 @@ ClickHouse Keeper может использоваться как равноце :::note -В случае изменения топологии кластера ClickHouse Keeper(например, замены сервера), удостоверьтесь, что вы сохраняеете отношение `server_id` - `hostname`, не переиспользуете существующие `server_id` для для новых серверов и не перемешиваете идентификаторы. Подобные ошибки могут случаться, если вы используете автоматизацию при разворачивании кластера без логики сохранения идентификаторов. +В случае изменения топологии кластера ClickHouse Keeper(например, замены сервера), удостоверьтесь, что вы сохраняеете отношение `server_id` - `hostname`, не переиспользуете существующие `server_id` для новых серверов и не перемешиваете идентификаторы. Подобные ошибки могут случаться, если вы используете автоматизацию при разворачивании кластера без логики сохранения идентификаторов. ::: Примеры конфигурации кворума с тремя узлами можно найти в [интеграционных тестах](https://github.com/ClickHouse/ClickHouse/tree/master/tests/integration) с префиксом `test_keeper_`. Пример конфигурации для сервера №1: @@ -337,7 +337,7 @@ clickhouse-keeper-converter --zookeeper-logs-dir /var/lib/zookeeper/version-2 -- После того, как выполнили действия выше выполните следующие шаги. 1. Выберете одну ноду Keeper, которая станет новым лидером. Учтите, что данные с этой ноды будут использованы всем кластером, поэтому рекомендуется выбрать ноду с наиболее актуальным состоянием. -2. Перед дальнейшими действиям сделайте резервную копию данных из директорий `log_storage_path` и `snapshot_storage_path`. +2. Перед дальнейшими действиями сделайте резервную копию данных из директорий `log_storage_path` и `snapshot_storage_path`. 3. Измените настройки на всех нодах кластера, которые вы собираетесь использовать. 4. Отправьте команду `rcvr` на ноду, которую вы выбрали, или остановите ее и запустите заново с аргументом `--force-recovery`. Это переведет ноду в режим восстановления. 5. Запускайте остальные ноды кластера по одной и проверяйте, что команда `mntr` возвращает `follower` в выводе состояния `zk_server_state` перед тем, как запустить следующую ноду. diff --git a/docs/ru/operations/configuration-files.md b/docs/ru/operations/configuration-files.md index 3b037521692..74f7d217fb7 100644 --- a/docs/ru/operations/configuration-files.md +++ b/docs/ru/operations/configuration-files.md @@ -89,7 +89,7 @@ $ cat /etc/clickhouse-server/users.d/alice.xml Вы можете использовать симметричное шифрование для зашифровки элемента конфигурации, например, поля password. Чтобы это сделать, сначала настройте [кодек шифрования](../sql-reference/statements/create/table.md#encryption-codecs), затем добавьте аттибут`encrypted_by` с именем кодека шифрования как значение к элементу, который надо зашифровать. -В отличии от аттрибутов `from_zk`, `from_env` и `incl` (или элемента `include`), подстановка, т.е. расшифровка зашифрованного значения, не выподняется в файле предобработки. Расшифровка происходит только во время исполнения в серверном процессе. +В отличие от аттрибутов `from_zk`, `from_env` и `incl` (или элемента `include`), подстановка, т.е. расшифровка зашифрованного значения, не выподняется в файле предобработки. Расшифровка происходит только во время исполнения в серверном процессе. Пример: @@ -110,7 +110,7 @@ $ cat /etc/clickhouse-server/users.d/alice.xml ``` -Чтобы получить зашифрованное значение может быть использовано приложение-пример `encrypt_decrypt` . +Чтобы получить зашифрованное значение, может быть использовано приложение-пример `encrypt_decrypt` . Пример: diff --git a/docs/ru/operations/system-tables/grants.md b/docs/ru/operations/system-tables/grants.md index b3ef789e95b..4485b684218 100644 --- a/docs/ru/operations/system-tables/grants.md +++ b/docs/ru/operations/system-tables/grants.md @@ -19,7 +19,7 @@ slug: /ru/operations/system-tables/grants - `column` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Имя столбца, к которому предоставляется доступ. - `is_partial_revoke` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Логическое значение. Показывает, были ли отменены некоторые привилегии. Возможные значения: -- `0` — Строка описывает частичный отзыв. -- `1` — Строка описывает грант. +- `0` — Строка описывает грант. +- `1` — Строка описывает частичный отзыв. - `grant_option` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Разрешение предоставлено с опцией `WITH GRANT OPTION`, подробнее см. [GRANT](../../sql-reference/statements/grant.md#grant-privigele-syntax). diff --git a/docs/ru/operations/system-tables/quota_usage.md b/docs/ru/operations/system-tables/quota_usage.md index 96f6debd24e..46305e59da6 100644 --- a/docs/ru/operations/system-tables/quota_usage.md +++ b/docs/ru/operations/system-tables/quota_usage.md @@ -26,8 +26,11 @@ slug: /ru/operations/system-tables/quota_usage - `max_read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество строк, считываемых из всех таблиц и табличных функций, участвующих в запросах. - `read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — общее количество байт, считанных из всех таблиц и табличных функций, участвующих в запросах. - `max_read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество байт, считываемых из всех таблиц и табличных функций. -- `execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — общее время выполнения запроса, в секундах. -- `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — максимальное время выполнения запроса. +- `failed_sequential_authentications` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — Общее количество неудачных попыток подряд ввести пароль. Если пользователь ввел верный пароль до преодоления порогового значения `max_failed_sequential_authentications` то счетчик неудачных попыток будет сброшен. +- `max_failed_sequential_authentications` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — Максимальное количество неудачных попыток подряд ввести пароль. +- `execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/float.md))) — общее время выполнения запроса, в секундах. +- `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/float.md))) — максимальное время выполнения запроса. + ## Смотрите также {#see-also} diff --git a/docs/ru/operations/system-tables/quotas_usage.md b/docs/ru/operations/system-tables/quotas_usage.md index 27e7cdf8abe..4bc0f2e81ca 100644 --- a/docs/ru/operations/system-tables/quotas_usage.md +++ b/docs/ru/operations/system-tables/quotas_usage.md @@ -29,9 +29,10 @@ slug: /ru/operations/system-tables/quotas_usage - `max_read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество строк, считываемых из всех таблиц и табличных функций, участвующих в запросах. - `read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — общее количество байт, считанных из всех таблиц и табличных функций, участвующих в запросах. - `max_read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — максимальное количество байт, считываемых из всех таблиц и табличных функций. +- `failed_sequential_authentications` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/float.md))) — Общее количество неудачных попыток подряд ввести пароль. Если пользователь ввел верный пароль до преодоления порогового значения `max_failed_sequential_authentications` то счетчик неудачных попыток будет сброшен. +- `max_failed_sequential_authentications` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/float.md))) — Максимальное количество неудачных попыток подряд ввести пароль. - `execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — общее время выполнения запроса, в секундах. - `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — максимальное время выполнения запроса. - ## Смотрите также {#see-also} - [SHOW QUOTA](../../sql-reference/statements/show.md#show-quota-statement) diff --git a/docs/ru/operations/utilities/clickhouse-benchmark.md b/docs/ru/operations/utilities/clickhouse-benchmark.md index 73de78d1c15..eb342bea9a7 100644 --- a/docs/ru/operations/utilities/clickhouse-benchmark.md +++ b/docs/ru/operations/utilities/clickhouse-benchmark.md @@ -50,7 +50,7 @@ clickhouse-benchmark [keys] < queries_file; - `-r`, `--randomize` — использовать случайный порядок выполнения запросов при наличии более одного входного запроса. - `-s`, `--secure` — используется `TLS` соединение. - `-t N`, `--timelimit=N` — лимит по времени в секундах. `clickhouse-benchmark` перестает отправлять запросы при достижении лимита по времени. Значение по умолчанию: 0 (лимит отключен). -- `--confidence=N` — уровень доверия для T-критерия. Возможные значения: 0 (80%), 1 (90%), 2 (95%), 3 (98%), 4 (99%), 5 (99.5%). Значение по умолчанию: 5. В [режиме сравнения](#clickhouse-benchmark-comparison-mode) `clickhouse-benchmark` проверяет [двухвыборочный t-критерий Стьюдента для независимых выборок](https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test) чтобы определить, различны ли две выборки при выбранном уровне доверия. +- `--confidence=N` — уровень доверия для T-критерия. Возможные значения: 0 (80%), 1 (90%), 2 (95%), 3 (98%), 4 (99%), 5 (99.5%). Значение по умолчанию: 5. В [режиме сравнения](#clickhouse-benchmark-comparison-mode) `clickhouse-benchmark` проверяет [двухвыборочный t-критерий Стьюдента для независимых выборок](https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test), чтобы определить, различны ли две выборки при выбранном уровне доверия. - `--cumulative` — выводить статистику за все время работы, а не за последний временной интервал. - `--database=DATABASE_NAME` — имя базы данных ClickHouse. Значение по умолчанию: `default`. - `--json=FILEPATH` — дополнительный вывод в формате `JSON`. Когда этот ключ указан, `clickhouse-benchmark` выводит отчет в указанный JSON-файл. diff --git a/docs/ru/sql-reference/data-types/datetime.md b/docs/ru/sql-reference/data-types/datetime.md index 80d844a1713..57f24786bb7 100644 --- a/docs/ru/sql-reference/data-types/datetime.md +++ b/docs/ru/sql-reference/data-types/datetime.md @@ -33,7 +33,7 @@ ClickHouse отображает значения в зависимости от ## Примеры {#primery} -**1.** Создание таблицы с столбцом типа `DateTime` и вставка данных в неё: +**1.** Создание таблицы со столбцом типа `DateTime` и вставка данных в неё: ``` sql CREATE TABLE dt diff --git a/docs/ru/sql-reference/functions/arithmetic-functions.md b/docs/ru/sql-reference/functions/arithmetic-functions.md index 73bac0595e1..ca7a4566c6c 100644 --- a/docs/ru/sql-reference/functions/arithmetic-functions.md +++ b/docs/ru/sql-reference/functions/arithmetic-functions.md @@ -172,7 +172,7 @@ multiplyDecimal(a, b[, result_scale]) ``` :::note -Эта функция работают гораздо медленнее обычной `multiply`. +Эта функция работает гораздо медленнее обычной `multiply`. В случае, если нет необходимости иметь фиксированную точность и/или нужны быстрые вычисления, следует использовать [multiply](#multiply). ::: diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md index 659e2d3f75e..1f06bdf264a 100644 --- a/docs/ru/sql-reference/functions/array-functions.md +++ b/docs/ru/sql-reference/functions/array-functions.md @@ -488,7 +488,7 @@ arrayPushBack(array, single_value) **Аргументы** - `array` – массив. -- `single_value` – значение добавляемого элемента. В массив с числам можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе «[Типы данных](../../sql-reference/functions/array-functions.md#data_types)». Может быть равно `NULL`, в этом случае функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. +- `single_value` – значение добавляемого элемента. В массив с числами можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе «[Типы данных](../../sql-reference/functions/array-functions.md#data_types)». Может быть равно `NULL`, в этом случае функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. **Пример** @@ -513,7 +513,7 @@ arrayPushFront(array, single_value) **Аргументы** - `array` – массив. -- `single_value` – значение добавляемого элемента. В массив с числам можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе «[Типы данных](../../sql-reference/functions/array-functions.md#data_types)». Может быть равно `NULL`, в этом случае функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. +- `single_value` – значение добавляемого элемента. В массив с числами можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе «[Типы данных](../../sql-reference/functions/array-functions.md#data_types)». Может быть равно `NULL`, в этом случае функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. **Пример** diff --git a/docs/ru/sql-reference/statements/alter/quota.md b/docs/ru/sql-reference/statements/alter/quota.md index 709baea6af0..c14b81c9bf3 100644 --- a/docs/ru/sql-reference/statements/alter/quota.md +++ b/docs/ru/sql-reference/statements/alter/quota.md @@ -22,7 +22,7 @@ ALTER QUOTA [IF EXISTS] name [ON CLUSTER cluster_name] Ключи `user_name`, `ip_address`, `client_key`, `client_key, user_name` и `client_key, ip_address` соответствуют полям таблицы [system.quotas](../../../operations/system-tables/quotas.md). -Параметры `queries`, `query_selects`, `query_inserts`, `errors`, `result_rows`, `result_bytes`, `read_rows`, `read_bytes`, `execution_time` соответствуют полям таблицы [system.quotas_usage](../../../operations/system-tables/quotas_usage.md). +Параметры `queries`, `query_selects`, `query_inserts`, `errors`, `result_rows`, `result_bytes`, `read_rows`, `read_bytes`, `execution_time`, `failed_sequential_authentications` соответствуют полям таблицы [system.quotas_usage](../../../operations/system-tables/quotas_usage.md). В секции `ON CLUSTER` можно указать кластеры, на которых создается квота, см. [Распределенные DDL запросы](../../../sql-reference/distributed-ddl.md). diff --git a/docs/ru/sql-reference/statements/create/quota.md b/docs/ru/sql-reference/statements/create/quota.md index 18eba6b5b1a..398c52fdc73 100644 --- a/docs/ru/sql-reference/statements/create/quota.md +++ b/docs/ru/sql-reference/statements/create/quota.md @@ -20,7 +20,7 @@ CREATE QUOTA [IF NOT EXISTS | OR REPLACE] name [ON CLUSTER cluster_name] ``` Ключи `user_name`, `ip_address`, `client_key`, `client_key, user_name` и `client_key, ip_address` соответствуют полям таблицы [system.quotas](../../../operations/system-tables/quotas.md). -Параметры `queries`, `query_selects`, `query_inserts`, `errors`, `result_rows`, `result_bytes`, `read_rows`, `read_bytes`, `execution_time` соответствуют полям таблицы [system.quotas_usage](../../../operations/system-tables/quotas_usage.md). +Параметры `queries`, `query_selects`, `query_inserts`, `errors`, `result_rows`, `result_bytes`, `read_rows`, `read_bytes`, `execution_time`, `failed_sequential_authentications` соответствуют полям таблицы [system.quotas_usage](../../../operations/system-tables/quotas_usage.md). В секции `ON CLUSTER` можно указать кластеры, на которых создается квота, см. [Распределенные DDL запросы](../../../sql-reference/distributed-ddl.md). diff --git a/docs/ru/sql-reference/statements/select/distinct.md b/docs/ru/sql-reference/statements/select/distinct.md index 58fe16b16d9..ad310434598 100644 --- a/docs/ru/sql-reference/statements/select/distinct.md +++ b/docs/ru/sql-reference/statements/select/distinct.md @@ -92,7 +92,7 @@ ClickHouse поддерживает использование секций `DIS ## Обработка NULL {#null-processing} -`DISTINCT` работает с [NULL](../../syntax.md#null-literal) как-будто `NULL` — обычное значение и `NULL==NULL`. Другими словами, в результате `DISTINCT`, различные комбинации с `NULL` встретятся только один раз. Это отличается от обработки `NULL` в большинстве других контекстов. +`DISTINCT` работает с [NULL](../../syntax.md#null-literal) как будто `NULL` — обычное значение и `NULL==NULL`. Другими словами, в результате `DISTINCT`, различные комбинации с `NULL` встретятся только один раз. Это отличается от обработки `NULL` в большинстве других контекстов. ## Альтернативы {#alternatives} diff --git a/docs/ru/sql-reference/table-functions/cluster.md b/docs/ru/sql-reference/table-functions/cluster.md index f148a21294a..bb22b38f8f9 100644 --- a/docs/ru/sql-reference/table-functions/cluster.md +++ b/docs/ru/sql-reference/table-functions/cluster.md @@ -33,7 +33,7 @@ clusterAllReplicas('cluster_name', db, table[, sharding_key]) **Использование макросов** -`cluster_name` может содержать макрос — подстановку в фигурных скобках. Эта подстановка заменяется на соответствующее значение из секции [macros](../../operations/server-configuration-parameters/settings.md#macros) конфигурационного файла . +`cluster_name` может содержать макрос — подстановку в фигурных скобках. Эта подстановка заменяется на соответствующее значение из секции [macros](../../operations/server-configuration-parameters/settings.md#macros) конфигурационного файла. Пример: diff --git a/docs/zh/operations/system-tables/dictionaries.md b/docs/zh/operations/system-tables/dictionaries.md index 105a591cf69..0cf91e45e86 100644 --- a/docs/zh/operations/system-tables/dictionaries.md +++ b/docs/zh/operations/system-tables/dictionaries.md @@ -20,7 +20,7 @@ machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 - `LOADED_AND_RELOADING` — Dictionary is loaded successfully, and is being reloaded right now (frequent reasons: [SYSTEM RELOAD DICTIONARY](../../sql-reference/statements/system.md#query_language-system-reload-dictionary) 查询,超时,字典配置已更改)。 - `FAILED_AND_RELOADING` — Could not load the dictionary as a result of an error and is loading now. - `origin` ([字符串](../../sql-reference/data-types/string.md)) — Path to the configuration file that describes the dictionary. -- `type` ([字符串](../../sql-reference/data-types/string.md)) — Type of a dictionary allocation. [在内存中存储字典](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). +- `type` ([字符串](../../sql-reference/data-types/string.md)) — Type of dictionary allocation. [在内存中存储字典](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). - `key` — [密钥类型](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-key):数字键 ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) or Сomposite key ([字符串](../../sql-reference/data-types/string.md)) — form “(type 1, type 2, …, type n)”. - `attribute.names` ([阵列](../../sql-reference/data-types/array.md)([字符串](../../sql-reference/data-types/string.md))) — Array of [属性名称](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes) 由字典提供。 - `attribute.types` ([阵列](../../sql-reference/data-types/array.md)([字符串](../../sql-reference/data-types/string.md))) — Corresponding array of [属性类型](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes) 这是由字典提供。 diff --git a/packages/build b/packages/build index c2285b8ee7c..b2dd085d9dd 100755 --- a/packages/build +++ b/packages/build @@ -130,6 +130,8 @@ if [ -n "$SANITIZER" ]; then fi elif [[ $BUILD_TYPE == 'debug' ]]; then VERSION_POSTFIX+="+debug" +elif [[ $BUILD_TYPE =~ 'coverage' ]]; then + VERSION_POSTFIX+="+coverage" fi if [[ "$PKG_ROOT" != "$SOURCE" ]]; then diff --git a/packages/clickhouse-client.yaml b/packages/clickhouse-client.yaml index 4d707b28ad9..34b42d92adf 100644 --- a/packages/clickhouse-client.yaml +++ b/packages/clickhouse-client.yaml @@ -49,6 +49,12 @@ contents: dst: /usr/bin/clickhouse-client - src: root/usr/bin/clickhouse-local dst: /usr/bin/clickhouse-local +- src: root/usr/bin/ch + dst: /usr/bin/ch +- src: root/usr/bin/chc + dst: /usr/bin/chc +- src: root/usr/bin/chl + dst: /usr/bin/chl - src: root/usr/bin/clickhouse-obfuscator dst: /usr/bin/clickhouse-obfuscator # docs diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index b3a5af6d6c9..62bcf068879 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -7,35 +7,16 @@ endif () include(${ClickHouse_SOURCE_DIR}/cmake/split_debug_symbols.cmake) # The `clickhouse` binary is a multi purpose tool that contains multiple execution modes (client, server, etc.), -# each of them may be built and linked as a separate library. -# If you do not know what modes you need, turn this option OFF and enable SERVER and CLIENT only. +# So client/server/... is just a symlink to `clickhouse` binary. +# +# But, there are several components that requires extra libraries, like keeper +# requires NuRaft, that regular binary does not requires, so you can disable +# compilation of this components. +# +# If you do not know what modes you need, turn then all. option (ENABLE_CLICKHOUSE_ALL "Enable all ClickHouse modes by default" ON) -option (ENABLE_CLICKHOUSE_SERVER "Server mode (main mode)" ${ENABLE_CLICKHOUSE_ALL}) -option (ENABLE_CLICKHOUSE_CLIENT "Client mode (interactive tui/shell that connects to the server)" - ${ENABLE_CLICKHOUSE_ALL}) - -# https://clickhouse.com/docs/en/operations/utilities/clickhouse-local/ -option (ENABLE_CLICKHOUSE_LOCAL "Local files fast processing mode" ${ENABLE_CLICKHOUSE_ALL}) - -# https://clickhouse.com/docs/en/operations/utilities/clickhouse-benchmark/ -option (ENABLE_CLICKHOUSE_BENCHMARK "Queries benchmarking mode" ${ENABLE_CLICKHOUSE_ALL}) - -option (ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG "Configs processor (extract values etc.)" ${ENABLE_CLICKHOUSE_ALL}) - -# https://clickhouse.com/docs/en/operations/utilities/clickhouse-compressor/ -option (ENABLE_CLICKHOUSE_COMPRESSOR "Data compressor and decompressor" ${ENABLE_CLICKHOUSE_ALL}) - -# https://clickhouse.com/docs/en/operations/utilities/clickhouse-copier/ -option (ENABLE_CLICKHOUSE_COPIER "Inter-cluster data copying mode" ${ENABLE_CLICKHOUSE_ALL}) - -option (ENABLE_CLICKHOUSE_FORMAT "Queries pretty-printer and formatter with syntax highlighting" - ${ENABLE_CLICKHOUSE_ALL}) - # https://clickhouse.com/docs/en/operations/utilities/clickhouse-obfuscator/ -option (ENABLE_CLICKHOUSE_OBFUSCATOR "Table data obfuscator (convert real data to benchmark-ready one)" - ${ENABLE_CLICKHOUSE_ALL}) - # https://clickhouse.com/docs/en/operations/utilities/odbc-bridge/ # TODO Also needs NANODBC. if (ENABLE_ODBC AND NOT USE_MUSL) @@ -51,18 +32,12 @@ endif () # https://presentations.clickhouse.com/matemarketing_2020/ option (ENABLE_CLICKHOUSE_GIT_IMPORT "A tool to analyze Git repositories" ${ENABLE_CLICKHOUSE_ALL}) -option (ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER "A tool to export table data files to be later put to a static files web server" ${ENABLE_CLICKHOUSE_ALL}) - option (ENABLE_CLICKHOUSE_KEEPER "ClickHouse alternative to ZooKeeper" ${ENABLE_CLICKHOUSE_ALL}) option (ENABLE_CLICKHOUSE_KEEPER_CONVERTER "Util allows to convert ZooKeeper logs and snapshots into clickhouse-keeper snapshot" ${ENABLE_CLICKHOUSE_ALL}) option (ENABLE_CLICKHOUSE_KEEPER_CLIENT "ClickHouse Keeper Client" ${ENABLE_CLICKHOUSE_ALL}) -option (ENABLE_CLICKHOUSE_SU "A tool similar to 'su'" ${ENABLE_CLICKHOUSE_ALL}) - -option (ENABLE_CLICKHOUSE_DISKS "A tool to manage disks" ${ENABLE_CLICKHOUSE_ALL}) - if (NOT ENABLE_NURAFT) # RECONFIGURE_MESSAGE_LEVEL should not be used here, # since ENABLE_NURAFT is set to OFF for FreeBSD and Darwin. @@ -71,27 +46,7 @@ if (NOT ENABLE_NURAFT) set(ENABLE_CLICKHOUSE_KEEPER_CONVERTER OFF) endif() -option(ENABLE_CLICKHOUSE_INSTALL "Install ClickHouse without .deb/.rpm/.tgz packages (having the binary only)" ${ENABLE_CLICKHOUSE_ALL}) - -message(STATUS "ClickHouse modes:") - -if (NOT ENABLE_CLICKHOUSE_SERVER) - message(WARNING "ClickHouse server mode is not going to be built.") -else() - message(STATUS "Server mode: ON") -endif() - -if (NOT ENABLE_CLICKHOUSE_CLIENT) - message(WARNING "ClickHouse client mode is not going to be built. You won't be able to connect to the server and run tests") -else() - message(STATUS "Client mode: ON") -endif() - -if (ENABLE_CLICKHOUSE_LOCAL) - message(STATUS "Local mode: ON") -else() - message(STATUS "Local mode: OFF") -endif() +message(STATUS "ClickHouse extra components:") if (ENABLE_CLICKHOUSE_SELF_EXTRACTING) message(STATUS "Self-extracting executable: ON") @@ -99,42 +54,6 @@ else() message(STATUS "Self-extracting executable: OFF") endif() -if (ENABLE_CLICKHOUSE_BENCHMARK) - message(STATUS "Benchmark mode: ON") -else() - message(STATUS "Benchmark mode: OFF") -endif() - -if (ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG) - message(STATUS "Extract from config mode: ON") -else() - message(STATUS "Extract from config mode: OFF") -endif() - -if (ENABLE_CLICKHOUSE_COMPRESSOR) - message(STATUS "Compressor mode: ON") -else() - message(STATUS "Compressor mode: OFF") -endif() - -if (ENABLE_CLICKHOUSE_COPIER) - message(STATUS "Copier mode: ON") -else() - message(STATUS "Copier mode: OFF") -endif() - -if (ENABLE_CLICKHOUSE_FORMAT) - message(STATUS "Format mode: ON") -else() - message(STATUS "Format mode: OFF") -endif() - -if (ENABLE_CLICKHOUSE_OBFUSCATOR) - message(STATUS "Obfuscator mode: ON") -else() - message(STATUS "Obfuscator mode: OFF") -endif() - if (ENABLE_CLICKHOUSE_ODBC_BRIDGE) message(STATUS "ODBC bridge mode: ON") else() @@ -147,18 +66,6 @@ else() message(STATUS "Library bridge mode: OFF") endif() -if (ENABLE_CLICKHOUSE_INSTALL) - message(STATUS "ClickHouse install: ON") -else() - message(STATUS "ClickHouse install: OFF") -endif() - -if (ENABLE_CLICKHOUSE_GIT_IMPORT) - message(STATUS "ClickHouse git-import: ON") -else() - message(STATUS "ClickHouse git-import: OFF") -endif() - if (ENABLE_CLICKHOUSE_KEEPER) message(STATUS "ClickHouse keeper mode: ON") else() @@ -177,19 +84,6 @@ else() message(STATUS "ClickHouse keeper-client mode: OFF") endif() - -if (ENABLE_CLICKHOUSE_DISKS) - message(STATUS "Clickhouse disks mode: ON") -else() - message(STATUS "ClickHouse disks mode: OFF") -endif() - -if (ENABLE_CLICKHOUSE_SU) - message(STATUS "ClickHouse su: ON") -else() - message(STATUS "ClickHouse su: OFF") -endif() - configure_file (config_tools.h.in ${CONFIG_INCLUDE_PATH}/config_tools.h) macro(clickhouse_target_link_split_lib target name) @@ -272,42 +166,6 @@ endif () target_link_libraries (clickhouse PRIVATE clickhouse_common_io string_utils ${HARMFUL_LIB}) target_include_directories (clickhouse PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) -if (ENABLE_CLICKHOUSE_SERVER) - clickhouse_target_link_split_lib(clickhouse server) -endif () -if (ENABLE_CLICKHOUSE_CLIENT) - clickhouse_target_link_split_lib(clickhouse client) -endif () -if (ENABLE_CLICKHOUSE_LOCAL) - clickhouse_target_link_split_lib(clickhouse local) -endif () -if (ENABLE_CLICKHOUSE_BENCHMARK) - clickhouse_target_link_split_lib(clickhouse benchmark) -endif () -if (ENABLE_CLICKHOUSE_COPIER) - clickhouse_target_link_split_lib(clickhouse copier) -endif () -if (ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG) - clickhouse_target_link_split_lib(clickhouse extract-from-config) -endif () -if (ENABLE_CLICKHOUSE_COMPRESSOR) - clickhouse_target_link_split_lib(clickhouse compressor) -endif () -if (ENABLE_CLICKHOUSE_FORMAT) - clickhouse_target_link_split_lib(clickhouse format) -endif () -if (ENABLE_CLICKHOUSE_OBFUSCATOR) - clickhouse_target_link_split_lib(clickhouse obfuscator) -endif () -if (ENABLE_CLICKHOUSE_GIT_IMPORT) - clickhouse_target_link_split_lib(clickhouse git-import) -endif () -if (ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER) - clickhouse_target_link_split_lib(clickhouse static-files-disk-uploader) -endif () -if (ENABLE_CLICKHOUSE_SU) - clickhouse_target_link_split_lib(clickhouse su) -endif () if (ENABLE_CLICKHOUSE_KEEPER) clickhouse_target_link_split_lib(clickhouse keeper) endif() @@ -317,77 +175,40 @@ endif() if (ENABLE_CLICKHOUSE_KEEPER_CLIENT) clickhouse_target_link_split_lib(clickhouse keeper-client) endif() -if (ENABLE_CLICKHOUSE_INSTALL) - clickhouse_target_link_split_lib(clickhouse install) -endif () -if (ENABLE_CLICKHOUSE_DISKS) - clickhouse_target_link_split_lib(clickhouse disks) -endif () +clickhouse_target_link_split_lib(clickhouse install) set (CLICKHOUSE_BUNDLE) +macro(clickhouse_program_install name lib_name) + clickhouse_target_link_split_lib(clickhouse ${lib_name}) + add_custom_target (${name} ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse ${name} DEPENDS clickhouse) + install (FILES "${CMAKE_CURRENT_BINARY_DIR}/${name}" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) + list(APPEND CLICKHOUSE_BUNDLE ${name}) + + foreach(alias ${ARGN}) + message(STATUS "Adding alias ${alias} for ${name}") + add_custom_target (${alias} ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse ${alias} DEPENDS clickhouse) + install (FILES "${CMAKE_CURRENT_BINARY_DIR}/${alias}" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) + list(APPEND CLICKHOUSE_BUNDLE ${alias}) + endforeach() +endmacro() + if (ENABLE_CLICKHOUSE_SELF_EXTRACTING) list(APPEND CLICKHOUSE_BUNDLE self-extracting) endif () -if (ENABLE_CLICKHOUSE_SERVER) - add_custom_target (clickhouse-server ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-server DEPENDS clickhouse) - install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-server" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - list(APPEND CLICKHOUSE_BUNDLE clickhouse-server) -endif () -if (ENABLE_CLICKHOUSE_CLIENT) - add_custom_target (clickhouse-client ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-client DEPENDS clickhouse) - install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-client" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - list(APPEND CLICKHOUSE_BUNDLE clickhouse-client) -endif () -if (ENABLE_CLICKHOUSE_LOCAL) - add_custom_target (clickhouse-local ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-local DEPENDS clickhouse) - install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-local" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - list(APPEND CLICKHOUSE_BUNDLE clickhouse-local) -endif () -if (ENABLE_CLICKHOUSE_BENCHMARK) - add_custom_target (clickhouse-benchmark ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-benchmark DEPENDS clickhouse) - install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-benchmark" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - list(APPEND CLICKHOUSE_BUNDLE clickhouse-benchmark) -endif () -if (ENABLE_CLICKHOUSE_COPIER) - add_custom_target (clickhouse-copier ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-copier DEPENDS clickhouse) - install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-copier" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - list(APPEND CLICKHOUSE_BUNDLE clickhouse-copier) -endif () -if (ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG) - add_custom_target (clickhouse-extract-from-config ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-extract-from-config DEPENDS clickhouse) - install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-extract-from-config" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - list(APPEND CLICKHOUSE_BUNDLE clickhouse-extract-from-config) -endif () -if (ENABLE_CLICKHOUSE_COMPRESSOR) - add_custom_target (clickhouse-compressor ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-compressor DEPENDS clickhouse) - install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-compressor" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - list(APPEND CLICKHOUSE_BUNDLE clickhouse-compressor) -endif () -if (ENABLE_CLICKHOUSE_FORMAT) - add_custom_target (clickhouse-format ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-format DEPENDS clickhouse) - install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-format" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - list(APPEND CLICKHOUSE_BUNDLE clickhouse-format) -endif () -if (ENABLE_CLICKHOUSE_OBFUSCATOR) - add_custom_target (clickhouse-obfuscator ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-obfuscator DEPENDS clickhouse) - install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-obfuscator" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - list(APPEND CLICKHOUSE_BUNDLE clickhouse-obfuscator) -endif () -if (ENABLE_CLICKHOUSE_GIT_IMPORT) - add_custom_target (clickhouse-git-import ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-git-import DEPENDS clickhouse) - install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-git-import" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - list(APPEND CLICKHOUSE_BUNDLE clickhouse-git-import) -endif () -if (ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER) - add_custom_target (clickhouse-static-files-disk-uploader ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-static-files-disk-uploader DEPENDS clickhouse) - install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-static-files-disk-uploader" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - list(APPEND CLICKHOUSE_BUNDLE clickhouse-static-files-disk-uploader) -endif () -if (ENABLE_CLICKHOUSE_SU) - add_custom_target (clickhouse-su ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-su DEPENDS clickhouse) - install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-su" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - list(APPEND CLICKHOUSE_BUNDLE clickhouse-su) -endif () + +clickhouse_program_install(clickhouse-server server) +clickhouse_program_install(clickhouse-client client chc) +clickhouse_program_install(clickhouse-local local chl ch) +clickhouse_program_install(clickhouse-benchmark benchmark) +clickhouse_program_install(clickhouse-copier copier) +clickhouse_program_install(clickhouse-extract-from-config extract-from-config) +clickhouse_program_install(clickhouse-compressor compressor) +clickhouse_program_install(clickhouse-format format) +clickhouse_program_install(clickhouse-obfuscator obfuscator) +clickhouse_program_install(clickhouse-git-import git-import) +clickhouse_program_install(clickhouse-static-files-disk-uploader static-files-disk-uploader) +clickhouse_program_install(clickhouse-disks disks) +clickhouse_program_install(clickhouse-su su) if (ENABLE_CLICKHOUSE_KEEPER) if (NOT BUILD_STANDALONE_KEEPER AND CREATE_KEEPER_SYMLINK) @@ -417,11 +238,6 @@ if (ENABLE_CLICKHOUSE_KEEPER_CLIENT) list(APPEND CLICKHOUSE_BUNDLE clickhouse-keeper-client) endif () -if (ENABLE_CLICKHOUSE_DISKS) - add_custom_target (clickhouse-disks ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-disks DEPENDS clickhouse) - install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-disks" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - list(APPEND CLICKHOUSE_BUNDLE clickhouse-disks) -endif () add_custom_target (clickhouse-bundle ALL DEPENDS ${CLICKHOUSE_BUNDLE}) diff --git a/programs/benchmark/Benchmark.cpp b/programs/benchmark/Benchmark.cpp index 59fc6c0c17f..961c678b936 100644 --- a/programs/benchmark/Benchmark.cpp +++ b/programs/benchmark/Benchmark.cpp @@ -640,7 +640,8 @@ int mainEntryClickHouseBenchmark(int argc, char ** argv) { std::cout << "Usage: " << argv[0] << " [options] < queries.txt\n"; std::cout << desc << "\n"; - return 1; + std::cout << "\nSee also: https://clickhouse.com/docs/en/operations/utilities/clickhouse-benchmark/\n"; + return 0; } print_stacktrace = options.count("stacktrace"); diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 0988e1eb4a1..7a77b7dd0ec 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -504,7 +504,7 @@ void Client::connect() << "It may lack support for new features." << std::endl << std::endl; } - else if (client_version_tuple > server_version_tuple) + else if (client_version_tuple > server_version_tuple && server_display_name != "clickhouse-cloud") { std::cout << "ClickHouse server version is older than ClickHouse client. " << "It may indicate that the server is out of date and can be upgraded." << std::endl @@ -845,83 +845,7 @@ bool Client::processWithFuzzing(const String & full_query) have_error = true; } - // Check that after the query is formatted, we can parse it back, - // format again and get the same result. Unfortunately, we can't - // compare the ASTs, which would be more sensitive to errors. This - // double formatting check doesn't catch all errors, e.g. we can - // format query incorrectly, but to a valid SQL that we can then - // parse and format into the same SQL. - // There are some complicated cases where we can generate the SQL - // which we can't parse: - // * first argument of lambda() replaced by fuzzer with - // something else, leading to constructs such as - // arrayMap((min(x) + 3) -> x + 1, ....) - // * internals of Enum replaced, leading to: - // Enum(equals(someFunction(y), 3)). - // And there are even the cases when we can parse the query, but - // it's logically incorrect and its formatting is a mess, such as - // when `lambda()` function gets substituted into a wrong place. - // To avoid dealing with these cases, run the check only for the - // queries we were able to successfully execute. - // Another caveat is that sometimes WITH queries are not executed, - // if they are not referenced by the main SELECT, so they can still - // have the aforementioned problems. Disable this check for such - // queries, for lack of a better solution. - // There is also a problem that fuzzer substitutes positive Int64 - // literals or Decimal literals, which are then parsed back as - // UInt64, and suddenly duplicate alias substitution starts or stops - // working (ASTWithAlias::formatImpl) or something like that. - // So we compare not even the first and second formatting of the - // query, but second and third. - // If you have to add any more workarounds to this check, just remove - // it altogether, it's not so useful. - if (ast_to_process && !have_error && !queryHasWithClause(*ast_to_process)) - { - ASTPtr ast_2; - try - { - const auto * tmp_pos = query_to_execute.c_str(); - ast_2 = parseQuery(tmp_pos, tmp_pos + query_to_execute.size(), false /* allow_multi_statements */); - } - catch (Exception & e) - { - if (e.code() != ErrorCodes::SYNTAX_ERROR && - e.code() != ErrorCodes::TOO_DEEP_RECURSION) - throw; - } - - if (ast_2) - { - const auto text_2 = ast_2->formatForErrorMessage(); - const auto * tmp_pos = text_2.c_str(); - const auto ast_3 = parseQuery(tmp_pos, tmp_pos + text_2.size(), - false /* allow_multi_statements */); - const auto text_3 = ast_3 ? ast_3->formatForErrorMessage() : ""; - - if (text_3 != text_2) - { - fmt::print(stderr, "Found error: The query formatting is broken.\n"); - - printChangedSettings(); - - fmt::print(stderr, - "Got the following (different) text after formatting the fuzzed query and parsing it back:\n'{}'\n, expected:\n'{}'\n", - text_3, text_2); - fmt::print(stderr, "In more detail:\n"); - fmt::print(stderr, "AST-1 (generated by fuzzer):\n'{}'\n", ast_to_process->dumpTree()); - fmt::print(stderr, "Text-1 (AST-1 formatted):\n'{}'\n", query_to_execute); - fmt::print(stderr, "AST-2 (Text-1 parsed):\n'{}'\n", ast_2->dumpTree()); - fmt::print(stderr, "Text-2 (AST-2 formatted):\n'{}'\n", text_2); - fmt::print(stderr, "AST-3 (Text-2 parsed):\n'{}'\n", ast_3 ? ast_3->dumpTree() : ""); - fmt::print(stderr, "Text-3 (AST-3 formatted):\n'{}'\n", text_3); - fmt::print(stderr, "Text-3 must be equal to Text-2, but it is not.\n"); - - _exit(1); - } - } - } - - // The server is still alive so we're going to continue fuzzing. + // The server is still alive, so we're going to continue fuzzing. // Determine what we're going to use as the starting AST. if (have_error) { @@ -1000,6 +924,7 @@ void Client::printHelpMessage(const OptionsDescription & options_description) std::cout << options_description.external_description.value() << "\n"; std::cout << options_description.hosts_and_ports_description.value() << "\n"; std::cout << "In addition, --param_name=value can be specified for substitution of parameters for parametrized queries.\n"; + std::cout << "\nSee also: https://clickhouse.com/docs/en/integrations/sql-clients/cli\n"; } diff --git a/programs/compressor/Compressor.cpp b/programs/compressor/Compressor.cpp index cc25747702a..7125fdc744f 100644 --- a/programs/compressor/Compressor.cpp +++ b/programs/compressor/Compressor.cpp @@ -100,6 +100,7 @@ int mainEntryClickHouseCompressor(int argc, char ** argv) std::cout << "Usage: " << argv[0] << " [options] < INPUT > OUTPUT" << std::endl; std::cout << "Usage: " << argv[0] << " [options] INPUT OUTPUT" << std::endl; std::cout << desc << std::endl; + std::cout << "\nSee also: https://clickhouse.com/docs/en/operations/utilities/clickhouse-compressor/\n"; return 0; } diff --git a/programs/config_tools.h.in b/programs/config_tools.h.in index 65ef3ca762b..50a1de5628b 100644 --- a/programs/config_tools.h.in +++ b/programs/config_tools.h.in @@ -2,23 +2,8 @@ #pragma once -#cmakedefine01 ENABLE_CLICKHOUSE_SERVER -#cmakedefine01 ENABLE_CLICKHOUSE_CLIENT -#cmakedefine01 ENABLE_CLICKHOUSE_LOCAL -#cmakedefine01 ENABLE_CLICKHOUSE_BENCHMARK -#cmakedefine01 ENABLE_CLICKHOUSE_PERFORMANCE_TEST -#cmakedefine01 ENABLE_CLICKHOUSE_COPIER -#cmakedefine01 ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG -#cmakedefine01 ENABLE_CLICKHOUSE_COMPRESSOR -#cmakedefine01 ENABLE_CLICKHOUSE_FORMAT -#cmakedefine01 ENABLE_CLICKHOUSE_OBFUSCATOR -#cmakedefine01 ENABLE_CLICKHOUSE_GIT_IMPORT -#cmakedefine01 ENABLE_CLICKHOUSE_INSTALL #cmakedefine01 ENABLE_CLICKHOUSE_ODBC_BRIDGE #cmakedefine01 ENABLE_CLICKHOUSE_LIBRARY_BRIDGE #cmakedefine01 ENABLE_CLICKHOUSE_KEEPER #cmakedefine01 ENABLE_CLICKHOUSE_KEEPER_CLIENT #cmakedefine01 ENABLE_CLICKHOUSE_KEEPER_CONVERTER -#cmakedefine01 ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER -#cmakedefine01 ENABLE_CLICKHOUSE_SU -#cmakedefine01 ENABLE_CLICKHOUSE_DISKS diff --git a/programs/copier/ClusterCopierApp.cpp b/programs/copier/ClusterCopierApp.cpp index 53f79888573..fdf07dec61a 100644 --- a/programs/copier/ClusterCopierApp.cpp +++ b/programs/copier/ClusterCopierApp.cpp @@ -78,6 +78,7 @@ void ClusterCopierApp::handleHelp(const std::string &, const std::string &) help_formatter.setHeader("Copies tables from one cluster to another"); help_formatter.setUsage("--config-file --task-path "); help_formatter.format(std::cerr); + help_formatter.setFooter("See also: https://clickhouse.com/docs/en/operations/utilities/clickhouse-copier/"); stopOptionsProcessing(); } diff --git a/programs/disks/CMakeLists.txt b/programs/disks/CMakeLists.txt index 9477854a58b..f0949fcfceb 100644 --- a/programs/disks/CMakeLists.txt +++ b/programs/disks/CMakeLists.txt @@ -11,6 +11,10 @@ set (CLICKHOUSE_DISKS_SOURCES CommandRemove.cpp CommandWrite.cpp) +if (CLICKHOUSE_CLOUD) + set (CLICKHOUSE_DISKS_SOURCES ${CLICKHOUSE_DISKS_SOURCES} CommandPackedIO.cpp) +endif () + set (CLICKHOUSE_DISKS_LINK PRIVATE boost::program_options diff --git a/programs/disks/CommandRead.cpp b/programs/disks/CommandRead.cpp index 85041faf22c..0f3ac7ab98c 100644 --- a/programs/disks/CommandRead.cpp +++ b/programs/disks/CommandRead.cpp @@ -61,7 +61,6 @@ public: auto out = disk->writeFile(relative_path_output); copyData(*in, *out); out->finalize(); - return; } else { diff --git a/programs/disks/DisksApp.cpp b/programs/disks/DisksApp.cpp index 4b3b83238a0..b7c3c7f5c97 100644 --- a/programs/disks/DisksApp.cpp +++ b/programs/disks/DisksApp.cpp @@ -65,6 +65,9 @@ void DisksApp::addOptions( positional_options_description.add("command_name", 1); supported_commands = {"list-disks", "list", "move", "remove", "link", "copy", "write", "read", "mkdir"}; +#ifdef CLICKHOUSE_CLOUD + supported_commands.insert("packed-io"); +#endif command_descriptions.emplace("list-disks", makeCommandListDisks()); command_descriptions.emplace("list", makeCommandList()); @@ -75,6 +78,9 @@ void DisksApp::addOptions( command_descriptions.emplace("write", makeCommandWrite()); command_descriptions.emplace("read", makeCommandRead()); command_descriptions.emplace("mkdir", makeCommandMkDir()); +#ifdef CLICKHOUSE_CLOUD + command_descriptions.emplace("packed-io", makeCommandPackedIO()); +#endif } void DisksApp::processOptions() @@ -89,6 +95,11 @@ void DisksApp::processOptions() config().setString("log-level", options["log-level"].as()); } +DisksApp::~DisksApp() +{ + global_context->shutdown(); +} + void DisksApp::init(std::vector & common_arguments) { stopOptionsProcessing(); @@ -134,6 +145,7 @@ void DisksApp::parseAndCheckOptions( .options(options_description_) .positional(positional_options_description) .allow_unregistered(); + po::parsed_options parsed = parser.run(); po::store(parsed, options); @@ -199,8 +211,8 @@ int DisksApp::main(const std::vector & /*args*/) po::parsed_options parsed = parser.run(); po::store(parsed, options); po::notify(options); - args = po::collect_unrecognized(parsed.options, po::collect_unrecognized_mode::include_positional); + args = po::collect_unrecognized(parsed.options, po::collect_unrecognized_mode::include_positional); command->processOptions(config(), options); } else diff --git a/programs/disks/DisksApp.h b/programs/disks/DisksApp.h index 0b596921707..51bc3f58dc4 100644 --- a/programs/disks/DisksApp.h +++ b/programs/disks/DisksApp.h @@ -21,6 +21,7 @@ class DisksApp : public Poco::Util::Application, public Loggers { public: DisksApp() = default; + ~DisksApp() override; void init(std::vector & common_arguments); @@ -52,9 +53,9 @@ protected: std::vector command_arguments; std::unordered_set supported_commands; - std::unordered_map command_descriptions; po::variables_map options; }; + } diff --git a/programs/disks/ICommand.h b/programs/disks/ICommand.h index da106e1084e..efe350fe87b 100644 --- a/programs/disks/ICommand.h +++ b/programs/disks/ICommand.h @@ -63,3 +63,4 @@ DB::CommandPtr makeCommandRead(); DB::CommandPtr makeCommandRemove(); DB::CommandPtr makeCommandWrite(); DB::CommandPtr makeCommandMkDir(); +DB::CommandPtr makeCommandPackedIO(); diff --git a/programs/git-import/git-import.cpp b/programs/git-import/git-import.cpp index 16244232bee..fdabeacd46e 100644 --- a/programs/git-import/git-import.cpp +++ b/programs/git-import/git-import.cpp @@ -172,6 +172,7 @@ clickhouse-client --query "INSERT INTO git.commits FORMAT TSV" < commits.tsv clickhouse-client --query "INSERT INTO git.file_changes FORMAT TSV" < file_changes.tsv clickhouse-client --query "INSERT INTO git.line_changes FORMAT TSV" < line_changes.tsv +Check out this presentation: https://presentations.clickhouse.com/matemarketing_2020/ )"; namespace po = boost::program_options; diff --git a/programs/install/Install.cpp b/programs/install/Install.cpp index 52f30098b38..c3d2c61d6d0 100644 --- a/programs/install/Install.cpp +++ b/programs/install/Install.cpp @@ -79,10 +79,6 @@ namespace ErrorCodes } -/// ANSI escape sequence for intense color in terminal. -#define HILITE "\033[1m" -#define END_HILITE "\033[0m" - #if defined(OS_DARWIN) /// Until createUser() and createGroup() are implemented, only sudo-less installations are supported/default for macOS. static constexpr auto DEFAULT_CLICKHOUSE_SERVER_USER = ""; @@ -216,6 +212,16 @@ int mainEntryClickHouseInstall(int argc, char ** argv) { try { + const char * start_hilite = ""; + const char * end_hilite = ""; + + if (isatty(STDOUT_FILENO)) + { + /// ANSI escape sequence for intense color in terminal. + start_hilite = "\033[1m"; + end_hilite = "\033[0m"; + } + po::options_description desc; desc.add_options() ("help,h", "produce help message") @@ -236,9 +242,10 @@ int mainEntryClickHouseInstall(int argc, char ** argv) if (options.count("help")) { + std::cout << "Install ClickHouse without .deb/.rpm/.tgz packages (having the binary only)\n\n"; std::cout << "Usage: " << formatWithSudo(std::string(argv[0]) + " install [options]", getuid() != 0) << '\n'; std::cout << desc << '\n'; - return 1; + return 0; } /// We need to copy binary to the binary directory. @@ -707,7 +714,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv) { fmt::print("Users config file {} already exists, will keep it and extract users info from it.\n", users_config_file.string()); - /// Check if password for default user already specified. + /// Check if password for the default user already specified. ConfigProcessor processor(users_config_file.string(), /* throw_on_bad_incl = */ false, /* log_to_console = */ false); ConfigurationPtr configuration(new Poco::Util::XMLConfiguration(processor.processConfig())); @@ -799,13 +806,13 @@ int mainEntryClickHouseInstall(int argc, char ** argv) /// Set up password for default user. if (has_password_for_default_user) { - fmt::print(HILITE "Password for default user is already specified. To remind or reset, see {} and {}." END_HILITE "\n", - users_config_file.string(), users_d.string()); + fmt::print("{}Password for the default user is already specified. To remind or reset, see {} and {}.{}\n", + start_hilite, users_config_file.string(), users_d.string(), end_hilite); } else if (!can_ask_password) { - fmt::print(HILITE "Password for default user is empty string. See {} and {} to change it." END_HILITE "\n", - users_config_file.string(), users_d.string()); + fmt::print("{}Password for the default user is an empty string. See {} and {} to change it.{}\n", + start_hilite, users_config_file.string(), users_d.string(), end_hilite); } else { @@ -814,7 +821,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv) char buf[1000] = {}; std::string password; - if (auto * result = readpassphrase("Enter password for default user: ", buf, sizeof(buf), 0)) + if (auto * result = readpassphrase("Enter password for the default user: ", buf, sizeof(buf), 0)) password = result; if (!password.empty()) @@ -839,7 +846,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv) "\n"; out.sync(); out.finalize(); - fmt::print(HILITE "Password for default user is saved in file {}." END_HILITE "\n", password_file); + fmt::print("{}Password for the default user is saved in file {}.{}\n", start_hilite, password_file, end_hilite); #else out << "\n" " \n" @@ -850,13 +857,13 @@ int mainEntryClickHouseInstall(int argc, char ** argv) "\n"; out.sync(); out.finalize(); - fmt::print(HILITE "Password for default user is saved in plaintext in file {}." END_HILITE "\n", password_file); + fmt::print("{}Password for the default user is saved in plaintext in file {}.{}\n", start_hilite, password_file, end_hilite); #endif has_password_for_default_user = true; } else - fmt::print(HILITE "Password for default user is empty string. See {} and {} to change it." END_HILITE "\n", - users_config_file.string(), users_d.string()); + fmt::print("{}Password for the default user is an empty string. See {} and {} to change it.{}\n", + start_hilite, users_config_file.string(), users_d.string(), end_hilite); } /** Set capabilities for the binary. diff --git a/programs/keeper-client/Commands.cpp b/programs/keeper-client/Commands.cpp index d19e2ffe00f..62b082ce15a 100644 --- a/programs/keeper-client/Commands.cpp +++ b/programs/keeper-client/Commands.cpp @@ -115,7 +115,7 @@ bool CreateCommand::parse(IParser::Pos & pos, std::shared_ptr & else if (ParserKeyword{"PERSISTENT SEQUENTIAL"}.ignore(pos, expected)) mode = zkutil::CreateMode::PersistentSequential; - node->args.push_back(mode); + node->args.push_back(std::move(mode)); return true; } diff --git a/programs/keeper-converter/KeeperConverter.cpp b/programs/keeper-converter/KeeperConverter.cpp index 2b2759412ab..92bdea28738 100644 --- a/programs/keeper-converter/KeeperConverter.cpp +++ b/programs/keeper-converter/KeeperConverter.cpp @@ -1,5 +1,4 @@ #include -#include #include #include diff --git a/programs/keeper/CMakeLists.txt b/programs/keeper/CMakeLists.txt index 143ded0ee85..b8a5d9c9c19 100644 --- a/programs/keeper/CMakeLists.txt +++ b/programs/keeper/CMakeLists.txt @@ -39,6 +39,7 @@ if (BUILD_STANDALONE_KEEPER) ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperContext.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperStateManager.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperStorage.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperConstants.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperAsynchronousMetrics.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/pathUtils.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SessionExpiryQueue.cpp @@ -69,6 +70,7 @@ if (BUILD_STANDALONE_KEEPER) ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/ServerType.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTPRequestHandlerFactoryMain.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/KeeperReadinessHandler.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/CloudPlacementInfo.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTP/HTTPServer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTP/ReadHeaders.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTP/HTTPServerConnection.cpp diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index c751702dc6f..5b844e7d650 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -31,9 +32,10 @@ #include #include -#include #include #include +#include +#include #include "Core/Defines.h" #include "config.h" @@ -352,6 +354,11 @@ try std::string include_from_path = config().getString("include_from", "/etc/metrika.xml"); + if (config().has(DB::PlacementInfo::PLACEMENT_CONFIG_PREFIX)) + { + PlacementInfo::PlacementInfo::instance().initialize(config()); + } + GlobalThreadPool::initialize( config().getUInt("max_thread_pool_size", 100), config().getUInt("max_thread_pool_free_size", 1000), @@ -482,19 +489,28 @@ try /// Prometheus (if defined and not setup yet with http_port) port_name = "prometheus.port"; - createServer(listen_host, port_name, listen_try, [&, my_http_context = std::move(http_context)](UInt16 port) mutable - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(my_http_context->getReceiveTimeout()); - socket.setSendTimeout(my_http_context->getSendTimeout()); - servers->emplace_back( - listen_host, - port_name, - "Prometheus: http://" + address.toString(), - std::make_unique( - std::move(my_http_context), createPrometheusMainHandlerFactory(*this, config_getter(), async_metrics, "PrometheusHandler-factory"), server_pool, socket, http_params)); - }); + createServer( + listen_host, + port_name, + listen_try, + [&, my_http_context = std::move(http_context)](UInt16 port) mutable + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(my_http_context->getReceiveTimeout()); + socket.setSendTimeout(my_http_context->getSendTimeout()); + auto metrics_writer = std::make_shared(config, "prometheus", async_metrics); + servers->emplace_back( + listen_host, + port_name, + "Prometheus: http://" + address.toString(), + std::make_unique( + std::move(my_http_context), + createPrometheusMainHandlerFactory(*this, config_getter(), metrics_writer, "PrometheusHandler-factory"), + server_pool, + socket, + http_params)); + }); /// HTTP control endpoints port_name = "keeper_server.http_control.port"; diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 443d4a52fa3..e19cf17c9f4 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -249,7 +249,7 @@ void LocalServer::tryInitPath() default_path = parent_folder / fmt::format("clickhouse-local-{}-{}-{}", getpid(), time(nullptr), randomSeed()); if (exists(default_path)) - throw Exception(ErrorCodes::FILE_ALREADY_EXISTS, "Unsuccessful attempt to create working directory: {} exist!", default_path.string()); + throw Exception(ErrorCodes::FILE_ALREADY_EXISTS, "Unsuccessful attempt to create working directory: {} already exists.", default_path.string()); create_directory(default_path); temporary_directory_to_delete = default_path; @@ -336,23 +336,23 @@ std::string LocalServer::getInitialCreateTableQuery() auto table_structure = config().getString("table-structure", "auto"); String table_file; - String format_from_file_name; + std::optional format_from_file_name; if (!config().has("table-file") || config().getString("table-file") == "-") { /// Use Unix tools stdin naming convention table_file = "stdin"; - format_from_file_name = FormatFactory::instance().getFormatFromFileDescriptor(STDIN_FILENO); + format_from_file_name = FormatFactory::instance().tryGetFormatFromFileDescriptor(STDIN_FILENO); } else { /// Use regular file auto file_name = config().getString("table-file"); table_file = quoteString(file_name); - format_from_file_name = FormatFactory::instance().getFormatFromFileName(file_name, false); + format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(file_name); } auto data_format = backQuoteIfNeed( - config().getString("table-data-format", config().getString("format", format_from_file_name.empty() ? "TSV" : format_from_file_name))); + config().getString("table-data-format", config().getString("format", format_from_file_name ? *format_from_file_name : "TSV"))); if (table_structure == "auto") @@ -828,6 +828,7 @@ void LocalServer::printHelpMessage([[maybe_unused]] const OptionsDescription & o std::cout << options_description.main_description.value() << "\n"; std::cout << getHelpFooter() << "\n"; std::cout << "In addition, --param_name=value can be specified for substitution of parameters for parametrized queries.\n"; + std::cout << "\nSee also: https://clickhouse.com/docs/en/operations/utilities/clickhouse-local/\n"; #endif } diff --git a/programs/main.cpp b/programs/main.cpp index 7d07112de66..3896b3819a8 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -19,39 +20,32 @@ #include #include +#include /// Universal executable for various clickhouse applications -#if ENABLE_CLICKHOUSE_SERVER int mainEntryClickHouseServer(int argc, char ** argv); -#endif -#if ENABLE_CLICKHOUSE_CLIENT int mainEntryClickHouseClient(int argc, char ** argv); -#endif -#if ENABLE_CLICKHOUSE_LOCAL int mainEntryClickHouseLocal(int argc, char ** argv); -#endif -#if ENABLE_CLICKHOUSE_BENCHMARK int mainEntryClickHouseBenchmark(int argc, char ** argv); -#endif -#if ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG int mainEntryClickHouseExtractFromConfig(int argc, char ** argv); -#endif -#if ENABLE_CLICKHOUSE_COMPRESSOR int mainEntryClickHouseCompressor(int argc, char ** argv); -#endif -#if ENABLE_CLICKHOUSE_FORMAT int mainEntryClickHouseFormat(int argc, char ** argv); -#endif -#if ENABLE_CLICKHOUSE_COPIER int mainEntryClickHouseClusterCopier(int argc, char ** argv); -#endif -#if ENABLE_CLICKHOUSE_OBFUSCATOR int mainEntryClickHouseObfuscator(int argc, char ** argv); -#endif -#if ENABLE_CLICKHOUSE_GIT_IMPORT int mainEntryClickHouseGitImport(int argc, char ** argv); -#endif +int mainEntryClickHouseStaticFilesDiskUploader(int argc, char ** argv); +int mainEntryClickHouseSU(int argc, char ** argv); +int mainEntryClickHouseDisks(int argc, char ** argv); + +int mainEntryClickHouseHashBinary(int, char **) +{ + /// Intentionally without newline. So you can run: + /// objcopy --add-section .clickhouse.hash=<(./clickhouse hash-binary) clickhouse + std::cout << getHashOfLoadedBinaryHex(); + return 0; +} + #if ENABLE_CLICKHOUSE_KEEPER int mainEntryClickHouseKeeper(int argc, char ** argv); #endif @@ -61,30 +55,13 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv); #if ENABLE_CLICKHOUSE_KEEPER_CLIENT int mainEntryClickHouseKeeperClient(int argc, char ** argv); #endif -#if ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER -int mainEntryClickHouseStaticFilesDiskUploader(int argc, char ** argv); -#endif -#if ENABLE_CLICKHOUSE_SU -int mainEntryClickHouseSU(int argc, char ** argv); -#endif -#if ENABLE_CLICKHOUSE_INSTALL + +// install int mainEntryClickHouseInstall(int argc, char ** argv); int mainEntryClickHouseStart(int argc, char ** argv); int mainEntryClickHouseStop(int argc, char ** argv); int mainEntryClickHouseStatus(int argc, char ** argv); int mainEntryClickHouseRestart(int argc, char ** argv); -#endif -#if ENABLE_CLICKHOUSE_DISKS -int mainEntryClickHouseDisks(int argc, char ** argv); -#endif - -int mainEntryClickHouseHashBinary(int, char **) -{ - /// Intentionally without newline. So you can run: - /// objcopy --add-section .clickhouse.hash=<(./clickhouse hash-binary) clickhouse - std::cout << getHashOfLoadedBinaryHex(); - return 0; -} namespace { @@ -96,36 +73,22 @@ using MainFunc = int (*)(int, char**); /// Add an item here to register new application std::pair clickhouse_applications[] = { -#if ENABLE_CLICKHOUSE_LOCAL {"local", mainEntryClickHouseLocal}, -#endif -#if ENABLE_CLICKHOUSE_CLIENT {"client", mainEntryClickHouseClient}, -#endif -#if ENABLE_CLICKHOUSE_BENCHMARK {"benchmark", mainEntryClickHouseBenchmark}, -#endif -#if ENABLE_CLICKHOUSE_SERVER {"server", mainEntryClickHouseServer}, -#endif -#if ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG {"extract-from-config", mainEntryClickHouseExtractFromConfig}, -#endif -#if ENABLE_CLICKHOUSE_COMPRESSOR {"compressor", mainEntryClickHouseCompressor}, -#endif -#if ENABLE_CLICKHOUSE_FORMAT {"format", mainEntryClickHouseFormat}, -#endif -#if ENABLE_CLICKHOUSE_COPIER {"copier", mainEntryClickHouseClusterCopier}, -#endif -#if ENABLE_CLICKHOUSE_OBFUSCATOR {"obfuscator", mainEntryClickHouseObfuscator}, -#endif -#if ENABLE_CLICKHOUSE_GIT_IMPORT {"git-import", mainEntryClickHouseGitImport}, -#endif + {"static-files-disk-uploader", mainEntryClickHouseStaticFilesDiskUploader}, + {"su", mainEntryClickHouseSU}, + {"hash-binary", mainEntryClickHouseHashBinary}, + {"disks", mainEntryClickHouseDisks}, + + // keeper #if ENABLE_CLICKHOUSE_KEEPER {"keeper", mainEntryClickHouseKeeper}, #endif @@ -135,34 +98,20 @@ std::pair clickhouse_applications[] = #if ENABLE_CLICKHOUSE_KEEPER_CLIENT {"keeper-client", mainEntryClickHouseKeeperClient}, #endif -#if ENABLE_CLICKHOUSE_INSTALL + + // install {"install", mainEntryClickHouseInstall}, {"start", mainEntryClickHouseStart}, {"stop", mainEntryClickHouseStop}, {"status", mainEntryClickHouseStatus}, {"restart", mainEntryClickHouseRestart}, -#endif -#if ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER - {"static-files-disk-uploader", mainEntryClickHouseStaticFilesDiskUploader}, -#endif -#if ENABLE_CLICKHOUSE_SU - {"su", mainEntryClickHouseSU}, -#endif - {"hash-binary", mainEntryClickHouseHashBinary}, -#if ENABLE_CLICKHOUSE_DISKS - {"disks", mainEntryClickHouseDisks}, -#endif }; /// Add an item here to register a new short name std::pair clickhouse_short_names[] = { -#if ENABLE_CLICKHOUSE_LOCAL {"chl", "local"}, -#endif -#if ENABLE_CLICKHOUSE_CLIENT {"chc", "client"}, -#endif }; int printHelp(int, char **) @@ -392,6 +341,50 @@ void checkHarmfulEnvironmentVariables(char ** argv) } #endif + +#if defined(SANITIZE_COVERAGE) +__attribute__((no_sanitize("coverage"))) +void dumpCoverage() +{ + /// A user can request to dump the coverage information into files at exit. + /// This is useful for non-server applications such as clickhouse-format or clickhouse-client, + /// that cannot introspect it with SQL functions at runtime. + + /// The CLICKHOUSE_WRITE_COVERAGE environment variable defines a prefix for a filename 'prefix.pid' + /// containing the list of addresses of covered . + + /// The format is even simpler than Clang's "sancov": an array of 64-bit addresses, native byte order, no header. + + if (const char * coverage_filename_prefix = getenv("CLICKHOUSE_WRITE_COVERAGE")) // NOLINT(concurrency-mt-unsafe) + { + auto dump = [](const std::string & name, auto span) + { + /// Write only non-zeros. + std::vector data; + data.reserve(span.size()); + for (auto addr : span) + if (addr) + data.push_back(addr); + + int fd = ::open(name.c_str(), O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0400); + if (-1 == fd) + { + writeError("Cannot open a file to write the coverage data\n"); + } + else + { + if (!writeRetry(fd, reinterpret_cast(data.data()), data.size() * sizeof(data[0]))) + writeError("Cannot write the coverage data to a file\n"); + if (0 != ::close(fd)) + writeError("Cannot close the file with coverage data\n"); + } + }; + + dump(fmt::format("{}.{}", coverage_filename_prefix, getpid()), getCumulativeCoverage()); + } +} +#endif + } bool isClickhouseApp(std::string_view app_suffix, std::vector & argv) @@ -512,6 +505,12 @@ int main(int argc_, char ** argv_) if (main_func == printHelp && !argv.empty() && (argv.size() == 1 || argv[1][0] == '-')) main_func = mainEntryClickHouseLocal; - return main_func(static_cast(argv.size()), argv.data()); + int exit_code = main_func(static_cast(argv.size()), argv.data()); + +#if defined(SANITIZE_COVERAGE) + dumpCoverage(); +#endif + + return exit_code; } #endif diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp index 7e09d5e8046..242e995e466 100644 --- a/programs/obfuscator/Obfuscator.cpp +++ b/programs/obfuscator/Obfuscator.cpp @@ -1310,7 +1310,7 @@ try throw ErrnoException(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Input must be seekable file (it will be read twice)"); SingleReadBufferIterator read_buffer_iterator(std::move(file)); - schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, false, context_const); + schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, context_const); } else { diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 75ec574c357..0a3c23d746a 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -97,6 +98,7 @@ #include #include #include +#include #include #include #include @@ -557,7 +559,7 @@ static void sanityChecks(Server & server) { const char * filename = "/proc/sys/kernel/task_delayacct"; if (readNumber(filename) == 0) - server.context()->addWarningMessage("Delay accounting is not enabled, OSIOWaitMicroseconds will not be gathered. Check " + String(filename)); + server.context()->addWarningMessage("Delay accounting is not enabled, OSIOWaitMicroseconds will not be gathered. You can enable it using `echo 1 > " + String(filename) + "` or by using sysctl."); } catch (...) // NOLINT(bugprone-empty-catch) { @@ -711,6 +713,22 @@ try getNumberOfPhysicalCPUCores(), // on ARM processors it can show only enabled at current moment cores std::thread::hardware_concurrency()); +#if defined(__x86_64__) + String cpu_info; +#define COLLECT_FLAG(X) \ + if (CPU::have##X()) \ + { \ + if (!cpu_info.empty()) \ + cpu_info += ", "; \ + cpu_info += #X; \ + } + + CPU_ID_ENUMERATE(COLLECT_FLAG) +#undef COLLECT_FLAG + + LOG_INFO(log, "Available CPU instruction sets: {}", cpu_info); +#endif + sanityChecks(*this); // Initialize global thread pool. Do it before we fetch configs from zookeeper @@ -826,6 +844,13 @@ try 0, // We don't need any threads one all the parts will be deleted server_settings.max_parts_cleaning_thread_pool_size); + auto max_database_replicated_create_table_thread_pool_size = server_settings.max_database_replicated_create_table_thread_pool_size ? + server_settings.max_database_replicated_create_table_thread_pool_size : getNumberOfPhysicalCPUCores(); + getDatabaseReplicatedCreateTablesThreadPool().initialize( + max_database_replicated_create_table_thread_pool_size, + 0, // We don't need any threads once all the tables will be created + max_database_replicated_create_table_thread_pool_size); + /// Initialize global local cache for remote filesystem. if (config().has("local_cache_for_remote_fs")) { @@ -1953,6 +1978,11 @@ try load_metadata_tasks); } + if (config().has(DB::PlacementInfo::PLACEMENT_CONFIG_PREFIX)) + { + PlacementInfo::PlacementInfo::instance().initialize(config()); + } + /// Do not keep tasks in server, they should be kept inside databases. Used here to make dependent tasks only. load_metadata_tasks.clear(); load_metadata_tasks.shrink_to_fit(); diff --git a/programs/server/binary.html b/programs/server/binary.html index 988dd33a72a..eec39cd4463 100644 --- a/programs/server/binary.html +++ b/programs/server/binary.html @@ -60,10 +60,29 @@ /// If it is hosted on server, assume that it is the address of ClickHouse. if (location.protocol != 'file:') { host = location.origin; - user = 'default'; add_http_cors_header = false; } + if (window.location.search) { + const params = new URLSearchParams(window.location.search); + if (params.has('host')) { host = params.get('host'); } + if (params.has('user')) { user = params.get('user'); } + if (params.has('password')) { password = params.get('password'); } + } + + let url = `${host}?allow_introspection_functions=1`; + + if (add_http_cors_header) { + url += '&add_http_cors_header=1'; + } + + if (user) { + url += `&user=${encodeURIComponent(user)}`; + } + if (password) { + url += `&password=${encodeURIComponent(password)}`; + } + let map = L.map('space', { crs: L.CRS.Simple, center: [-512, 512], @@ -97,24 +116,11 @@ const key = `${coords.z}-${coords.x}-${coords.y}`; let buf = cached_tiles[key]; if (!buf) { - let url = `${host}?default_format=RowBinary&allow_introspection_functions=1`; + let request_url = `${url}&default_format=RowBinary` + + `¶m_z=${coords.z}¶m_x=${coords.x}¶m_y=${coords.y}` + + `&enable_http_compression=1&network_compression_method=zstd&network_zstd_compression_level=6`; - if (add_http_cors_header) { - // For debug purposes, you may set add_http_cors_header from a browser console - url += '&add_http_cors_header=1'; - } - - if (user) { - url += `&user=${encodeURIComponent(user)}`; - } - if (password) { - url += `&password=${encodeURIComponent(password)}`; - } - - url += `¶m_z=${coords.z}¶m_x=${coords.x}¶m_y=${coords.y}`; - url += `&enable_http_compression=1&network_compression_method=zstd&network_zstd_compression_level=6`; - - const response = await fetch(url, { method: 'POST', body: sql }); + const response = await fetch(request_url, { method: 'POST', body: sql }); if (!response.ok) { const text = await response.text(); @@ -232,7 +238,7 @@ const addr_hex = '0x' + addr_int.toString(16); const response = fetch( - `http://localhost:8123/?default_format=JSON`, + `${url}&default_format=JSON`, { method: 'POST', body: `SELECT encodeXMLComponent(demangle(addressToSymbol(${addr_int}::UInt64))) AS name, diff --git a/programs/server/config.xml b/programs/server/config.xml index 0dc271692b8..23f3458110e 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -937,6 +937,11 @@ --> + + it is reasonable to use all the cores. - if (cpu_count >= 32) - cpu_count = physical_concurrency(); + if (cores >= 32) + cores = physical_concurrency(); #endif #if defined(OS_LINUX) - cpu_count = getCGroupLimitedCPUCores(cpu_count); + cores = getCGroupLimitedCPUCores(cores); #endif - return cpu_count; + return cores; } } @@ -203,6 +203,6 @@ unsigned getNumberOfPhysicalCPUCoresImpl() unsigned getNumberOfPhysicalCPUCores() { /// Calculate once. - static auto res = getNumberOfPhysicalCPUCoresImpl(); - return res; + static auto cores = getNumberOfPhysicalCPUCoresImpl(); + return cores; } diff --git a/src/Common/logger_useful.h b/src/Common/logger_useful.h index 3899d060b7c..8e78e93e198 100644 --- a/src/Common/logger_useful.h +++ b/src/Common/logger_useful.h @@ -9,6 +9,7 @@ #include #include #include +#include namespace Poco { class Logger; } @@ -20,7 +21,8 @@ using LogSeriesLimiterPtr = std::shared_ptr; namespace { - [[maybe_unused]] const ::Poco::Logger * getLoggerHelper(const LoggerPtr & logger) { return logger.get(); } + [[maybe_unused]] LoggerPtr getLoggerHelper(const LoggerPtr & logger) { return logger; } + [[maybe_unused]] LoggerPtr getLoggerHelper(const AtomicLogger & logger) { return logger.load(); } [[maybe_unused]] const ::Poco::Logger * getLoggerHelper(const ::Poco::Logger * logger) { return logger; } [[maybe_unused]] std::unique_ptr getLoggerHelper(std::unique_ptr && logger) { return logger; } [[maybe_unused]] std::unique_ptr getLoggerHelper(std::unique_ptr && logger) { return logger; } diff --git a/src/Common/memcmpSmall.h b/src/Common/memcmpSmall.h index 36d5d7efab8..103eabb5b8d 100644 --- a/src/Common/memcmpSmall.h +++ b/src/Common/memcmpSmall.h @@ -7,6 +7,7 @@ #include #include +#include namespace detail @@ -26,9 +27,8 @@ inline int cmp(T a, T b) /// We can process uninitialized memory in the functions below. -/// Results don't depend on the values inside uninitialized memory but Memory Sanitizer cannot see it. -/// Disable optimized functions if compile with Memory Sanitizer. -#if defined(__AVX512BW__) && defined(__AVX512VL__) && !defined(MEMORY_SANITIZER) +/// Results don't depend on the values inside uninitialized memory +#if defined(__AVX512BW__) && defined(__AVX512VL__) # include @@ -42,6 +42,9 @@ inline int cmp(T a, T b) template inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size) { + __msan_unpoison_overflow_15(a, a_size); + __msan_unpoison_overflow_15(b, b_size); + size_t min_size = std::min(a_size, b_size); for (size_t offset = 0; offset < min_size; offset += 16) @@ -74,6 +77,9 @@ inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char template inline int memcmpSmallLikeZeroPaddedAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size) { + __msan_unpoison_overflow_15(a, a_size); + __msan_unpoison_overflow_15(b, b_size); + size_t min_size = std::min(a_size, b_size); for (size_t offset = 0; offset < min_size; offset += 16) @@ -144,6 +150,9 @@ inline int memcmpSmallLikeZeroPaddedAllowOverflow15(const Char * a, size_t a_siz template inline int memcmpSmallAllowOverflow15(const Char * a, const Char * b, size_t size) { + __msan_unpoison_overflow_15(a, size); + __msan_unpoison_overflow_15(b, size); + for (size_t offset = 0; offset < size; offset += 16) { uint16_t mask = _mm_cmp_epi8_mask( @@ -174,6 +183,9 @@ inline bool memequalSmallAllowOverflow15(const Char * a, size_t a_size, const Ch if (a_size != b_size) return false; + __msan_unpoison_overflow_15(a, a_size); + __msan_unpoison_overflow_15(b, b_size); + for (size_t offset = 0; offset < a_size; offset += 16) { uint16_t mask = _mm_cmp_epi8_mask( @@ -246,6 +258,7 @@ inline bool memequal16(const void * a, const void * b) /** Compare memory region to zero */ inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size) { + __msan_unpoison_overflow_15(reinterpret_cast(data), size); const __m128i zero16 = _mm_setzero_si128(); for (size_t offset = 0; offset < size; offset += 16) @@ -263,7 +276,7 @@ inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size) return true; } -#elif defined(__SSE2__) && !defined(MEMORY_SANITIZER) +#elif defined(__SSE2__) # include @@ -277,6 +290,9 @@ inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size) template inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size) { + __msan_unpoison_overflow_15(a, a_size); + __msan_unpoison_overflow_15(b, b_size); + size_t min_size = std::min(a_size, b_size); for (size_t offset = 0; offset < min_size; offset += 16) @@ -309,6 +325,9 @@ inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char template inline int memcmpSmallLikeZeroPaddedAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size) { + __msan_unpoison_overflow_15(a, a_size); + __msan_unpoison_overflow_15(b, b_size); + size_t min_size = std::min(a_size, b_size); for (size_t offset = 0; offset < min_size; offset += 16) @@ -380,6 +399,9 @@ inline int memcmpSmallLikeZeroPaddedAllowOverflow15(const Char * a, size_t a_siz template inline int memcmpSmallAllowOverflow15(const Char * a, const Char * b, size_t size) { + __msan_unpoison_overflow_15(a, size); + __msan_unpoison_overflow_15(b, size); + for (size_t offset = 0; offset < size; offset += 16) { uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8( @@ -410,6 +432,9 @@ inline bool memequalSmallAllowOverflow15(const Char * a, size_t a_size, const Ch if (a_size != b_size) return false; + __msan_unpoison_overflow_15(a, a_size); + __msan_unpoison_overflow_15(b, b_size); + for (size_t offset = 0; offset < a_size; offset += 16) { uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8( @@ -483,6 +508,8 @@ inline bool memequal16(const void * a, const void * b) /** Compare memory region to zero */ inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size) { + __msan_unpoison_overflow_15(reinterpret_cast(data), size); + const __m128i zero16 = _mm_setzero_si128(); for (size_t offset = 0; offset < size; offset += 16) @@ -509,6 +536,9 @@ inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size) template inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size) { + __msan_unpoison_overflow_15(a, a_size); + __msan_unpoison_overflow_15(b, b_size); + size_t min_size = std::min(a_size, b_size); for (size_t offset = 0; offset < min_size; offset += 16) @@ -534,6 +564,9 @@ inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char template inline int memcmpSmallLikeZeroPaddedAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size) { + __msan_unpoison_overflow_15(a, a_size); + __msan_unpoison_overflow_15(b, b_size); + size_t min_size = std::min(a_size, b_size); for (size_t offset = 0; offset < min_size; offset += 16) @@ -599,6 +632,9 @@ inline int memcmpSmallLikeZeroPaddedAllowOverflow15(const Char * a, size_t a_siz template inline int memcmpSmallAllowOverflow15(const Char * a, const Char * b, size_t size) { + __msan_unpoison_overflow_15(a, size); + __msan_unpoison_overflow_15(b, size); + for (size_t offset = 0; offset < size; offset += 16) { uint64_t mask = getNibbleMask(vceqq_u8( @@ -625,6 +661,9 @@ inline bool memequalSmallAllowOverflow15(const Char * a, size_t a_size, const Ch if (a_size != b_size) return false; + __msan_unpoison_overflow_15(a, a_size); + __msan_unpoison_overflow_15(b, b_size); + for (size_t offset = 0; offset < a_size; offset += 16) { uint64_t mask = getNibbleMask(vceqq_u8( @@ -683,6 +722,7 @@ inline bool memequal16(const void * a, const void * b) inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size) { + __msan_unpoison_overflow_15(reinterpret_cast(data), size); for (size_t offset = 0; offset < size; offset += 16) { uint64_t mask = getNibbleMask(vceqzq_u8(vld1q_u8(reinterpret_cast(data) + offset))); diff --git a/src/Common/memcpySmall.h b/src/Common/memcpySmall.h index 0c2aee96250..f3d26c60380 100644 --- a/src/Common/memcpySmall.h +++ b/src/Common/memcpySmall.h @@ -1,5 +1,7 @@ #pragma once +#include + #include #include /// ssize_t @@ -38,6 +40,7 @@ namespace detail { inline void memcpySmallAllowReadWriteOverflow15Impl(char * __restrict dst, const char * __restrict src, ssize_t n) { + __msan_unpoison_overflow_15(src, n); while (n > 0) { _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), @@ -64,6 +67,7 @@ namespace detail { inline void memcpySmallAllowReadWriteOverflow15Impl(char * __restrict dst, const char * __restrict src, ssize_t n) { + __msan_unpoison_overflow_15(src, n); while (n > 0) { vst1q_s8(reinterpret_cast(dst), vld1q_s8(reinterpret_cast(src))); diff --git a/src/Common/tests/gtest_async_loader.cpp b/src/Common/tests/gtest_async_loader.cpp index d978d23750c..fc2537abcfc 100644 --- a/src/Common/tests/gtest_async_loader.cpp +++ b/src/Common/tests/gtest_async_loader.cpp @@ -427,9 +427,7 @@ TEST(AsyncLoader, CancelExecutingTask) } } -// This test is disabled due to `MemorySanitizer: use-of-uninitialized-value` issue in `collectSymbolsFromProgramHeaders` function -// More details: https://github.com/ClickHouse/ClickHouse/pull/48923#issuecomment-1545415482 -TEST(AsyncLoader, DISABLED_JobFailure) +TEST(AsyncLoader, JobFailure) { AsyncLoaderTest t; t.loader.start(); @@ -622,7 +620,13 @@ TEST(AsyncLoader, CustomDependencyFailure) auto dependent_job1 = makeLoadJob({ collect_job }, "dependent_job1", dependent_job_func); auto dependent_job2 = makeLoadJob({ collect_job }, "dependent_job2", dependent_job_func); auto dependent_job3 = makeLoadJob({ collect_job }, "dependent_job3", dependent_job_func); - auto task = t.schedule({ dependent_job1, dependent_job2, dependent_job3 }); // Other jobs should be discovery automatically + auto task = t.schedule({ + dependent_job1, dependent_job2, dependent_job3, + collect_job, + late_dep1, late_dep2, late_dep3, + good_dep1, good_dep2, good_dep3, + evil_dep1, evil_dep2, evil_dep3, + }); t.loader.wait(collect_job, true); canceled_sync.arrive_and_wait(); // (A) @@ -1022,8 +1026,10 @@ TEST(AsyncLoader, SetMaxThreads) }; // Generate enough independent jobs + std::vector tasks; + tasks.reserve(1000); for (int i = 0; i < 1000; i++) - t.schedule({makeLoadJob({}, "job", job_func)})->detach(); + tasks.push_back(t.schedule({makeLoadJob({}, "job", job_func)})); t.loader.start(); while (sync_index < syncs.size()) diff --git a/src/Common/tests/gtest_log.cpp b/src/Common/tests/gtest_log.cpp index 419aac370d6..6d2bd56ad77 100644 --- a/src/Common/tests/gtest_log.cpp +++ b/src/Common/tests/gtest_log.cpp @@ -9,6 +9,7 @@ #include #include #include +#include TEST(Logger, Log) @@ -32,7 +33,6 @@ TEST(Logger, TestLog) LOG_TEST(log, "Hello World"); EXPECT_EQ(oss.str(), "Hello World\n"); - Poco::Logger::destroy("TestLogger"); } { /// Test logs invisible for other levels @@ -45,8 +45,6 @@ TEST(Logger, TestLog) LOG_TEST(log, "Hello World"); EXPECT_EQ(oss.str(), ""); - - Poco::Logger::destroy(std::string{level} + "_Logger"); } } @@ -103,3 +101,75 @@ TEST(Logger, SideEffects) LOG_TRACE(log, "test no throw {}", getLogMessageParamOrThrow()); } + +TEST(Logger, SharedRawLogger) +{ + { + std::ostringstream stream; // STYLE_CHECK_ALLOW_STD_STRING_STREAM + auto stream_channel = Poco::AutoPtr(new Poco::StreamChannel(stream)); + + auto shared_logger = getLogger("Logger_1"); + shared_logger->setChannel(stream_channel.get()); + shared_logger->setLevel("trace"); + + LOG_TRACE(shared_logger, "SharedLogger1Log1"); + LOG_TRACE(getRawLogger("Logger_1"), "RawLogger1Log"); + LOG_TRACE(shared_logger, "SharedLogger1Log2"); + + auto actual = stream.str(); + EXPECT_EQ(actual, "SharedLogger1Log1\nRawLogger1Log\nSharedLogger1Log2\n"); + } + { + std::ostringstream stream; // STYLE_CHECK_ALLOW_STD_STRING_STREAM + auto stream_channel = Poco::AutoPtr(new Poco::StreamChannel(stream)); + + auto * raw_logger = getRawLogger("Logger_2"); + raw_logger->setChannel(stream_channel.get()); + raw_logger->setLevel("trace"); + + LOG_TRACE(getLogger("Logger_2"), "SharedLogger2Log1"); + LOG_TRACE(raw_logger, "RawLogger2Log"); + LOG_TRACE(getLogger("Logger_2"), "SharedLogger2Log2"); + + auto actual = stream.str(); + EXPECT_EQ(actual, "SharedLogger2Log1\nRawLogger2Log\nSharedLogger2Log2\n"); + } +} + +TEST(Logger, SharedLoggersThreadSafety) +{ + static size_t threads_count = std::thread::hardware_concurrency(); + static constexpr size_t loggers_count = 10; + static constexpr size_t logger_get_count = 1000; + + Poco::Logger::root(); + + std::vector names; + + Poco::Logger::names(names); + size_t loggers_size_before = names.size(); + + std::vector threads; + + for (size_t thread_index = 0; thread_index < threads_count; ++thread_index) + { + threads.emplace_back([]() + { + for (size_t logger_index = 0; logger_index < loggers_count; ++logger_index) + { + for (size_t iteration = 0; iteration < logger_get_count; ++iteration) + { + getLogger("Logger_" + std::to_string(logger_index)); + } + } + }); + } + + for (auto & thread : threads) + thread.join(); + + Poco::Logger::names(names); + size_t loggers_size_after = names.size(); + + EXPECT_EQ(loggers_size_before, loggers_size_after); +} diff --git a/src/Interpreters/threadPoolCallbackRunner.h b/src/Common/threadPoolCallbackRunner.h similarity index 100% rename from src/Interpreters/threadPoolCallbackRunner.h rename to src/Common/threadPoolCallbackRunner.h diff --git a/src/Compression/CompressionCodecDeflateQpl.cpp b/src/Compression/CompressionCodecDeflateQpl.cpp index 631a12cc252..85023f82b17 100644 --- a/src/Compression/CompressionCodecDeflateQpl.cpp +++ b/src/Compression/CompressionCodecDeflateQpl.cpp @@ -6,14 +6,15 @@ #include #include #include -#include -#include -#include "libaccel_config.h" #include +#include +#include #include #include -#include +#include "libaccel_config.h" + +#include namespace DB { @@ -416,9 +417,7 @@ UInt32 CompressionCodecDeflateQpl::doCompressData(const char * source, UInt32 so { /// QPL library is using AVX-512 with some shuffle operations. /// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle. -#if defined(MEMORY_SANITIZER) __msan_unpoison(dest, getMaxCompressedDataSize(source_size)); -#endif Int32 res = HardwareCodecDeflateQpl::RET_ERROR; if (DeflateQplJobHWPool::instance().isJobPoolReady()) res = hw_codec->doCompressData(source, source_size, dest, getMaxCompressedDataSize(source_size)); @@ -439,9 +438,7 @@ void CompressionCodecDeflateQpl::doDecompressData(const char * source, UInt32 so { /// QPL library is using AVX-512 with some shuffle operations. /// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle. -#if defined(MEMORY_SANITIZER) __msan_unpoison(dest, uncompressed_size); -#endif /// Device IOTLB miss has big perf. impact for IAA accelerators. /// To avoid page fault, we need touch buffers related to accelerator in advance. touchBufferWithZeroFilling(dest, uncompressed_size); diff --git a/src/Compression/CompressionCodecT64.cpp b/src/Compression/CompressionCodecT64.cpp index bf9a9414bc1..3ddc56fe4f6 100644 --- a/src/Compression/CompressionCodecT64.cpp +++ b/src/Compression/CompressionCodecT64.cpp @@ -91,6 +91,7 @@ enum class MagicNumber : uint8_t Decimal32 = 19, Decimal64 = 20, IPv4 = 21, + Date32 = 22, }; MagicNumber serializeTypeId(std::optional type_id) @@ -109,6 +110,7 @@ MagicNumber serializeTypeId(std::optional type_id) case TypeIndex::Int32: return MagicNumber::Int32; case TypeIndex::Int64: return MagicNumber::Int64; case TypeIndex::Date: return MagicNumber::Date; + case TypeIndex::Date32: return MagicNumber::Date32; case TypeIndex::DateTime: return MagicNumber::DateTime; case TypeIndex::DateTime64: return MagicNumber::DateTime64; case TypeIndex::Enum8: return MagicNumber::Enum8; @@ -137,6 +139,7 @@ TypeIndex deserializeTypeId(uint8_t serialized_type_id) case MagicNumber::Int32: return TypeIndex::Int32; case MagicNumber::Int64: return TypeIndex::Int64; case MagicNumber::Date: return TypeIndex::Date; + case MagicNumber::Date32: return TypeIndex::Date32; case MagicNumber::DateTime: return TypeIndex::DateTime; case MagicNumber::DateTime64: return TypeIndex::DateTime64; case MagicNumber::Enum8: return TypeIndex::Enum8; @@ -165,6 +168,7 @@ TypeIndex baseType(TypeIndex type_idx) return TypeIndex::Int16; case TypeIndex::Int32: case TypeIndex::Decimal32: + case TypeIndex::Date32: return TypeIndex::Int32; case TypeIndex::Int64: case TypeIndex::Decimal64: @@ -205,6 +209,7 @@ TypeIndex typeIdx(const IDataType * data_type) case TypeIndex::UInt16: case TypeIndex::Enum16: case TypeIndex::Date: + case TypeIndex::Date32: case TypeIndex::Int32: case TypeIndex::UInt32: case TypeIndex::IPv4: diff --git a/src/Coordination/KeeperConstants.cpp b/src/Coordination/KeeperConstants.cpp new file mode 100644 index 00000000000..f788095334e --- /dev/null +++ b/src/Coordination/KeeperConstants.cpp @@ -0,0 +1,376 @@ +#include +#include + +/// Events which are useful for Keeper. +/// New events should be added manually. +#define APPLY_FOR_KEEPER_PROFILE_EVENTS(M) \ + M(FileOpen) \ + M(Seek) \ + M(ReadBufferFromFileDescriptorRead) \ + M(ReadBufferFromFileDescriptorReadFailed) \ + M(ReadBufferFromFileDescriptorReadBytes) \ + M(WriteBufferFromFileDescriptorWrite) \ + M(WriteBufferFromFileDescriptorWriteFailed) \ + M(WriteBufferFromFileDescriptorWriteBytes) \ + M(FileSync) \ + M(DirectorySync) \ + M(FileSyncElapsedMicroseconds) \ + M(DirectorySyncElapsedMicroseconds) \ + M(ReadCompressedBytes) \ + M(CompressedReadBufferBlocks) \ + M(CompressedReadBufferBytes) \ + M(AIOWrite) \ + M(AIOWriteBytes) \ + M(AIORead) \ + M(AIOReadBytes) \ + M(IOBufferAllocs) \ + M(IOBufferAllocBytes) \ + M(ArenaAllocChunks) \ + M(ArenaAllocBytes) \ + M(CreatedReadBufferOrdinary) \ + M(CreatedReadBufferDirectIO) \ + M(CreatedReadBufferDirectIOFailed) \ + M(CreatedReadBufferMMap) \ + M(CreatedReadBufferMMapFailed) \ + M(DiskReadElapsedMicroseconds) \ + M(DiskWriteElapsedMicroseconds) \ + M(NetworkReceiveElapsedMicroseconds) \ + M(NetworkSendElapsedMicroseconds) \ + M(NetworkReceiveBytes) \ + M(NetworkSendBytes) \ +\ + M(DiskS3GetRequestThrottlerCount) \ + M(DiskS3GetRequestThrottlerSleepMicroseconds) \ + M(DiskS3PutRequestThrottlerCount) \ + M(DiskS3PutRequestThrottlerSleepMicroseconds) \ + M(S3GetRequestThrottlerCount) \ + M(S3GetRequestThrottlerSleepMicroseconds) \ + M(S3PutRequestThrottlerCount) \ + M(S3PutRequestThrottlerSleepMicroseconds) \ + M(RemoteReadThrottlerBytes) \ + M(RemoteReadThrottlerSleepMicroseconds) \ + M(RemoteWriteThrottlerBytes) \ + M(RemoteWriteThrottlerSleepMicroseconds) \ + M(LocalReadThrottlerBytes) \ + M(LocalReadThrottlerSleepMicroseconds) \ + M(LocalWriteThrottlerBytes) \ + M(LocalWriteThrottlerSleepMicroseconds) \ + M(ThrottlerSleepMicroseconds) \ +\ + M(SlowRead) \ + M(ReadBackoff) \ +\ + M(ContextLock) \ + M(ContextLockWaitMicroseconds) \ +\ + M(RWLockAcquiredReadLocks) \ + M(RWLockAcquiredWriteLocks) \ + M(RWLockReadersWaitMilliseconds) \ + M(RWLockWritersWaitMilliseconds) \ + M(DNSError) \ + M(RealTimeMicroseconds) \ + M(UserTimeMicroseconds) \ + M(SystemTimeMicroseconds) \ + M(MemoryOvercommitWaitTimeMicroseconds) \ + M(MemoryAllocatorPurge) \ + M(MemoryAllocatorPurgeTimeMicroseconds) \ + M(SoftPageFaults) \ + M(HardPageFaults) \ +\ + M(OSIOWaitMicroseconds) \ + M(OSCPUWaitMicroseconds) \ + M(OSCPUVirtualTimeMicroseconds) \ + M(OSReadBytes) \ + M(OSWriteBytes) \ + M(OSReadChars) \ + M(OSWriteChars) \ +\ + M(PerfCPUCycles) \ + M(PerfInstructions) \ + M(PerfCacheReferences) \ + M(PerfCacheMisses) \ + M(PerfBranchInstructions) \ + M(PerfBranchMisses) \ + M(PerfBusCycles) \ + M(PerfStalledCyclesFrontend) \ + M(PerfStalledCyclesBackend) \ + M(PerfRefCPUCycles) \ +\ + M(PerfCPUClock) \ + M(PerfTaskClock) \ + M(PerfContextSwitches) \ + M(PerfCPUMigrations) \ + M(PerfAlignmentFaults) \ + M(PerfEmulationFaults) \ + M(PerfMinEnabledTime) \ + M(PerfMinEnabledRunningTime) \ + M(PerfDataTLBReferences) \ + M(PerfDataTLBMisses) \ + M(PerfInstructionTLBReferences) \ + M(PerfInstructionTLBMisses) \ + M(PerfLocalMemoryReferences) \ + M(PerfLocalMemoryMisses) \ +\ + M(CreatedHTTPConnections) \ + M(CannotWriteToWriteBufferDiscard) \ +\ + M(S3ReadMicroseconds) \ + M(S3ReadRequestsCount) \ + M(S3ReadRequestsErrors) \ + M(S3ReadRequestsThrottling) \ + M(S3ReadRequestsRedirects) \ +\ + M(S3WriteMicroseconds) \ + M(S3WriteRequestsCount) \ + M(S3WriteRequestsErrors) \ + M(S3WriteRequestsThrottling) \ + M(S3WriteRequestsRedirects) \ +\ + M(DiskS3ReadMicroseconds) \ + M(DiskS3ReadRequestsCount) \ + M(DiskS3ReadRequestsErrors) \ + M(DiskS3ReadRequestsThrottling) \ + M(DiskS3ReadRequestsRedirects) \ +\ + M(DiskS3WriteMicroseconds) \ + M(DiskS3WriteRequestsCount) \ + M(DiskS3WriteRequestsErrors) \ + M(DiskS3WriteRequestsThrottling) \ + M(DiskS3WriteRequestsRedirects) \ +\ + M(S3DeleteObjects) \ + M(S3CopyObject) \ + M(S3ListObjects) \ + M(S3HeadObject) \ + M(S3GetObjectAttributes) \ + M(S3CreateMultipartUpload) \ + M(S3UploadPartCopy) \ + M(S3UploadPart) \ + M(S3AbortMultipartUpload) \ + M(S3CompleteMultipartUpload) \ + M(S3PutObject) \ + M(S3GetObject) \ +\ + M(AzureUploadPart) \ + M(DiskAzureUploadPart) \ + M(AzureCopyObject) \ + M(DiskAzureCopyObject) \ + M(AzureDeleteObjects) \ + M(AzureListObjects) \ +\ + M(DiskS3DeleteObjects) \ + M(DiskS3CopyObject) \ + M(DiskS3ListObjects) \ + M(DiskS3HeadObject) \ + M(DiskS3GetObjectAttributes) \ + M(DiskS3CreateMultipartUpload) \ + M(DiskS3UploadPartCopy) \ + M(DiskS3UploadPart) \ + M(DiskS3AbortMultipartUpload) \ + M(DiskS3CompleteMultipartUpload) \ + M(DiskS3PutObject) \ + M(DiskS3GetObject) \ +\ + M(S3Clients) \ + M(TinyS3Clients) \ +\ + M(ReadBufferFromS3Microseconds) \ + M(ReadBufferFromS3InitMicroseconds) \ + M(ReadBufferFromS3Bytes) \ + M(ReadBufferFromS3RequestsErrors) \ + M(ReadBufferFromS3ResetSessions) \ + M(ReadBufferFromS3PreservedSessions) \ +\ + M(ReadWriteBufferFromHTTPPreservedSessions) \ +\ + M(WriteBufferFromS3Microseconds) \ + M(WriteBufferFromS3Bytes) \ + M(WriteBufferFromS3RequestsErrors) \ + M(WriteBufferFromS3WaitInflightLimitMicroseconds) \ + M(RemoteFSSeeks) \ + M(RemoteFSPrefetches) \ + M(RemoteFSCancelledPrefetches) \ + M(RemoteFSUnusedPrefetches) \ + M(RemoteFSPrefetchedReads) \ + M(RemoteFSPrefetchedBytes) \ + M(RemoteFSUnprefetchedReads) \ + M(RemoteFSUnprefetchedBytes) \ + M(RemoteFSLazySeeks) \ + M(RemoteFSSeeksWithReset) \ + M(RemoteFSBuffers) \ +\ + M(ThreadpoolReaderTaskMicroseconds) \ + M(ThreadpoolReaderPrepareMicroseconds) \ + M(ThreadpoolReaderReadBytes) \ + M(ThreadpoolReaderSubmit) \ + M(ThreadpoolReaderSubmitReadSynchronously) \ + M(ThreadpoolReaderSubmitReadSynchronouslyBytes) \ + M(ThreadpoolReaderSubmitReadSynchronouslyMicroseconds) \ + M(ThreadpoolReaderSubmitLookupInCacheMicroseconds) \ + M(AsynchronousReaderIgnoredBytes) \ +\ + M(FileSegmentWaitReadBufferMicroseconds) \ + M(FileSegmentReadMicroseconds) \ + M(FileSegmentCacheWriteMicroseconds) \ + M(FileSegmentPredownloadMicroseconds) \ + M(FileSegmentUsedBytes) \ +\ + M(ReadBufferSeekCancelConnection) \ +\ + M(SleepFunctionCalls) \ + M(SleepFunctionMicroseconds) \ + M(SleepFunctionElapsedMicroseconds) \ +\ + M(ThreadPoolReaderPageCacheHit) \ + M(ThreadPoolReaderPageCacheHitBytes) \ + M(ThreadPoolReaderPageCacheHitElapsedMicroseconds) \ + M(ThreadPoolReaderPageCacheMiss) \ + M(ThreadPoolReaderPageCacheMissBytes) \ + M(ThreadPoolReaderPageCacheMissElapsedMicroseconds) \ +\ + M(AsynchronousReadWaitMicroseconds) \ + M(SynchronousReadWaitMicroseconds) \ + M(AsynchronousRemoteReadWaitMicroseconds) \ + M(SynchronousRemoteReadWaitMicroseconds) \ +\ + M(ExternalDataSourceLocalCacheReadBytes) \ +\ + M(MainConfigLoads) \ +\ + M(KeeperPacketsSent) \ + M(KeeperPacketsReceived) \ + M(KeeperRequestTotal) \ + M(KeeperLatency) \ + M(KeeperCommits) \ + M(KeeperCommitsFailed) \ + M(KeeperSnapshotCreations) \ + M(KeeperSnapshotCreationsFailed) \ + M(KeeperSnapshotApplys) \ + M(KeeperSnapshotApplysFailed) \ + M(KeeperReadSnapshot) \ + M(KeeperSaveSnapshot) \ + M(KeeperCreateRequest) \ + M(KeeperRemoveRequest) \ + M(KeeperSetRequest) \ + M(KeeperReconfigRequest) \ + M(KeeperCheckRequest) \ + M(KeeperMultiRequest) \ + M(KeeperMultiReadRequest) \ + M(KeeperGetRequest) \ + M(KeeperListRequest) \ + M(KeeperExistsRequest) \ +\ + M(IOUringSQEsSubmitted) \ + M(IOUringSQEsResubmits) \ + M(IOUringCQEsCompleted) \ + M(IOUringCQEsFailed) \ +\ + M(LogTest) \ + M(LogTrace) \ + M(LogDebug) \ + M(LogInfo) \ + M(LogWarning) \ + M(LogError) \ + M(LogFatal) \ +\ + M(InterfaceHTTPSendBytes) \ + M(InterfaceHTTPReceiveBytes) \ + M(InterfaceNativeSendBytes) \ + M(InterfaceNativeReceiveBytes) \ + M(InterfacePrometheusSendBytes) \ + M(InterfacePrometheusReceiveBytes) \ + M(InterfaceInterserverSendBytes) \ + M(InterfaceInterserverReceiveBytes) \ + M(InterfaceMySQLSendBytes) \ + M(InterfaceMySQLReceiveBytes) \ + M(InterfacePostgreSQLSendBytes) \ + M(InterfacePostgreSQLReceiveBytes) + +namespace ProfileEvents +{ +#define M(NAME) extern const Event NAME; + APPLY_FOR_KEEPER_PROFILE_EVENTS(M) +#undef M + +#define M(NAME) NAME, +extern const std::vector keeper_profile_events +{ + APPLY_FOR_KEEPER_PROFILE_EVENTS(M) +}; +#undef M +} + +/// Metrics which are useful for Keeper. +/// New metrics should be added manually. +#define APPLY_FOR_KEEPER_METRICS(M) \ + M(BackgroundCommonPoolTask) \ + M(BackgroundCommonPoolSize) \ + M(TCPConnection) \ + M(HTTPConnection) \ + M(OpenFileForRead) \ + M(OpenFileForWrite) \ + M(Read) \ + M(RemoteRead) \ + M(Write) \ + M(NetworkReceive) \ + M(NetworkSend) \ + M(MemoryTracking) \ + M(ContextLockWait) \ + M(Revision) \ + M(VersionInteger) \ + M(RWLockWaitingReaders) \ + M(RWLockWaitingWriters) \ + M(RWLockActiveReaders) \ + M(RWLockActiveWriters) \ + M(GlobalThread) \ + M(GlobalThreadActive) \ + M(GlobalThreadScheduled) \ + M(LocalThread) \ + M(LocalThreadActive) \ + M(LocalThreadScheduled) \ + M(IOPrefetchThreads) \ + M(IOPrefetchThreadsActive) \ + M(IOPrefetchThreadsScheduled) \ + M(IOWriterThreads) \ + M(IOWriterThreadsActive) \ + M(IOWriterThreadsScheduled) \ + M(IOThreads) \ + M(IOThreadsActive) \ + M(IOThreadsScheduled) \ + M(ThreadPoolRemoteFSReaderThreads) \ + M(ThreadPoolRemoteFSReaderThreadsActive) \ + M(ThreadPoolRemoteFSReaderThreadsScheduled) \ + M(ThreadPoolFSReaderThreads) \ + M(ThreadPoolFSReaderThreadsActive) \ + M(ThreadPoolFSReaderThreadsScheduled) \ + M(DiskObjectStorageAsyncThreads) \ + M(DiskObjectStorageAsyncThreadsActive) \ + M(ObjectStorageS3Threads) \ + M(ObjectStorageS3ThreadsActive) \ + M(ObjectStorageS3ThreadsScheduled) \ + M(ObjectStorageAzureThreads) \ + M(ObjectStorageAzureThreadsActive) \ + M(ObjectStorageAzureThreadsScheduled) \ + M(MMappedFiles) \ + M(MMappedFileBytes) \ + M(AsynchronousReadWait) \ + M(S3Requests) \ + M(KeeperAliveConnections) \ + M(KeeperOutstandingRequets) \ + M(ThreadsInOvercommitTracker) \ + M(IOUringPendingEvents) \ + M(IOUringInFlightEvents) \ + +namespace CurrentMetrics +{ +#define M(NAME) extern const Metric NAME; + APPLY_FOR_KEEPER_METRICS(M) +#undef M + +#define M(NAME) NAME, +extern const std::vector keeper_metrics +{ + APPLY_FOR_KEEPER_METRICS(M) +}; +#undef M +} diff --git a/src/Coordination/KeeperContext.cpp b/src/Coordination/KeeperContext.cpp index baad8d98e6a..374571bae7e 100644 --- a/src/Coordination/KeeperContext.cpp +++ b/src/Coordination/KeeperContext.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -37,26 +38,11 @@ void KeeperContext::initialize(const Poco::Util::AbstractConfiguration & config, { dispatcher = dispatcher_; - if (config.hasProperty("keeper_server.availability_zone")) + const auto keeper_az = PlacementInfo::PlacementInfo::instance().getAvailabilityZone(); + if (!keeper_az.empty()) { - auto keeper_az = config.getString("keeper_server.availability_zone.value", ""); - const auto auto_detect_for_cloud = config.getBool("keeper_server.availability_zone.enable_auto_detection_on_cloud", false); - if (keeper_az.empty() && auto_detect_for_cloud) - { - try - { - keeper_az = DB::S3::getRunningAvailabilityZone(); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - } - } - if (!keeper_az.empty()) - { - system_nodes_with_data[keeper_availability_zone_path] = keeper_az; - LOG_INFO(getLogger("KeeperContext"), "Initialize the KeeperContext with availability zone: '{}'", keeper_az); - } + system_nodes_with_data[keeper_availability_zone_path] = keeper_az; + LOG_INFO(getLogger("KeeperContext"), "Initialize the KeeperContext with availability zone: '{}'", keeper_az); } updateKeeperMemorySoftLimit(config); diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index f53b8031712..091571b4a1a 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -16,7 +16,7 @@ #include #include #include -#include "Core/Field.h" +#include #include @@ -79,20 +79,20 @@ namespace writeBinary(false, out); /// Serialize stat - writeBinary(node.stat.czxid, out); - writeBinary(node.stat.mzxid, out); - writeBinary(node.stat.ctime, out); - writeBinary(node.stat.mtime, out); - writeBinary(node.stat.version, out); - writeBinary(node.stat.cversion, out); - writeBinary(node.stat.aversion, out); - writeBinary(node.stat.ephemeralOwner, out); + writeBinary(node.czxid, out); + writeBinary(node.mzxid, out); + writeBinary(node.ctime(), out); + writeBinary(node.mtime, out); + writeBinary(node.version, out); + writeBinary(node.cversion, out); + writeBinary(node.aversion, out); + writeBinary(node.ephemeralOwner(), out); if (version < SnapshotVersion::V6) - writeBinary(static_cast(node.getData().size()), out); - writeBinary(node.stat.numChildren, out); - writeBinary(node.stat.pzxid, out); + writeBinary(static_cast(node.data_size), out); + writeBinary(node.numChildren(), out); + writeBinary(node.pzxid, out); - writeBinary(node.seq_num, out); + writeBinary(node.seqNum(), out); if (version >= SnapshotVersion::V4 && version <= SnapshotVersion::V5) writeBinary(node.sizeInBytes(), out); @@ -102,7 +102,7 @@ namespace { String new_data; readBinary(new_data, in); - node.setData(std::move(new_data)); + node.setData(new_data); if (version >= SnapshotVersion::V1) { @@ -138,22 +138,36 @@ namespace } /// Deserialize stat - readBinary(node.stat.czxid, in); - readBinary(node.stat.mzxid, in); - readBinary(node.stat.ctime, in); - readBinary(node.stat.mtime, in); - readBinary(node.stat.version, in); - readBinary(node.stat.cversion, in); - readBinary(node.stat.aversion, in); - readBinary(node.stat.ephemeralOwner, in); + readBinary(node.czxid, in); + readBinary(node.mzxid, in); + int64_t ctime; + readBinary(ctime, in); + node.setCtime(ctime); + readBinary(node.mtime, in); + readBinary(node.version, in); + readBinary(node.cversion, in); + readBinary(node.aversion, in); + int64_t ephemeral_owner = 0; + readBinary(ephemeral_owner, in); + if (ephemeral_owner != 0) + node.setEphemeralOwner(ephemeral_owner); + if (version < SnapshotVersion::V6) { int32_t data_length = 0; readBinary(data_length, in); } - readBinary(node.stat.numChildren, in); - readBinary(node.stat.pzxid, in); - readBinary(node.seq_num, in); + int32_t num_children = 0; + readBinary(num_children, in); + if (ephemeral_owner == 0) + node.setNumChildren(num_children); + + readBinary(node.pzxid, in); + + int32_t seq_num = 0; + readBinary(seq_num, in); + if (ephemeral_owner == 0) + node.setSeqNum(seq_num); if (version >= SnapshotVersion::V4 && version <= SnapshotVersion::V5) { @@ -238,7 +252,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr /// Benign race condition possible while taking snapshot: NuRaft decide to create snapshot at some log id /// and only after some time we lock storage and enable snapshot mode. So snapshot_container_size can be /// slightly bigger than required. - if (node.stat.mzxid > snapshot.zxid) + if (node.mzxid > snapshot.zxid) break; writeBinary(path, out); @@ -363,11 +377,6 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial if (recalculate_digest) storage.nodes_digest = 0; - const auto is_node_empty = [](const auto & node) - { - return node.getData().empty() && node.stat == KeeperStorage::Node::Stat{}; - }; - for (size_t nodes_read = 0; nodes_read < snapshot_container_size; ++nodes_read) { std::string path; @@ -395,7 +404,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial } else if (match_result == EXACT) { - if (!is_node_empty(node)) + if (!node.empty()) { if (keeper_context->ignoreSystemPathOnStartup() || keeper_context->getServerState() != KeeperContext::Phase::INIT) { @@ -412,8 +421,8 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial } storage.container.insertOrReplace(path, node); - if (node.stat.ephemeralOwner != 0) - storage.ephemerals[node.stat.ephemeralOwner].insert(path); + if (node.isEphemeral()) + storage.ephemerals[node.ephemeralOwner()].insert(path); if (recalculate_digest) storage.nodes_digest += node.getDigest(path); @@ -433,16 +442,16 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial { if (itr.key != "/") { - if (itr.value.stat.numChildren != static_cast(itr.value.getChildren().size())) + if (itr.value.numChildren() != static_cast(itr.value.getChildren().size())) { #ifdef NDEBUG /// TODO (alesapin) remove this, it should be always CORRUPTED_DATA. LOG_ERROR(getLogger("KeeperSnapshotManager"), "Children counter in stat.numChildren {}" - " is different from actual children size {} for node {}", itr.value.stat.numChildren, itr.value.getChildren().size(), itr.key); + " is different from actual children size {} for node {}", itr.value.numChildren(), itr.value.getChildren().size(), itr.key); #else throw Exception(ErrorCodes::LOGICAL_ERROR, "Children counter in stat.numChildren {}" " is different from actual children size {} for node {}", - itr.value.stat.numChildren, itr.value.getChildren().size(), itr.key); + itr.value.numChildren(), itr.value.getChildren().size(), itr.key); #endif } } diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 8d50f0a76b1..c82f8301eff 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -136,12 +136,12 @@ namespace { void assertDigest( - const KeeperStorage::Digest & first, - const KeeperStorage::Digest & second, + const KeeperStorage::Digest & expected, + const KeeperStorage::Digest & actual, const Coordination::ZooKeeperRequest & request, bool committing) { - if (!KeeperStorage::checkDigest(first, second)) + if (!KeeperStorage::checkDigest(expected, actual)) { LOG_FATAL( getLogger("KeeperStateMachine"), @@ -149,9 +149,9 @@ void assertDigest( "{}). Keeper will terminate to avoid inconsistencies.\nExtra information about the request:\n{}", committing ? "committing" : "preprocessing", request.getOpNum(), - first.value, - second.value, - first.version, + expected.value, + actual.value, + expected.version, request.toString()); std::terminate(); } diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 992d4ca8a95..eaa0c3c9e68 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -166,56 +166,132 @@ KeeperStorage::ResponsesForSessions processWatchesImpl( } // When this function is updated, update CURRENT_DIGEST_VERSION!! -uint64_t calculateDigest(std::string_view path, std::string_view data, const KeeperStorage::Node::Stat & stat) +uint64_t calculateDigest(std::string_view path, const KeeperStorage::Node & node) { SipHash hash; hash.update(path); - hash.update(data); + auto data = node.getData(); + if (!data.empty()) + { + chassert(data.data() != nullptr); + hash.update(data); + } - hash.update(stat.czxid); - hash.update(stat.czxid); - hash.update(stat.mzxid); - hash.update(stat.ctime); - hash.update(stat.mtime); - hash.update(stat.version); - hash.update(stat.cversion); - hash.update(stat.aversion); - hash.update(stat.ephemeralOwner); - hash.update(data.length()); - hash.update(stat.numChildren); - hash.update(stat.pzxid); + hash.update(node.czxid); + hash.update(node.mzxid); + hash.update(node.ctime()); + hash.update(node.mtime); + hash.update(node.version); + hash.update(node.cversion); + hash.update(node.aversion); + hash.update(node.ephemeralOwner()); + hash.update(node.numChildren()); + hash.update(node.pzxid); - return hash.get64(); + auto digest = hash.get64(); + + /// 0 means no cached digest + if (digest == 0) + return 1; + + return digest; } } +KeeperStorage::Node & KeeperStorage::Node::operator=(const Node & other) +{ + if (this == &other) + return *this; + + czxid = other.czxid; + mzxid = other.mzxid; + pzxid = other.pzxid; + acl_id = other.acl_id; + mtime = other.mtime; + is_ephemeral_and_ctime = other.is_ephemeral_and_ctime; + ephemeral_or_children_data = other.ephemeral_or_children_data; + data_size = other.data_size; + version = other.version; + cversion = other.cversion; + aversion = other.aversion; + + if (data_size != 0) + { + data = std::unique_ptr(new char[data_size]); + memcpy(data.get(), other.data.get(), data_size); + } + + children = other.children; + + return *this; +} + +KeeperStorage::Node::Node(const Node & other) +{ + *this = other; +} + +bool KeeperStorage::Node::empty() const +{ + return data_size == 0 && mzxid == 0; +} + +void KeeperStorage::Node::copyStats(const Coordination::Stat & stat) +{ + czxid = stat.czxid; + mzxid = stat.mzxid; + pzxid = stat.pzxid; + + mtime = stat.mtime; + setCtime(stat.ctime); + + version = stat.version; + cversion = stat.cversion; + aversion = stat.aversion; + + if (stat.ephemeralOwner == 0) + { + is_ephemeral_and_ctime.is_ephemeral = false; + setNumChildren(stat.numChildren); + } + else + { + setEphemeralOwner(stat.ephemeralOwner); + } +} + void KeeperStorage::Node::setResponseStat(Coordination::Stat & response_stat) const { - response_stat.czxid = stat.czxid; - response_stat.mzxid = stat.mzxid; - response_stat.ctime = stat.ctime; - response_stat.mtime = stat.mtime; - response_stat.version = stat.version; - response_stat.cversion = stat.cversion; - response_stat.aversion = stat.aversion; - response_stat.ephemeralOwner = stat.ephemeralOwner; - response_stat.dataLength = static_cast(data.size()); - response_stat.numChildren = stat.numChildren; - response_stat.pzxid = stat.pzxid; + response_stat.czxid = czxid; + response_stat.mzxid = mzxid; + response_stat.ctime = ctime(); + response_stat.mtime = mtime; + response_stat.version = version; + response_stat.cversion = cversion; + response_stat.aversion = aversion; + response_stat.ephemeralOwner = ephemeralOwner(); + response_stat.dataLength = static_cast(data_size); + response_stat.numChildren = numChildren(); + response_stat.pzxid = pzxid; } uint64_t KeeperStorage::Node::sizeInBytes() const { - return sizeof(Node) + children.size() * sizeof(StringRef) + data.size(); + return sizeof(Node) + children.size() * sizeof(StringRef) + data_size; } -void KeeperStorage::Node::setData(String new_data) +void KeeperStorage::Node::setData(const String & new_data) { - data = std::move(new_data); + data_size = static_cast(new_data.size()); + if (data_size != 0) + { + data = std::unique_ptr(new char[new_data.size()]); + memcpy(data.get(), new_data.data(), data_size); + } } void KeeperStorage::Node::addChild(StringRef child_path) @@ -230,25 +306,41 @@ void KeeperStorage::Node::removeChild(StringRef child_path) void KeeperStorage::Node::invalidateDigestCache() const { - has_cached_digest = false; + cached_digest = 0; } UInt64 KeeperStorage::Node::getDigest(const std::string_view path) const { - if (!has_cached_digest) - { - cached_digest = calculateDigest(path, data, stat); - has_cached_digest = true; - } + if (cached_digest == 0) + cached_digest = calculateDigest(path, *this); return cached_digest; }; void KeeperStorage::Node::shallowCopy(const KeeperStorage::Node & other) { - stat = other.stat; - seq_num = other.seq_num; - setData(other.getData()); + czxid = other.czxid; + mzxid = other.mzxid; + pzxid = other.pzxid; + acl_id = other.acl_id; /// 0 -- no ACL by default + + mtime = other.mtime; + + is_ephemeral_and_ctime = other.is_ephemeral_and_ctime; + + ephemeral_or_children_data = other.ephemeral_or_children_data; + + data_size = other.data_size; + if (data_size != 0) + { + data = std::unique_ptr(new char[data_size]); + memcpy(data.get(), other.data.get(), data_size); + } + + version = other.version; + cversion = other.cversion; + aversion = other.aversion; + cached_digest = other.cached_digest; } @@ -280,13 +372,13 @@ void KeeperStorage::initializeSystemNodes() // update root and the digest based on it auto current_root_it = container.find("/"); - assert(current_root_it != container.end()); + chassert(current_root_it != container.end()); removeDigest(current_root_it->value, "/"); auto updated_root_it = container.updateValue( "/", - [](auto & node) + [](KeeperStorage::Node & node) { - ++node.stat.numChildren; + node.increaseNumChildren(); node.addChild(getBaseNodeName(keeper_system_path)); } ); @@ -296,7 +388,7 @@ void KeeperStorage::initializeSystemNodes() // insert child system nodes for (const auto & [path, data] : keeper_context->getSystemNodesWithData()) { - assert(path.starts_with(keeper_system_path)); + chassert(path.starts_with(keeper_system_path)); Node child_system_node; child_system_node.setData(data); auto [map_key, _] = container.insert(std::string{path}, child_system_node); @@ -341,7 +433,7 @@ std::shared_ptr KeeperStorage::UncommittedState::tryGetNode void KeeperStorage::UncommittedState::applyDelta(const Delta & delta) { - assert(!delta.path.empty()); + chassert(!delta.path.empty()); if (!nodes.contains(delta.path)) { if (auto storage_node = tryGetNodeFromStorage(delta.path)) @@ -357,22 +449,22 @@ void KeeperStorage::UncommittedState::applyDelta(const Delta & delta) if constexpr (std::same_as) { - assert(!node); + chassert(!node); node = std::make_shared(); - node->stat = operation.stat; + node->copyStats(operation.stat); node->setData(operation.data); acls = operation.acls; last_applied_zxid = delta.zxid; } else if constexpr (std::same_as) { - assert(node); + chassert(node); node = nullptr; last_applied_zxid = delta.zxid; } else if constexpr (std::same_as) { - assert(node); + chassert(node); node->invalidateDigestCache(); operation.update_fn(*node); last_applied_zxid = delta.zxid; @@ -386,6 +478,40 @@ void KeeperStorage::UncommittedState::applyDelta(const Delta & delta) delta.operation); } +bool KeeperStorage::UncommittedState::hasACL(int64_t session_id, bool is_local, std::function predicate) const +{ + const auto check_auth = [&](const auto & auth_ids) + { + for (const auto & auth : auth_ids) + { + using TAuth = std::remove_reference_t; + + const AuthID * auth_ptr = nullptr; + if constexpr (std::is_pointer_v) + auth_ptr = auth; + else + auth_ptr = &auth; + + if (predicate(*auth_ptr)) + return true; + } + return false; + }; + + if (is_local) + return check_auth(storage.session_and_auth[session_id]); + + if (check_auth(storage.session_and_auth[session_id])) + return true; + + // check if there are uncommitted + const auto auth_it = session_and_auth.find(session_id); + if (auth_it == session_and_auth.end()) + return false; + + return check_auth(auth_it->second); +} + void KeeperStorage::UncommittedState::addDelta(Delta new_delta) { const auto & added_delta = deltas.emplace_back(std::move(new_delta)); @@ -410,7 +536,7 @@ void KeeperStorage::UncommittedState::addDeltas(std::vector new_deltas) void KeeperStorage::UncommittedState::commit(int64_t commit_zxid) { - assert(deltas.empty() || deltas.front().zxid >= commit_zxid); + chassert(deltas.empty() || deltas.front().zxid >= commit_zxid); // collect nodes that have no further modification in the current transaction std::unordered_set modified_nodes; @@ -428,7 +554,7 @@ void KeeperStorage::UncommittedState::commit(int64_t commit_zxid) if (!front_delta.path.empty()) { auto & path_deltas = deltas_for_path.at(front_delta.path); - assert(path_deltas.front() == &front_delta); + chassert(path_deltas.front() == &front_delta); path_deltas.pop_front(); if (path_deltas.empty()) { @@ -446,7 +572,7 @@ void KeeperStorage::UncommittedState::commit(int64_t commit_zxid) else if (auto * add_auth = std::get_if(&front_delta.operation)) { auto & uncommitted_auth = session_and_auth[add_auth->session_id]; - assert(!uncommitted_auth.empty() && uncommitted_auth.front() == &add_auth->auth_id); + chassert(!uncommitted_auth.empty() && uncommitted_auth.front() == &add_auth->auth_id); uncommitted_auth.pop_front(); if (uncommitted_auth.empty()) session_and_auth.erase(add_auth->session_id); @@ -486,7 +612,7 @@ void KeeperStorage::UncommittedState::rollback(int64_t rollback_zxid) if (delta_it->zxid < rollback_zxid) break; - assert(delta_it->zxid == rollback_zxid); + chassert(delta_it->zxid == rollback_zxid); if (!delta_it->path.empty()) { std::visit( @@ -673,7 +799,7 @@ Coordination::Error KeeperStorage::commit(int64_t commit_zxid) if (node_it == container.end()) onStorageInconsistency(); - if (operation.version != -1 && operation.version != node_it->value.stat.version) + if (operation.version != -1 && operation.version != node_it->value.version) onStorageInconsistency(); removeDigest(node_it->value, path); @@ -695,7 +821,7 @@ Coordination::Error KeeperStorage::commit(int64_t commit_zxid) if (node_it == container.end()) onStorageInconsistency(); - if (operation.version != -1 && operation.version != node_it->value.stat.aversion) + if (operation.version != -1 && operation.version != node_it->value.aversion) onStorageInconsistency(); acl_map.removeUsage(node_it->value.acl_id); @@ -740,7 +866,7 @@ Coordination::Error KeeperStorage::commit(int64_t commit_zxid) bool KeeperStorage::createNode( const std::string & path, String data, - const KeeperStorage::Node::Stat & stat, + const Coordination::Stat & stat, Coordination::ACLs node_acls) { auto parent_path = parentNodePath(path); @@ -749,7 +875,7 @@ bool KeeperStorage::createNode( if (node_it == container.end()) return false; - if (node_it->value.stat.ephemeralOwner != 0) + if (node_it->value.isEphemeral()) return false; if (container.contains(path)) @@ -761,8 +887,8 @@ bool KeeperStorage::createNode( acl_map.addUsage(acl_id); created_node.acl_id = acl_id; - created_node.stat = stat; - created_node.setData(std::move(data)); + created_node.copyStats(stat); + created_node.setData(data); auto [map_key, _] = container.insert(path, created_node); /// Take child path from key owned by map. auto child_path = getBaseNodeName(map_key->getKey()); @@ -771,7 +897,7 @@ bool KeeperStorage::createNode( [child_path](KeeperStorage::Node & parent) { parent.addChild(child_path); - chassert(parent.stat.numChildren == static_cast(parent.getChildren().size())); + chassert(parent.numChildren() == static_cast(parent.getChildren().size())); } ); @@ -785,21 +911,22 @@ bool KeeperStorage::removeNode(const std::string & path, int32_t version) if (node_it == container.end()) return false; - if (version != -1 && version != node_it->value.stat.version) + if (version != -1 && version != node_it->value.version) return false; - if (node_it->value.stat.numChildren) + if (node_it->value.numChildren()) return false; - auto prev_node = node_it->value; - acl_map.removeUsage(prev_node.acl_id); + KeeperStorage::Node prev_node; + prev_node.shallowCopy(node_it->value); + acl_map.removeUsage(node_it->value.acl_id); container.updateValue( parentNodePath(path), [child_basename = getBaseNodeName(node_it->key)](KeeperStorage::Node & parent) { parent.removeChild(child_basename); - chassert(parent.stat.numChildren == static_cast(parent.getChildren().size())); + chassert(parent.numChildren() == static_cast(parent.getChildren().size())); } ); @@ -959,7 +1086,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr if (parent_node == nullptr) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; - else if (parent_node->stat.ephemeralOwner != 0) + else if (parent_node->isEphemeral()) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNOCHILDRENFOREPHEMERALS}}; std::string path_created = request.path; @@ -968,7 +1095,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr if (request.not_exists) return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; - auto seq_num = parent_node->seq_num; + auto seq_num = parent_node->seqNum(); std::stringstream seq_num_str; // STYLE_CHECK_ALLOW_STD_STRING_STREAM seq_num_str.exceptions(std::ios::failbit); @@ -1008,20 +1135,20 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr auto parent_update = [parent_cversion, zxid](KeeperStorage::Node & node) { /// Increment sequential number even if node is not sequential - ++node.seq_num; + node.increaseSeqNum(); if (parent_cversion == -1) - ++node.stat.cversion; - else if (parent_cversion > node.stat.cversion) - node.stat.cversion = parent_cversion; + ++node.cversion; + else if (parent_cversion > node.cversion) + node.cversion = parent_cversion; - if (zxid > node.stat.pzxid) - node.stat.pzxid = zxid; - ++node.stat.numChildren; + if (zxid > node.pzxid) + node.pzxid = zxid; + node.increaseNumChildren(); }; new_deltas.emplace_back(std::string{parent_path}, zxid, KeeperStorage::UpdateNodeDelta{std::move(parent_update)}); - KeeperStorage::Node::Stat stat; + Coordination::Stat stat; stat.czxid = zxid; stat.mzxid = zxid; stat.pzxid = zxid; @@ -1135,7 +1262,8 @@ struct KeeperStorageGetRequestProcessor final : public KeeperStorageRequestProce else { node_it->value.setResponseStat(response.stat); - response.data = node_it->value.getData(); + auto data = node_it->value.getData(); + response.data = std::string(data); response.error = Coordination::Error::ZOK; } @@ -1192,8 +1320,8 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr { [zxid](KeeperStorage::Node & parent) { - if (parent.stat.pzxid < zxid) - parent.stat.pzxid = zxid; + if (parent.pzxid < zxid) + parent.pzxid = zxid; } } ); @@ -1207,9 +1335,9 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr update_parent_pzxid(); return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; } - else if (request.version != -1 && request.version != node->stat.version) + else if (request.version != -1 && request.version != node->version) return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADVERSION}}; - else if (node->stat.numChildren != 0) + else if (node->numChildren() != 0) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNOTEMPTY}}; if (request.restored_from_zookeeper_log) @@ -1220,14 +1348,14 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr zxid, KeeperStorage::UpdateNodeDelta{[](KeeperStorage::Node & parent) { - ++parent.stat.cversion; - --parent.stat.numChildren; + ++parent.cversion; + parent.decreaseNumChildren(); }}); - new_deltas.emplace_back(request.path, zxid, KeeperStorage::RemoveNodeDelta{request.version, node->stat.ephemeralOwner}); + new_deltas.emplace_back(request.path, zxid, KeeperStorage::RemoveNodeDelta{request.version, node->ephemeralOwner()}); - if (node->stat.ephemeralOwner != 0) - storage.unregisterEphemeralPath(node->stat.ephemeralOwner, request.path); + if (node->isEphemeral()) + storage.unregisterEphemeralPath(node->ephemeralOwner(), request.path); digest = storage.calculateNodesDigest(digest, new_deltas); @@ -1341,7 +1469,7 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce auto node = storage.uncommitted_state.getNode(request.path); - if (request.version != -1 && request.version != node->stat.version) + if (request.version != -1 && request.version != node->version) return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADVERSION}}; new_deltas.emplace_back( @@ -1350,9 +1478,9 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce KeeperStorage::UpdateNodeDelta{ [zxid, data = request.data, time](KeeperStorage::Node & value) { - value.stat.version++; - value.stat.mzxid = zxid; - value.stat.mtime = time; + value.version++; + value.mzxid = zxid; + value.mtime = time; value.setData(data); }, request.version}); @@ -1364,7 +1492,7 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce { [](KeeperStorage::Node & parent) { - parent.stat.cversion++; + parent.cversion++; } } ); @@ -1455,7 +1583,7 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc { auto path_prefix = request.path; if (path_prefix.empty()) - throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: path cannot be empty"); + throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Path cannot be empty"); const auto & children = node_it->value.getChildren(); response.names.reserve(children.size()); @@ -1466,9 +1594,7 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc auto list_request_type = ALL; if (auto * filtered_list = dynamic_cast(&request)) - { list_request_type = filtered_list->list_request_type; - } if (list_request_type == ALL) return true; @@ -1478,7 +1604,7 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc if (child_it == container.end()) onStorageInconsistency(); - const auto is_ephemeral = child_it->value.stat.ephemeralOwner != 0; + const auto is_ephemeral = child_it->value.isEphemeral(); return (is_ephemeral && list_request_type == EPHEMERAL_ONLY) || (!is_ephemeral && list_request_type == PERSISTENT_ONLY); }; @@ -1531,7 +1657,7 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro auto node = storage.uncommitted_state.getNode(request.path); if (check_not_exists) { - if (node && (request.version == -1 || request.version == node->stat.version)) + if (node && (request.version == -1 || request.version == node->version)) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNODEEXISTS}}; } else @@ -1539,7 +1665,7 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro if (!node) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; - if (request.version != -1 && request.version != node->stat.version) + if (request.version != -1 && request.version != node->version) return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADVERSION}}; } @@ -1575,7 +1701,7 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro if (check_not_exists) { - if (node_it != container.end() && (request.version == -1 || request.version == node_it->value.stat.version)) + if (node_it != container.end() && (request.version == -1 || request.version == node_it->value.version)) on_error(Coordination::Error::ZNODEEXISTS); else response.error = Coordination::Error::ZOK; @@ -1584,7 +1710,7 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro { if (node_it == container.end()) on_error(Coordination::Error::ZNONODE); - else if (request.version != -1 && request.version != node_it->value.stat.version) + else if (request.version != -1 && request.version != node_it->value.version) on_error(Coordination::Error::ZBADVERSION); else response.error = Coordination::Error::ZOK; @@ -1637,7 +1763,7 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr auto node = uncommitted_state.getNode(request.path); - if (request.version != -1 && request.version != node->stat.aversion) + if (request.version != -1 && request.version != node->aversion) return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADVERSION}}; @@ -1657,7 +1783,7 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr zxid, KeeperStorage::UpdateNodeDelta { - [](KeeperStorage::Node & n) { ++n.stat.aversion; } + [](KeeperStorage::Node & n) { ++n.aversion; } } } }; @@ -1826,7 +1952,7 @@ struct KeeperStorageMultiRequestProcessor final : public KeeperStorageRequestPro } } - assert(request.requests.empty() || operation_type.has_value()); + chassert(request.requests.empty() || operation_type.has_value()); } std::vector @@ -1875,7 +2001,7 @@ struct KeeperStorageMultiRequestProcessor final : public KeeperStorageRequestPro auto & deltas = storage.uncommitted_state.deltas; // the deltas will have at least SubDeltaEnd or FailedMultiDelta - assert(!deltas.empty()); + chassert(!deltas.empty()); if (auto * failed_multi = std::get_if(&deltas.front().operation)) { for (size_t i = 0; i < concrete_requests.size(); ++i) @@ -2075,7 +2201,7 @@ UInt64 KeeperStorage::calculateNodesDigest(UInt64 current_digest, const std::vec [&](const CreateNodeDelta & create_delta) { auto node = std::make_shared(); - node->stat = create_delta.stat; + node->copyStats(create_delta.stat); node->setData(create_delta.data); updated_nodes.emplace(delta.path, node); }, @@ -2198,8 +2324,8 @@ void KeeperStorage::preprocessRequest( { [ephemeral_path](Node & parent) { - ++parent.stat.cversion; - --parent.stat.numChildren; + ++parent.cversion; + parent.decreaseNumChildren(); } } ); @@ -2302,7 +2428,7 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest( if (is_local) { - assert(zk_request->isReadRequest()); + chassert(zk_request->isReadRequest()); if (check_acl && !request_processor->checkAuth(*this, session_id, true)) { response = zk_request->makeResponse(); @@ -2531,6 +2657,17 @@ void KeeperStorage::recalculateStats() container.recalculateDataSize(); } +bool KeeperStorage::checkDigest(const Digest & first, const Digest & second) +{ + if (first.version != second.version) + return true; + + if (first.version == DigestVersion::NO_DIGEST) + return true; + + return first.value == second.value; +} + String KeeperStorage::generateDigest(const String & userdata) { std::vector user_password; diff --git a/src/Coordination/KeeperStorage.h b/src/Coordination/KeeperStorage.h index 01c1413a884..6618ec0bd85 100644 --- a/src/Coordination/KeeperStorage.h +++ b/src/Coordination/KeeperStorage.h @@ -5,17 +5,15 @@ #include #include #include -#include -#include -#include -#include -#include #include namespace DB { +class KeeperContext; +using KeeperContextPtr = std::shared_ptr; + struct KeeperStorageRequestProcessor; using KeeperStorageRequestProcessorPtr = std::shared_ptr; using ResponseCallback = std::function; @@ -35,40 +33,113 @@ public: /// New fields should be added to the struct only if it's really necessary struct Node { - /// to reduce size of the Node struct we use a custom Stat without dataLength - struct Stat - { - int64_t czxid{0}; - int64_t mzxid{0}; - int64_t ctime{0}; - int64_t mtime{0}; - int32_t version{0}; - int32_t cversion{0}; - int32_t aversion{0}; - int32_t numChildren{0}; /// NOLINT - int64_t ephemeralOwner{0}; /// NOLINT - int64_t pzxid{0}; - - bool operator==(const Stat &) const = default; - }; - + int64_t czxid{0}; + int64_t mzxid{0}; + int64_t pzxid{0}; uint64_t acl_id = 0; /// 0 -- no ACL by default - Stat stat{}; - int32_t seq_num = 0; - /// we cannot use `std::optional because we want to - /// pack the boolean with seq_num above - mutable bool has_cached_digest = false; + int64_t mtime{0}; + + std::unique_ptr data{nullptr}; + uint32_t data_size{0}; + + int32_t version{0}; + int32_t cversion{0}; + int32_t aversion{0}; + mutable uint64_t cached_digest = 0; + Node() = default; + + Node & operator=(const Node & other); + + Node(const Node & other); + + bool empty() const; + + bool isEphemeral() const + { + return is_ephemeral_and_ctime.is_ephemeral; + } + + int64_t ephemeralOwner() const + { + if (isEphemeral()) + return ephemeral_or_children_data.ephemeral_owner; + + return 0; + } + + void setEphemeralOwner(int64_t ephemeral_owner) + { + is_ephemeral_and_ctime.is_ephemeral = ephemeral_owner != 0; + ephemeral_or_children_data.ephemeral_owner = ephemeral_owner; + } + + int32_t numChildren() const + { + if (isEphemeral()) + return 0; + + return ephemeral_or_children_data.children_info.num_children; + } + + void setNumChildren(int32_t num_children) + { + ephemeral_or_children_data.children_info.num_children = num_children; + } + + void increaseNumChildren() + { + chassert(!isEphemeral()); + ++ephemeral_or_children_data.children_info.num_children; + } + + void decreaseNumChildren() + { + chassert(!isEphemeral()); + --ephemeral_or_children_data.children_info.num_children; + } + + int32_t seqNum() const + { + if (isEphemeral()) + return 0; + + return ephemeral_or_children_data.children_info.seq_num; + } + + void setSeqNum(int32_t seq_num) + { + ephemeral_or_children_data.children_info.seq_num = seq_num; + } + + void increaseSeqNum() + { + chassert(!isEphemeral()); + ++ephemeral_or_children_data.children_info.seq_num; + } + + int64_t ctime() const + { + return is_ephemeral_and_ctime.ctime; + } + + void setCtime(uint64_t ctime) + { + is_ephemeral_and_ctime.ctime = ctime; + } + + void copyStats(const Coordination::Stat & stat); + void setResponseStat(Coordination::Stat & response_stat) const; /// Object memory size uint64_t sizeInBytes() const; - void setData(String new_data); + void setData(const String & new_data); - const auto & getData() const noexcept { return data; } + std::string_view getData() const noexcept { return {data.get(), data_size}; } void addChild(StringRef child_path); @@ -87,18 +158,46 @@ public: // (e.g. we don't need to copy list of children) void shallowCopy(const Node & other); private: - String data; + /// as ctime can't be negative because it stores the timestamp when the + /// node was created, we can use the MSB for a bool + struct + { + bool is_ephemeral : 1; + int64_t ctime : 63; + } is_ephemeral_and_ctime{false, 0}; + + /// ephemeral notes cannot have children so a node can set either + /// ephemeral_owner OR seq_num + num_children + union + { + int64_t ephemeral_owner; + struct + { + int32_t seq_num; + int32_t num_children; + } children_info; + } ephemeral_or_children_data{0}; + ChildrenSet children{}; }; +#if !defined(ADDRESS_SANITIZER) && !defined(MEMORY_SANITIZER) + static_assert( + sizeof(ListNode) <= 144, + "std::list node containing ListNode is > 160 bytes (sizeof(ListNode) + 16 bytes for pointers) which will increase " + "memory consumption"); +#endif + enum DigestVersion : uint8_t { NO_DIGEST = 0, V1 = 1, - V2 = 2 // added system nodes that modify the digest on startup so digest from V0 is invalid + V2 = 2, // added system nodes that modify the digest on startup so digest from V0 is invalid + V3 = 3, // fixed bug with casting, removed duplicate czxid usage + V4 = 4 // 0 is not a valid digest value }; - static constexpr auto CURRENT_DIGEST_VERSION = DigestVersion::V2; + static constexpr auto CURRENT_DIGEST_VERSION = DigestVersion::V4; struct ResponseForSession { @@ -113,16 +212,7 @@ public: uint64_t value{0}; }; - static bool checkDigest(const Digest & first, const Digest & second) - { - if (first.version != second.version) - return true; - - if (first.version == DigestVersion::NO_DIGEST) - return true; - - return first.value == second.value; - } + static bool checkDigest(const Digest & first, const Digest & second); static String generateDigest(const String & userdata); @@ -177,7 +267,7 @@ public: // - quickly commit the changes to the storage struct CreateNodeDelta { - KeeperStorage::Node::Stat stat; + Coordination::Stat stat; Coordination::ACLs acls; String data; }; @@ -251,39 +341,7 @@ public: void applyDelta(const Delta & delta); - bool hasACL(int64_t session_id, bool is_local, std::function predicate) - { - const auto check_auth = [&](const auto & auth_ids) - { - for (const auto & auth : auth_ids) - { - using TAuth = std::remove_reference_t; - - const AuthID * auth_ptr = nullptr; - if constexpr (std::is_pointer_v) - auth_ptr = auth; - else - auth_ptr = &auth; - - if (predicate(*auth_ptr)) - return true; - } - return false; - }; - - if (is_local) - return check_auth(storage.session_and_auth[session_id]); - - if (check_auth(storage.session_and_auth[session_id])) - return true; - - // check if there are uncommitted - const auto auth_it = session_and_auth.find(session_id); - if (auth_it == session_and_auth.end()) - return false; - - return check_auth(auth_it->second); - } + bool hasACL(int64_t session_id, bool is_local, std::function predicate) const; void forEachAuthInSession(int64_t session_id, std::function func) const; @@ -342,7 +400,7 @@ public: bool createNode( const std::string & path, String data, - const KeeperStorage::Node::Stat & stat, + const Coordination::Stat & stat, Coordination::ACLs node_acls); // Remove node in the storage diff --git a/src/Coordination/SnapshotableHashTable.h b/src/Coordination/SnapshotableHashTable.h index ac8d36745c2..0c6af29d24a 100644 --- a/src/Coordination/SnapshotableHashTable.h +++ b/src/Coordination/SnapshotableHashTable.h @@ -2,72 +2,58 @@ #include #include #include -#include namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - template struct ListNode { StringRef key; V value; - /// |* * ****** | - /// ^ ^ ^ - /// active_in_map free_key version - /// (1 byte) (1 byte) (6 bytes) - uint64_t node_metadata = 0; + struct + { + bool active_in_map : 1; + bool free_key : 1; + uint64_t version : 62; + } node_metadata{false, false, 0}; void setInactiveInMap() { - node_metadata &= ~active_in_map_mask; + node_metadata.active_in_map = false; } void setActiveInMap() { - node_metadata |= active_in_map_mask; + node_metadata.active_in_map = true; } bool isActiveInMap() { - return node_metadata & active_in_map_mask; + return node_metadata.active_in_map; } void setFreeKey() { - node_metadata |= free_key_mask; + node_metadata.free_key = true; } bool getFreeKey() { - return node_metadata & free_key_mask; + return node_metadata.free_key; } uint64_t getVersion() { - return node_metadata & version_mask; + return node_metadata.version; } void setVersion(uint64_t version) { - if (version > version_mask) - throw Exception( - ErrorCodes::LOGICAL_ERROR, "Snapshot version {} is larger than maximum allowed value {}", version, version_mask); - - node_metadata &= ~version_mask; - node_metadata |= version; + node_metadata.version = version; } - - static constexpr uint64_t active_in_map_mask = static_cast(1) << 63; - static constexpr uint64_t free_key_mask = static_cast(1) << 62; - static constexpr uint64_t version_mask = ~(static_cast(3) << 62); }; template diff --git a/src/Coordination/ZooKeeperDataReader.cpp b/src/Coordination/ZooKeeperDataReader.cpp index 6b9d5f7c8eb..c7b1abf1d83 100644 --- a/src/Coordination/ZooKeeperDataReader.cpp +++ b/src/Coordination/ZooKeeperDataReader.cpp @@ -101,30 +101,37 @@ int64_t deserializeStorageData(KeeperStorage & storage, ReadBuffer & in, LoggerP KeeperStorage::Node node{}; String data; Coordination::read(data, in); - node.setData(std::move(data)); + node.setData(data); Coordination::read(node.acl_id, in); /// Deserialize stat - Coordination::read(node.stat.czxid, in); - Coordination::read(node.stat.mzxid, in); + Coordination::read(node.czxid, in); + Coordination::read(node.mzxid, in); /// For some reason ZXID specified in filename can be smaller /// then actual zxid from nodes. In this case we will use zxid from nodes. - max_zxid = std::max(max_zxid, node.stat.mzxid); + max_zxid = std::max(max_zxid, node.mzxid); - Coordination::read(node.stat.ctime, in); - Coordination::read(node.stat.mtime, in); - Coordination::read(node.stat.version, in); - Coordination::read(node.stat.cversion, in); - Coordination::read(node.stat.aversion, in); - Coordination::read(node.stat.ephemeralOwner, in); - Coordination::read(node.stat.pzxid, in); + int64_t ctime; + Coordination::read(ctime, in); + node.setCtime(ctime); + Coordination::read(node.mtime, in); + Coordination::read(node.version, in); + Coordination::read(node.cversion, in); + Coordination::read(node.aversion, in); + int64_t ephemeral_owner; + Coordination::read(ephemeral_owner, in); + if (ephemeral_owner != 0) + node.setEphemeralOwner(ephemeral_owner); + Coordination::read(node.pzxid, in); if (!path.empty()) { - node.seq_num = node.stat.cversion; + if (ephemeral_owner == 0) + node.setSeqNum(node.cversion); + storage.container.insertOrReplace(path, node); - if (node.stat.ephemeralOwner != 0) - storage.ephemerals[node.stat.ephemeralOwner].insert(path); + if (ephemeral_owner != 0) + storage.ephemerals[ephemeral_owner].insert(path); storage.acl_map.addUsage(node.acl_id); } @@ -139,7 +146,7 @@ int64_t deserializeStorageData(KeeperStorage & storage, ReadBuffer & in, LoggerP if (itr.key != "/") { auto parent_path = parentNodePath(itr.key); - storage.container.updateValue(parent_path, [my_path = itr.key] (KeeperStorage::Node & value) { value.addChild(getBaseNodeName(my_path)); ++value.stat.numChildren; }); + storage.container.updateValue(parent_path, [my_path = itr.key] (KeeperStorage::Node & value) { value.addChild(getBaseNodeName(my_path)); value.increaseNumChildren(); }); } } diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index 59a550177a4..bd9dc4c3fd3 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -1508,7 +1508,7 @@ void addNode(DB::KeeperStorage & storage, const std::string & path, const std::s using Node = DB::KeeperStorage::Node; Node node{}; node.setData(data); - node.stat.ephemeralOwner = ephemeral_owner; + node.setEphemeralOwner(ephemeral_owner); storage.container.insertOrReplace(path, node); auto child_it = storage.container.find(path); auto child_path = DB::getBaseNodeName(child_it->key); @@ -1517,7 +1517,7 @@ void addNode(DB::KeeperStorage & storage, const std::string & path, const std::s [&](auto & parent) { parent.addChild(child_path); - parent.stat.numChildren++; + parent.increaseNumChildren(); }); } @@ -1530,12 +1530,12 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple) DB::KeeperSnapshotManager manager(3, keeper_context, params.enable_compression); DB::KeeperStorage storage(500, "", keeper_context); - addNode(storage, "/hello", "world", 1); - addNode(storage, "/hello/somepath", "somedata", 3); + addNode(storage, "/hello1", "world", 1); + addNode(storage, "/hello2", "somedata", 3); storage.session_id_counter = 5; storage.zxid = 2; - storage.ephemerals[3] = {"/hello"}; - storage.ephemerals[1] = {"/hello/somepath"}; + storage.ephemerals[3] = {"/hello2"}; + storage.ephemerals[1] = {"/hello1"}; storage.getSessionID(130); storage.getSessionID(130); @@ -1556,13 +1556,13 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple) auto [restored_storage, snapshot_meta, _] = manager.deserializeSnapshotFromBuffer(debuf); EXPECT_EQ(restored_storage->container.size(), 6); - EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 2); - EXPECT_EQ(restored_storage->container.getValue("/hello").getChildren().size(), 1); - EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getChildren().size(), 0); + EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 3); + EXPECT_EQ(restored_storage->container.getValue("/hello1").getChildren().size(), 0); + EXPECT_EQ(restored_storage->container.getValue("/hello2").getChildren().size(), 0); EXPECT_EQ(restored_storage->container.getValue("/").getData(), ""); - EXPECT_EQ(restored_storage->container.getValue("/hello").getData(), "world"); - EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getData(), "somedata"); + EXPECT_EQ(restored_storage->container.getValue("/hello1").getData(), "world"); + EXPECT_EQ(restored_storage->container.getValue("/hello2").getData(), "somedata"); EXPECT_EQ(restored_storage->session_id_counter, 7); EXPECT_EQ(restored_storage->zxid, 2); EXPECT_EQ(restored_storage->ephemerals.size(), 2); @@ -2251,12 +2251,12 @@ TEST_P(CoordinationTest, TestStorageSnapshotDifferentCompressions) DB::KeeperSnapshotManager manager(3, keeper_context, params.enable_compression); DB::KeeperStorage storage(500, "", keeper_context); - addNode(storage, "/hello", "world", 1); - addNode(storage, "/hello/somepath", "somedata", 3); + addNode(storage, "/hello1", "world", 1); + addNode(storage, "/hello2", "somedata", 3); storage.session_id_counter = 5; storage.zxid = 2; - storage.ephemerals[3] = {"/hello"}; - storage.ephemerals[1] = {"/hello/somepath"}; + storage.ephemerals[3] = {"/hello2"}; + storage.ephemerals[1] = {"/hello1"}; storage.getSessionID(130); storage.getSessionID(130); @@ -2273,13 +2273,13 @@ TEST_P(CoordinationTest, TestStorageSnapshotDifferentCompressions) auto [restored_storage, snapshot_meta, _] = new_manager.deserializeSnapshotFromBuffer(debuf); EXPECT_EQ(restored_storage->container.size(), 6); - EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 2); - EXPECT_EQ(restored_storage->container.getValue("/hello").getChildren().size(), 1); - EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getChildren().size(), 0); + EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 3); + EXPECT_EQ(restored_storage->container.getValue("/hello1").getChildren().size(), 0); + EXPECT_EQ(restored_storage->container.getValue("/hello2").getChildren().size(), 0); EXPECT_EQ(restored_storage->container.getValue("/").getData(), ""); - EXPECT_EQ(restored_storage->container.getValue("/hello").getData(), "world"); - EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getData(), "somedata"); + EXPECT_EQ(restored_storage->container.getValue("/hello1").getData(), "world"); + EXPECT_EQ(restored_storage->container.getValue("/hello2").getData(), "somedata"); EXPECT_EQ(restored_storage->session_id_counter, 7); EXPECT_EQ(restored_storage->zxid, 2); EXPECT_EQ(restored_storage->ephemerals.size(), 2); @@ -2948,7 +2948,7 @@ TEST_P(CoordinationTest, TestCheckNotExistsRequest) create_path("/test_node"); auto node_it = storage.container.find("/test_node"); ASSERT_NE(node_it, storage.container.end()); - auto node_version = node_it->value.stat.version; + auto node_version = node_it->value.version; { SCOPED_TRACE("CheckNotExists returns ZNODEEXISTS"); diff --git a/src/Core/AccurateComparison.h b/src/Core/AccurateComparison.h index a201c136e3a..139ee4d88dc 100644 --- a/src/Core/AccurateComparison.h +++ b/src/Core/AccurateComparison.h @@ -152,7 +152,7 @@ bool notEqualsOp(A a, B b) } /// Converts numeric to an equal numeric of other type. -/// When `strict` is `true` check that result exactly same as input, otherwise just check overflow +/// When `strict` is `true` check that result exactly the same as input, otherwise just check overflow template inline bool NO_SANITIZE_UNDEFINED convertNumeric(From value, To & result) { diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index a7d5b0a869f..dfd60b994f4 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -601,7 +601,7 @@ Block Block::shrinkToFit() const { Columns new_columns(data.size(), nullptr); for (size_t i = 0; i < data.size(); ++i) - new_columns[i] = data[i].column->shrinkToFit(); + new_columns[i] = data[i].column->cloneResized(data[i].column->size()); return cloneWithColumns(new_columns); } diff --git a/src/Core/Field.h b/src/Core/Field.h index 445a5850ca4..414874310a6 100644 --- a/src/Core/Field.h +++ b/src/Core/Field.h @@ -216,9 +216,8 @@ using NearestFieldType = typename NearestFieldTypeImpl::Type; template <> struct NearestFieldTypeImpl { using Type = std::conditional_t, Int64, UInt64>; }; template <> struct NearestFieldTypeImpl { using Type = Int64; }; template <> struct NearestFieldTypeImpl { using Type = UInt64; }; -#ifdef __cpp_char8_t template <> struct NearestFieldTypeImpl { using Type = UInt64; }; -#endif +template <> struct NearestFieldTypeImpl { using Type = Int64; }; template <> struct NearestFieldTypeImpl { using Type = UInt64; }; template <> struct NearestFieldTypeImpl { using Type = UInt64; }; @@ -306,7 +305,6 @@ static constexpr auto DBMS_MIN_FIELD_SIZE = 32; */ class Field { - static constexpr int nan_direction_hint = 1; // When comparing Floats NaN are considered to be larger than all numbers public: struct Types { @@ -511,6 +509,7 @@ public: case Types::IPv4: return get() < rhs.get(); case Types::IPv6: return get() < rhs.get(); case Types::Float64: + static constexpr int nan_direction_hint = 1; /// Put NaN at the end return FloatCompareHelper::less(get(), rhs.get(), nan_direction_hint); case Types::String: return get() < rhs.get(); case Types::Array: return get() < rhs.get(); @@ -555,6 +554,7 @@ public: case Types::IPv6: return get() <= rhs.get(); case Types::Float64: { + static constexpr int nan_direction_hint = 1; /// Put NaN at the end Float64 f1 = get(); Float64 f2 = get(); return FloatCompareHelper::less(f1, f2, nan_direction_hint) @@ -595,6 +595,7 @@ public: case Types::UInt64: return get() == rhs.get(); case Types::Int64: return get() == rhs.get(); case Types::Float64: + static constexpr int nan_direction_hint = 1; /// Put NaN at the end return FloatCompareHelper::equals(get(), rhs.get(), nan_direction_hint); case Types::UUID: return get() == rhs.get(); case Types::IPv4: return get() == rhs.get(); diff --git a/src/Core/MySQL/PacketEndpoint.cpp b/src/Core/MySQL/PacketEndpoint.cpp index 97b5d3b4d11..085d7595167 100644 --- a/src/Core/MySQL/PacketEndpoint.cpp +++ b/src/Core/MySQL/PacketEndpoint.cpp @@ -40,7 +40,7 @@ bool PacketEndpoint::tryReceivePacket(IMySQLReadPacket & packet, UInt64 millisec ReadBufferFromPocoSocket * socket_in = typeid_cast(in); if (!socket_in) - throw Exception(ErrorCodes::LOGICAL_ERROR, "LOGICAL ERROR: Attempt to pull the duration in a non socket stream"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to pull the duration in a non socket stream"); if (!socket_in->poll(millisecond * 1000)) return false; diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index 1940646a22f..de2a4e9b755 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -114,6 +114,9 @@ namespace DB M(Bool, validate_tcp_client_information, false, "Validate client_information in the query packet over the native TCP protocol.", 0) \ M(Bool, storage_metadata_write_full_object_key, false, "Write disk metadata files with VERSION_FULL_OBJECT_KEY format", 0) \ M(UInt64, max_materialized_views_count_for_table, 0, "A limit on the number of materialized views attached to a table.", 0) \ + M(UInt32, max_database_replicated_create_table_thread_pool_size, 1, "The number of threads to create tables during replica recovery in DatabaseReplicated. Zero means number of threads equal number of cores.", 0) \ + M(String, default_replica_path, "/clickhouse/tables/{uuid}/{shard}", "The path to the table in ZooKeeper", 0) \ + M(String, default_replica_name, "{replica}", "The replica name in ZooKeeper", 0) \ /// If you add a setting which can be updated at runtime, please update 'changeable_settings' map in StorageSystemServerSettings.cpp diff --git a/src/Core/ServerUUID.cpp b/src/Core/ServerUUID.cpp index bcc1fecb529..c2de6be7794 100644 --- a/src/Core/ServerUUID.cpp +++ b/src/Core/ServerUUID.cpp @@ -14,6 +14,11 @@ namespace ErrorCodes } void ServerUUID::load(const fs::path & server_uuid_file, Poco::Logger * log) +{ + server_uuid = loadServerUUID(server_uuid_file, log); +} + +UUID loadServerUUID(const fs::path & server_uuid_file, Poco::Logger * log) { /// Write a uuid file containing a unique uuid if the file doesn't already exist during server start. @@ -25,8 +30,7 @@ void ServerUUID::load(const fs::path & server_uuid_file, Poco::Logger * log) ReadBufferFromFile in(server_uuid_file); readUUIDText(uuid, in); assertEOF(in); - server_uuid = uuid; - return; + return uuid; } catch (...) { @@ -44,7 +48,7 @@ void ServerUUID::load(const fs::path & server_uuid_file, Poco::Logger * log) out.write(uuid_str.data(), uuid_str.size()); out.sync(); out.finalize(); - server_uuid = new_uuid; + return new_uuid; } catch (...) { diff --git a/src/Core/ServerUUID.h b/src/Core/ServerUUID.h index b5ea17426cb..71ae9edc00e 100644 --- a/src/Core/ServerUUID.h +++ b/src/Core/ServerUUID.h @@ -21,4 +21,6 @@ public: static void load(const fs::path & server_uuid_file, Poco::Logger * log); }; +UUID loadServerUUID(const fs::path & server_uuid_file, Poco::Logger * log); + } diff --git a/src/Core/Settings.h b/src/Core/Settings.h index e0b3ca39899..f488d5a84e7 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -40,6 +40,8 @@ class IColumn; M(UInt64, min_insert_block_size_bytes, (DEFAULT_INSERT_BLOCK_SIZE * 256), "Squash blocks passed to INSERT query to specified size in bytes, if blocks are not big enough.", 0) \ M(UInt64, min_insert_block_size_rows_for_materialized_views, 0, "Like min_insert_block_size_rows, but applied only during pushing to MATERIALIZED VIEW (default: min_insert_block_size_rows)", 0) \ M(UInt64, min_insert_block_size_bytes_for_materialized_views, 0, "Like min_insert_block_size_bytes, but applied only during pushing to MATERIALIZED VIEW (default: min_insert_block_size_bytes)", 0) \ + M(UInt64, min_external_table_block_size_rows, DEFAULT_INSERT_BLOCK_SIZE, "Squash blocks passed to external table to specified size in rows, if blocks are not big enough.", 0) \ + M(UInt64, min_external_table_block_size_bytes, (DEFAULT_INSERT_BLOCK_SIZE * 256), "Squash blocks passed to external table to specified size in bytes, if blocks are not big enough.", 0) \ M(UInt64, max_joined_block_size_rows, DEFAULT_BLOCK_SIZE, "Maximum block size for JOIN result (if join algorithm supports it). 0 means unlimited.", 0) \ M(UInt64, max_insert_threads, 0, "The maximum number of threads to execute the INSERT SELECT query. Values 0 or 1 means that INSERT SELECT is not run in parallel. Higher values will lead to higher memory usage. Parallel INSERT SELECT has effect only if the SELECT part is run on parallel, see 'max_threads' setting.", 0) \ M(UInt64, max_insert_delayed_streams_for_parallel_write, 0, "The maximum number of streams (columns) to delay final part flush. Default - auto (1000 in case of underlying storage supports parallel write, for example S3 and disabled otherwise)", 0) \ @@ -82,9 +84,11 @@ class IColumn; M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to S3, s3_min_upload_part_size is multiplied by s3_upload_part_size_multiply_factor.", 0) \ M(UInt64, s3_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited. You ", 0) \ M(UInt64, s3_max_single_part_upload_size, 32*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \ - M(UInt64, azure_max_single_part_upload_size, 100*1024*1024, "The maximum size of object to upload using singlepart upload to Azure blob storage.", 0) \ + M(UInt64, azure_max_single_part_upload_size, 100*1024*1024, "The maximum size of object to upload using singlepart upload to Azure blob storage.", 0) \ + M(UInt64, azure_max_single_part_copy_size, 256*1024*1024, "The maximum size of object to copy using single part copy to Azure blob storage.", 0) \ M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ M(UInt64, azure_max_single_read_retries, 4, "The maximum number of retries during single Azure blob storage read.", 0) \ + M(UInt64, azure_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during Azure blob storage write", 0) \ M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \ M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \ @@ -153,7 +157,7 @@ class IColumn; M(Float, totals_auto_threshold, 0.5, "The threshold for totals_mode = 'auto'.", 0) \ \ M(Bool, allow_suspicious_low_cardinality_types, false, "In CREATE TABLE statement allows specifying LowCardinality modifier for types of small fixed size (8 or less). Enabling this may increase merge times and memory consumption.", 0) \ - M(Bool, allow_suspicious_fixed_string_types, false, "In CREATE TABLE statement allows creating columns of type FixedString(n) with n > 256. FixedString with length >= 256 is suspicious and most likely indicates misusage", 0) \ + M(Bool, allow_suspicious_fixed_string_types, false, "In CREATE TABLE statement allows creating columns of type FixedString(n) with n > 256. FixedString with length >= 256 is suspicious and most likely indicates misuse", 0) \ M(Bool, allow_suspicious_indices, false, "Reject primary/secondary indexes and sorting keys with identical expressions", 0) \ M(Bool, allow_suspicious_ttl_expressions, false, "Reject TTL expressions that don't depend on any of table's columns. It indicates a user error most of the time.", 0) \ M(Bool, compile_expressions, false, "Compile some scalar functions and operators to native code.", 0) \ @@ -184,6 +188,7 @@ class IColumn; M(Float, parallel_replicas_single_task_marks_count_multiplier, 2, "A multiplier which will be added during calculation for minimal number of marks to retrieve from coordinator. This will be applied only for remote replicas.", 0) \ M(Bool, parallel_replicas_for_non_replicated_merge_tree, false, "If true, ClickHouse will use parallel replicas algorithm also for non-replicated MergeTree tables", 0) \ M(UInt64, parallel_replicas_min_number_of_rows_per_replica, 0, "Limit the number of replicas used in a query to (estimated rows to read / min_number_of_rows_per_replica). The max is still limited by 'max_parallel_replicas'", 0) \ + M(Bool, parallel_replicas_prefer_local_join, true, "If true, and JOIN can be executed with parallel replicas algorithm, and all storages of right JOIN part are *MergeTree, local JOIN will be used instead of GLOBAL JOIN.", 0) \ M(UInt64, parallel_replicas_mark_segment_size, 128, "Parts virtually divided into segments to be distributed between replicas for parallel reading. This setting controls the size of these segments. Not recommended to change until you're absolutely sure in what you're doing", 0) \ \ M(Bool, skip_unavailable_shards, false, "If true, ClickHouse silently skips unavailable shards. Shard is marked as unavailable when: 1) The shard cannot be reached due to a connection failure. 2) Shard is unresolvable through DNS. 3) Table does not exist on the shard.", 0) \ @@ -214,6 +219,8 @@ class IColumn; M(UInt64, merge_tree_max_rows_to_use_cache, (128 * 8192), "The maximum number of rows per request, to use the cache of uncompressed data. If the request is large, the cache is not used. (For large queries not to flush out the cache.)", 0) \ M(UInt64, merge_tree_max_bytes_to_use_cache, (192 * 10 * 1024 * 1024), "The maximum number of bytes per request, to use the cache of uncompressed data. If the request is large, the cache is not used. (For large queries not to flush out the cache.)", 0) \ M(Bool, do_not_merge_across_partitions_select_final, false, "Merge parts only in one partition in select final", 0) \ + M(Bool, split_parts_ranges_into_intersecting_and_non_intersecting_final, true, "Split parts ranges into intersecting and non intersecting during FINAL optimization", 0) \ + M(Bool, split_intersecting_parts_ranges_into_layers_final, true, "Split intersecting parts ranges into layers during FINAL optimization", 0) \ M(Bool, allow_experimental_inverted_index, false, "If it is set to true, allow to use experimental inverted index.", 0) \ \ M(UInt64, mysql_max_rows_to_insert, 65536, "The maximum number of rows in MySQL batch insertion of the MySQL storage engine", 0) \ @@ -252,7 +259,7 @@ class IColumn; M(LogQueriesType, log_queries_min_type, QueryLogElementType::QUERY_START, "Minimal type in query_log to log, possible values (from low to high): QUERY_START, QUERY_FINISH, EXCEPTION_BEFORE_START, EXCEPTION_WHILE_PROCESSING.", 0) \ M(Milliseconds, log_queries_min_query_duration_ms, 0, "Minimal time for the query to run, to get to the query_log/query_thread_log/query_views_log.", 0) \ M(UInt64, log_queries_cut_to_length, 100000, "If query length is greater than specified threshold (in bytes), then cut query when writing to query log. Also limit length of printed query in ordinary text log.", 0) \ - M(Float, log_queries_probability, 1., "Log queries with the specified probabality.", 0) \ + M(Float, log_queries_probability, 1., "Log queries with the specified probability.", 0) \ \ M(Bool, log_processors_profiles, false, "Log Processors profile events.", 0) \ M(DistributedProductMode, distributed_product_mode, DistributedProductMode::DENY, "How are distributed subqueries performed inside IN or JOIN sections?", IMPORTANT) \ @@ -260,8 +267,8 @@ class IColumn; M(UInt64, max_concurrent_queries_for_all_users, 0, "The maximum number of concurrent requests for all users.", 0) \ M(UInt64, max_concurrent_queries_for_user, 0, "The maximum number of concurrent requests per user.", 0) \ \ - M(Bool, insert_deduplicate, true, "For INSERT queries in the replicated table, specifies that deduplication of insertings blocks should be performed", 0) \ - M(Bool, async_insert_deduplicate, false, "For async INSERT queries in the replicated table, specifies that deduplication of insertings blocks should be performed", 0) \ + M(Bool, insert_deduplicate, true, "For INSERT queries in the replicated table, specifies that deduplication of inserting blocks should be performed", 0) \ + M(Bool, async_insert_deduplicate, false, "For async INSERT queries in the replicated table, specifies that deduplication of inserting blocks should be performed", 0) \ \ M(UInt64Auto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled, 'auto' - use majority", 0) \ M(Milliseconds, insert_quorum_timeout, 600000, "If the quorum of replicas did not meet in specified time (in milliseconds), exception will be thrown and insertion is aborted.", 0) \ @@ -586,10 +593,11 @@ class IColumn; M(Bool, optimize_substitute_columns, false, "Use constraints for column substitution", 0) \ M(Bool, optimize_append_index, false, "Use constraints in order to append index condition (indexHint)", 0) \ M(Bool, normalize_function_names, true, "Normalize function names to their canonical names", 0) \ - M(Bool, enable_early_constant_folding, true, "Enable query optimization where we analyze function and subqueries results and rewrite query if there're constants there", 0) \ + M(Bool, enable_early_constant_folding, true, "Enable query optimization where we analyze function and subqueries results and rewrite query if there are constants there", 0) \ M(Bool, deduplicate_blocks_in_dependent_materialized_views, false, "Should deduplicate blocks for materialized views if the block is not a duplicate for the table. Use true to always deduplicate in dependent tables.", 0) \ + M(Bool, update_insert_deduplication_token_in_dependent_materialized_views, false, "Should update insert deduplication token with table identifier during insert in dependent materialized views.", 0) \ M(Bool, materialized_views_ignore_errors, false, "Allows to ignore errors for MATERIALIZED VIEW, and deliver original block to the table regardless of MVs", 0) \ - M(Bool, ignore_materialized_views_with_dropped_target_table, false, "Ignore MVs with dropped taraget table during pushing to views", 0) \ + M(Bool, ignore_materialized_views_with_dropped_target_table, false, "Ignore MVs with dropped target table during pushing to views", 0) \ M(Bool, allow_experimental_refreshable_materialized_view, false, "Allow refreshable materialized views (CREATE MATERIALIZED VIEW REFRESH ...).", 0) \ M(Bool, stop_refreshable_materialized_views_on_startup, false, "On server startup, prevent scheduling of refreshable materialized views, as if with SYSTEM STOP VIEWS. You can manually start them with SYSTEM START VIEWS or SYSTEM START VIEW afterwards. Also applies to newly created views. Has no effect on non-refreshable materialized views.", 0) \ M(Bool, use_compact_format_in_distributed_parts_names, true, "Changes format of directories names for distributed table insert parts.", 0) \ @@ -746,9 +754,14 @@ class IColumn; M(Bool, async_insert, false, "If true, data from INSERT query is stored in queue and later flushed to table in background. If wait_for_async_insert is false, INSERT query is processed almost instantly, otherwise client will wait until data will be flushed to table", 0) \ M(Bool, wait_for_async_insert, true, "If true wait for processing of asynchronous insertion", 0) \ M(Seconds, wait_for_async_insert_timeout, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "Timeout for waiting for processing asynchronous insertion", 0) \ - M(UInt64, async_insert_max_data_size, 1000000, "Maximum size in bytes of unparsed data collected per query before being inserted", 0) \ + M(UInt64, async_insert_max_data_size, 10485760, "Maximum size in bytes of unparsed data collected per query before being inserted", 0) \ M(UInt64, async_insert_max_query_number, 450, "Maximum number of insert queries before being inserted", 0) \ - M(Milliseconds, async_insert_busy_timeout_ms, 200, "Maximum time to wait before dumping collected data per query since the first data appeared", 0) \ + M(Milliseconds, async_insert_poll_timeout_ms, 10, "Timeout for polling data from asynchronous insert queue", 0) \ + M(Bool, async_insert_use_adaptive_busy_timeout, true, "If it is set to true, use adaptive busy timeout for asynchronous inserts", 0) \ + M(Milliseconds, async_insert_busy_timeout_min_ms, 50, "If auto-adjusting is enabled through async_insert_use_adaptive_busy_timeout, minimum time to wait before dumping collected data per query since the first data appeared. It also serves as the initial value for the adaptive algorithm", 0) \ + M(Milliseconds, async_insert_busy_timeout_max_ms, 200, "Maximum time to wait before dumping collected data per query since the first data appeared.", 0) ALIAS(async_insert_busy_timeout_ms) \ + M(Double, async_insert_busy_timeout_increase_rate, 0.2, "The exponential growth rate at which the adaptive asynchronous insert timeout increases", 0) \ + M(Double, async_insert_busy_timeout_decrease_rate, 0.2, "The exponential growth rate at which the adaptive asynchronous insert timeout decreases", 0) \ \ M(UInt64, remote_fs_read_max_backoff_ms, 10000, "Max wait time when trying to read data for remote disk", 0) \ M(UInt64, remote_fs_read_backoff_max_tries, 5, "Max attempts to read with backoff", 0) \ @@ -763,8 +776,8 @@ class IColumn; \ M(Bool, load_marks_asynchronously, false, "Load MergeTree marks asynchronously", 0) \ M(Bool, enable_filesystem_read_prefetches_log, false, "Log to system.filesystem prefetch_log during query. Should be used only for testing or debugging, not recommended to be turned on by default", 0) \ - M(Bool, allow_prefetched_read_pool_for_remote_filesystem, true, "Prefer prefethed threadpool if all parts are on remote filesystem", 0) \ - M(Bool, allow_prefetched_read_pool_for_local_filesystem, false, "Prefer prefethed threadpool if all parts are on remote filesystem", 0) \ + M(Bool, allow_prefetched_read_pool_for_remote_filesystem, true, "Prefer prefetched threadpool if all parts are on remote filesystem", 0) \ + M(Bool, allow_prefetched_read_pool_for_local_filesystem, false, "Prefer prefetched threadpool if all parts are on local filesystem", 0) \ \ M(UInt64, prefetch_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, "The maximum size of the prefetch buffer to read from the filesystem.", 0) \ M(UInt64, filesystem_prefetch_step_bytes, 0, "Prefetch step in bytes. Zero means `auto` - approximately the best prefetch step will be auto deduced, but might not be 100% the best. The actual value might be different because of setting filesystem_prefetch_min_bytes_for_single_read_task", 0) \ @@ -809,7 +822,7 @@ class IColumn; \ M(String, rename_files_after_processing, "", "Rename successfully processed files according to the specified pattern; Pattern can include the following placeholders: `%a` (full original file name), `%f` (original filename without extension), `%e` (file extension with dot), `%t` (current timestamp in µs), and `%%` (% sign)", 0) \ \ - M(Bool, parallelize_output_from_storages, true, "Parallelize output for reading step from storage. It allows parallelizing query processing right after reading from storage if possible", 0) \ + M(Bool, parallelize_output_from_storages, true, "Parallelize output for reading step from storage. It allows parallelization of query processing right after reading from storage if possible", 0) \ M(String, insert_deduplication_token, "", "If not empty, used for duplicate detection instead of data digest", 0) \ M(Bool, count_distinct_optimization, false, "Rewrite count distinct to subquery of group by", 0) \ M(Bool, throw_if_no_data_to_insert, true, "Allows or forbids empty INSERTs, enabled by default (throws an error on an empty insert)", 0) \ @@ -821,12 +834,13 @@ class IColumn; M(UInt64, insert_keeper_retry_max_backoff_ms, 10000, "Max backoff timeout for keeper operations during insert", 0) \ M(Float, insert_keeper_fault_injection_probability, 0.0f, "Approximate probability of failure for a keeper request during insert. Valid value is in interval [0.0f, 1.0f]", 0) \ M(UInt64, insert_keeper_fault_injection_seed, 0, "0 - random seed, otherwise the setting value", 0) \ - M(Bool, force_aggregation_in_order, false, "Force use of aggregation in order on remote nodes during distributed aggregation. PLEASE, NEVER CHANGE THIS SETTING VALUE MANUALLY!", IMPORTANT) \ + M(Bool, force_aggregation_in_order, false, "The setting is used by the server itself to support distributed queries. Do not change it manually, because it will break normal operations. (Forces use of aggregation in order on remote nodes during distributed aggregation).", IMPORTANT) \ M(UInt64, http_max_request_param_data_size, 10_MiB, "Limit on size of request data used as a query parameter in predefined HTTP requests.", 0) \ M(Bool, function_json_value_return_type_allow_nullable, false, "Allow function JSON_VALUE to return nullable type.", 0) \ M(Bool, function_json_value_return_type_allow_complex, false, "Allow function JSON_VALUE to return complex type, such as: struct, array, map.", 0) \ M(Bool, use_with_fill_by_sorting_prefix, true, "Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently", 0) \ M(Bool, optimize_uniq_to_count, true, "Rewrite uniq and its variants(except uniqUpTo) to count if subquery has distinct or group by clause.", 0) \ + M(Bool, use_variant_as_common_type, false, "Use Variant as a result type for if/multiIf in case when there is no common type for arguments", 0) \ \ /** Experimental functions */ \ M(Bool, allow_experimental_materialized_postgresql_table, false, "Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental", 0) \ @@ -834,6 +848,7 @@ class IColumn; M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \ M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \ M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ + M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \ M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \ M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \ M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \ @@ -846,7 +861,7 @@ class IColumn; M(UInt64, grace_hash_join_max_buckets, 1024, "Limit on the number of grace hash join buckets", 0) \ M(Bool, optimize_distinct_in_order, true, "Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \ M(Bool, keeper_map_strict_mode, false, "Enforce additional checks during operations on KeeperMap. E.g. throw an exception on an insert for already existing key", 0) \ - M(UInt64, extract_kvp_max_pairs_per_row, 1000, "Max number pairs that can be produced by extractKeyValuePairs function. Used to safeguard against consuming too much memory.", 0) \ + M(UInt64, extract_key_value_pairs_max_pairs_per_row, 1000, "Max number of pairs that can be produced by the `extractKeyValuePairs` function. Used as a safeguard against consuming too much memory.", 0) ALIAS(extract_kvp_max_pairs_per_row) \ M(Timezone, session_timezone, "", "This setting can be removed in the future due to potential caveats. It is experimental and is not suitable for production usage. The default timezone for current session or query. The server default timezone if empty.", 0) \ M(Bool, allow_create_index_without_type, false, "Allow CREATE INDEX query without TYPE. Query will be ignored. Made for SQL compatibility tests.", 0) \ M(Bool, create_index_ignore_unique, false, "Ignore UNIQUE keyword in CREATE UNIQUE INDEX. Made for SQL compatibility tests.", 0) \ @@ -1004,6 +1019,7 @@ class IColumn; M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \ + M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats", 0) \ M(Bool, output_format_markdown_escape_special_characters, false, "Escape special characters in Markdown", 0) \ M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \ M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \ @@ -1081,6 +1097,8 @@ class IColumn; M(String, format_schema, "", "Schema identifier (used by schema-based formats)", 0) \ M(String, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)", 0) \ M(String, format_template_row, "", "Path to file which contains format string for rows (for Template format)", 0) \ + M(String, format_template_row_format, "", "Format string for rows (for Template format)", 0) \ + M(String, format_template_resultset_format, "", "Format string for result set (for Template format)", 0) \ M(String, format_template_rows_between_delimiter, "\n", "Delimiter between rows (for Template format)", 0) \ \ M(EscapingRule, format_custom_escaping_rule, "Escaped", "Field escaping rule (for CustomSeparated format)", 0) \ @@ -1101,7 +1119,7 @@ class IColumn; M(Bool, insert_distributed_one_random_shard, false, "If setting is enabled, inserting into distributed table will choose a random shard to write when there is no sharding key", 0) \ \ M(Bool, exact_rows_before_limit, false, "When enabled, ClickHouse will provide exact value for rows_before_limit_at_least statistic, but with the cost that the data before limit will have to be read completely", 0) \ - M(UInt64, cross_to_inner_join_rewrite, 1, "Use inner join instead of comma/cross join if there're joining expressions in the WHERE section. Values: 0 - no rewrite, 1 - apply if possible for comma/cross, 2 - force rewrite all comma joins, cross - if possible", 0) \ + M(UInt64, cross_to_inner_join_rewrite, 1, "Use inner join instead of comma/cross join if there are joining expressions in the WHERE section. Values: 0 - no rewrite, 1 - apply if possible for comma/cross, 2 - force rewrite all comma joins, cross - if possible", 0) \ \ M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \ M(Bool, output_format_arrow_use_signed_indexes_for_dictionary, true, "Use signed integers for dictionary indexes in Arrow format", 0) \ @@ -1129,6 +1147,8 @@ class IColumn; M(Bool, output_format_sql_insert_use_replace, false, "Use REPLACE statement instead of INSERT", 0) \ M(Bool, output_format_sql_insert_quote_names, true, "Quote column names with '`' characters", 0) \ \ + M(Bool, output_format_values_escape_quote_with_quote, false, "If true escape ' with '', otherwise quoted with \\'", 0) \ + \ M(Bool, output_format_bson_string_as_string, false, "Use BSON String type instead of Binary for String columns.", 0) \ M(Bool, input_format_bson_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format BSON.", 0) \ \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index dff0ebb759c..e97a411e2c1 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -84,9 +84,31 @@ namespace SettingsChangesHistory /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) static std::map settings_changes_history = { + {"24.2", { + {"output_format_values_escape_quote_with_quote", false, false, "If true escape ' with '', otherwise quoted with \\'"}, + {"input_format_try_infer_exponent_floats", true, false, "Don't infer floats in exponential notation by default"}, + {"async_insert_max_data_size", 1000000, 10485760, "The previous value appeared to be too small."}, + {"async_insert_poll_timeout_ms", 10, 10, "Timeout in milliseconds for polling data from asynchronous insert queue"}, + {"async_insert_use_adaptive_busy_timeout", true, true, "Use adaptive asynchronous insert timeout"}, + {"async_insert_busy_timeout_min_ms", 50, 50, "The minimum value of the asynchronous insert timeout in milliseconds; it also serves as the initial value, which may be increased later by the adaptive algorithm"}, + {"async_insert_busy_timeout_max_ms", 200, 200, "The minimum value of the asynchronous insert timeout in milliseconds; async_insert_busy_timeout_ms is aliased to async_insert_busy_timeout_max_ms"}, + {"async_insert_busy_timeout_increase_rate", 0.2, 0.2, "The exponential growth rate at which the adaptive asynchronous insert timeout increases"}, + {"async_insert_busy_timeout_decrease_rate", 0.2, 0.2, "The exponential growth rate at which the adaptive asynchronous insert timeout decreases"}, + {"format_template_row_format", "", "", "Template row format string can be set directly in query"}, + {"format_template_resultset_format", "", "", "Template result set format string can be set in query"}, + {"split_parts_ranges_into_intersecting_and_non_intersecting_final", true, true, "Allow to split parts ranges into intersecting and non intersecting during FINAL optimization"}, + {"split_intersecting_parts_ranges_into_layers_final", true, true, "Allow to split intersecting parts ranges into layers during FINAL optimization"}, + {"azure_max_single_part_copy_size", 256*1024*1024, 256*1024*1024, "The maximum size of object to copy using single part copy to Azure blob storage."}, + {"min_external_table_block_size_rows", DEFAULT_INSERT_BLOCK_SIZE, DEFAULT_INSERT_BLOCK_SIZE, "Squash blocks passed to external table to specified size in rows, if blocks are not big enough"}, + {"min_external_table_block_size_bytes", DEFAULT_INSERT_BLOCK_SIZE * 256, DEFAULT_INSERT_BLOCK_SIZE * 256, "Squash blocks passed to external table to specified size in bytes, if blocks are not big enough."}, + {"parallel_replicas_prefer_local_join", true, true, "If true, and JOIN can be executed with parallel replicas algorithm, and all storages of right JOIN part are *MergeTree, local JOIN will be used instead of GLOBAL JOIN."}, + {"extract_key_value_pairs_max_pairs_per_row", 0, 0, "Max number of pairs that can be produced by the `extractKeyValuePairs` function. Used as a safeguard against consuming too much memory."}, + }}, {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."}, {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"}, {"output_format_arrow_use_signed_indexes_for_dictionary", false, true, "Use signed indexes type for Arrow dictionaries by default as it's recommended"}, + {"allow_experimental_variant_type", false, false, "Add new experimental Variant type"}, + {"use_variant_as_common_type", false, false, "Allow to use Variant in if/multiIf if there is no common type"}, {"output_format_arrow_use_64_bit_indexes_for_dictionary", false, false, "Allow to use 64 bit indexes type in Arrow dictionaries"}, {"parallel_replicas_mark_segment_size", 128, 128, "Add new setting to control segment size in new parallel replicas coordinator implementation"}, {"ignore_materialized_views_with_dropped_target_table", false, false, "Add new setting to allow to ignore materialized views with dropped target table"}, @@ -100,7 +122,11 @@ static std::map sett {"function_visible_width_behavior", 0, 1, "We changed the default behavior of `visibleWidth` to be more precise"}, {"max_estimated_execution_time", 0, 0, "Separate max_execution_time and max_estimated_execution_time"}, {"iceberg_engine_ignore_schema_evolution", false, false, "Allow to ignore schema evolution in Iceberg table engine"}, - {"optimize_injective_functions_in_group_by", false, true, "Replace injective functions by it's arguments in GROUP BY section in analyzer"}}}, + {"optimize_injective_functions_in_group_by", false, true, "Replace injective functions by it's arguments in GROUP BY section in analyzer"}, + {"update_insert_deduplication_token_in_dependent_materialized_views", false, false, "Allow to update insert deduplication token with table identifier during insert in dependent materialized views"}, + {"azure_max_unexpected_write_error_retries", 4, 4, "The maximum number of retries in case of unexpected errors during Azure blob storage write"}, + {"split_parts_ranges_into_intersecting_and_non_intersecting_final", false, true, "Allow to split parts ranges into intersecting and non intersecting during FINAL optimization"}, + {"split_intersecting_parts_ranges_into_layers_final", true, true, "Allow to split intersecting parts ranges into layers during FINAL optimization"}}}, {"23.12", {{"allow_suspicious_ttl_expressions", true, false, "It is a new setting, and in previous versions the behavior was equivalent to allowing."}, {"input_format_parquet_allow_missing_columns", false, true, "Allow missing columns in Parquet files by default"}, {"input_format_orc_allow_missing_columns", false, true, "Allow missing columns in ORC files by default"}, diff --git a/src/Core/SettingsFields.cpp b/src/Core/SettingsFields.cpp index 80197cfbe22..f72b64fd56d 100644 --- a/src/Core/SettingsFields.cpp +++ b/src/Core/SettingsFields.cpp @@ -1,8 +1,7 @@ #include - #include +#include #include -#include #include #include #include @@ -13,6 +12,7 @@ #include + namespace DB { namespace ErrorCodes @@ -20,6 +20,7 @@ namespace ErrorCodes extern const int SIZE_OF_FIXED_STRING_DOESNT_MATCH; extern const int CANNOT_PARSE_BOOL; extern const int CANNOT_PARSE_NUMBER; + extern const int CANNOT_CONVERT_TYPE; } @@ -48,9 +49,51 @@ namespace T fieldToNumber(const Field & f) { if (f.getType() == Field::Types::String) + { return stringToNumber(f.get()); + } + else if (f.getType() == Field::Types::UInt64) + { + T result; + if (!accurate::convertNumeric(f.get(), result)) + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Field value {} is out of range of {} type", f, demangle(typeid(T).name())); + return result; + } + else if (f.getType() == Field::Types::Int64) + { + T result; + if (!accurate::convertNumeric(f.get(), result)) + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Field value {} is out of range of {} type", f, demangle(typeid(T).name())); + return result; + } + else if (f.getType() == Field::Types::Bool) + { + return T(f.get()); + } + else if (f.getType() == Field::Types::Float64) + { + Float64 x = f.get(); + if constexpr (std::is_floating_point_v) + { + return T(x); + } + else + { + if (!isFinite(x)) + { + /// Conversion of infinite values to integer is undefined. + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Cannot convert infinite value to integer type"); + } + else if (x > Float64(std::numeric_limits::max()) || x < Float64(std::numeric_limits::lowest())) + { + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Cannot convert out of range floating point value to integer type"); + } + else + return T(x); + } + } else - return applyVisitor(FieldVisitorConvertToNumber(), f); + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Invalid value {} of the setting, which needs {}", f, demangle(typeid(T).name())); } Map stringToMap(const String & str) @@ -174,7 +217,7 @@ namespace if (f.getType() == Field::Types::String) return stringToMaxThreads(f.get()); else - return applyVisitor(FieldVisitorConvertToNumber(), f); + return fieldToNumber(f); } } diff --git a/src/Core/TypeId.h b/src/Core/TypeId.h index 9c634d2321c..7003e880cd5 100644 --- a/src/Core/TypeId.h +++ b/src/Core/TypeId.h @@ -49,6 +49,7 @@ enum class TypeIndex IPv4, IPv6, JSONPaths, + Variant, }; /** diff --git a/src/DataTypes/DataTypeAggregateFunction.cpp b/src/DataTypes/DataTypeAggregateFunction.cpp index 7dc036cafa4..14a3c6a4248 100644 --- a/src/DataTypes/DataTypeAggregateFunction.cpp +++ b/src/DataTypes/DataTypeAggregateFunction.cpp @@ -239,7 +239,7 @@ static DataTypePtr create(const ASTPtr & arguments) argument_types.push_back(DataTypeFactory::instance().get(arguments->children[i])); if (function_name.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: empty name of aggregate function passed"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty name of aggregate function passed"); AggregateFunctionProperties properties; AggregateFunctionPtr function = AggregateFunctionFactory::instance().get(function_name, action, argument_types, params_row, properties); diff --git a/src/DataTypes/DataTypeCustomSimpleAggregateFunction.cpp b/src/DataTypes/DataTypeCustomSimpleAggregateFunction.cpp index aa3b154e49b..ee9870eb0ef 100644 --- a/src/DataTypes/DataTypeCustomSimpleAggregateFunction.cpp +++ b/src/DataTypes/DataTypeCustomSimpleAggregateFunction.cpp @@ -141,7 +141,7 @@ static std::pair create(const ASTPtr & argum argument_types.push_back(DataTypeFactory::instance().get(arguments->children[i])); if (function_name.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: empty name of aggregate function passed"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty name of aggregate function passed"); AggregateFunctionProperties properties; /// NullsAction is not part of the type definition, instead it will have transformed the function into a different one diff --git a/src/DataTypes/DataTypeDate.h b/src/DataTypes/DataTypeDate.h index 2f17207cc07..0e08b9ba2ca 100644 --- a/src/DataTypes/DataTypeDate.h +++ b/src/DataTypes/DataTypeDate.h @@ -12,6 +12,7 @@ public: static constexpr auto family_name = "Date"; TypeIndex getTypeId() const override { return TypeIndex::Date; } + TypeIndex getColumnType() const override { return TypeIndex::UInt16; } const char * getFamilyName() const override { return family_name; } bool canBeUsedAsVersion() const override { return true; } diff --git a/src/DataTypes/DataTypeDate32.h b/src/DataTypes/DataTypeDate32.h index 9160b62dc15..02e818f10df 100644 --- a/src/DataTypes/DataTypeDate32.h +++ b/src/DataTypes/DataTypeDate32.h @@ -12,6 +12,7 @@ public: static constexpr auto family_name = "Date32"; TypeIndex getTypeId() const override { return TypeIndex::Date32; } + TypeIndex getColumnType() const override { return TypeIndex::Int32; } const char * getFamilyName() const override { return family_name; } Field getDefault() const override diff --git a/src/DataTypes/DataTypeDateTime.h b/src/DataTypes/DataTypeDateTime.h index a4a05917ba5..5519240dee1 100644 --- a/src/DataTypes/DataTypeDateTime.h +++ b/src/DataTypes/DataTypeDateTime.h @@ -40,6 +40,7 @@ public: const char * getFamilyName() const override { return family_name; } String doGetName() const override; TypeIndex getTypeId() const override { return TypeIndex::DateTime; } + TypeIndex getColumnType() const override { return TypeIndex::UInt32; } bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } diff --git a/src/DataTypes/DataTypeEnum.h b/src/DataTypes/DataTypeEnum.h index 2f607fc2aa6..075d2d274ae 100644 --- a/src/DataTypes/DataTypeEnum.h +++ b/src/DataTypes/DataTypeEnum.h @@ -54,6 +54,7 @@ public: const char * getFamilyName() const override; TypeIndex getTypeId() const override { return type_id; } + TypeIndex getColumnType() const override { return sizeof(FieldType) == 1 ? TypeIndex::Int8 : TypeIndex::Int16; } FieldType readValue(ReadBuffer & istr) const { diff --git a/src/DataTypes/DataTypeFactory.cpp b/src/DataTypes/DataTypeFactory.cpp index 415f24d8151..d154b386ace 100644 --- a/src/DataTypes/DataTypeFactory.cpp +++ b/src/DataTypes/DataTypeFactory.cpp @@ -290,6 +290,7 @@ DataTypeFactory::DataTypeFactory() registerDataTypeDomainGeo(*this); registerDataTypeMap(*this); registerDataTypeObject(*this); + registerDataTypeVariant(*this); } DataTypeFactory & DataTypeFactory::instance() diff --git a/src/DataTypes/DataTypeFactory.h b/src/DataTypes/DataTypeFactory.h index ba7c1a3d7fe..a2aeb6f3646 100644 --- a/src/DataTypes/DataTypeFactory.h +++ b/src/DataTypes/DataTypeFactory.h @@ -100,5 +100,6 @@ void registerDataTypeDomainBool(DataTypeFactory & factory); void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory); void registerDataTypeDomainGeo(DataTypeFactory & factory); void registerDataTypeObject(DataTypeFactory & factory); +void registerDataTypeVariant(DataTypeFactory & factory); } diff --git a/src/DataTypes/DataTypeInterval.h b/src/DataTypes/DataTypeInterval.h index b0e747555e3..8bb9ae8d7b6 100644 --- a/src/DataTypes/DataTypeInterval.h +++ b/src/DataTypes/DataTypeInterval.h @@ -28,6 +28,7 @@ public: std::string doGetName() const override { return fmt::format("Interval{}", kind.toString()); } const char * getFamilyName() const override { return "Interval"; } TypeIndex getTypeId() const override { return TypeIndex::Interval; } + TypeIndex getColumnType() const override { return TypeIndex::Int64; } bool equals(const IDataType & rhs) const override; diff --git a/src/DataTypes/DataTypeLowCardinalityHelpers.cpp b/src/DataTypes/DataTypeLowCardinalityHelpers.cpp index 98eb76267a4..116e806f89c 100644 --- a/src/DataTypes/DataTypeLowCardinalityHelpers.cpp +++ b/src/DataTypes/DataTypeLowCardinalityHelpers.cpp @@ -20,6 +20,7 @@ namespace ErrorCodes { extern const int ILLEGAL_COLUMN; extern const int TYPE_MISMATCH; + extern const int LOGICAL_ERROR; } DataTypePtr recursiveRemoveLowCardinality(const DataTypePtr & type) @@ -55,62 +56,61 @@ DataTypePtr recursiveRemoveLowCardinality(const DataTypePtr & type) ColumnPtr recursiveRemoveLowCardinality(const ColumnPtr & column) { - if (!column) - return column; + ColumnPtr res = column; if (const auto * column_array = typeid_cast(column.get())) { const auto & data = column_array->getDataPtr(); auto data_no_lc = recursiveRemoveLowCardinality(data); - if (data.get() == data_no_lc.get()) - return column; - - return ColumnArray::create(data_no_lc, column_array->getOffsetsPtr()); + if (data.get() != data_no_lc.get()) + res = ColumnArray::create(data_no_lc, column_array->getOffsetsPtr()); } - - if (const auto * column_const = typeid_cast(column.get())) + else if (const auto * column_const = typeid_cast(column.get())) { const auto & nested = column_const->getDataColumnPtr(); auto nested_no_lc = recursiveRemoveLowCardinality(nested); - if (nested.get() == nested_no_lc.get()) - return column; - - return ColumnConst::create(nested_no_lc, column_const->size()); + if (nested.get() != nested_no_lc.get()) + res = ColumnConst::create(nested_no_lc, column_const->size()); } - - if (const auto * column_tuple = typeid_cast(column.get())) + else if (const auto * column_tuple = typeid_cast(column.get())) { auto columns = column_tuple->getColumns(); for (auto & element : columns) element = recursiveRemoveLowCardinality(element); - return ColumnTuple::create(columns); + res = ColumnTuple::create(columns); } - - if (const auto * column_map = typeid_cast(column.get())) + else if (const auto * column_map = typeid_cast(column.get())) { const auto & nested = column_map->getNestedColumnPtr(); auto nested_no_lc = recursiveRemoveLowCardinality(nested); - if (nested.get() == nested_no_lc.get()) - return column; - - return ColumnMap::create(nested_no_lc); + if (nested.get() != nested_no_lc.get()) + res = ColumnMap::create(nested_no_lc); } - /// Special case when column is a lazy argument of short circuit function. /// We should call recursiveRemoveLowCardinality on the result column /// when function will be executed. - if (const auto * column_function = typeid_cast(column.get())) + else if (const auto * column_function = typeid_cast(column.get())) { - if (!column_function->isShortCircuitArgument()) - return column; - - return column_function->recursivelyConvertResultToFullColumnIfLowCardinality(); + if (column_function->isShortCircuitArgument()) + res = column_function->recursivelyConvertResultToFullColumnIfLowCardinality(); + } + else if (const auto * column_low_cardinality = typeid_cast(column.get())) + { + res = column_low_cardinality->convertToFullColumn(); } - if (const auto * column_low_cardinality = typeid_cast(column.get())) - return column_low_cardinality->convertToFullColumn(); + if (res != column) + { + /// recursiveRemoveLowCardinality() must not change the size of a passed column! + if (res->size() != column->size()) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "recursiveRemoveLowCardinality() somehow changed the size of column {}. Old size={}, new size={}. It's a bug", + column->getName(), column->size(), res->size()); + } + } - return column; + return res; } ColumnPtr recursiveLowCardinalityTypeConversion(const ColumnPtr & column, const DataTypePtr & from_type, const DataTypePtr & to_type) diff --git a/src/DataTypes/DataTypeNullable.cpp b/src/DataTypes/DataTypeNullable.cpp index 41a9a1de543..484d779551f 100644 --- a/src/DataTypes/DataTypeNullable.cpp +++ b/src/DataTypes/DataTypeNullable.cpp @@ -114,5 +114,33 @@ DataTypePtr makeNullableOrLowCardinalityNullable(const DataTypePtr & type) return std::make_shared(type); } +DataTypePtr makeNullableOrLowCardinalityNullableSafe(const DataTypePtr & type) +{ + if (isNullableOrLowCardinalityNullable(type)) + return type; + + if (type->lowCardinality()) + { + const auto & dictionary_type = assert_cast(*type).getDictionaryType(); + return std::make_shared(makeNullable(dictionary_type)); + } + + return makeNullableSafe(type); +} + +DataTypePtr removeNullableOrLowCardinalityNullable(const DataTypePtr & type) +{ + if (type->isNullable()) + return static_cast(*type).getNestedType(); + + if (type->isLowCardinalityNullable()) + { + auto dict_type = removeNullable(static_cast(*type).getDictionaryType()); + return std::make_shared(dict_type); + } + + return type; + +} } diff --git a/src/DataTypes/DataTypeNullable.h b/src/DataTypes/DataTypeNullable.h index 06d46fb15ed..7ad0e1ba5f1 100644 --- a/src/DataTypes/DataTypeNullable.h +++ b/src/DataTypes/DataTypeNullable.h @@ -54,5 +54,8 @@ DataTypePtr makeNullable(const DataTypePtr & type); DataTypePtr makeNullableSafe(const DataTypePtr & type); DataTypePtr removeNullable(const DataTypePtr & type); DataTypePtr makeNullableOrLowCardinalityNullable(const DataTypePtr & type); +DataTypePtr makeNullableOrLowCardinalityNullableSafe(const DataTypePtr & type); +/// Nullable(T) -> T, LowCardinality(Nullable(T)) -> T +DataTypePtr removeNullableOrLowCardinalityNullable(const DataTypePtr & type); } diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index f5d1ea5d877..5c9d5a3366e 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -189,11 +190,15 @@ MutableColumnPtr DataTypeTuple::createColumn() const MutableColumnPtr DataTypeTuple::createColumn(const ISerialization & serialization) const { + /// If we read Tuple as Variant subcolumn, it may be wrapped to SerializationVariantElement. + /// Here we don't need it, so we drop this wrapper. + const auto * current_serialization = &serialization; + while (const auto * serialization_variant_element = typeid_cast(current_serialization)) + current_serialization = serialization_variant_element->getNested().get(); + /// If we read subcolumn of nested Tuple, it may be wrapped to SerializationNamed /// several times to allow to reconstruct the substream path name. /// Here we don't need substream path name, so we drop first several wrapper serializations. - - const auto * current_serialization = &serialization; while (const auto * serialization_named = typeid_cast(current_serialization)) current_serialization = serialization_named->getNested().get(); diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp new file mode 100644 index 00000000000..456b4ea03b6 --- /dev/null +++ b/src/DataTypes/DataTypeVariant.cpp @@ -0,0 +1,220 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int EMPTY_DATA_PASSED; +} + + +DataTypeVariant::DataTypeVariant(const DataTypes & variants_) +{ + /// Sort nested types by their full names and squash identical types. + std::map name_to_type; + for (const auto & type : variants_) + { + /// Nullable(...), LowCardinality(Nullable(...)) and Variant(...) types are not allowed inside Variant type. + if (isNullableOrLowCardinalityNullable(type)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Nullable/LowCardinality(Nullable) types are not allowed inside Variant type"); + if (type->getTypeId() == TypeIndex::Variant) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Nested Variant types are not allowed"); + /// Don't use Nothing type as a variant. + if (!isNothing(type)) + name_to_type[type->getName()] = type; + } + + variants.reserve(name_to_type.size()); + for (const auto & [_, type] : name_to_type) + variants.push_back(type); + + if (variants.empty()) + throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Variant cannot be empty"); + + if (variants.size() > ColumnVariant::MAX_NESTED_COLUMNS) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Variant type with more than {} nested types is not allowed", ColumnVariant::MAX_NESTED_COLUMNS); +} + +std::string DataTypeVariant::doGetName() const +{ + size_t size = variants.size(); + WriteBufferFromOwnString s; + + s << "Variant("; + for (size_t i = 0; i < size; ++i) + { + if (i != 0) + s << ", "; + + s << variants[i]->getName(); + } + s << ")"; + + return s.str(); +} + +std::string DataTypeVariant::doGetPrettyName(size_t indent) const +{ + size_t size = variants.size(); + WriteBufferFromOwnString s; + s << "Variant("; + + for (size_t i = 0; i != size; ++i) + { + if (i != 0) + s << ", "; + + s << variants[i]->getPrettyName(indent); + } + + s << ')'; + return s.str(); +} + +MutableColumnPtr DataTypeVariant::createColumn() const +{ + size_t size = variants.size(); + MutableColumns nested_columns; + nested_columns.reserve(size); + for (size_t i = 0; i < size; ++i) + nested_columns.push_back(variants[i]->createColumn()); + + return ColumnVariant::create(std::move(nested_columns)); +} + +Field DataTypeVariant::getDefault() const +{ + return Null(); +} + +bool DataTypeVariant::equals(const IDataType & rhs) const +{ + if (typeid(rhs) != typeid(*this)) + return false; + + const DataTypeVariant & rhs_variant = static_cast(rhs); + + size_t size = variants.size(); + if (size != rhs_variant.variants.size()) + return false; + + for (size_t i = 0; i < size; ++i) + if (!variants[i]->equals(*rhs_variant.variants[i])) + return false; + + return true; +} + +bool DataTypeVariant::textCanContainOnlyValidUTF8() const +{ + return std::all_of(variants.begin(), variants.end(), [](auto && elem) { return elem->textCanContainOnlyValidUTF8(); }); +} + +bool DataTypeVariant::haveMaximumSizeOfValue() const +{ + return std::all_of(variants.begin(), variants.end(), [](auto && elem) { return elem->haveMaximumSizeOfValue(); }); +} + +bool DataTypeVariant::hasDynamicSubcolumns() const +{ + return std::any_of(variants.begin(), variants.end(), [](auto && elem) { return elem->hasDynamicSubcolumns(); }); +} + +std::optional DataTypeVariant::tryGetVariantDiscriminator(const DataTypePtr & type) const +{ + String type_name = type->getName(); + for (size_t i = 0; i != variants.size(); ++i) + { + /// We don't use equals here, because it doesn't respect custom type names. + if (variants[i]->getName() == type_name) + return i; + } + + return std::nullopt; +} + +size_t DataTypeVariant::getMaximumSizeOfValueInMemory() const +{ + size_t max_size = 0; + for (const auto & elem : variants) + { + size_t elem_max_size = elem->getMaximumSizeOfValueInMemory(); + if (elem_max_size > max_size) + max_size = elem_max_size; + } + return max_size; +} + +SerializationPtr DataTypeVariant::doGetDefaultSerialization() const +{ + SerializationVariant::VariantSerializations serializations; + serializations.reserve(variants.size()); + Names variant_names; + variant_names.reserve(variants.size()); + + for (const auto & variant : variants) + { + serializations.push_back(variant->getDefaultSerialization()); + variant_names.push_back(variant->getName()); + } + + return std::make_shared(std::move(serializations), std::move(variant_names), SerializationVariant::getVariantsDeserializeTextOrder(variants), getName()); +} + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.empty()) + throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Variant cannot be empty"); + + DataTypes nested_types; + nested_types.reserve(arguments->children.size()); + + for (const ASTPtr & child : arguments->children) + nested_types.emplace_back(DataTypeFactory::instance().get(child)); + + return std::make_shared(nested_types); +} + +bool isVariantExtension(const DataTypePtr & from_type, const DataTypePtr & to_type) +{ + const auto * from_variant = typeid_cast(from_type.get()); + const auto * to_variant = typeid_cast(to_type.get()); + if (!from_variant || !to_variant) + return false; + + const auto & to_variants = to_variant->getVariants(); + std::unordered_set to_variant_types; + to_variant_types.reserve(to_variants.size()); + for (const auto & variant : to_variants) + to_variant_types.insert(variant->getName()); + + for (const auto & variant : from_variant->getVariants()) + { + if (!to_variant_types.contains(variant->getName())) + return false; + } + + return true; +} + + +void registerDataTypeVariant(DataTypeFactory & factory) +{ + factory.registerDataType("Variant", create); +} + +} diff --git a/src/DataTypes/DataTypeVariant.h b/src/DataTypes/DataTypeVariant.h new file mode 100644 index 00000000000..d26ce4ea90f --- /dev/null +++ b/src/DataTypes/DataTypeVariant.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +/** Variant data type. + * This type represents a union of other data types. + * For example, type Variant(T1, T2, ..., TN) means that each row of this type + * has a value of either type T1 or T2 or ... or TN or none of them (NULL value). + * Nullable(...), LowCardinality(Nullable(...)) and Variant(...) types are not allowed + * inside Variant type. + * The order of nested types doesn't matter: Variant(T1, T2) = Variant(T2, T1). + * To have global order of nested types we sort variants by type names on Variant creation. + * The index of a variant in a sorted list is called global variant discriminator. + */ +class DataTypeVariant final : public IDataType +{ +private: + DataTypes variants; + +public: + static constexpr bool is_parametric = true; + + explicit DataTypeVariant(const DataTypes & variants_); + + TypeIndex getTypeId() const override { return TypeIndex::Variant; } + const char * getFamilyName() const override { return "Variant"; } + + bool canBeInsideNullable() const override { return false; } + bool supportsSparseSerialization() const override { return false; } + bool canBeInsideSparseColumns() const override { return false; } + + MutableColumnPtr createColumn() const override; + + Field getDefault() const override; + + bool equals(const IDataType & rhs) const override; + + bool isParametric() const override { return true; } + bool haveSubtypes() const override { return true; } + bool textCanContainOnlyValidUTF8() const override; + bool haveMaximumSizeOfValue() const override; + bool hasDynamicSubcolumns() const override; + size_t getMaximumSizeOfValueInMemory() const override; + + const DataTypePtr & getVariant(size_t i) const { return variants[i]; } + const DataTypes & getVariants() const { return variants; } + + /// Check if Variant has provided type in the list of variants and return its discriminator. + std::optional tryGetVariantDiscriminator(const DataTypePtr & type) const; + +private: + std::string doGetName() const override; + std::string doGetPrettyName(size_t indent) const override; + SerializationPtr doGetDefaultSerialization() const override; +}; + +/// Check if conversion from from_type to to_type is Variant extension +/// (both types are Variants and to_type contains all variants from from_type). +bool isVariantExtension(const DataTypePtr & from_type, const DataTypePtr & to_type); + +} + diff --git a/src/DataTypes/EnumValues.cpp b/src/DataTypes/EnumValues.cpp index 9df49e765a7..a15136b9335 100644 --- a/src/DataTypes/EnumValues.cpp +++ b/src/DataTypes/EnumValues.cpp @@ -74,6 +74,25 @@ T EnumValues::getValue(StringRef field_name, bool try_treat_as_id) const return it->getMapped(); } +template +bool EnumValues::tryGetValue(T & x, StringRef field_name, bool try_treat_as_id) const +{ + const auto it = name_to_value_map.find(field_name); + if (!it) + { + /// It is used in CSV and TSV input formats. If we fail to find given string in + /// enum names, we will try to treat it as enum id. + if (try_treat_as_id) + { + ReadBufferFromMemory tmp_buf(field_name.data, field_name.size); + return tryReadText(x, tmp_buf) && tmp_buf.eof() && value_to_name_map.contains(x); + } + return false; + } + x = it->getMapped(); + return true; +} + template Names EnumValues::getAllRegisteredNames() const { diff --git a/src/DataTypes/EnumValues.h b/src/DataTypes/EnumValues.h index 5189f7a56f5..889878bc60f 100644 --- a/src/DataTypes/EnumValues.h +++ b/src/DataTypes/EnumValues.h @@ -7,7 +7,7 @@ namespace DB { -namespace ErrorCodes +namespace ErrorCodesEnumValues { extern const int BAD_ARGUMENTS; } @@ -42,6 +42,11 @@ public: return it; } + bool hasValue(const T & value) const + { + return value_to_name_map.contains(value); + } + /// throws exception if value is not valid const StringRef & getNameForValue(const T & value) const { @@ -60,6 +65,7 @@ public: } T getValue(StringRef field_name, bool try_treat_as_id = false) const; + bool tryGetValue(T & x, StringRef field_name, bool try_treat_as_id = false) const; template bool containsAll(const TValues & rhs_values) const diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp index 2a7e0f246de..392c56343e3 100644 --- a/src/DataTypes/IDataType.cpp +++ b/src/DataTypes/IDataType.cpp @@ -109,11 +109,26 @@ Ptr IDataType::getForSubcolumn( bool throw_if_null) const { Ptr res; - forEachSubcolumn([&](const auto &, const auto & name, const auto & subdata) + + ISerialization::StreamCallback callback_with_data = [&](const auto & subpath) { - if (name == subcolumn_name) - res = subdata.*member; - }, data); + for (size_t i = 0; i < subpath.size(); ++i) + { + size_t prefix_len = i + 1; + if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, prefix_len)) + { + auto name = ISerialization::getSubcolumnNameForStream(subpath, prefix_len); + /// Create data from path only if it's requested subcolumn. + if (name == subcolumn_name) + res = ISerialization::createFromPath(subpath, prefix_len).*member; + } + subpath[i].visited = true; + } + }; + + ISerialization::EnumerateStreamsSettings settings; + settings.position_independent_encoding = false; + data.serialization->enumerateStreams(settings, callback_with_data, data); if (!res && throw_if_null) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index eabf066bc3d..48cc127746f 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -86,6 +86,8 @@ public: /// Data type id. It's used for runtime type checks. virtual TypeIndex getTypeId() const = 0; + /// Storage type (e.g. Int64 for Interval) + virtual TypeIndex getColumnType() const { return getTypeId(); } bool hasSubcolumn(std::string_view subcolumn_name) const; @@ -150,7 +152,7 @@ public: /** Create ColumnConst for corresponding type, with specified size and value. */ - ColumnPtr createColumnConst(size_t size, const Field & field) const; + virtual ColumnPtr createColumnConst(size_t size, const Field & field) const; ColumnPtr createColumnConstWithDefaultValue(size_t size) const; /** Get default value of data type. @@ -412,6 +414,8 @@ struct WhichDataType constexpr bool isSimple() const { return isInt() || isUInt() || isFloat() || isString(); } constexpr bool isLowCardinality() const { return idx == TypeIndex::LowCardinality; } + + constexpr bool isVariant() const { return idx == TypeIndex::Variant; } }; /// IDataType helpers (alternative for IDataType virtual methods with single point of truth) @@ -464,6 +468,7 @@ template inline bool isTuple(const T & data_type) { return WhichDat template inline bool isMap(const T & data_type) {return WhichDataType(data_type).isMap(); } template inline bool isInterval(const T & data_type) {return WhichDataType(data_type).isInterval(); } template inline bool isObject(const T & data_type) { return WhichDataType(data_type).isObject(); } +template inline bool isVariant(const T & data_type) { return WhichDataType(data_type).isVariant(); } template inline bool isNothing(const T & data_type) { return WhichDataType(data_type).isNothing(); } diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index 80d498de38a..7d57d72090b 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -54,6 +54,7 @@ const std::set ISerialization::Substream::named_types TupleElement, NamedOffsets, NamedNullMap, + NamedVariantDiscriminators, }; String ISerialization::Substream::toString() const @@ -61,6 +62,9 @@ String ISerialization::Substream::toString() const if (named_types.contains(type)) return fmt::format("{}({})", type, name_of_substream); + if (type == VariantElement) + return fmt::format("VariantElement({})", variant_element_name); + return String(magic_enum::enum_name(type)); } @@ -186,6 +190,12 @@ String getNameForSubstreamPath( else stream_name += substream_name; } + else if (it->type == Substream::VariantDiscriminators) + stream_name += ".variant_discr"; + else if (it->type == Substream::VariantOffsets) + stream_name += ".variant_offsets"; + else if (it->type == Substream::VariantElement) + stream_name += "." + it->variant_element_name; } return stream_name; @@ -274,6 +284,53 @@ bool ISerialization::isSpecialCompressionAllowed(const SubstreamPath & path) return true; } +namespace +{ + +template +bool tryDeserializeText(const F deserialize, DB::IColumn & column) +{ + size_t prev_size = column.size(); + try + { + deserialize(column); + return true; + } + catch (...) + { + if (column.size() > prev_size) + column.popBack(column.size() - prev_size); + return false; + } +} + +} + +bool ISerialization::tryDeserializeTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeTextCSV(my_column, istr, settings); }, column); +} + +bool ISerialization::tryDeserializeTextEscaped(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeTextEscaped(my_column, istr, settings); }, column); +} + +bool ISerialization::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeTextJSON(my_column, istr, settings); }, column); +} + +bool ISerialization::tryDeserializeTextQuoted(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeTextQuoted(my_column, istr, settings); }, column); +} + +bool ISerialization::tryDeserializeWholeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeWholeText(my_column, istr, settings); }, column); +} + void ISerialization::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String field; @@ -283,6 +340,15 @@ void ISerialization::deserializeTextRaw(IColumn & column, ReadBuffer & istr, con deserializeWholeText(column, buf, settings); } +bool ISerialization::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + /// Read until \t or \n. + readString(field, istr); + ReadBufferFromString buf(field); + return tryDeserializeWholeText(column, buf, settings); +} + void ISerialization::serializeTextMarkdown( const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const { @@ -310,7 +376,8 @@ bool ISerialization::hasSubcolumnForPath(const SubstreamPath & path, size_t pref size_t last_elem = prefix_len - 1; return path[last_elem].type == Substream::NullMap || path[last_elem].type == Substream::TupleElement - || path[last_elem].type == Substream::ArraySizes; + || path[last_elem].type == Substream::ArraySizes + || path[last_elem].type == Substream::VariantElement; } ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len) @@ -339,6 +406,8 @@ void ISerialization::throwUnexpectedDataAfterParsedValue(IColumn & column, ReadB { WriteBufferFromOwnString ostr; serializeText(column, column.size() - 1, ostr, settings); + /// Restore correct column size. + column.popBack(1); throw Exception( ErrorCodes::UNEXPECTED_DATA_AFTER_PARSED_VALUE, "Unexpected data '{}' after parsed {} value '{}'", diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index dcddd6a8161..7fba9db4acf 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -154,6 +154,12 @@ public: ObjectStructure, ObjectData, + VariantDiscriminators, + NamedVariantDiscriminators, + VariantOffsets, + VariantElements, + VariantElement, + Regular, }; @@ -162,6 +168,9 @@ public: Type type; + /// The name of a variant element type. + String variant_element_name; + /// Name of substream for type from 'named_types'. String name_of_substream; @@ -321,17 +330,20 @@ public: virtual void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; virtual void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; /** Text serialization as a literal that may be inserted into a query. */ virtual void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; virtual void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; /** Text serialization for the CSV format. */ virtual void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; virtual void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; /** Text serialization for displaying on a terminal or saving into a text file, and the like. * Without escaping or quoting. @@ -341,11 +353,13 @@ public: /** Text deserialization in case when buffer contains only one value, without any escaping and delimiters. */ virtual void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; /** Text serialization intended for using in JSON format. */ virtual void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; virtual void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; virtual void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t /*indent*/) const { serializeTextJSON(column, row_num, ostr, settings); @@ -365,6 +379,7 @@ public: * additional code in data types serialization and ReadHelpers. */ virtual void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; + virtual bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; virtual void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; virtual void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index 6b597f2e699..e8aab615849 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -419,9 +419,11 @@ static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffe } -template -static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested, bool allow_unenclosed) +template +static ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested, bool allow_unenclosed) { + static constexpr bool throw_exception = std::is_same_v; + ColumnArray & column_array = assert_cast(column); ColumnArray::Offsets & offsets = column_array.getOffsets(); @@ -433,7 +435,18 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r if (checkChar('[', istr)) has_braces = true; else if (!allow_unenclosed) - throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Array does not start with '[' character"); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Array does not start with '[' character"); + return ReturnType(false); + } + + auto on_error_no_throw = [&]() + { + if (size) + nested_column.popBack(size); + return ReturnType(false); + }; try { @@ -443,11 +456,17 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r if (!first) { if (*istr.position() == ',') + { ++istr.position(); + } else - throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, - "Cannot read array from text, expected comma or end of array, found '{}'", - *istr.position()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, + "Cannot read array from text, expected comma or end of array, found '{}'", + *istr.position()); + return on_error_no_throw(); + } } first = false; @@ -457,25 +476,42 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r if (*istr.position() == ']') break; - read_nested(nested_column); + if constexpr (throw_exception) + read_nested(nested_column); + else if (!read_nested(nested_column)) + return on_error_no_throw(); + ++size; skipWhitespaceIfAny(istr); } if (has_braces) - assertChar(']', istr); + { + if constexpr (throw_exception) + assertChar(']', istr); + else if (!checkChar(']', istr)) + return on_error_no_throw(); + } else /// If array is not enclosed in braces, we read until EOF. - assertEOF(istr); + { + if constexpr (throw_exception) + assertEOF(istr); + else if (!istr.eof()) + return on_error_no_throw(); + } } catch (...) { if (size) nested_column.popBack(size); - throw; + if constexpr (throw_exception) + throw; + return ReturnType(false); } offsets.push_back(offsets.back() + size); + return ReturnType(true); } @@ -494,8 +530,8 @@ void SerializationArray::deserializeText(IColumn & column, ReadBuffer & istr, co deserializeTextImpl(column, istr, [&](IColumn & nested_column) { - if (settings.null_as_default) - SerializationNullable::deserializeTextQuotedImpl(nested_column, istr, settings, nested); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(nested_column, istr, settings, nested); else nested->deserializeTextQuoted(nested_column, istr, settings); }, false); @@ -504,6 +540,29 @@ void SerializationArray::deserializeText(IColumn & column, ReadBuffer & istr, co throwUnexpectedDataAfterParsedValue(column, istr, settings, "Array"); } +bool SerializationArray::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +{ + auto read_nested = [&](IColumn & nested_column) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(nested_column, istr, settings, nested); + return nested->tryDeserializeTextQuoted(nested_column, istr, settings); + }; + + bool ok = deserializeTextImpl(column, istr, std::move(read_nested), false); + + if (!ok) + return false; + + if (whole && !istr.eof()) + { + column.popBack(1); + return false; + } + + return true; +} + void SerializationArray::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { const ColumnArray & column_array = assert_cast(column); @@ -559,13 +618,25 @@ void SerializationArray::deserializeTextJSON(IColumn & column, ReadBuffer & istr deserializeTextImpl(column, istr, [&](IColumn & nested_column) { - if (settings.null_as_default) - SerializationNullable::deserializeTextJSONImpl(nested_column, istr, settings, nested); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, nested); else nested->deserializeTextJSON(nested_column, istr, settings); }, false); } +bool SerializationArray::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_nested = [&](IColumn & nested_column) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, nested); + return nested->tryDeserializeTextJSON(nested_column, istr, settings); + }; + + return deserializeTextImpl(column, istr, std::move(read_nested), false); +} + void SerializationArray::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -608,8 +679,8 @@ void SerializationArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr, deserializeTextImpl(column, rb, [&](IColumn & nested_column) { - if (settings.null_as_default) - SerializationNullable::deserializeTextCSVImpl(nested_column, rb, settings, nested); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(nested_column, rb, settings, nested); else nested->deserializeTextCSV(nested_column, rb, settings); }, true); @@ -619,12 +690,43 @@ void SerializationArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr, deserializeTextImpl(column, rb, [&](IColumn & nested_column) { - if (settings.null_as_default) - SerializationNullable::deserializeTextQuotedImpl(nested_column, rb, settings, nested); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(nested_column, rb, settings, nested); else nested->deserializeTextQuoted(nested_column, rb, settings); }, true); } } +bool SerializationArray::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + if (!tryReadCSV(s, istr, settings.csv)) + return false; + ReadBufferFromString rb(s); + + if (settings.csv.arrays_as_nested_csv) + { + auto read_nested = [&](IColumn & nested_column) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextCSV(nested_column, rb, settings, nested); + return nested->tryDeserializeTextCSV(nested_column, rb, settings); + }; + + return deserializeTextImpl(column, rb, read_nested, true); + } + else + { + auto read_nested = [&](IColumn & nested_column) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(nested_column, rb, settings, nested); + return nested->tryDeserializeTextQuoted(nested_column, rb, settings); + }; + + return deserializeTextImpl(column, rb, read_nested, true); + } +} + } diff --git a/src/DataTypes/Serializations/SerializationArray.h b/src/DataTypes/Serializations/SerializationArray.h index de331169db5..82f5e8bce45 100644 --- a/src/DataTypes/Serializations/SerializationArray.h +++ b/src/DataTypes/Serializations/SerializationArray.h @@ -20,15 +20,18 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Streaming serialization of arrays is arranged in a special way: * - elements placed in a row are written/read without array sizes; diff --git a/src/DataTypes/Serializations/SerializationBool.cpp b/src/DataTypes/Serializations/SerializationBool.cpp index 41b5bf806e5..f745fac4d30 100644 --- a/src/DataTypes/Serializations/SerializationBool.cpp +++ b/src/DataTypes/Serializations/SerializationBool.cpp @@ -150,30 +150,42 @@ bool tryDeserializeAllVariants(ColumnUInt8 * column, ReadBuffer & istr) return true; } -void deserializeImpl( +template +ReturnType deserializeImpl( IColumn & column, ReadBuffer & istr, const FormatSettings & settings, std::function check_end_of_value) { + static constexpr bool throw_exception = std::is_same_v; + ColumnUInt8 * col = checkAndGetDeserializeColumnType(column); + auto restore_column_if_needed = [&, prev_size = col->size()]() + { + if (col->size() > prev_size) + col->popBack(1); + }; PeekableReadBuffer buf(istr); buf.setCheckpoint(); if (checkString(settings.bool_true_representation, buf) && check_end_of_value(buf)) { col->insert(true); - return; + return ReturnType(true); } buf.rollbackToCheckpoint(); if (checkString(settings.bool_false_representation, buf) && check_end_of_value(buf)) { - col->insert(false); buf.dropCheckpoint(); if (buf.hasUnreadData()) - throw Exception( - ErrorCodes::CANNOT_PARSE_BOOL, - "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " - "bool_true_representation or bool_false_representation contains some delimiters of input format"); - return; + { + if constexpr (throw_exception) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " + "bool_true_representation or bool_false_representation contains some delimiters of input format"); + return ReturnType(false); + } + col->insert(false); + return ReturnType(true); } buf.rollbackToCheckpoint(); @@ -181,22 +193,31 @@ void deserializeImpl( { buf.dropCheckpoint(); if (buf.hasUnreadData()) - throw Exception( - ErrorCodes::CANNOT_PARSE_BOOL, - "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " - "bool_true_representation or bool_false_representation contains some delimiters of input format"); - return; + { + if constexpr (throw_exception) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " + "bool_true_representation or bool_false_representation contains some delimiters of input format"); + restore_column_if_needed(); + return ReturnType(false); + } + return ReturnType(true); } buf.makeContinuousMemoryFromCheckpointToPos(); buf.rollbackToCheckpoint(); - throw Exception( - ErrorCodes::CANNOT_PARSE_BOOL, - "Cannot parse boolean value here: '{}', should be '{}' or '{}' controlled by setting bool_true_representation and " - "bool_false_representation or one of " - "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0", - String(buf.position(), std::min(10lu, buf.available())), - settings.bool_true_representation, settings.bool_false_representation); + restore_column_if_needed(); + if constexpr (throw_exception) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot parse boolean value here: '{}', should be '{}' or '{}' controlled by setting bool_true_representation and " + "bool_false_representation or one of " + "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0", + String(buf.position(), std::min(10lu, buf.available())), + settings.bool_true_representation, settings.bool_false_representation); + + return ReturnType(false); } } @@ -225,6 +246,14 @@ void SerializationBool::deserializeTextEscaped(IColumn & column, ReadBuffer & is deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); } +bool SerializationBool::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + return false; + + return deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); +} + void SerializationBool::serializeTextJSON(const IColumn &column, size_t row_num, WriteBuffer &ostr, const FormatSettings &settings) const { serializeSimple(column, row_num, ostr, settings); @@ -250,6 +279,33 @@ void SerializationBool::deserializeTextJSON(IColumn &column, ReadBuffer &istr, c col->insert(value); } +bool SerializationBool::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + if (istr.eof()) + return false; + + ColumnUInt8 * col = checkAndGetDeserializeColumnType(column); + bool value = false; + char first_char = *istr.position(); + if (first_char == 't' || first_char == 'f') + { + if (!readBoolTextWord(value, istr)) + return false; + } + else if (first_char == '1' || first_char == '0') + { + /// Doesn't throw. + readBoolText(value, istr); + } + else + { + return false; + } + + col->insert(value); + return true; +} + void SerializationBool::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeCustom(column, row_num, ostr, settings); @@ -263,6 +319,14 @@ void SerializationBool::deserializeTextCSV(IColumn & column, ReadBuffer & istr, deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == settings.csv.delimiter || *buf.position() == '\n' || *buf.position() == '\r'; }); } +bool SerializationBool::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + return false; + + return deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == settings.csv.delimiter || *buf.position() == '\n' || *buf.position() == '\r'; }); +} + void SerializationBool::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeCustom(column, row_num, ostr, settings); @@ -276,15 +340,30 @@ void SerializationBool::deserializeTextRaw(IColumn & column, ReadBuffer & istr, deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); } +bool SerializationBool::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + return false; + + return deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); +} + void SerializationBool::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeSimple(column, row_num, ostr, settings); } -void SerializationBool::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +template +ReturnType deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) { + static constexpr bool throw_exception = std::is_same_v; + if (istr.eof()) - throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); + return ReturnType(false); + } auto * col = checkAndGetDeserializeColumnType(column); @@ -292,11 +371,17 @@ void SerializationBool::deserializeTextQuoted(IColumn & column, ReadBuffer & ist switch (symbol) { case 't': - assertStringCaseInsensitive("true", istr); + if constexpr (throw_exception) + assertStringCaseInsensitive("true", istr); + else if (!checkStringCaseInsensitive("true", istr)) + return ReturnType(false); col->insert(true); break; case 'f': - assertStringCaseInsensitive("false", istr); + if constexpr (throw_exception) + assertStringCaseInsensitive("false", istr); + else if (!checkStringCaseInsensitive("false", istr)) + return ReturnType(false); col->insert(false); break; case '1': @@ -307,16 +392,40 @@ void SerializationBool::deserializeTextQuoted(IColumn & column, ReadBuffer & ist break; case '\'': ++istr.position(); - deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return !buf.eof() && *buf.position() == '\''; }); - assertChar('\'', istr); + if constexpr (throw_exception) + { + deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return !buf.eof() && *buf.position() == '\''; }); + assertChar('\'', istr); + } + else + { + if (!deserializeImpl(column, istr, settings, [](ReadBuffer & buf) { return !buf.eof() && *buf.position() == '\''; }) || !checkChar('\'', istr)) + return ReturnType(false); + } break; default: - throw Exception( - ErrorCodes::CANNOT_PARSE_BOOL, - "Cannot parse boolean value here: '{}', should be true/false, 1/0 or on of " - "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0 in quotes", - String(istr.position(), std::min(10ul, istr.available()))); + { + if constexpr (throw_exception) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot parse boolean value here: '{}', should be true/false, 1/0 or on of " + "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0 in quotes", + String(istr.position(), std::min(10ul, istr.available()))); + return ReturnType(false); + } } + + return ReturnType(true); +} + +void SerializationBool::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextQuotedImpl(column, istr, settings); +} + +bool SerializationBool::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return deserializeTextQuotedImpl(column, istr, settings); } void SerializationBool::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const @@ -327,6 +436,14 @@ void SerializationBool::deserializeWholeText(IColumn & column, ReadBuffer & istr deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof(); }); } +bool SerializationBool::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + return false; + + return deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof(); }); +} + void SerializationBool::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeSimple(column, row_num, ostr, settings); diff --git a/src/DataTypes/Serializations/SerializationBool.h b/src/DataTypes/Serializations/SerializationBool.h index a5aa0ca80a2..3e511b7249e 100644 --- a/src/DataTypes/Serializations/SerializationBool.h +++ b/src/DataTypes/Serializations/SerializationBool.h @@ -15,21 +15,27 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; }; diff --git a/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp b/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp index 03564bac64b..abe443cab1b 100644 --- a/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp +++ b/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp @@ -24,6 +24,12 @@ void deserializeFromString(const SerializationCustomSimpleText & domain, IColumn domain.deserializeText(column, istr, settings, true); } +bool tryDeserializeFromString(const SerializationCustomSimpleText & domain, IColumn & column, const String & s, const FormatSettings & settings) +{ + ReadBufferFromString istr(s); + return domain.tryDeserializeText(column, istr, settings, true); +} + } namespace DB @@ -34,6 +40,19 @@ SerializationCustomSimpleText::SerializationCustomSimpleText(const Serialization { } +bool SerializationCustomSimpleText::tryDeserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, bool whole) const +{ + try + { + deserializeText(column, istr, settings, whole); + return true; + } + catch (...) + { + return false; + } +} + void SerializationCustomSimpleText::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String str; @@ -41,6 +60,13 @@ void SerializationCustomSimpleText::deserializeWholeText(IColumn & column, ReadB deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readStringUntilEOF(str, istr); + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeEscapedString(serializeToString(*this, column, row_num, settings), ostr); @@ -53,6 +79,13 @@ void SerializationCustomSimpleText::deserializeTextEscaped(IColumn & column, Rea deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readEscapedString(str, istr); + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeQuotedString(serializeToString(*this, column, row_num, settings), ostr); @@ -65,6 +98,14 @@ void SerializationCustomSimpleText::deserializeTextQuoted(IColumn & column, Read deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + if (!tryReadQuotedString(str, istr)) + return false; + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeCSVString(serializeToString(*this, column, row_num, settings), ostr); @@ -77,6 +118,13 @@ void SerializationCustomSimpleText::deserializeTextCSV(IColumn & column, ReadBuf deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readCSVStringInto(str, istr, settings.csv); + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeJSONString(serializeToString(*this, column, row_num, settings), ostr, settings); @@ -89,6 +137,14 @@ void SerializationCustomSimpleText::deserializeTextJSON(IColumn & column, ReadBu deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + if (!tryReadJSONStringInto(str, istr)) + return false; + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeXMLStringForTextElement(serializeToString(*this, column, row_num, settings), ostr); diff --git a/src/DataTypes/Serializations/SerializationCustomSimpleText.h b/src/DataTypes/Serializations/SerializationCustomSimpleText.h index 0c909350002..c80a57e234c 100644 --- a/src/DataTypes/Serializations/SerializationCustomSimpleText.h +++ b/src/DataTypes/Serializations/SerializationCustomSimpleText.h @@ -22,20 +22,24 @@ public: /// whole = true means that buffer contains only one value, so we should read until EOF. /// It's needed to check if there is garbage after parsed field. virtual void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const = 0; + virtual bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const; /** Text deserialization in case when buffer contains only one value, without any escaping and delimiters. */ void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization with escaping but without quoting. */ void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization as a literal that may be inserted into a query. */ void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization for the CSV format. */ @@ -44,12 +48,14 @@ public: * (the delimiter is not consumed). */ void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization intended for using in JSON format. * force_quoting_64bit_integers parameter forces to brace UInt64 and Int64 types into quotes. */ void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization for putting into the XML format. */ diff --git a/src/DataTypes/Serializations/SerializationDate.cpp b/src/DataTypes/Serializations/SerializationDate.cpp index 534f599a072..38e1bb87b6d 100644 --- a/src/DataTypes/Serializations/SerializationDate.cpp +++ b/src/DataTypes/Serializations/SerializationDate.cpp @@ -22,6 +22,15 @@ void SerializationDate::deserializeWholeText(IColumn & column, ReadBuffer & istr throwUnexpectedDataAfterParsedValue(column, istr, settings, "Date"); } +bool SerializationDate::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + if (!tryReadDateText(x, istr, time_zone) || !istr.eof()) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { DayNum x; @@ -29,6 +38,15 @@ void SerializationDate::deserializeTextEscaped(IColumn & column, ReadBuffer & is assert_cast(column).getData().push_back(x); } +bool SerializationDate::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + if (!tryReadDateText(x, istr, time_zone)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeText(column, row_num, ostr, settings); @@ -50,6 +68,16 @@ void SerializationDate::deserializeTextQuoted(IColumn & column, ReadBuffer & ist assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. } +bool SerializationDate::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + if (!checkChar('\'', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('\'', istr)) + return false; + + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -66,6 +94,15 @@ void SerializationDate::deserializeTextJSON(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(x); } +bool SerializationDate::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + if (!checkChar('"', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('"', istr)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -80,6 +117,15 @@ void SerializationDate::deserializeTextCSV(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(value); } +bool SerializationDate::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum value; + if (!tryReadCSV(value, istr, time_zone)) + return false; + assert_cast(column).getData().push_back(value); + return true; +} + SerializationDate::SerializationDate(const DateLUTImpl & time_zone_) : time_zone(time_zone_) { } diff --git a/src/DataTypes/Serializations/SerializationDate.h b/src/DataTypes/Serializations/SerializationDate.h index f751b06fba6..dcf79eb49da 100644 --- a/src/DataTypes/Serializations/SerializationDate.h +++ b/src/DataTypes/Serializations/SerializationDate.h @@ -13,14 +13,19 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; protected: const DateLUTImpl & time_zone; diff --git a/src/DataTypes/Serializations/SerializationDate32.cpp b/src/DataTypes/Serializations/SerializationDate32.cpp index 851710de839..70a22d59e42 100644 --- a/src/DataTypes/Serializations/SerializationDate32.cpp +++ b/src/DataTypes/Serializations/SerializationDate32.cpp @@ -21,6 +21,15 @@ void SerializationDate32::deserializeWholeText(IColumn & column, ReadBuffer & is throwUnexpectedDataAfterParsedValue(column, istr, settings, "Date32"); } +bool SerializationDate32::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + if (!tryReadDateText(x, istr, time_zone) || !istr.eof()) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate32::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { ExtendedDayNum x; @@ -28,6 +37,15 @@ void SerializationDate32::deserializeTextEscaped(IColumn & column, ReadBuffer & assert_cast(column).getData().push_back(x); } +bool SerializationDate32::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + if (!tryReadDateText(x, istr, time_zone)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate32::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeText(column, row_num, ostr, settings); @@ -49,6 +67,15 @@ void SerializationDate32::deserializeTextQuoted(IColumn & column, ReadBuffer & i assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. } +bool SerializationDate32::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + if (!checkChar('\'', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('\'', istr)) + return false; + assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. + return true; +} + void SerializationDate32::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -65,6 +92,15 @@ void SerializationDate32::deserializeTextJSON(IColumn & column, ReadBuffer & ist assert_cast(column).getData().push_back(x); } +bool SerializationDate32::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + if (!checkChar('"', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('"', istr)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate32::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -79,6 +115,15 @@ void SerializationDate32::deserializeTextCSV(IColumn & column, ReadBuffer & istr assert_cast(column).getData().push_back(value.getExtenedDayNum()); } +bool SerializationDate32::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + LocalDate value; + if (!tryReadCSV(value, istr)) + return false; + assert_cast(column).getData().push_back(value.getExtenedDayNum()); + return true; +} + SerializationDate32::SerializationDate32(const DateLUTImpl & time_zone_) : time_zone(time_zone_) { } diff --git a/src/DataTypes/Serializations/SerializationDate32.h b/src/DataTypes/Serializations/SerializationDate32.h index 49560fb6c7d..be2e2b76c1d 100644 --- a/src/DataTypes/Serializations/SerializationDate32.h +++ b/src/DataTypes/Serializations/SerializationDate32.h @@ -12,14 +12,19 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; protected: const DateLUTImpl & time_zone; diff --git a/src/DataTypes/Serializations/SerializationDateTime.cpp b/src/DataTypes/Serializations/SerializationDateTime.cpp index 77beb0d9b75..17465d85e9d 100644 --- a/src/DataTypes/Serializations/SerializationDateTime.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime.cpp @@ -21,15 +21,56 @@ inline void readText(time_t & x, ReadBuffer & istr, const FormatSettings & setti switch (settings.date_time_input_format) { case FormatSettings::DateTimeInputFormat::Basic: - readDateTimeText(x, istr, time_zone); - return; + readDateTimeTextImpl<>(x, istr, time_zone); + break; case FormatSettings::DateTimeInputFormat::BestEffort: parseDateTimeBestEffort(x, istr, time_zone, utc_time_zone); - return; + break; case FormatSettings::DateTimeInputFormat::BestEffortUS: parseDateTimeBestEffortUS(x, istr, time_zone, utc_time_zone); - return; + break; } + + if (x < 0) + x = 0; +} + +inline void readAsIntText(time_t & x, ReadBuffer & istr) +{ + readIntText(x, istr); + if (x < 0) + x = 0; +} + +inline bool tryReadText(time_t & x, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) +{ + bool res; + switch (settings.date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + res = tryReadDateTimeText(x, istr, time_zone); + break; + case FormatSettings::DateTimeInputFormat::BestEffort: + res = tryParseDateTimeBestEffort(x, istr, time_zone, utc_time_zone); + break; + case FormatSettings::DateTimeInputFormat::BestEffortUS: + res = tryParseDateTimeBestEffortUS(x, istr, time_zone, utc_time_zone); + break; + } + + if (x < 0) + x = 0; + + return res; +} + +inline bool tryReadAsIntText(time_t & x, ReadBuffer & istr) +{ + if (!tryReadIntText(x, istr)) + return false; + if (x < 0) + x = 0; + return true; } } @@ -68,15 +109,32 @@ void SerializationDateTime::deserializeWholeText(IColumn & column, ReadBuffer & throwUnexpectedDataAfterParsedValue(column, istr, settings, "DateTime"); } +bool SerializationDateTime::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone) || !istr.eof()) + return false; + + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + void SerializationDateTime::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { time_t x = 0; readText(x, istr, settings, time_zone, utc_time_zone); - if (x < 0) - x = 0; assert_cast(column).getData().push_back(static_cast(x)); } +bool SerializationDateTime::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone)) + return false; + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + void SerializationDateTime::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('\'', ostr); @@ -94,15 +152,32 @@ void SerializationDateTime::deserializeTextQuoted(IColumn & column, ReadBuffer & } else /// Just 1504193808 or 01504193808 { - readIntText(x, istr); + readAsIntText(x, istr); } - if (x < 0) - x = 0; /// It's important to do this at the end - for exception safety. assert_cast(column).getData().push_back(static_cast(x)); } +bool SerializationDateTime::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' + { + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone) || !checkChar('\'', istr)) + return false; + } + else /// Just 1504193808 or 01504193808 + { + if (!tryReadAsIntText(x, istr)) + return false; + } + + /// It's important to do this at the end - for exception safety. + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + void SerializationDateTime::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -120,13 +195,30 @@ void SerializationDateTime::deserializeTextJSON(IColumn & column, ReadBuffer & i } else { - readIntText(x, istr); + readAsIntText(x, istr); } - if (x < 0) - x = 0; + assert_cast(column).getData().push_back(static_cast(x)); } +bool SerializationDateTime::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (checkChar('"', istr)) + { + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone) || !checkChar('"', istr)) + return false; + } + else + { + if (!tryReadIntText(x, istr)) + return false; + } + + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + void SerializationDateTime::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -165,13 +257,48 @@ void SerializationDateTime::deserializeTextCSV(IColumn & column, ReadBuffer & is readCSVString(datetime_str, istr, settings.csv); ReadBufferFromString buf(datetime_str); readText(x, buf, settings, time_zone, utc_time_zone); + if (!buf.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "DateTime"); } } - if (x < 0) - x = 0; - assert_cast(column).getData().push_back(static_cast(x)); } +bool SerializationDateTime::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + + if (istr.eof()) + return false; + + char maybe_quote = *istr.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + { + ++istr.position(); + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone) || !checkChar(maybe_quote, istr)) + return false; + } + else + { + if (settings.csv.delimiter != ',' || settings.date_time_input_format == FormatSettings::DateTimeInputFormat::Basic) + { + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone)) + return false; + } + else + { + String datetime_str; + readCSVString(datetime_str, istr, settings.csv); + ReadBufferFromString buf(datetime_str); + if (!tryReadText(x, buf, settings, time_zone, utc_time_zone) || !buf.eof()) + return false; + } + } + + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + } diff --git a/src/DataTypes/Serializations/SerializationDateTime.h b/src/DataTypes/Serializations/SerializationDateTime.h index f4a142483e5..584b0c4116b 100644 --- a/src/DataTypes/Serializations/SerializationDateTime.h +++ b/src/DataTypes/Serializations/SerializationDateTime.h @@ -15,14 +15,19 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; }; } diff --git a/src/DataTypes/Serializations/SerializationDateTime64.cpp b/src/DataTypes/Serializations/SerializationDateTime64.cpp index 93891886000..442e29edd52 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime64.cpp @@ -47,6 +47,16 @@ void SerializationDateTime64::deserializeText(IColumn & column, ReadBuffer & ist throwUnexpectedDataAfterParsedValue(column, istr, settings, "DateTime64"); } +bool SerializationDateTime64::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const +{ + DateTime64 result = 0; + if (!tryReadDateTime64Text(result, scale, istr, time_zone) || (whole && !istr.eof())) + return false; + + assert_cast(column).getData().push_back(result); + return true; +} + void SerializationDateTime64::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { deserializeTextEscaped(column, istr, settings); @@ -75,6 +85,29 @@ static inline void readText(DateTime64 & x, UInt32 scale, ReadBuffer & istr, con } } +static inline bool tryReadText(DateTime64 & x, UInt32 scale, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) +{ + switch (settings.date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + return tryReadDateTime64Text(x, scale, istr, time_zone); + case FormatSettings::DateTimeInputFormat::BestEffort: + return tryParseDateTime64BestEffort(x, scale, istr, time_zone, utc_time_zone); + case FormatSettings::DateTimeInputFormat::BestEffortUS: + return tryParseDateTime64BestEffortUS(x, scale, istr, time_zone, utc_time_zone); + } +} + + +bool SerializationDateTime64::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !istr.eof()) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDateTime64::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { DateTime64 x = 0; @@ -82,6 +115,15 @@ void SerializationDateTime64::deserializeTextEscaped(IColumn & column, ReadBuffe assert_cast(column).getData().push_back(x); } +bool SerializationDateTime64::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDateTime64::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('\'', ostr); @@ -104,6 +146,23 @@ void SerializationDateTime64::deserializeTextQuoted(IColumn & column, ReadBuffer assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. } +bool SerializationDateTime64::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' + { + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !checkChar('\'', istr)) + return false; + } + else /// Just 1504193808 or 01504193808 + { + if (!tryReadIntText(x, istr)) + return false; + } + assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. + return true; +} + void SerializationDateTime64::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -126,6 +185,23 @@ void SerializationDateTime64::deserializeTextJSON(IColumn & column, ReadBuffer & assert_cast(column).getData().push_back(x); } +bool SerializationDateTime64::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (checkChar('"', istr)) + { + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !checkChar('"', istr)) + return false; + } + else + { + if (!tryReadIntText(x, istr)) + return false; + } + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDateTime64::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -170,4 +246,40 @@ void SerializationDateTime64::deserializeTextCSV(IColumn & column, ReadBuffer & assert_cast(column).getData().push_back(x); } +bool SerializationDateTime64::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + + if (istr.eof()) + return false; + + char maybe_quote = *istr.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + { + ++istr.position(); + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !checkChar(maybe_quote, istr)) + return false; + } + else + { + if (settings.csv.delimiter != ',' || settings.date_time_input_format == FormatSettings::DateTimeInputFormat::Basic) + { + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone)) + return false; + } + else + { + String datetime_str; + readCSVString(datetime_str, istr, settings.csv); + ReadBufferFromString buf(datetime_str); + if (!tryReadText(x, scale, buf, settings, time_zone, utc_time_zone) || !buf.eof()) + return false; + } + } + + assert_cast(column).getData().push_back(x); + return true; +} + } diff --git a/src/DataTypes/Serializations/SerializationDateTime64.h b/src/DataTypes/Serializations/SerializationDateTime64.h index f817edbf0dd..b49bd1e9098 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.h +++ b/src/DataTypes/Serializations/SerializationDateTime64.h @@ -15,15 +15,21 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; }; } diff --git a/src/DataTypes/Serializations/SerializationDecimal.cpp b/src/DataTypes/Serializations/SerializationDecimal.cpp index b576b7a048c..d632c224783 100644 --- a/src/DataTypes/Serializations/SerializationDecimal.cpp +++ b/src/DataTypes/Serializations/SerializationDecimal.cpp @@ -16,11 +16,19 @@ namespace ErrorCodes } template -bool SerializationDecimal::tryReadText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale) +bool SerializationDecimal::tryReadText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale, bool csv) { UInt32 unread_scale = scale; - if (!tryReadDecimalText(istr, x, precision, unread_scale)) - return false; + if (csv) + { + if (!tryReadCSVDecimalText(istr, x, precision, unread_scale)) + return false; + } + else + { + if (!tryReadDecimalText(istr, x, precision, unread_scale)) + return false; + } if (common::mulOverflow(x.value, DecimalUtils::scaleMultiplier(unread_scale), x.value)) return false; @@ -59,6 +67,16 @@ void SerializationDecimal::deserializeText(IColumn & column, ReadBuffer & ist ISerialization::throwUnexpectedDataAfterParsedValue(column, istr, settings, "Decimal"); } +template +bool SerializationDecimal::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const +{ + T x; + if (!tryReadText(x, istr) || (whole && !istr.eof())) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationDecimal::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { @@ -67,6 +85,16 @@ void SerializationDecimal::deserializeTextCSV(IColumn & column, ReadBuffer & assert_cast(column).getData().push_back(x); } +template +bool SerializationDecimal::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + T x; + if (!tryReadText(x, istr, true)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationDecimal::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -88,6 +116,18 @@ void SerializationDecimal::deserializeTextJSON(IColumn & column, ReadBuffer & assertChar('"', istr); } +template +bool SerializationDecimal::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + bool have_quotes = checkChar('"', istr); + T x; + if (!tryReadText(x, istr) || (have_quotes && !checkChar('"', istr))) + return false; + + assert_cast(column).getData().push_back(x); + return true; +} + template class SerializationDecimal; template class SerializationDecimal; diff --git a/src/DataTypes/Serializations/SerializationDecimal.h b/src/DataTypes/Serializations/SerializationDecimal.h index 57decdd0973..22a8eb1a47c 100644 --- a/src/DataTypes/Serializations/SerializationDecimal.h +++ b/src/DataTypes/Serializations/SerializationDecimal.h @@ -16,15 +16,19 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void readText(T & x, ReadBuffer & istr, bool csv = false) const { readText(x, istr, this->precision, this->scale, csv); } + bool tryReadText(T & x, ReadBuffer & istr, bool csv = false) const { return tryReadText(x, istr, this->precision, this->scale, csv); } static void readText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_, bool csv = false); - static bool tryReadText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_); + static bool tryReadText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_, bool csv = false); }; } diff --git a/src/DataTypes/Serializations/SerializationEnum.cpp b/src/DataTypes/Serializations/SerializationEnum.cpp index 9b3a437e9cf..14b1a33e2ce 100644 --- a/src/DataTypes/Serializations/SerializationEnum.cpp +++ b/src/DataTypes/Serializations/SerializationEnum.cpp @@ -34,6 +34,27 @@ void SerializationEnum::deserializeTextEscaped(IColumn & column, ReadBuffe } } +template +bool SerializationEnum::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + FieldType x; + if (settings.tsv.enum_as_number) + { + if (!tryReadValue(istr, x)) + return false; + } + else + { + std::string field_name; + readEscapedString(field_name, istr); + if (!ref_enum_values.tryGetValue(x, StringRef(field_name), true)) + return false; + } + + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const { @@ -48,6 +69,20 @@ void SerializationEnum::deserializeTextQuoted(IColumn & column, ReadBuffer assert_cast(column).getData().push_back(ref_enum_values.getValue(StringRef(field_name))); } +template +bool SerializationEnum::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + std::string field_name; + if (!tryReadQuotedStringWithSQLStyle(field_name, istr)) + return false; + + FieldType x; + if (!ref_enum_values.tryGetValue(x, StringRef(field_name))) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { @@ -65,6 +100,27 @@ void SerializationEnum::deserializeWholeText(IColumn & column, ReadBuffer } } +template +bool SerializationEnum::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + FieldType x; + if (settings.tsv.enum_as_number) + { + if (!tryReadValue(istr, x) || !istr.eof()) + return false; + } + else + { + std::string field_name; + readStringUntilEOF(field_name, istr); + if (!ref_enum_values.tryGetValue(x, StringRef(field_name), true)) + return false; + } + + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -90,6 +146,27 @@ void SerializationEnum::deserializeTextJSON(IColumn & column, ReadBuffer & } } +template +bool SerializationEnum::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + FieldType x; + if (!istr.eof() && *istr.position() != '"') + { + if (!tryReadValue(istr, x)) + return false; + } + else + { + std::string field_name; + readJSONString(field_name, istr); + if (!ref_enum_values.tryGetValue(x, StringRef(field_name))) + return false; + } + + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const { @@ -109,6 +186,28 @@ void SerializationEnum::deserializeTextCSV(IColumn & column, ReadBuffer & } } +template +bool SerializationEnum::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + FieldType x; + + if (settings.csv.enum_as_number) + { + if (!tryReadValue(istr, x)) + return false; + } + else + { + std::string field_name; + readCSVString(field_name, istr, settings.csv); + if (!ref_enum_values.tryGetValue(x, StringRef(field_name), true)) + return false; + } + + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::serializeTextMarkdown( const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationEnum.h b/src/DataTypes/Serializations/SerializationEnum.h index 03b134e59a6..bb720ee9b1f 100644 --- a/src/DataTypes/Serializations/SerializationEnum.h +++ b/src/DataTypes/Serializations/SerializationEnum.h @@ -34,15 +34,20 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; @@ -53,6 +58,11 @@ public: return ref_enum_values.findByValue(x)->first; } + bool tryReadValue(ReadBuffer & istr, FieldType & x) const + { + return tryReadText(x, istr) && ref_enum_values.hasValue(x); + } + std::optional> own_enum_values; std::shared_ptr> own_enum_type; const EnumValues & ref_enum_values; diff --git a/src/DataTypes/Serializations/SerializationFixedString.cpp b/src/DataTypes/Serializations/SerializationFixedString.cpp index fa50af52f2f..23e959d80c9 100644 --- a/src/DataTypes/Serializations/SerializationFixedString.cpp +++ b/src/DataTypes/Serializations/SerializationFixedString.cpp @@ -150,12 +150,49 @@ static inline void read(const SerializationFixedString & self, IColumn & column, } } +bool SerializationFixedString::tryAlignStringLength(size_t n, PaddedPODArray & data, size_t string_start) +{ + size_t length = data.size() - string_start; + if (length < n) + { + data.resize_fill(string_start + n); + } + else if (length > n) + { + data.resize_assume_reserved(string_start); + return false; + } + + return true; +} + +template +static inline bool tryRead(const SerializationFixedString & self, IColumn & column, Reader && reader) +{ + ColumnFixedString::Chars & data = typeid_cast(column).getChars(); + size_t prev_size = data.size(); + try + { + return reader(data) && SerializationFixedString::tryAlignStringLength(self.getN(), data, prev_size); + } + catch (...) + { + data.resize_assume_reserved(prev_size); + return false; + } +} + void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { read(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); }); } +bool SerializationFixedString::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); return true; }); +} + void SerializationFixedString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const { @@ -169,12 +206,22 @@ void SerializationFixedString::deserializeTextQuoted(IColumn & column, ReadBuffe read(*this, column, [&istr](ColumnFixedString::Chars & data) { readQuotedStringInto(data, istr); }); } +bool SerializationFixedString::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { return tryReadQuotedStringInto(data, istr); }); +} + void SerializationFixedString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { read(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringUntilEOFInto(data, istr); }); } +bool SerializationFixedString::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringUntilEOFInto(data, istr); return true; }); +} + void SerializationFixedString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -188,6 +235,10 @@ void SerializationFixedString::deserializeTextJSON(IColumn & column, ReadBuffer read(*this, column, [&istr](ColumnFixedString::Chars & data) { readJSONStringInto(data, istr); }); } +bool SerializationFixedString::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { return tryReadJSONStringInto(data, istr); }); +} void SerializationFixedString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const { @@ -208,6 +259,11 @@ void SerializationFixedString::deserializeTextCSV(IColumn & column, ReadBuffer & read(*this, column, [&istr, &csv = settings.csv](ColumnFixedString::Chars & data) { readCSVStringInto(data, istr, csv); }); } +bool SerializationFixedString::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryRead(*this, column, [&istr, &csv = settings.csv](ColumnFixedString::Chars & data) { readCSVStringInto(data, istr, csv); return true; }); +} + void SerializationFixedString::serializeTextMarkdown( const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const { diff --git a/src/DataTypes/Serializations/SerializationFixedString.h b/src/DataTypes/Serializations/SerializationFixedString.h index c27b10ad158..8eb4eacdbff 100644 --- a/src/DataTypes/Serializations/SerializationFixedString.h +++ b/src/DataTypes/Serializations/SerializationFixedString.h @@ -26,20 +26,25 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; @@ -47,6 +52,7 @@ public: /// If the length is less than getN() the function will add zero characters up to getN(). /// If the length is greater than getN() the function will throw an exception. static void alignStringLength(size_t n, PaddedPODArray & data, size_t string_start); + static bool tryAlignStringLength(size_t n, PaddedPODArray & data, size_t string_start); }; } diff --git a/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp b/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp new file mode 100644 index 00000000000..dfcd24aff58 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp @@ -0,0 +1,187 @@ +#include + +namespace DB +{ + +template +void SerializationIP::serializeText(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings &) const +{ + writeText(assert_cast &>(column).getData()[row_num], ostr); +} + +template +void SerializationIP::deserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, bool whole) const +{ + IPv x; + readText(x, istr); + + assert_cast &>(column).getData().push_back(x); + + if (whole && !istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); +} + +template +bool SerializationIP::tryDeserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &, bool whole) const +{ + IPv x; + if (!tryReadText(x, istr) || (whole && !istr.eof())) + return false; + + assert_cast &>(column).getData().push_back(x); + return true; +} + +template +void SerializationIP::serializeTextQuoted(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +template +void SerializationIP::deserializeTextQuoted(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + assertChar('\'', istr); + readText(x, istr); + assertChar('\'', istr); + assert_cast &>(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. +} + +template +bool SerializationIP::tryDeserializeTextQuoted(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + if (!checkChar('\'', istr) || !tryReadText(x, istr) || !checkChar('\'', istr)) + return false; + assert_cast &>(column).getData().push_back(x); + return true; +} + +template +void SerializationIP::serializeTextJSON(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +template +void SerializationIP::deserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + IPv x; + assertChar('"', istr); + readText(x, istr); + /// this code looks weird, but we want to throw specific exception to match original behavior... + if (istr.eof()) + assertChar('"', istr); + assert_cast &>(column).getData().push_back(x); + if (*istr.position() != '"') + throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); + istr.ignore(); +} + +template +bool SerializationIP::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + if (!checkChar('"', istr) || !tryReadText(x, istr) || !checkChar('"', istr)) + return false; + + assert_cast &>(column).getData().push_back(x); + return true; +} + +template +void SerializationIP::serializeTextCSV(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +template +void SerializationIP::deserializeTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv value; + readCSV(value, istr); + + assert_cast &>(column).getData().push_back(value); +} + +template +bool SerializationIP::tryDeserializeTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv value; + if (!tryReadCSV(value, istr)) + return false; + + assert_cast &>(column).getData().push_back(value); + return true; +} + +template +void SerializationIP::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const +{ + IPv x = field.get(); + if constexpr (std::is_same_v) + writeBinary(x, ostr); + else + writeBinaryLittleEndian(x, ostr); +} + +template +void SerializationIP::deserializeBinary(DB::Field & field, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + if constexpr (std::is_same_v) + readBinary(x, istr); + else + readBinaryLittleEndian(x, istr); + field = NearestFieldType(x); +} + +template +void SerializationIP::serializeBinary(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings &) const +{ + writeBinary(assert_cast &>(column).getData()[row_num], ostr); +} + +template +void SerializationIP::deserializeBinary(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + readBinary(x.toUnderType(), istr); + assert_cast &>(column).getData().push_back(x); +} + +template +void SerializationIP::serializeBinaryBulk(const DB::IColumn & column, DB::WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const typename ColumnVector::Container & x = typeid_cast &>(column).getData(); + + size_t size = x.size(); + + if (limit == 0 || offset + limit > size) + limit = size - offset; + + if (limit) + ostr.write(reinterpret_cast(&x[offset]), sizeof(IPv) * limit); +} + +template +void SerializationIP::deserializeBinaryBulk(DB::IColumn & column, DB::ReadBuffer & istr, size_t limit, double) const +{ + typename ColumnVector::Container & x = typeid_cast &>(column).getData(); + size_t initial_size = x.size(); + x.resize(initial_size + limit); + size_t size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(IPv) * limit); + x.resize(initial_size + size / sizeof(IPv)); +} + +template class SerializationIP; +template class SerializationIP; + +} diff --git a/src/DataTypes/Serializations/SerializationIPv4andIPv6.h b/src/DataTypes/Serializations/SerializationIPv4andIPv6.h index 7d8669fd444..a53f257646b 100644 --- a/src/DataTypes/Serializations/SerializationIPv4andIPv6.h +++ b/src/DataTypes/Serializations/SerializationIPv4andIPv6.h @@ -13,123 +13,30 @@ template class SerializationIP : public SimpleTextSerialization { public: - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override - { - writeText(assert_cast &>(column).getData()[row_num], ostr); - } - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override - { - IPv x; - readText(x, istr); + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; - if (whole && !istr.eof()) - throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - assert_cast &>(column).getData().push_back(x); - } - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - serializeText(column, row_num, ostr, settings); - } - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - deserializeText(column, istr, settings, false); - } - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - writeChar('\'', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('\'', ostr); - } - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override - { - IPv x; - assertChar('\'', istr); - readText(x, istr); - assertChar('\'', istr); - assert_cast &>(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. - } - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); - } - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - IPv x; - assertChar('"', istr); - readText(x, istr); - /// this code looks weird, but we want to throw specific exception to match original behavior... - if (istr.eof()) - assertChar('"', istr); - if (*istr.position() != '"') - throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); - istr.ignore(); + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - assert_cast &>(column).getData().push_back(x); - } - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); - } - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &/* settings*/) const override - { - IPv value; - readCSV(value, istr); + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &/* settings*/) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &/* settings*/) const override; - assert_cast &>(column).getData().push_back(value); - } + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override; - void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override - { - IPv x = field.get(); - if constexpr (std::is_same_v) - writeBinary(x, ostr); - else - writeBinaryLittleEndian(x, ostr); - } - void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override - { - IPv x; - if constexpr (std::is_same_v) - readBinary(x, istr); - else - readBinaryLittleEndian(x, istr); - field = NearestFieldType(x); - } - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override - { - writeBinary(assert_cast &>(column).getData()[row_num], ostr); - } - void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override - { - IPv x; - readBinary(x.toUnderType(), istr); - assert_cast &>(column).getData().push_back(x); - } - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override - { - const typename ColumnVector::Container & x = typeid_cast &>(column).getData(); + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - size_t size = x.size(); - - if (limit == 0 || offset + limit > size) - limit = size - offset; - - if (limit) - ostr.write(reinterpret_cast(&x[offset]), sizeof(IPv) * limit); - } - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const override - { - typename ColumnVector::Container & x = typeid_cast &>(column).getData(); - size_t initial_size = x.size(); - x.resize(initial_size + limit); - size_t size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(IPv) * limit); - x.resize(initial_size + size / sizeof(IPv)); - } + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const override; }; using SerializationIPv4 = SerializationIP; diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp index 3e1cbdb00f5..9efe05042ed 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp +++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp @@ -700,6 +700,11 @@ void SerializationLowCardinality::deserializeTextEscaped(IColumn & column, ReadB deserializeImpl(column, &ISerialization::deserializeTextEscaped, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextEscaped, istr, settings); +} + void SerializationLowCardinality::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeTextQuoted, ostr, settings); @@ -710,11 +715,21 @@ void SerializationLowCardinality::deserializeTextQuoted(IColumn & column, ReadBu deserializeImpl(column, &ISerialization::deserializeTextQuoted, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextQuoted, istr, settings); +} + void SerializationLowCardinality::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { deserializeImpl(column, &ISerialization::deserializeWholeText, istr, settings); } +bool SerializationLowCardinality::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeWholeText, istr, settings); +} + void SerializationLowCardinality::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeTextCSV, ostr, settings); @@ -725,6 +740,11 @@ void SerializationLowCardinality::deserializeTextCSV(IColumn & column, ReadBuffe deserializeImpl(column, &ISerialization::deserializeTextCSV, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextCSV, istr, settings); +} + void SerializationLowCardinality::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeText, ostr, settings); @@ -740,6 +760,11 @@ void SerializationLowCardinality::deserializeTextJSON(IColumn & column, ReadBuff deserializeImpl(column, &ISerialization::deserializeTextJSON, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextJSON, istr, settings); +} + void SerializationLowCardinality::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeTextXML, ostr, settings); @@ -750,6 +775,11 @@ void SerializationLowCardinality::deserializeTextRaw(IColumn & column, ReadBuffe deserializeImpl(column, &ISerialization::deserializeTextRaw, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextRaw, istr, settings); +} + void SerializationLowCardinality::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeTextRaw, ostr, settings); @@ -769,7 +799,7 @@ template void SerializationLowCardinality::deserializeImpl( IColumn & column, SerializationLowCardinality::DeserializeFunctionPtr func, Args &&... args) const { - auto & low_cardinality_column= getColumnLowCardinality(column); + auto & low_cardinality_column = getColumnLowCardinality(column); auto temp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); auto serialization = dictionary_type->getDefaultSerialization(); @@ -778,4 +808,19 @@ void SerializationLowCardinality::deserializeImpl( low_cardinality_column.insertFromFullColumn(*temp_column, 0); } +template +bool SerializationLowCardinality::tryDeserializeImpl( + IColumn & column, SerializationLowCardinality::TryDeserializeFunctionPtr func, Args &&... args) const +{ + auto & low_cardinality_column = getColumnLowCardinality(column); + auto temp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); + + auto serialization = dictionary_type->getDefaultSerialization(); + if (!(serialization.get()->*func)(*temp_column, std::forward(args)...)) + return false; + + low_cardinality_column.insertFromFullColumn(*temp_column, 0); + return true; +} + } diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.h b/src/DataTypes/Serializations/SerializationLowCardinality.h index 5f56bcf8108..d2c3a95c702 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.h +++ b/src/DataTypes/Serializations/SerializationLowCardinality.h @@ -55,16 +55,22 @@ public: void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; private: @@ -79,6 +85,12 @@ private: template void deserializeImpl(IColumn & column, DeserializeFunctionPtr func, Args &&... args) const; + + template + using TryDeserializeFunctionPtr = bool (ISerialization::*)(IColumn &, Params ...) const; + + template + bool tryDeserializeImpl(IColumn & column, TryDeserializeFunctionPtr func, Args &&... args) const; }; } diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index 7588e630689..7b6f87baf2e 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -115,9 +115,11 @@ void SerializationMap::serializeTextImpl( writeChar('}', ostr); } -template -void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const +template +ReturnType SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const { + static constexpr bool throw_exception = std::is_same_v; + auto & column_map = assert_cast(column); auto & nested_array = column_map.getNestedColumn(); @@ -128,7 +130,21 @@ void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, auto & value_column = nested_tuple.getColumn(1); size_t size = 0; - assertChar('{', istr); + if constexpr (throw_exception) + assertChar('{', istr); + else if (!checkChar('{', istr)) + return ReturnType(false); + + auto on_error_no_throw = [&]() + { + if (size) + { + nested_tuple.getColumnPtr(0) = key_column.cut(0, offsets.back()); + nested_tuple.getColumnPtr(1) = value_column.cut(0, offsets.back()); + } + + return ReturnType(false); + }; try { @@ -138,9 +154,15 @@ void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, if (!first) { if (*istr.position() == ',') + { ++istr.position(); + } else - throw Exception(ErrorCodes::CANNOT_READ_MAP_FROM_TEXT, "Cannot read Map from text"); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::CANNOT_READ_MAP_FROM_TEXT, "Cannot read Map from text"); + return on_error_no_throw(); + } } first = false; @@ -150,19 +172,32 @@ void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, if (*istr.position() == '}') break; - reader(istr, key, key_column); + if constexpr (throw_exception) + reader(istr, key, key_column); + else if (!reader(istr, key, key_column)) + return on_error_no_throw(); + ++size; skipWhitespaceIfAny(istr); - assertChar(':', istr); + if constexpr (throw_exception) + assertChar(':', istr); + else if (!checkChar(':', istr)) + return on_error_no_throw(); skipWhitespaceIfAny(istr); - reader(istr, value, value_column); + if constexpr (throw_exception) + reader(istr, value, value_column); + else if (!reader(istr, value, value_column)) + return on_error_no_throw(); skipWhitespaceIfAny(istr); } - assertChar('}', istr); + if constexpr (throw_exception) + assertChar('}', istr); + else if (!checkChar('}', istr)) + return on_error_no_throw(); } catch (...) { @@ -171,10 +206,14 @@ void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, nested_tuple.getColumnPtr(0) = key_column.cut(0, offsets.back()); nested_tuple.getColumnPtr(1) = value_column.cut(0, offsets.back()); } - throw; + + if constexpr (throw_exception) + throw; + return ReturnType(false); } offsets.push_back(offsets.back() + size); + return ReturnType(true); } void SerializationMap::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -192,8 +231,8 @@ void SerializationMap::deserializeText(IColumn & column, ReadBuffer & istr, cons deserializeTextImpl(column, istr, [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) { - if (settings.null_as_default) - SerializationNullable::deserializeTextQuotedImpl(subcolumn, buf, settings, subcolumn_serialization); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(subcolumn, buf, settings, subcolumn_serialization); else subcolumn_serialization->deserializeTextQuoted(subcolumn, buf, settings); }); @@ -202,6 +241,28 @@ void SerializationMap::deserializeText(IColumn & column, ReadBuffer & istr, cons throwUnexpectedDataAfterParsedValue(column, istr, settings, "Map"); } +bool SerializationMap::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +{ + auto reader = [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(subcolumn, buf, settings, subcolumn_serialization); + return subcolumn_serialization->tryDeserializeTextQuoted(subcolumn, buf, settings); + }; + + auto ok = deserializeTextImpl(column, istr, reader); + if (!ok) + return false; + + if (whole && !istr.eof()) + { + column.popBack(1); + return false; + } + + return true; +} + void SerializationMap::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeTextImpl(column, row_num, ostr, @@ -260,13 +321,25 @@ void SerializationMap::deserializeTextJSON(IColumn & column, ReadBuffer & istr, deserializeTextImpl(column, istr, [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) { - if (settings.null_as_default) - SerializationNullable::deserializeTextJSONImpl(subcolumn, buf, settings, subcolumn_serialization); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); else subcolumn_serialization->deserializeTextJSON(subcolumn, buf, settings); }); } +bool SerializationMap::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto reader = [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); + return subcolumn_serialization->tryDeserializeTextJSON(subcolumn, buf, settings); + }; + + return deserializeTextImpl(column, istr, reader); +} + void SerializationMap::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { const auto & column_map = assert_cast(column); @@ -308,6 +381,15 @@ void SerializationMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, c deserializeText(column, rb, settings, true); } +bool SerializationMap::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + if (!tryReadCSV(s, istr, settings.csv)) + return false; + ReadBufferFromString rb(s); + return tryDeserializeText(column, rb, settings, true); +} + void SerializationMap::enumerateStreams( EnumerateStreamsSettings & settings, const StreamCallback & callback, diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h index f32c656757d..3e27ef1b04a 100644 --- a/src/DataTypes/Serializations/SerializationMap.h +++ b/src/DataTypes/Serializations/SerializationMap.h @@ -24,13 +24,16 @@ public: void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void enumerateStreams( EnumerateStreamsSettings & settings, @@ -68,8 +71,8 @@ private: template void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, KeyWriter && key_writer, ValueWriter && value_writer) const; - template - void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const; + template + ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const; }; } diff --git a/src/DataTypes/Serializations/SerializationNothing.h b/src/DataTypes/Serializations/SerializationNothing.h index 02974d1ca76..7d1fff55b01 100644 --- a/src/DataTypes/Serializations/SerializationNothing.h +++ b/src/DataTypes/Serializations/SerializationNothing.h @@ -25,6 +25,7 @@ public: void deserializeBinary(IColumn &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); } void serializeText(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); } void deserializeText(IColumn &, ReadBuffer &, const FormatSettings &, bool) const override { throwNoSerialization(); } + bool tryDeserializeText(IColumn &, ReadBuffer &, const FormatSettings &, bool) const override { throwNoSerialization(); } /// These methods read and write zero bytes just to allow to figure out size of column. void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 4b0ad0b54ba..4d31451f92d 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -189,55 +189,59 @@ void SerializationNullable::serializeBinary(const IColumn & column, size_t row_n nested->serializeBinary(col.getNestedColumn(), row_num, ostr, settings); } -/// Deserialize value into ColumnNullable. -/// We need to insert both to nested column and to null byte map, or, in case of exception, to not insert at all. -template -requires std::same_as -static ReturnType -safeDeserialize(IColumn & column, const ISerialization &, CheckForNull && check_for_null, DeserializeNested && deserialize_nested) +template +ReturnType safeAppendToNullMap(ColumnNullable & column, bool is_null) { - ColumnNullable & col = assert_cast(column); - - if (check_for_null()) + try { - col.insertDefault(); + column.getNullMapData().push_back(is_null); } - else + catch (...) { - deserialize_nested(col.getNestedColumn()); - - try - { - col.getNullMapData().push_back(0); - } - catch (...) - { - col.getNestedColumn().popBack(1); + column.getNestedColumn().popBack(1); + if constexpr (std::is_same_v) throw; - } + return ReturnType(false); } + + return ReturnType(true); } -/// Deserialize value into non-nullable column. In case of NULL, insert default value and return false. +/// Deserialize value into non-nullable column. In case of NULL, insert default and set is_null to true. +/// If ReturnType is bool, return true if parsing was successful and false in case of any error. template -requires std::same_as -static ReturnType -safeDeserialize(IColumn & column, const ISerialization &, CheckForNull && check_for_null, DeserializeNested && deserialize_nested) +static ReturnType deserializeImpl(IColumn & column, ReadBuffer & buf, CheckForNull && check_for_null, DeserializeNested && deserialize_nested, bool & is_null) { - bool insert_default = check_for_null(); - if (insert_default) + is_null = check_for_null(buf); + if (is_null) + { column.insertDefault(); + } else - deserialize_nested(column); - return !insert_default; + { + if constexpr (std::is_same_v) + deserialize_nested(column, buf); + else if (!deserialize_nested(column, buf)) + return ReturnType(false); + } + + return ReturnType(true); } void SerializationNullable::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - safeDeserialize(column, *nested, - [&istr] { bool is_null = false; readBinary(is_null, istr); return is_null; }, - [this, &istr, settings] (IColumn & nested_column) { nested->deserializeBinary(nested_column, istr, settings); }); + ColumnNullable & col = assert_cast(column); + bool is_null; + auto check_for_null = [](ReadBuffer & buf) + { + bool is_null_ = false; + readBinary(is_null_, buf); + return is_null_; + }; + auto deserialize_nested = [this, &settings] (IColumn & nested_column, ReadBuffer & buf) { nested->deserializeBinary(nested_column, buf, settings); }; + deserializeImpl(col.getNestedColumn(), istr, check_for_null, deserialize_nested, is_null); + safeAppendToNullMap(col, is_null); } @@ -246,20 +250,19 @@ void SerializationNullable::serializeTextEscaped(const IColumn & column, size_t const ColumnNullable & col = assert_cast(column); if (col.isNullAt(row_num)) - writeString(settings.tsv.null_representation, ostr); + serializeNullEscaped(ostr, settings); else nested->serializeTextEscaped(col.getNestedColumn(), row_num, ostr, settings); } - -void SerializationNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::serializeNullEscaped(DB::WriteBuffer & ostr, const DB::FormatSettings & settings) { - deserializeTextEscapedImpl(column, istr, settings, nested); + writeString(settings.tsv.null_representation, ostr); } -void SerializationNullable::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +bool SerializationNullable::tryDeserializeNullEscaped(DB::ReadBuffer & istr, const DB::FormatSettings & settings) { - deserializeTextRawImpl(column, istr, settings, nested); + return checkString(settings.tsv.null_representation, istr); } void SerializationNullable::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -267,72 +270,73 @@ void SerializationNullable::serializeTextRaw(const IColumn & column, size_t row_ const ColumnNullable & col = assert_cast(column); if (col.isNullAt(row_num)) - writeString(settings.tsv.null_representation, ostr); + serializeNullRaw(ostr, settings); else nested->serializeTextRaw(col.getNestedColumn(), row_num, ostr, settings); } -template -ReturnType SerializationNullable::deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested) +void SerializationNullable::serializeNullRaw(DB::WriteBuffer & ostr, const DB::FormatSettings & settings) { - return deserializeTextEscapedAndRawImpl(column, istr, settings, nested); + writeString(settings.tsv.null_representation, ostr); } -template -ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) +bool SerializationNullable::tryDeserializeNullRaw(DB::ReadBuffer & istr, const DB::FormatSettings & settings) { - return deserializeTextEscapedAndRawImpl(column, istr, settings, nested); + return checkString(settings.tsv.null_representation, istr); } template -ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested_serialization) +ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization, bool & is_null) { + static constexpr bool throw_exception = std::is_same_v; + const String & null_representation = settings.tsv.null_representation; + auto deserialize_nested = [&nested_serialization, &settings] (IColumn & nested_column, ReadBuffer & buf_) + { + if constexpr (throw_exception) + { + if constexpr (escaped) + nested_serialization->deserializeTextEscaped(nested_column, buf_, settings); + else + nested_serialization->deserializeTextRaw(nested_column, buf_, settings); + } + else + { + if constexpr (escaped) + return nested_serialization->tryDeserializeTextEscaped(nested_column, buf_, settings); + else + return nested_serialization->tryDeserializeTextRaw(nested_column, buf_, settings); + } + }; /// Some data types can deserialize absence of data (e.g. empty string), so eof is ok. if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) { /// This is not null, surely. - return safeDeserialize(column, *nested_serialization, - [] { return false; }, - [&nested_serialization, &istr, &settings] (IColumn & nested_column) - { - if constexpr (escaped) - nested_serialization->deserializeTextEscaped(nested_column, istr, settings); - else - nested_serialization->deserializeTextRaw(nested_column, istr, settings); - }); + return deserializeImpl(column, istr, [](ReadBuffer &){ return false; }, deserialize_nested, is_null); } /// Check if we have enough data in buffer to check if it's a null. if (istr.available() > null_representation.size()) { - auto check_for_null = [&istr, &null_representation]() + auto check_for_null = [&null_representation](ReadBuffer & buf) { - auto * pos = istr.position(); - if (checkString(null_representation, istr) && (*istr.position() == '\t' || *istr.position() == '\n')) + auto * pos = buf.position(); + if (checkString(null_representation, buf) && (*buf.position() == '\t' || *buf.position() == '\n')) return true; - istr.position() = pos; + buf.position() = pos; return false; }; - auto deserialize_nested = [&nested_serialization, &settings, &istr] (IColumn & nested_column) - { - if constexpr (escaped) - nested_serialization->deserializeTextEscaped(nested_column, istr, settings); - else - nested_serialization->deserializeTextRaw(nested_column, istr, settings); - }; - return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); + return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); } /// We don't have enough data in buffer to check if it's a null. /// Use PeekableReadBuffer to make a checkpoint before checking null /// representation and rollback if check was failed. - PeekableReadBuffer buf(istr, true); - auto check_for_null = [&buf, &null_representation]() + PeekableReadBuffer peekable_buf(istr, true); + auto check_for_null = [&null_representation](ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n')) @@ -342,16 +346,18 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col return false; }; - auto deserialize_nested = [&nested_serialization, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) + auto deserialize_nested_with_check = [&deserialize_nested, &nested_serialization, &settings, &null_representation, &istr] (IColumn & nested_column, ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); auto * pos = buf.position(); - if constexpr (escaped) - nested_serialization->deserializeTextEscaped(nested_column, buf, settings); - else - nested_serialization->deserializeTextRaw(nested_column, buf, settings); + if constexpr (throw_exception) + deserialize_nested(nested_column, buf); + else if (!deserialize_nested(nested_column, buf)) + return ReturnType(false); + /// Check that we don't have any unread data in PeekableReadBuffer own memory. if (likely(!buf.hasUnreadData())) - return; + return ReturnType(true); /// We have some unread data in PeekableReadBuffer own memory. /// It can happen only if there is a string instead of a number @@ -360,6 +366,9 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col /// We also should delete incorrectly deserialized value from nested column. nested_column.popBack(1); + if constexpr (!throw_exception) + return ReturnType(false); + if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos) throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation " "containing '\\t' or '\\n' may not work correctly for large input."); @@ -377,7 +386,63 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col istr.count(), std::string(pos, buf.position() - pos), parsed_value.str()); }; - return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); + return deserializeImpl(column, peekable_buf, check_for_null, deserialize_nested_with_check, is_null); +} + +void SerializationNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextEscapedAndRawImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextEscapedAndRawImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextEscaped(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextEscapedAndRawImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextEscaped(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextEscapedAndRawImpl(nested_column, istr, settings, nested_serialization, is_null); +} + +void SerializationNullable::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextEscapedAndRawImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextEscapedAndRawImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextRaw(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextEscapedAndRawImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextRaw(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextEscapedAndRawImpl(nested_column, istr, settings, nested_serialization, is_null); } void SerializationNullable::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -385,45 +450,51 @@ void SerializationNullable::serializeTextQuoted(const IColumn & column, size_t r const ColumnNullable & col = assert_cast(column); if (col.isNullAt(row_num)) - writeCString("NULL", ostr); + serializeNullQuoted(ostr); else nested->serializeTextQuoted(col.getNestedColumn(), row_num, ostr, settings); } - -void SerializationNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::serializeNullQuoted(DB::WriteBuffer & ostr) { - deserializeTextQuotedImpl(column, istr, settings, nested); + writeCString("NULL", ostr); +} + +bool SerializationNullable::tryDeserializeNullQuoted(DB::ReadBuffer & istr) +{ + return checkStringCaseInsensitive("NULL", istr); } template -ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) +ReturnType deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested, bool & is_null) { + static constexpr bool throw_exception = std::is_same_v; + + auto deserialize_nested = [&nested, &settings] (IColumn & nested_column, ReadBuffer & buf) + { + if constexpr (!throw_exception) + return nested->tryDeserializeTextQuoted(nested_column, buf, settings); + nested->deserializeTextQuoted(nested_column, buf, settings); + }; + if (istr.eof() || (*istr.position() != 'N' && *istr.position() != 'n')) { /// This is not null, surely. - return safeDeserialize(column, *nested, - [] { return false; }, - [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextQuoted(nested_column, istr, settings); }); + return deserializeImpl(column, istr, [](ReadBuffer &){ return false; }, deserialize_nested, is_null); } /// Check if we have enough data in buffer to check if it's a null. if (istr.available() >= 4) { - auto check_for_null = [&istr]() + auto check_for_null = [](ReadBuffer & buf) { - auto * pos = istr.position(); - if (checkStringCaseInsensitive("NULL", istr)) + auto * pos = buf.position(); + if (checkStringCaseInsensitive("NULL", buf)) return true; - istr.position() = pos; + buf.position() = pos; return false; }; - auto deserialize_nested = [&nested, &settings, &istr] (IColumn & nested_column) - { - nested->deserializeTextQuoted(nested_column, istr, settings); - }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); } /// We don't have enough data in buffer to check if it's a NULL @@ -431,9 +502,10 @@ ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, Re /// to differentiate for example NULL and NaN for float) /// Use PeekableReadBuffer to make a checkpoint before checking /// null and rollback if the check was failed. - PeekableReadBuffer buf(istr, true); - auto check_for_null = [&buf]() + PeekableReadBuffer peekable_buf(istr, true); + auto check_for_null = [](ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); if (checkStringCaseInsensitive("NULL", buf)) @@ -443,39 +515,74 @@ ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, Re return false; }; - auto deserialize_nested = [&nested, &settings, &buf] (IColumn & nested_column) + auto deserialize_nested_with_check = [&deserialize_nested] (IColumn & nested_column, ReadBuffer & buf_) { - nested->deserializeTextQuoted(nested_column, buf, settings); + auto & buf = assert_cast(buf_); + + if constexpr (throw_exception) + deserialize_nested(nested_column, buf); + else if (!deserialize_nested(nested_column, buf)) + return false; + /// Check that we don't have any unread data in PeekableReadBuffer own memory. if (likely(!buf.hasUnreadData())) - return; + return ReturnType(true); /// We have some unread data in PeekableReadBuffer own memory. /// It can happen only if there is an unquoted string instead of a number. /// We also should delete incorrectly deserialized value from nested column. nested_column.popBack(1); + + if constexpr (!throw_exception) + return ReturnType(false); + throw DB::Exception( ErrorCodes::CANNOT_READ_ALL_DATA, "Error while parsing Nullable: got an unquoted string {} instead of a number", String(buf.position(), std::min(10ul, buf.available()))); }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return deserializeImpl(column, peekable_buf, check_for_null, deserialize_nested_with_check, is_null); } -void SerializationNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - deserializeWholeTextImpl(column, istr, settings, nested); + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextQuotedImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextQuotedImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextQuotedImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextQuotedImpl(nested_column, istr, settings, nested_serialization, is_null); } template -ReturnType SerializationNullable::deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) +ReturnType deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested, bool & is_null) { - PeekableReadBuffer buf(istr, true); - auto check_for_null = [&buf]() + static constexpr bool throw_exception = std::is_same_v; + + PeekableReadBuffer peekable_buf(istr, true); + auto check_for_null = [](ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); @@ -490,15 +597,46 @@ ReturnType SerializationNullable::deserializeWholeTextImpl(IColumn & column, Rea return false; }; - auto deserialize_nested = [&nested, &settings, &buf] (IColumn & nested_column) + auto deserialize_nested = [&nested, &settings] (IColumn & nested_column, ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); + if constexpr (!throw_exception) + return nested->tryDeserializeWholeText(nested_column, buf, settings); + nested->deserializeWholeText(nested_column, buf, settings); assert(!buf.hasUnreadData()); }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return deserializeImpl(column, peekable_buf, check_for_null, deserialize_nested, is_null); } +void SerializationNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeWholeTextImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeWholeTextImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedWholeText(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeWholeTextImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedWholeText(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeWholeTextImpl(nested_column, istr, settings, nested_serialization, is_null); +} void SerializationNullable::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -510,48 +648,56 @@ void SerializationNullable::serializeTextCSV(const IColumn & column, size_t row_ nested->serializeTextCSV(col.getNestedColumn(), row_num, ostr, settings); } -void SerializationNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::serializeNullCSV(DB::WriteBuffer & ostr, const DB::FormatSettings & settings) { - deserializeTextCSVImpl(column, istr, settings, nested); + writeString(settings.csv.null_representation, ostr); +} + +bool SerializationNullable::tryDeserializeNullCSV(DB::ReadBuffer & istr, const DB::FormatSettings & settings) +{ + return checkString(settings.csv.null_representation, istr); } template -ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested_serialization) +ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization, bool & is_null) { + static constexpr bool throw_exception = std::is_same_v; + + auto deserialize_nested = [&nested_serialization, &settings] (IColumn & nested_column, ReadBuffer & buf) + { + if constexpr (!throw_exception) + return nested_serialization->tryDeserializeTextCSV(nested_column, buf, settings); + nested_serialization->deserializeTextCSV(nested_column, buf, settings); + }; + const String & null_representation = settings.csv.null_representation; if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) { /// This is not null, surely. - return safeDeserialize(column, *nested_serialization, - [] { return false; }, - [&nested_serialization, &istr, &settings] (IColumn & nested_column) { nested_serialization->deserializeTextCSV(nested_column, istr, settings); }); + return deserializeImpl(column, istr, [](ReadBuffer &){ return false; }, deserialize_nested, is_null); } /// Check if we have enough data in buffer to check if it's a null. if (settings.csv.custom_delimiter.empty() && istr.available() > null_representation.size()) { - auto check_for_null = [&istr, &null_representation, &settings]() + auto check_for_null = [&null_representation, &settings](ReadBuffer & buf) { - auto * pos = istr.position(); - if (checkString(null_representation, istr) && (*istr.position() == settings.csv.delimiter || *istr.position() == '\r' || *istr.position() == '\n')) + auto * pos = buf.position(); + if (checkString(null_representation, buf) && (*buf.position() == settings.csv.delimiter || *buf.position() == '\r' || *buf.position() == '\n')) return true; - istr.position() = pos; + buf.position() = pos; return false; }; - auto deserialize_nested = [&nested_serialization, &settings, &istr] (IColumn & nested_column) - { - nested_serialization->deserializeTextCSV(nested_column, istr, settings); - }; - return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); + return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); } /// We don't have enough data in buffer to check if it's a null. /// Use PeekableReadBuffer to make a checkpoint before checking null /// representation and rollback if the check was failed. - PeekableReadBuffer buf(istr, true); - auto check_for_null = [&buf, &null_representation, &settings]() + PeekableReadBuffer peekable_buf(istr, true); + auto check_for_null = [&null_representation, &settings](ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); if (checkString(null_representation, buf)) @@ -574,13 +720,18 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB return false; }; - auto deserialize_nested = [&nested_serialization, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) + auto deserialize_nested_with_check = [&deserialize_nested, &nested_serialization, &settings, &null_representation, &istr] (IColumn & nested_column, ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); auto * pos = buf.position(); - nested_serialization->deserializeTextCSV(nested_column, buf, settings); + if constexpr (throw_exception) + deserialize_nested(nested_column, buf); + else if (!deserialize_nested(nested_column, buf)) + return ReturnType(false); + /// Check that we don't have any unread data in PeekableReadBuffer own memory. if (likely(!buf.hasUnreadData())) - return; + return ReturnType(true); /// We have some unread data in PeekableReadBuffer own memory. /// It can happen only if there is an unquoted string instead of a number @@ -589,6 +740,9 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB /// We also should delete incorrectly deserialized value from nested column. nested_column.popBack(1); + if constexpr (!throw_exception) + return ReturnType(false); + if (null_representation.find(settings.csv.delimiter) != std::string::npos || null_representation.find('\r') != std::string::npos || null_representation.find('\n') != std::string::npos) throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "CSV custom null representation containing " @@ -604,7 +758,35 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB istr.count(), std::string(pos, buf.position() - pos), parsed_value.str()); }; - return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); + return deserializeImpl(column, peekable_buf, check_for_null, deserialize_nested_with_check, is_null); +} + +void SerializationNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextCSVImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextCSVImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextCSVImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextCSV(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextCSVImpl(nested_column, istr, settings, nested_serialization, is_null); } void SerializationNullable::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -618,38 +800,86 @@ void SerializationNullable::serializeText(const IColumn & column, size_t row_num /// This assumes UTF-8 and proper font support. This is Ok, because Pretty formats are "presentational", not for data exchange. if (col.isNullAt(row_num)) - { - if (settings.pretty.charset == FormatSettings::Pretty::Charset::UTF8) - writeCString("ᴺᵁᴸᴸ", ostr); - else - writeCString("NULL", ostr); - } + serializeNullText(ostr, settings); else nested->serializeText(col.getNestedColumn(), row_num, ostr, settings); } +void SerializationNullable::serializeNullText(DB::WriteBuffer & ostr, const DB::FormatSettings & settings) +{ + if (settings.pretty.charset == FormatSettings::Pretty::Charset::UTF8) + writeCString("ᴺᵁᴸᴸ", ostr); + else + writeCString("NULL", ostr); +} + +bool SerializationNullable::tryDeserializeNullText(DB::ReadBuffer & istr) +{ + if (checkCharCaseInsensitive('N', istr)) + return checkStringCaseInsensitive("ULL", istr); + return checkStringCaseInsensitive("ᴺᵁᴸᴸ", istr); +} + void SerializationNullable::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { const ColumnNullable & col = assert_cast(column); if (col.isNullAt(row_num)) - writeCString("null", ostr); + serializeNullJSON(ostr); else nested->serializeTextJSON(col.getNestedColumn(), row_num, ostr, settings); } -void SerializationNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::serializeNullJSON(DB::WriteBuffer & ostr) { - deserializeTextJSONImpl(column, istr, settings, nested); + writeCString("null", ostr); +} + +bool SerializationNullable::tryDeserializeNullJSON(DB::ReadBuffer & istr) +{ + return checkString("null", istr); } template -ReturnType SerializationNullable::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) +ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested, bool & is_null) { - return safeDeserialize(column, *nested, - [&istr] { return checkStringByFirstCharacterAndAssertTheRest("null", istr); }, - [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextJSON(nested_column, istr, settings); }); + auto check_for_null = [](ReadBuffer & buf){ return checkStringByFirstCharacterAndAssertTheRest("null", buf); }; + auto deserialize_nested = [&nested, &settings](IColumn & nested_column, ReadBuffer & buf) + { + if constexpr (std::is_same_v) + return nested->tryDeserializeTextJSON(nested_column, buf, settings); + nested->deserializeTextJSON(nested_column, buf, settings); + }; + + return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); +} + +void SerializationNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextJSONImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextJSONImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); } void SerializationNullable::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -662,11 +892,9 @@ void SerializationNullable::serializeTextXML(const IColumn & column, size_t row_ nested->serializeTextXML(col.getNestedColumn(), row_num, ostr, settings); } -template bool SerializationNullable::deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); +void SerializationNullable::serializeNullXML(DB::WriteBuffer & ostr) +{ + writeCString("\\N", ostr); +} } diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h index 3ec01b46de5..37858ccdefd 100644 --- a/src/DataTypes/Serializations/SerializationNullable.h +++ b/src/DataTypes/Serializations/SerializationNullable.h @@ -51,9 +51,12 @@ public: void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; @@ -66,31 +69,49 @@ public: * In CSV, non-NULL string value, starting with \N characters, must be placed in quotes, to avoid ambiguity. */ void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - /// If ReturnType is bool, check for NULL and deserialize value into non-nullable column (and return true) or insert default value of nested type (and return false) - /// If ReturnType is void, deserialize Nullable(T) - template - static ReturnType deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); - template - static ReturnType deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); - template - static ReturnType deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); - template - static ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); - template - static ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); - template - static ReturnType deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); - template - static ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + /// If Check for NULL and deserialize value into non-nullable column (and return true) or insert default value of nested type (and return false) + static bool deserializeNullAsDefaultOrNestedWholeText(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextEscaped(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextQuoted(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextCSV(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextJSON(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextRaw(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + + /// If Check for NULL and deserialize value into non-nullable column or insert default value of nested type. + /// Return true if parsing was successful and false in case of any error. + static bool tryDeserializeNullAsDefaultOrNestedWholeText(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextEscaped(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextQuoted(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextCSV(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextJSON(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextRaw(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + + + static void serializeNullEscaped(WriteBuffer & ostr, const FormatSettings & settings); + static bool tryDeserializeNullEscaped(ReadBuffer & istr, const FormatSettings & settings); + static void serializeNullQuoted(WriteBuffer & ostr); + static bool tryDeserializeNullQuoted(ReadBuffer & istr); + static void serializeNullCSV(WriteBuffer & ostr, const FormatSettings & settings); + static bool tryDeserializeNullCSV(ReadBuffer & istr, const FormatSettings & settings); + static void serializeNullJSON(WriteBuffer & ostr); + static bool tryDeserializeNullJSON(ReadBuffer & istr); + static void serializeNullRaw(WriteBuffer & ostr, const FormatSettings & settings); + static bool tryDeserializeNullRaw(ReadBuffer & istr, const FormatSettings & settings); + static void serializeNullText(WriteBuffer & ostr, const FormatSettings & settings); + static bool tryDeserializeNullText(ReadBuffer & istr); + static void serializeNullXML(WriteBuffer & ostr); private: struct SubcolumnCreator : public ISubcolumnCreator diff --git a/src/DataTypes/Serializations/SerializationNumber.cpp b/src/DataTypes/Serializations/SerializationNumber.cpp index b6c7e4618b8..bdb4dfc6735 100644 --- a/src/DataTypes/Serializations/SerializationNumber.cpp +++ b/src/DataTypes/Serializations/SerializationNumber.cpp @@ -37,6 +37,18 @@ void SerializationNumber::deserializeText(IColumn & column, ReadBuffer & istr throwUnexpectedDataAfterParsedValue(column, istr, settings, "Number"); } +template +bool SerializationNumber::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const +{ + T x; + + if (!tryReadText(x, istr) || (whole && !istr.eof())) + return false; + + assert_cast &>(column).getData().push_back(x); + return true; +} + template void SerializationNumber::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -44,9 +56,10 @@ void SerializationNumber::serializeTextJSON(const IColumn & column, size_t ro writeJSONNumber(x, ostr, settings); } -template -void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +template +ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) { + static constexpr bool throw_exception = std::is_same_v; bool has_quote = false; if (!istr.eof() && *istr.position() == '"') /// We understand the number both in quotes and without. { @@ -54,13 +67,16 @@ void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & ++istr.position(); } - FieldType x; + T x; /// null if (!has_quote && !istr.eof() && *istr.position() == 'n') { ++istr.position(); - assertString("ull", istr); + if constexpr (throw_exception) + assertString("ull", istr); + else if (!checkString("ull", istr)) + return ReturnType(false); x = NaNOrZero(); } @@ -73,26 +89,62 @@ void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & { // extra conditions to parse true/false strings into 1/0 if (istr.eof()) - throwReadAfterEOF(); + { + if constexpr (throw_exception) + throwReadAfterEOF(); + else + return false; + } + if (*istr.position() == 't' || *istr.position() == 'f') { bool tmp = false; - readBoolTextWord(tmp, istr); + if constexpr (throw_exception) + readBoolTextWord(tmp, istr); + else if (!readBoolTextWord(tmp, istr)) + return ReturnType(false); + x = tmp; } else - readText(x, istr); + { + if constexpr (throw_exception) + readText(x, istr); + else if (!tryReadText(x, istr)) + return ReturnType(false); + } } else { - readText(x, istr); + if constexpr (throw_exception) + readText(x, istr); + else if (!tryReadText(x, istr)) + return ReturnType(false); } if (has_quote) - assertChar('"', istr); + { + if constexpr (throw_exception) + assertChar('"', istr); + else if (!checkChar('"', istr)) + return ReturnType(false); + } } assert_cast &>(column).getData().push_back(x); + return ReturnType(true); +} + +template +void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextJSONImpl(column, istr, settings); +} + +template +bool SerializationNumber::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return deserializeTextJSONImpl(column, istr, settings); } template @@ -103,6 +155,16 @@ void SerializationNumber::deserializeTextCSV(IColumn & column, ReadBuffer & i assert_cast &>(column).getData().push_back(x); } +template +bool SerializationNumber::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & /*settings*/) const +{ + FieldType x; + if (!tryReadCSV(x, istr)) + return false; + assert_cast &>(column).getData().push_back(x); + return true; +} + template void SerializationNumber::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const { diff --git a/src/DataTypes/Serializations/SerializationNumber.h b/src/DataTypes/Serializations/SerializationNumber.h index 972c6c9a30f..9d53dc9c494 100644 --- a/src/DataTypes/Serializations/SerializationNumber.h +++ b/src/DataTypes/Serializations/SerializationNumber.h @@ -20,9 +20,12 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; /** Format is platform-dependent. */ void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; diff --git a/src/DataTypes/Serializations/SerializationString.cpp b/src/DataTypes/Serializations/SerializationString.cpp index b2b083fd466..fd46206e9ad 100644 --- a/src/DataTypes/Serializations/SerializationString.cpp +++ b/src/DataTypes/Serializations/SerializationString.cpp @@ -208,7 +208,7 @@ static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnSt data[offset - 1] = 0; } - data.resize(offset); + data.resize_exact(offset); } @@ -272,50 +272,85 @@ void SerializationString::serializeTextEscaped(const IColumn & column, size_t ro } -template -static inline void read(IColumn & column, Reader && reader) +template +static inline ReturnType read(IColumn & column, Reader && reader) { + static constexpr bool throw_exception = std::is_same_v; ColumnString & column_string = assert_cast(column); ColumnString::Chars & data = column_string.getChars(); ColumnString::Offsets & offsets = column_string.getOffsets(); size_t old_chars_size = data.size(); size_t old_offsets_size = offsets.size(); - try - { - reader(data); - data.push_back(0); - offsets.push_back(data.size()); - } - catch (...) + auto restore_column = [&]() { offsets.resize_assume_reserved(old_offsets_size); data.resize_assume_reserved(old_chars_size); - throw; + }; + + try + { + if constexpr (throw_exception) + { + reader(data); + } + else if (!reader(data)) + { + restore_column(); + return false; + } + + data.push_back(0); + offsets.push_back(data.size()); + return ReturnType(true); + } + catch (...) + { + restore_column(); + if constexpr (throw_exception) + throw; + else + return false; } } void SerializationString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - read(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); }); } +bool SerializationString::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return read(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); return true; }); +} void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); }); } - -void SerializationString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +bool SerializationString::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - writeQuotedString(assert_cast(column).getDataAt(row_num), ostr); + return read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); return true; }); +} + +void SerializationString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + if (settings.values.escape_quote_with_quote) + writeQuotedStringPostgreSQL(assert_cast(column).getDataAt(row_num).toView(), ostr); + else + writeQuotedString(assert_cast(column).getDataAt(row_num), ostr); } void SerializationString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - read(column, [&](ColumnString::Chars & data) { readQuotedStringInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readQuotedStringInto(data, istr); }); +} + +bool SerializationString::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return read(column, [&](ColumnString::Chars & data) { return tryReadQuotedStringInto(data, istr); }); } @@ -329,11 +364,11 @@ void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & ist { if (settings.json.read_objects_as_strings && !istr.eof() && *istr.position() == '{') { - read(column, [&](ColumnString::Chars & data) { readJSONObjectPossiblyInvalid(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readJSONObjectPossiblyInvalid(data, istr); }); } else if (settings.json.read_arrays_as_strings && !istr.eof() && *istr.position() == '[') { - read(column, [&](ColumnString::Chars & data) { readJSONArrayInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readJSONArrayInto(data, istr); }); } else if (settings.json.read_bools_as_strings && !istr.eof() && (*istr.position() == 't' || *istr.position() == 'f')) { @@ -349,7 +384,7 @@ void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & ist str_value = "false"; } - read(column, [&](ColumnString::Chars & data) { data.insert(str_value.begin(), str_value.end()); }); + read(column, [&](ColumnString::Chars & data) { data.insert(str_value.begin(), str_value.end()); }); } else if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"') { @@ -358,12 +393,60 @@ void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & ist Float64 tmp; ReadBufferFromString buf(field); if (tryReadFloatText(tmp, buf) && buf.eof()) - read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); }); + read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); }); else throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON String value here: {}", field); } else - read(column, [&](ColumnString::Chars & data) { readJSONStringInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readJSONStringInto(data, istr); }); +} + +bool SerializationString::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.json.read_objects_as_strings && !istr.eof() && *istr.position() == '{') + return read(column, [&](ColumnString::Chars & data) { return readJSONObjectPossiblyInvalid(data, istr); }); + + if (settings.json.read_arrays_as_strings && !istr.eof() && *istr.position() == '[') + return read(column, [&](ColumnString::Chars & data) { return readJSONArrayInto(data, istr); }); + + if (settings.json.read_bools_as_strings && !istr.eof() && (*istr.position() == 't' || *istr.position() == 'f')) + { + String str_value; + if (*istr.position() == 't') + { + if (!checkString("true", istr)) + return false; + str_value = "true"; + } + else if (*istr.position() == 'f') + { + if (!checkString("false", istr)) + return false; + str_value = "false"; + } + + read(column, [&](ColumnString::Chars & data) { data.insert(str_value.begin(), str_value.end()); }); + return true; + } + + if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"') + { + String field; + if (!tryReadJSONField(field, istr)) + return false; + + Float64 tmp; + ReadBufferFromString buf(field); + if (tryReadFloatText(tmp, buf) && buf.eof()) + { + read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); }); + return true; + } + + return false; + } + + return read(column, [&](ColumnString::Chars & data) { return tryReadJSONStringInto(data, istr); }); } @@ -381,7 +464,12 @@ void SerializationString::serializeTextCSV(const IColumn & column, size_t row_nu void SerializationString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); }); + read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); }); +} + +bool SerializationString::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); return true; }); } void SerializationString::serializeTextMarkdown( diff --git a/src/DataTypes/Serializations/SerializationString.h b/src/DataTypes/Serializations/SerializationString.h index cd4cdf79c11..89ab84f0d22 100644 --- a/src/DataTypes/Serializations/SerializationString.h +++ b/src/DataTypes/Serializations/SerializationString.h @@ -18,20 +18,25 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; }; diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index cbbe97eb05c..399ad870d60 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -62,15 +62,34 @@ void SerializationTuple::serializeBinary(const IColumn & column, size_t row_num, } -template -static void addElementSafe(size_t num_elems, IColumn & column, F && impl) +template +static ReturnType addElementSafe(size_t num_elems, IColumn & column, F && impl) { + static constexpr bool throw_exception = std::is_same_v; + /// We use the assumption that tuples of zero size do not exist. size_t old_size = column.size(); + auto restore_elements = [&]() + { + for (size_t i = 0; i < num_elems; ++i) + { + auto & element_column = extractElementColumn(column, i); + if (element_column.size() > old_size) + { + chassert(element_column.size() - old_size == 1); + element_column.popBack(1); + } + } + }; + try { - impl(); + if (!impl()) + { + restore_elements(); + return ReturnType(false); + } // Check that all columns now have the same size. size_t new_size = column.size(); @@ -81,30 +100,32 @@ static void addElementSafe(size_t num_elems, IColumn & column, F && impl) { // This is not a logical error because it may work with // user-supplied data. - throw Exception(ErrorCodes::SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH, - "Cannot read a tuple because not all elements are present"); + if constexpr (throw_exception) + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH, + "Cannot read a tuple because not all elements are present"); + restore_elements(); + return ReturnType(false); } } } catch (...) { - for (size_t i = 0; i < num_elems; ++i) - { - auto & element_column = extractElementColumn(column, i); - if (element_column.size() > old_size) - element_column.popBack(1); - } - - throw; + restore_elements(); + if constexpr (throw_exception) + throw; + return ReturnType(false); } + + return ReturnType(true); } void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - addElementSafe(elems.size(), column, [&] + addElementSafe(elems.size(), column, [&] { for (size_t i = 0; i < elems.size(); ++i) elems[i]->deserializeBinary(extractElementColumn(column, i), istr, settings); + return true; }); } @@ -120,25 +141,51 @@ void SerializationTuple::serializeText(const IColumn & column, size_t row_num, W writeChar(')', ostr); } -void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +template +ReturnType SerializationTuple::deserializeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const { - const size_t size = elems.size(); - assertChar('(', istr); + static constexpr bool throw_exception = std::is_same_v; - addElementSafe(elems.size(), column, [&] + const size_t size = elems.size(); + if constexpr (throw_exception) + assertChar('(', istr); + else if (!checkChar('(', istr)) + return ReturnType(false); + + auto impl = [&]() { for (size_t i = 0; i < size; ++i) { skipWhitespaceIfAny(istr); if (i != 0) { - assertChar(',', istr); + if constexpr (throw_exception) + assertChar(',', istr); + else if (!checkChar(',', istr)) + return false; + skipWhitespaceIfAny(istr); } - if (settings.null_as_default) - SerializationNullable::deserializeTextQuotedImpl(extractElementColumn(column, i), istr, settings, elems[i]); + + auto & element_column = extractElementColumn(column, i); + if constexpr (throw_exception) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(element_column, istr, settings, elems[i]); + else + elems[i]->deserializeTextQuoted(element_column, istr, settings); + } else - elems[i]->deserializeTextQuoted(extractElementColumn(column, i), istr, settings); + { + bool ok; + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + ok = SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(element_column, istr, settings, elems[i]); + else + ok = elems[i]->tryDeserializeTextQuoted(element_column, istr, settings); + + if (!ok) + return false; + } } // Special format for one element tuple (1,) @@ -150,11 +197,32 @@ void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, co } skipWhitespaceIfAny(istr); - assertChar(')', istr); + if constexpr (throw_exception) + assertChar(')', istr); + else if (!checkChar(')', istr)) + return false; if (whole && !istr.eof()) - throwUnexpectedDataAfterParsedValue(column, istr, settings, "Tuple"); - }); + { + if constexpr (throw_exception) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "Tuple"); + return false; + } + + return true; + }; + + return addElementSafe(elems.size(), column, impl); +} + +void SerializationTuple::deserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, bool whole) const +{ + deserializeTextImpl(column, istr, settings, whole); +} + +bool SerializationTuple::tryDeserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, bool whole) const +{ + return deserializeTextImpl(column, istr, settings, whole); } void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -239,16 +307,40 @@ void SerializationTuple::serializeTextJSONPretty(const IColumn & column, size_t } } -void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +template +ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { + static constexpr bool throw_exception = std::is_same_v; + + auto deserialize_element = [&](IColumn & element_column, size_t element_pos) + { + if constexpr (throw_exception) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(element_column, istr, settings, elems[element_pos]); + else + elems[element_pos]->deserializeTextJSON(element_column, istr, settings); + return true; + } + else + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(element_column, istr, settings, elems[element_pos]); + return elems[element_pos]->tryDeserializeTextJSON(element_column, istr, settings); + } + }; + if (settings.json.read_named_tuples_as_objects && have_explicit_names) { skipWhitespaceIfAny(istr); - assertChar('{', istr); + if constexpr (throw_exception) + assertChar('{', istr); + else if (!checkChar('{', istr)) + return ReturnType(false); skipWhitespaceIfAny(istr); - addElementSafe(elems.size(), column, [&] + auto impl = [&]() { std::vector seen_elements(elems.size(), 0); size_t processed = 0; @@ -256,18 +348,32 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr while (!istr.eof() && *istr.position() != '}') { if (!settings.json.ignore_unknown_keys_in_named_tuple && processed == elems.size()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected number of elements in named tuple. Expected no more than {} (consider enabling input_format_json_ignore_unknown_keys_in_named_tuple setting)", elems.size()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected number of elements in named tuple. Expected no more than {} (consider enabling input_format_json_ignore_unknown_keys_in_named_tuple setting)", elems.size()); + return false; + } if (processed + skipped > 0) { - assertChar(',', istr); + if constexpr (throw_exception) + assertChar(',', istr); + else if (!checkChar(',', istr)) + return false; skipWhitespaceIfAny(istr); } std::string name; - readDoubleQuotedString(name, istr); + if constexpr (throw_exception) + readDoubleQuotedString(name, istr); + else if (!tryReadDoubleQuotedString(name, istr)) + return false; + skipWhitespaceIfAny(istr); - assertChar(':', istr); + if constexpr (throw_exception) + assertChar(':', istr); + else if (!checkChar(':', istr)) + return false; skipWhitespaceIfAny(istr); const size_t element_pos = getPositionByName(name); @@ -275,36 +381,52 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr { if (settings.json.ignore_unknown_keys_in_named_tuple) { - skipJSONField(istr, name); + if constexpr (throw_exception) + skipJSONField(istr, name); + else if (!trySkipJSONField(istr, name)) + return false; + skipWhitespaceIfAny(istr); ++skipped; continue; } else - throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, "Tuple doesn't have element with name '{}', enable setting input_format_json_ignore_unknown_keys_in_named_tuple", name); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, "Tuple doesn't have element with name '{}', enable setting input_format_json_ignore_unknown_keys_in_named_tuple", name); + return false; + } } seen_elements[element_pos] = 1; auto & element_column = extractElementColumn(column, element_pos); - try + if constexpr (throw_exception) { - if (settings.null_as_default) - SerializationNullable::deserializeTextJSONImpl(element_column, istr, settings, elems[element_pos]); - else - elems[element_pos]->deserializeTextJSON(element_column, istr, settings); + try + { + deserialize_element(element_column, element_pos); + } + catch (Exception & e) + { + e.addMessage("(while reading the value of nested key " + name + ")"); + throw; + } } - catch (Exception & e) + else { - e.addMessage("(while reading the value of nested key " + name + ")"); - throw; + if (!deserialize_element(element_column, element_pos)) + return false; } skipWhitespaceIfAny(istr); ++processed; } - assertChar('}', istr); + if constexpr (throw_exception) + assertChar('}', istr); + else if (!checkChar('}', istr)) + return false; /// Check if we have missing elements. if (processed != elems.size()) @@ -315,41 +437,81 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr continue; if (!settings.json.defaults_for_missing_elements_in_named_tuple) - throw Exception( - ErrorCodes::INCORRECT_DATA, - "JSON object doesn't contain tuple element {}. If you want to insert defaults in case of missing elements, " - "enable setting input_format_json_defaults_for_missing_elements_in_named_tuple", - elems[element_pos]->getElementName()); + { + if constexpr (throw_exception) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "JSON object doesn't contain tuple element {}. If you want to insert defaults in case of missing elements, " + "enable setting input_format_json_defaults_for_missing_elements_in_named_tuple", + elems[element_pos]->getElementName()); + return false; + } auto & element_column = extractElementColumn(column, element_pos); element_column.insertDefault(); } } - }); + + return true; + }; + + return addElementSafe(elems.size(), column, impl); } else { - assertChar('[', istr); + skipWhitespaceIfAny(istr); + if constexpr (throw_exception) + assertChar('[', istr); + else if (!checkChar('[', istr)) + return false; + skipWhitespaceIfAny(istr); - addElementSafe(elems.size(), column, [&] + auto impl = [&]() { for (size_t i = 0; i < elems.size(); ++i) { skipWhitespaceIfAny(istr); if (i != 0) { - assertChar(',', istr); + if constexpr (throw_exception) + assertChar(',', istr); + else if (!checkChar(',', istr)) + return false; skipWhitespaceIfAny(istr); } - elems[i]->deserializeTextJSON(extractElementColumn(column, i), istr, settings); + + auto & element_column = extractElementColumn(column, i); + + if constexpr (throw_exception) + deserialize_element(element_column, i); + else if (!deserialize_element(element_column, i)) + return false; } skipWhitespaceIfAny(istr); - assertChar(']', istr); - }); + if constexpr (throw_exception) + assertChar(']', istr); + else if (!checkChar(']', istr)) + return false; + + return true; + }; + + return addElementSafe(elems.size(), column, impl); } } +void SerializationTuple::deserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeTextJSONImpl(column, istr, settings); +} + +bool SerializationTuple::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + return deserializeTextJSONImpl(column, istr, settings); +} + + void SerializationTuple::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeCString("", ostr); @@ -374,7 +536,7 @@ void SerializationTuple::serializeTextCSV(const IColumn & column, size_t row_num void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - addElementSafe(elems.size(), column, [&] + addElementSafe(elems.size(), column, [&] { const size_t size = elems.size(); for (size_t i = 0; i < size; ++i) @@ -385,11 +547,46 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, assertChar(settings.csv.tuple_delimiter, istr); skipWhitespaceIfAny(istr); } - if (settings.null_as_default) - SerializationNullable::deserializeTextCSVImpl(extractElementColumn(column, i), istr, settings, elems[i]); + + auto & element_column = extractElementColumn(column, i); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(element_column, istr, settings, elems[i]); else - elems[i]->deserializeTextCSV(extractElementColumn(column, i), istr, settings); + elems[i]->deserializeTextCSV(element_column, istr, settings); } + return true; + }); +} + +bool SerializationTuple::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return addElementSafe(elems.size(), column, [&] + { + const size_t size = elems.size(); + for (size_t i = 0; i < size; ++i) + { + if (i != 0) + { + skipWhitespaceIfAny(istr); + if (!checkChar(settings.csv.tuple_delimiter, istr)) + return false; + skipWhitespaceIfAny(istr); + } + + auto & element_column = extractElementColumn(column, i); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + { + if (!SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextCSV(element_column, istr, settings, elems[i])) + return false; + } + else + { + if (!elems[i]->tryDeserializeTextCSV(element_column, istr, settings)) + return false; + } + } + + return true; }); } diff --git a/src/DataTypes/Serializations/SerializationTuple.h b/src/DataTypes/Serializations/SerializationTuple.h index 7325259f440..d9c63a05217 100644 --- a/src/DataTypes/Serializations/SerializationTuple.h +++ b/src/DataTypes/Serializations/SerializationTuple.h @@ -23,14 +23,17 @@ public: void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; /// Tuples in CSV format will be serialized as separate columns (that is, losing their nesting in the tuple). void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Each sub-column in a tuple is serialized in separate stream. */ @@ -73,6 +76,15 @@ private: bool have_explicit_names; size_t getPositionByName(const String & name) const; + + template + ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const; + + template + ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; + + template + ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; }; } diff --git a/src/DataTypes/Serializations/SerializationUUID.cpp b/src/DataTypes/Serializations/SerializationUUID.cpp index 5cf17b4c0c8..5a7aeca67a0 100644 --- a/src/DataTypes/Serializations/SerializationUUID.cpp +++ b/src/DataTypes/Serializations/SerializationUUID.cpp @@ -25,15 +25,16 @@ void SerializationUUID::deserializeText(IColumn & column, ReadBuffer & istr, con throwUnexpectedDataAfterParsedValue(column, istr, settings, "UUID"); } -void SerializationUUID::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +bool SerializationUUID::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const { - deserializeText(column, istr, settings, false); + UUID x; + if (!tryReadText(x, istr) || (whole && !istr.eof())) + return false; + + assert_cast(column).getData().push_back(x); + return true; } -void SerializationUUID::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeText(column, row_num, ostr, settings); -} void SerializationUUID::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -76,6 +77,17 @@ void SerializationUUID::deserializeTextQuoted(IColumn & column, ReadBuffer & ist assert_cast(column).getData().push_back(std::move(uuid)); /// It's important to do this at the end - for exception safety. } +bool SerializationUUID::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID uuid; + String field; + if (!checkChar('\'', istr) || !tryReadText(uuid, istr) || !checkChar('\'', istr)) + return false; + + assert_cast(column).getData().push_back(std::move(uuid)); + return true; +} + void SerializationUUID::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -92,6 +104,15 @@ void SerializationUUID::deserializeTextJSON(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(x); } +bool SerializationUUID::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID x; + if (!checkChar('"', istr) || !tryReadText(x, istr) || !checkChar('"', istr)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationUUID::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -106,6 +127,14 @@ void SerializationUUID::deserializeTextCSV(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(value); } +bool SerializationUUID::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID value; + if (!tryReadCSV(value, istr)) + return false; + assert_cast(column).getData().push_back(value); + return true; +} void SerializationUUID::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const { diff --git a/src/DataTypes/Serializations/SerializationUUID.h b/src/DataTypes/Serializations/SerializationUUID.h index da8c15f7279..458504f8f42 100644 --- a/src/DataTypes/Serializations/SerializationUUID.h +++ b/src/DataTypes/Serializations/SerializationUUID.h @@ -10,14 +10,16 @@ class SerializationUUID : public SimpleTextSerialization public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override; diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp new file mode 100644 index 00000000000..5af94364167 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -0,0 +1,840 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int NOT_IMPLEMENTED; + extern const int INCORRECT_DATA; +} + +void SerializationVariant::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + const auto * type_variant = data.type ? &assert_cast(*data.type) : nullptr; + const auto * column_variant = data.column ? &assert_cast(*data.column) : nullptr; + + auto discriminators_serialization = std::make_shared(std::make_shared>(), "discr", SubstreamType::NamedVariantDiscriminators); + auto local_discriminators = column_variant ? column_variant->getLocalDiscriminatorsPtr() : nullptr; + + settings.path.push_back(Substream::VariantDiscriminators); + auto discriminators_data = SubstreamData(discriminators_serialization) + .withType(type_variant ? std::make_shared>() : nullptr) + .withColumn(column_variant ? column_variant->getLocalDiscriminatorsPtr() : nullptr) + .withSerializationInfo(data.serialization_info); + + settings.path.back().data = discriminators_data; + callback(settings.path); + settings.path.pop_back(); + + settings.path.push_back(Substream::VariantElements); + settings.path.back().data = data; + + for (size_t i = 0; i < variants.size(); ++i) + { + settings.path.back().creator = std::make_shared(local_discriminators, variant_names[i], i, column_variant ? column_variant->localDiscriminatorByGlobal(i) : i); + + auto variant_data = SubstreamData(variants[i]) + .withType(type_variant ? type_variant->getVariant(i) : nullptr) + .withColumn(column_variant ? column_variant->getVariantPtrByGlobalDiscriminator(i) : nullptr) + .withSerializationInfo(data.serialization_info); + + addVariantElementToPath(settings.path, i); + settings.path.back().data = variant_data; + variants[i]->enumerateStreams(settings, callback, variant_data); + settings.path.pop_back(); + } + + settings.path.pop_back(); +} + +struct SerializeBinaryBulkStateVariant : public ISerialization::SerializeBinaryBulkState +{ + std::vector states; +}; + +struct DeserializeBinaryBulkStateVariant : public ISerialization::DeserializeBinaryBulkState +{ + std::vector states; +}; + +void SerializationVariant::serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const ColumnVariant & col = assert_cast(column); + + auto variant_state = std::make_shared(); + variant_state->states.resize(variants.size()); + + settings.path.push_back(Substream::VariantElements); + + for (size_t i = 0; i < variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->serializeBinaryBulkStatePrefix(col.getVariantByGlobalDiscriminator(i), settings, variant_state->states[i]); + settings.path.pop_back(); + } + + settings.path.pop_back(); + state = std::move(variant_state); +} + + +void SerializationVariant::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto * variant_state = checkAndGetState(state); + + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i < variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->serializeBinaryBulkStateSuffix(settings, variant_state->states[i]); + settings.path.pop_back(); + } + settings.path.pop_back(); +} + + +void SerializationVariant::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + auto variant_state = std::make_shared(); + variant_state->states.resize(variants.size()); + + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i < variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->deserializeBinaryBulkStatePrefix(settings, variant_state->states[i]); + settings.path.pop_back(); + } + + settings.path.pop_back(); + state = std::move(variant_state); +} + + +void SerializationVariant::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const ColumnVariant & col = assert_cast(column); + if (const size_t size = col.size(); limit == 0 || offset + limit > size) + limit = size - offset; + + settings.path.push_back(Substream::VariantDiscriminators); + auto * discriminators_stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!discriminators_stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariant::serializeBinaryBulkWithMultipleStreams"); + + auto * variant_state = checkAndGetState(state); + + /// If offset = 0 and limit == col.size() or we have only NULLs, we don't need to calculate + /// offsets and limits for variants and need to just serialize whole columns. + if ((offset == 0 && limit == col.size()) || col.hasOnlyNulls()) + { + /// First, serialize discriminators. + /// If we have only NULLs or local and global discriminators are the same, just serialize the column as is. + if (col.hasOnlyNulls() || col.hasGlobalVariantsOrder()) + { + SerializationNumber().serializeBinaryBulk(col.getLocalDiscriminatorsColumn(), *discriminators_stream, offset, limit); + } + /// If local and global discriminators are different, we should convert local to global before serializing (because we don't serialize the mapping). + else + { + const auto & local_discriminators = col.getLocalDiscriminators(); + for (size_t i = offset; i != offset + limit; ++i) + writeBinaryLittleEndian(col.globalDiscriminatorByLocal(local_discriminators[i]), *discriminators_stream); + } + + /// Second, serialize variants in global order. + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i != variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->states[i]); + settings.path.pop_back(); + } + settings.path.pop_back(); + return; + } + + /// If we have only one non empty variant and no NULLs, we can use the same limit offset for this variant. + if (auto non_empty_local_discr = col.getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + /// First, serialize discriminators. + /// We know that all discriminators are the same, so we just need to serialize this discriminator limit times. + auto non_empty_global_discr = col.globalDiscriminatorByLocal(*non_empty_local_discr); + for (size_t i = 0; i != limit; ++i) + writeBinaryLittleEndian(non_empty_global_discr, *discriminators_stream); + + /// Second, serialize non-empty variant (other variants are empty and we can skip their serialization). + settings.path.push_back(Substream::VariantElements); + addVariantElementToPath(settings.path, non_empty_global_discr); + /// We can use the same offset/limit as for whole Variant column + variants[non_empty_global_discr]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(non_empty_global_discr), offset, limit, settings, variant_state->states[non_empty_global_discr]); + settings.path.pop_back(); + settings.path.pop_back(); + return; + } + + /// In general case we should iterate through local discriminators in range [offset, offset + limit] to serialize global discriminators and calculate offset/limit pair for each variant. + const auto & local_discriminators = col.getLocalDiscriminators(); + const auto & offsets = col.getOffsets(); + std::vector> variant_offsets_and_limits(variants.size(), {0, 0}); + size_t end = offset + limit; + for (size_t i = offset; i < end; ++i) + { + auto global_discr = col.globalDiscriminatorByLocal(local_discriminators[i]); + writeBinaryLittleEndian(global_discr, *discriminators_stream); + + if (global_discr != ColumnVariant::NULL_DISCRIMINATOR) + { + /// If we see this discriminator for the first time, update offset + if (!variant_offsets_and_limits[global_discr].second) + variant_offsets_and_limits[global_discr].first = offsets[i]; + /// Update limit for this discriminator. + ++variant_offsets_and_limits[global_discr].second; + } + } + + /// Serialize variants in global order. + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i != variants.size(); ++i) + { + /// Serialize variant only if we have its discriminator in the range. + if (variant_offsets_and_limits[i].second) + { + addVariantElementToPath(settings.path, i); + variants[i]->serializeBinaryBulkWithMultipleStreams( + col.getVariantByGlobalDiscriminator(i), + variant_offsets_and_limits[i].first, + variant_offsets_and_limits[i].second, + settings, + variant_state->states[i]); + settings.path.pop_back(); + } + } + settings.path.pop_back(); +} + + +void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto mutable_column = column->assumeMutable(); + ColumnVariant & col = assert_cast(*mutable_column); + /// We always serialize Variant column with global variants order, + /// so while deserialization column should be always with global variants order. + if (!col.hasGlobalVariantsOrder()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to deserialize data into Variant column with not global variants order"); + + /// First, deserialize discriminators. + settings.path.push_back(Substream::VariantDiscriminators); + if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) + { + col.getLocalDiscriminatorsPtr() = cached_discriminators; + } + else + { + auto * discriminators_stream = settings.getter(settings.path); + if (!discriminators_stream) + return; + + SerializationNumber().deserializeBinaryBulk(*col.getLocalDiscriminatorsPtr()->assumeMutable(), *discriminators_stream, limit, 0); + addToSubstreamsCache(cache, settings.path, col.getLocalDiscriminatorsPtr()); + } + settings.path.pop_back(); + + /// Second, calculate limits for each variant by iterating through new discriminators. + std::vector variant_limits(variants.size(), 0); + auto & discriminators_data = col.getLocalDiscriminators(); + size_t discriminators_offset = discriminators_data.size() - limit; + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) + { + ColumnVariant::Discriminator discr = discriminators_data[i]; + if (discr != ColumnVariant::NULL_DISCRIMINATOR) + ++variant_limits[discr]; + } + + /// Now we can deserialize variants according to their limits. + auto * variant_state = checkAndGetState(state); + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i != variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->deserializeBinaryBulkWithMultipleStreams(col.getVariantPtrByLocalDiscriminator(i), variant_limits[i], settings, variant_state->states[i], cache); + settings.path.pop_back(); + } + settings.path.pop_back(); + + /// Fill offsets column. + /// It's important to do it after deserialization of all variants, because to fill offsets we need + /// initial variants sizes without values in current range, but some variants can be shared with + /// other columns via substream cache and they can already contain values from this range even + /// before we call deserialize for them. So, before deserialize we cannot know for sure if + /// variant columns already contain values from current range or not. But after calling deserialize + /// we know for sure that they contain these values, so we can use valiant limits and their + /// new sizes to calculate correct offsets. + settings.path.push_back(Substream::VariantOffsets); + if (auto cached_offsets = getFromSubstreamsCache(cache, settings.path)) + { + col.getOffsetsPtr() = cached_offsets; + } + else + { + auto & offsets = col.getOffsets(); + offsets.reserve(offsets.size() + limit); + std::vector variant_offsets; + variant_offsets.reserve(variants.size()); + for (size_t i = 0; i != variants.size(); ++i) + variant_offsets.push_back(col.getVariantByLocalDiscriminator(i).size() - variant_limits[i]); + + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) + { + ColumnVariant::Discriminator discr = discriminators_data[i]; + if (discr == ColumnVariant::NULL_DISCRIMINATOR) + offsets.emplace_back(); + else + offsets.push_back(variant_offsets[discr]++); + } + + addToSubstreamsCache(cache, settings.path, col.getOffsetsPtr()); + } + settings.path.pop_back(); +} + +void SerializationVariant::addVariantElementToPath(DB::ISerialization::SubstreamPath & path, size_t i) const +{ + path.push_back(Substream::VariantElement); + path.back().variant_element_name = variant_names[i]; +} + +void SerializationVariant::serializeBinary(const Field & /*field*/, WriteBuffer & /*ostr*/, const FormatSettings & /*settings*/) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinary from a field is not implemented for SerializationVariant"); +} + +void SerializationVariant::deserializeBinary(Field & /*field*/, ReadBuffer & /*istr*/, const FormatSettings & /*settings*/) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method deserializeBinary to a field is not implemented for SerializationVariant"); +} + +void SerializationVariant::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + writeBinaryLittleEndian(global_discr, ostr); + if (global_discr != ColumnVariant::NULL_DISCRIMINATOR) + variants[global_discr]->serializeBinary(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +void SerializationVariant::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnVariant & col = assert_cast(column); + ColumnVariant::Discriminator global_discr; + readBinaryLittleEndian(global_discr, istr); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + { + col.insertDefault(); + } + else + { + auto & variant_column = col.getVariantByGlobalDiscriminator(global_discr); + variants[global_discr]->deserializeBinary(variant_column, istr, settings); + col.getLocalDiscriminators().push_back(col.localDiscriminatorByGlobal(global_discr)); + col.getOffsets().push_back(variant_column.size() - 1); + } +} + +namespace +{ + +const std::unordered_map & getTypesTextDeserializePriorityMap() +{ + static std::unordered_map priority_map = [] + { + static constexpr std::array priorities = { + /// Complex types have highest priority. + TypeIndex::Array, + TypeIndex::Tuple, + TypeIndex::Map, + TypeIndex::AggregateFunction, + + /// Enums can be parsed both from strings and numbers. + /// So they have high enough priority. + TypeIndex::Enum8, + TypeIndex::Enum16, + + /// Types that can be parsed from strings. + TypeIndex::UUID, + TypeIndex::IPv4, + TypeIndex::IPv6, + + /// Types that can be parsed from numbers. + /// The order: + /// 1) Integers + /// 2) Big Integers + /// 3) Decimals + /// 4) Floats + /// In each group small types have higher priority. + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Decimal32, + TypeIndex::Decimal64, + TypeIndex::Decimal128, + TypeIndex::Decimal256, + TypeIndex::Float32, + TypeIndex::Float64, + + /// Dates and DateTimes. More simple Date types have higher priority. + /// They have lower priority as numbers as some DateTimes sometimes can + /// be also parsed from numbers, but we don't want it usually. + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::DateTime64, + + /// String types have almost the lowest priority, + /// as in text formats almost all data can + /// be deserialized into String type. + TypeIndex::FixedString, + TypeIndex::String, + }; + + std::unordered_map pm; + + pm.reserve(priorities.size()); + for (size_t i = 0; i != priorities.size(); ++i) + pm[priorities[i]] = priorities.size() - i; + return pm; + }(); + + return priority_map; +} + +/// We want to create more or less optimal order of types in which we will try text deserializations. +/// To do it, for each type we calculate a priority and then sort them by this priority. +/// Above we defined priority of each data type, but types can be nested and also we can have LowCardinality and Nullable. +/// To sort any nested types we create a priority that is a tuple of 3 elements: +/// 1) The maximum depth of nested types like Array/Map/Tuple. +/// 2) The combination of simple and complex types priorities. +/// 3) The depth of nested types LowCardinality/Nullable. +/// So, when we will sort types, first we will sort by the maximum depth of nested types, so more nested types are deserialized first, +/// then for types with the same depth we sort by the types priority, and last we sort by the depth of LowCardinality/Nullable types, +/// so if we have types with the same level of nesting and the same priority, we will first try to deserialize LowCardinality/Nullable types +/// (for example if we have types Array(Array(String)) and Array(Array(Nullable(String))). +/// This is just a batch of heuristics. +std::tuple getTypeTextDeserializePriority(const DataTypePtr & type, size_t nested_depth, size_t simple_nested_depth, const std::unordered_map & priority_map) +{ + if (const auto * nullable_type = typeid_cast(type.get())) + return getTypeTextDeserializePriority(nullable_type->getNestedType(), nested_depth, simple_nested_depth + 1, priority_map); + + if (const auto * lc_type = typeid_cast(type.get())) + return getTypeTextDeserializePriority(lc_type->getDictionaryType(), nested_depth, simple_nested_depth + 1, priority_map); + + if (const auto * array_type = typeid_cast(type.get())) + { + auto [elements_nested_depth, elements_priority, elements_simple_nested_depth] = getTypeTextDeserializePriority(array_type->getNestedType(), nested_depth + 1, simple_nested_depth, priority_map); + return {elements_nested_depth, elements_priority + priority_map.at(TypeIndex::Array), elements_simple_nested_depth}; + } + + if (const auto * tuple_type = typeid_cast(type.get())) + { + size_t max_nested_depth = 0; + size_t sum_priority = 0; + size_t max_simple_nested_depth = 0; + for (const auto & elem : tuple_type->getElements()) + { + auto [elem_nested_depth, elem_priority, elem_simple_nested_depth] = getTypeTextDeserializePriority(elem, nested_depth + 1, simple_nested_depth, priority_map); + sum_priority += elem_priority; + if (elem_nested_depth > max_nested_depth) + max_nested_depth = elem_nested_depth; + if (elem_simple_nested_depth > max_simple_nested_depth) + max_simple_nested_depth = elem_simple_nested_depth; + } + + return {max_nested_depth, sum_priority + priority_map.at(TypeIndex::Tuple), max_simple_nested_depth}; + } + + if (const auto * map_type = typeid_cast(type.get())) + { + auto [key_max_depth, key_priority, key_simple_nested_depth] = getTypeTextDeserializePriority(map_type->getKeyType(), nested_depth + 1, simple_nested_depth, priority_map); + auto [value_max_depth, value_priority, value_simple_nested_depth] = getTypeTextDeserializePriority(map_type->getValueType(), nested_depth + 1, simple_nested_depth, priority_map); + return {std::max(key_max_depth, value_max_depth), key_priority + value_priority + priority_map.at(TypeIndex::Map), std::max(key_simple_nested_depth, value_simple_nested_depth)}; + } + + if (const auto * variant_type = typeid_cast(type.get())) + { + size_t max_priority = 0; + size_t max_depth = 0; + size_t max_simple_nested_depth = 0; + for (const auto & variant : variant_type->getVariants()) + { + auto [variant_max_depth, variant_priority, variant_simple_nested_depth] = getTypeTextDeserializePriority(variant, nested_depth, simple_nested_depth, priority_map); + if (variant_priority > max_priority) + max_priority = variant_priority; + if (variant_max_depth > max_depth) + max_depth = variant_max_depth; + if (variant_simple_nested_depth > max_simple_nested_depth) + max_simple_nested_depth = variant_simple_nested_depth; + } + + return {max_depth, max_priority, max_simple_nested_depth}; + } + + /// Bool type should have priority higher then all integers. + if (isBool(type)) + return {nested_depth, priority_map.at(TypeIndex::Int8) + 1, simple_nested_depth}; + + auto it = priority_map.find(type->getTypeId()); + return {nested_depth, it == priority_map.end() ? 0 : it->second, simple_nested_depth}; +} + +} + +std::vector SerializationVariant::getVariantsDeserializeTextOrder(const DB::DataTypes & variant_types) +{ + std::vector> priorities; + priorities.reserve(variant_types.size()); + std::vector order; + order.reserve(variant_types.size()); + const auto & priority_map = getTypesTextDeserializePriorityMap(); + for (size_t i = 0; i != variant_types.size(); ++i) + { + priorities.push_back(getTypeTextDeserializePriority(variant_types[i], 0, 0, priority_map)); + order.push_back(i); + } + + std::sort(order.begin(), order.end(), [&](size_t left, size_t right) { return priorities[left] > priorities[right]; }); + return order; +} + + +bool SerializationVariant::tryDeserializeImpl( + IColumn & column, + const String & field, + std::function check_for_null, + std::function try_deserialize_nested) const +{ + auto & column_variant = assert_cast(column); + ReadBufferFromString null_buf(field); + if (check_for_null(null_buf) && null_buf.eof()) + { + column_variant.insertDefault(); + return true; + } + + for (size_t global_discr : deserialize_text_order) + { + ReadBufferFromString variant_buf(field); + auto & variant_column = column_variant.getVariantByGlobalDiscriminator(global_discr); + size_t prev_size = variant_column.size(); + if (try_deserialize_nested(variant_column, variants[global_discr], variant_buf) && variant_buf.eof()) + { + column_variant.getLocalDiscriminators().push_back(column_variant.localDiscriminatorByGlobal(global_discr)); + column_variant.getOffsets().push_back(prev_size); + return true; + } + else if (variant_column.size() > prev_size) + { + variant_column.popBack(1); + } + } + + return false; +} + +void SerializationVariant::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullEscaped(ostr, settings); + else + variants[global_discr]->serializeTextEscaped(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readEscapedString(field, istr); + return tryDeserializeTextEscapedImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readEscapedString(field, istr); + if (!tryDeserializeTextEscapedImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse escaped value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextEscapedImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullEscaped(buf, settings); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextEscaped(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullRaw(ostr, settings); + else + variants[global_discr]->serializeTextRaw(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readString(field, istr); + return tryDeserializeTextRawImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readString(field, istr); + if (!tryDeserializeTextRawImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse raw value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextRawImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullRaw(buf, settings); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextRaw(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullQuoted(ostr); + else + variants[global_discr]->serializeTextQuoted(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + if (!tryReadQuotedField(field, istr)) + return false; + return tryDeserializeTextQuotedImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readQuotedField(field, istr); + if (!tryDeserializeTextQuotedImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse quoted value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextQuotedImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullQuoted(buf); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextQuoted(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullCSV(ostr, settings); + else + variants[global_discr]->serializeTextCSV(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readCSVStringInto(field, istr, settings.csv); + return tryDeserializeTextCSVImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readCSVField(field, istr, settings.csv); + if (!tryDeserializeTextCSVImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse CSV value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextCSVImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullCSV(buf, settings); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextCSV(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullText(ostr, settings); + else + variants[global_discr]->serializeText(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readStringUntilEOF(field, istr); + return tryDeserializeWholeTextImpl(column, field, settings); +} + +void SerializationVariant::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readStringUntilEOF(field, istr); + if (!tryDeserializeWholeTextImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse text value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeWholeTextImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullText(buf); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeWholeText(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullJSON(ostr); + else + variants[global_discr]->serializeTextJSON(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + if (!tryReadJSONField(field, istr)) + return false; + return tryDeserializeTextJSONImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readJSONField(field, istr); + if (!tryDeserializeTextJSONImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextJSONImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullJSON(buf); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextJSON(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullXML(ostr); + else + variants[global_discr]->serializeTextXML(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +} diff --git a/src/DataTypes/Serializations/SerializationVariant.h b/src/DataTypes/Serializations/SerializationVariant.h new file mode 100644 index 00000000000..3f53dcf1339 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationVariant.h @@ -0,0 +1,139 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/// Class for serializing/deserializing column with Variant type. +/// It supports both text and binary bulk serializations/deserializations. +/// +/// During text serialization it checks discriminator of the current row and +/// uses corresponding text serialization of this variant. +/// +/// During text deserialization it tries all variants deserializations +/// (using tryDeserializeText* methods of ISerialization) in predefined order +/// and inserts data in the first variant with succeeded deserialization. +/// +/// During binary bulk serialization it transforms local discriminators +/// to global and serializes them into a separate stream VariantDiscriminators. +/// Each variant is serialized into a separate stream with path VariantElements/VariantElement +/// (VariantElements stream is needed for correct sub-columns creation). We store and serialize +/// variants in a sparse form (the size of a variant column equals to the number of its discriminator +/// in the discriminators column), so during deserialization the limit for each variant is +/// calculated according to discriminators column. +/// Offsets column is not serialized and stored only in memory. +/// +/// During binary bulk deserialization we first deserialize discriminators from corresponding stream +/// and use them to calculate the limit for each variant. Each variant is deserialized from +/// corresponding stream using calculated limit. Offsets column is not deserialized and constructed +/// according to discriminators. +class SerializationVariant : public ISerialization +{ +public: + using VariantSerializations = std::vector; + + explicit SerializationVariant( + const VariantSerializations & variants_, + const std::vector & variant_names_, + const std::vector & deserialize_text_order_, + const String & variant_name_) + : variants(variants_), variant_names(variant_names_), deserialize_text_order(deserialize_text_order_), variant_name(variant_name_) + { + } + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + + /// Determine the order in which we should try to deserialize variants. + /// In some cases the text representation of a value can be deserialized + /// into several types (for example, almost all text values can be deserialized + /// into String type), so we uses some heuristics to determine the more optimal order. + static std::vector getVariantsDeserializeTextOrder(const DataTypes & variant_types); + +private: + void addVariantElementToPath(SubstreamPath & path, size_t i) const; + + bool tryDeserializeTextEscapedImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeTextQuotedImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeWholeTextImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeTextCSVImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeTextJSONImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeTextRawImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + + bool tryDeserializeImpl( + IColumn & column, + const String & field, + std::function check_for_null, + std::function try_deserialize_nested) const; + + VariantSerializations variants; + std::vector variant_names; + std::vector deserialize_text_order; + /// Name of Variant data type for better exception messages. + String variant_name; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp new file mode 100644 index 00000000000..053f8d22d5a --- /dev/null +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -0,0 +1,271 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +void SerializationVariantElement::enumerateStreams( + DB::ISerialization::EnumerateStreamsSettings & settings, + const DB::ISerialization::StreamCallback & callback, + const DB::ISerialization::SubstreamData & data) const +{ + /// We will need stream for discriminators during deserialization. + settings.path.push_back(Substream::VariantDiscriminators); + callback(settings.path); + settings.path.pop_back(); + + addVariantToPath(settings.path); + settings.path.back().data = data; + nested_serialization->enumerateStreams(settings, callback, data); + removeVariantFromPath(settings.path); +} + +void SerializationVariantElement::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationVariantElement"); +} + +void SerializationVariantElement::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationVariantElement"); +} + +struct DeserializeBinaryBulkStateVariantElement : public ISerialization::DeserializeBinaryBulkState +{ + /// During deserialization discriminators and variant streams can be shared. + /// For example we can read several variant elements together: "select v.UInt32, v.String from table", + /// or we can read the whole variant and some of variant elements: "select v, v.UInt32 from table". + /// To read the same column from the same stream more than once we use substream cache, + /// but this cache stores the whole column, not only the current range. + /// During deserialization of variant element discriminators and variant columns are not stored + /// in the result column, so we need to store them inside deserialization state, so we can use + /// substream cache correctly. + ColumnPtr discriminators; + ColumnPtr variant; + + ISerialization::DeserializeBinaryBulkStatePtr variant_element_state; +}; + +void SerializationVariantElement::deserializeBinaryBulkStatePrefix(DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const +{ + auto variant_element_state = std::make_shared(); + + addVariantToPath(settings.path); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, variant_element_state->variant_element_state); + removeVariantFromPath(settings.path); + + state = std::move(variant_element_state); +} + +void SerializationVariantElement::serializeBinaryBulkWithMultipleStreams(const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkWithMultipleStreams is not implemented for SerializationVariantElement"); +} + +void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & result_column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto * variant_element_state = checkAndGetState(state); + + /// First, deserialize discriminators from Variant column. + settings.path.push_back(Substream::VariantDiscriminators); + if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) + { + variant_element_state->discriminators = cached_discriminators; + } + else + { + auto * discriminators_stream = settings.getter(settings.path); + if (!discriminators_stream) + return; + + /// If we started to read a new column, reinitialize discriminators column in deserialization state. + if (!variant_element_state->discriminators || result_column->empty()) + variant_element_state->discriminators = ColumnVariant::ColumnDiscriminators::create(); + + SerializationNumber().deserializeBinaryBulk(*variant_element_state->discriminators->assumeMutable(), *discriminators_stream, limit, 0); + addToSubstreamsCache(cache, settings.path, variant_element_state->discriminators); + } + settings.path.pop_back(); + + /// Iterate through new discriminators to calculate the limit for our variant. + const auto & discriminators_data = assert_cast(*variant_element_state->discriminators).getData(); + size_t discriminators_offset = variant_element_state->discriminators->size() - limit; + size_t variant_limit = 0; + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) + variant_limit += (discriminators_data[i] == variant_discriminator); + + /// Now we know the limit for our variant and can deserialize it. + + /// If result column is Nullable, fill null map and extract nested column. + MutableColumnPtr mutable_column = result_column->assumeMutable(); + if (isColumnNullable(*mutable_column)) + { + auto & nullable_column = assert_cast(*mutable_column); + NullMap & null_map = nullable_column.getNullMapData(); + /// If we have only our discriminator in range, fill null map with 0. + if (variant_limit == limit) + { + null_map.resize_fill(null_map.size() + limit, 0); + } + /// If no our discriminator in current range, fill null map with 1. + else if (variant_limit == 0) + { + null_map.resize_fill(null_map.size() + limit, 1); + } + /// Otherwise we should iterate through discriminators to fill null map. + else + { + null_map.reserve(null_map.size() + limit); + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) + null_map.push_back(discriminators_data[i] != variant_discriminator); + } + + mutable_column = nullable_column.getNestedColumnPtr()->assumeMutable(); + } + + /// If we started to read a new column, reinitialize variant column in deserialization state. + if (!variant_element_state->variant || result_column->empty()) + { + variant_element_state->variant = mutable_column->cloneEmpty(); + + /// When result column is LowCardinality(Nullable(T)) we should + /// remove Nullable from variant column before deserialization. + if (isColumnLowCardinalityNullable(*mutable_column)) + assert_cast(*variant_element_state->variant->assumeMutable()).nestedRemoveNullable(); + } + + /// If nothing to deserialize, just insert defaults. + if (variant_limit == 0) + { + mutable_column->insertManyDefaults(limit); + return; + } + + addVariantToPath(settings.path); + nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_element_state->variant, variant_limit, settings, variant_element_state->variant_element_state, cache); + removeVariantFromPath(settings.path); + + /// If nothing was deserialized when variant_limit > 0 + /// it means that we don't have a stream for such sub-column. + /// It may happen during ALTER MODIFY column with Variant extension. + /// In this case we should just insert default values. + if (variant_element_state->variant->empty()) + { + mutable_column->insertManyDefaults(limit); + return; + } + + size_t variant_offset = variant_element_state->variant->size() - variant_limit; + + /// If we have only our discriminator in range, insert the whole range to result column. + if (variant_limit == limit) + { + mutable_column->insertRangeFrom(*variant_element_state->variant, variant_offset, variant_limit); + } + /// Otherwise iterate through discriminators and insert value from variant or default value depending on the discriminator. + else + { + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) + { + if (discriminators_data[i] == variant_discriminator) + mutable_column->insertFrom(*variant_element_state->variant, variant_offset++); + else + mutable_column->insertDefault(); + } + } +} + +void SerializationVariantElement::addVariantToPath(DB::ISerialization::SubstreamPath & path) const +{ + path.push_back(Substream::VariantElements); + path.push_back(Substream::VariantElement); + path.back().variant_element_name = variant_element_name; +} + +void SerializationVariantElement::removeVariantFromPath(DB::ISerialization::SubstreamPath & path) const +{ + path.pop_back(); + path.pop_back(); +} + +SerializationVariantElement::VariantSubcolumnCreator::VariantSubcolumnCreator( + const ColumnPtr & local_discriminators_, + const String & variant_element_name_, + const ColumnVariant::Discriminator global_variant_discriminator_, + const ColumnVariant::Discriminator local_variant_discriminator_) + : local_discriminators(local_discriminators_) + , variant_element_name(variant_element_name_) + , global_variant_discriminator(global_variant_discriminator_) + , local_variant_discriminator(local_variant_discriminator_) +{ +} + +DataTypePtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB::DataTypePtr & prev) const +{ + return makeNullableOrLowCardinalityNullableSafe(prev); +} + +SerializationPtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB::SerializationPtr & prev) const +{ + return std::make_shared(prev, variant_element_name, global_variant_discriminator); +} + +ColumnPtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB::ColumnPtr & prev) const +{ + /// Case when original Variant column contained only one non-empty variant and no NULLs. + /// In this case just use this variant. + if (prev->size() == local_discriminators->size()) + return makeNullableOrLowCardinalityNullableSafe(prev); + + /// If this variant is empty, fill result column with default values. + if (prev->empty()) + { + auto res = makeNullableOrLowCardinalityNullableSafe(prev)->cloneEmpty(); + res->insertManyDefaults(local_discriminators->size()); + return res; + } + + /// In general case we should iterate through discriminators and create null-map for our variant. + NullMap null_map; + null_map.reserve(local_discriminators->size()); + const auto & local_discriminators_data = assert_cast(*local_discriminators).getData(); + for (auto local_discr : local_discriminators_data) + null_map.push_back(local_discr != local_variant_discriminator); + + /// Now we can create new column from null-map and variant column using IColumn::expand. + auto res_column = IColumn::mutate(prev); + + /// Special case for LowCardinality. We want the result to be LowCardinality(Nullable), + /// but we don't have a good way to apply null-mask for LowCardinality(), so, we first + /// convert our column to LowCardinality(Nullable()) and then use expand which will + /// fill rows with 0 in mask with default value (that is NULL). + if (prev->lowCardinality()) + res_column = assert_cast(*res_column).cloneNullable(); + + res_column->expand(null_map, /*inverted = */ true); + + if (res_column->canBeInsideNullable()) + { + auto null_map_col = ColumnUInt8::create(); + null_map_col->getData() = std::move(null_map); + return ColumnNullable::create(std::move(res_column), std::move(null_map_col)); + } + + return res_column; +} + +} diff --git a/src/DataTypes/Serializations/SerializationVariantElement.h b/src/DataTypes/Serializations/SerializationVariantElement.h new file mode 100644 index 00000000000..c343c219cf3 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationVariantElement.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +class SerializationVariant; + +/// Serialization for Variant element when we read it as a subcolumn. +class SerializationVariantElement final : public SerializationWrapper +{ +private: + /// To be able to deserialize Variant element as a subcolumn + /// we need its type name and global discriminator. + String variant_element_name; + ColumnVariant::Discriminator variant_discriminator; + +public: + SerializationVariantElement(const SerializationPtr & nested_, const String & variant_element_name_, ColumnVariant::Discriminator variant_discriminator_) + : SerializationWrapper(nested_) + , variant_element_name(variant_element_name_) + , variant_discriminator(variant_discriminator_) + { + } + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + +private: + friend SerializationVariant; + + void addVariantToPath(SubstreamPath & path) const; + void removeVariantFromPath(SubstreamPath & path) const; + + struct VariantSubcolumnCreator : public ISubcolumnCreator + { + const ColumnPtr local_discriminators; + const String variant_element_name; + const ColumnVariant::Discriminator global_variant_discriminator; + const ColumnVariant::Discriminator local_variant_discriminator; + + VariantSubcolumnCreator( + const ColumnPtr & local_discriminators_, + const String & variant_element_name_, + const ColumnVariant::Discriminator global_variant_discriminator_, + const ColumnVariant::Discriminator local_variant_discriminator_); + + DataTypePtr create(const DataTypePtr & prev) const override; + ColumnPtr create(const ColumnPtr & prev) const override; + SerializationPtr create(const SerializationPtr & prev) const override; + }; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationWrapper.cpp b/src/DataTypes/Serializations/SerializationWrapper.cpp index 18e4891ee65..bde52bb8096 100644 --- a/src/DataTypes/Serializations/SerializationWrapper.cpp +++ b/src/DataTypes/Serializations/SerializationWrapper.cpp @@ -96,6 +96,11 @@ void SerializationWrapper::deserializeTextEscaped(IColumn & column, ReadBuffer & nested_serialization->deserializeTextEscaped(column, istr, settings); } +bool SerializationWrapper::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeTextEscaped(column, istr, settings); +} + void SerializationWrapper::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { nested_serialization->serializeTextQuoted(column, row_num, ostr, settings); @@ -106,6 +111,11 @@ void SerializationWrapper::deserializeTextQuoted(IColumn & column, ReadBuffer & nested_serialization->deserializeTextQuoted(column, istr, settings); } +bool SerializationWrapper::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeTextQuoted(column, istr, settings); +} + void SerializationWrapper::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { nested_serialization->serializeTextCSV(column, row_num, ostr, settings); @@ -116,6 +126,11 @@ void SerializationWrapper::deserializeTextCSV(IColumn & column, ReadBuffer & ist nested_serialization->deserializeTextCSV(column, istr, settings); } +bool SerializationWrapper::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeTextCSV(column, istr, settings); +} + void SerializationWrapper::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { nested_serialization->serializeText(column, row_num, ostr, settings); @@ -126,6 +141,11 @@ void SerializationWrapper::deserializeWholeText(IColumn & column, ReadBuffer & i nested_serialization->deserializeWholeText(column, istr, settings); } +bool SerializationWrapper::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeWholeText(column, istr, settings); +} + void SerializationWrapper::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { nested_serialization->serializeTextJSON(column, row_num, ostr, settings); @@ -136,6 +156,11 @@ void SerializationWrapper::deserializeTextJSON(IColumn & column, ReadBuffer & is nested_serialization->deserializeTextJSON(column, istr, settings); } +bool SerializationWrapper::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeTextJSON(column, istr, settings); +} + void SerializationWrapper::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const { nested_serialization->serializeTextJSONPretty(column, row_num, ostr, settings, indent); diff --git a/src/DataTypes/Serializations/SerializationWrapper.h b/src/DataTypes/Serializations/SerializationWrapper.h index 31900f93148..6c5e2046062 100644 --- a/src/DataTypes/Serializations/SerializationWrapper.h +++ b/src/DataTypes/Serializations/SerializationWrapper.h @@ -63,18 +63,23 @@ public: void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; diff --git a/src/DataTypes/Serializations/SimpleTextSerialization.h b/src/DataTypes/Serializations/SimpleTextSerialization.h index 0247f30b30a..11f56de73d1 100644 --- a/src/DataTypes/Serializations/SimpleTextSerialization.h +++ b/src/DataTypes/Serializations/SimpleTextSerialization.h @@ -36,29 +36,67 @@ protected: deserializeText(column, istr, settings, true); } + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, true); + } + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override { deserializeText(column, istr, settings, false); } + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, false); + } + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override { deserializeText(column, istr, settings, false); } + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, false); + } + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override { deserializeText(column, istr, settings, false); } + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, false); + } + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override { deserializeText(column, istr, settings, false); } + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, false); + } + /// whole = true means that buffer contains only one value, so we should read until EOF. /// It's needed to check if there is garbage after parsed field. virtual void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const = 0; + + virtual bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const + { + try + { + deserializeText(column, istr, settings, whole); + return true; + } + catch (...) + { + return false; + } + } }; } diff --git a/src/DataTypes/Utils.cpp b/src/DataTypes/Utils.cpp index e58331a8bcb..2f29d57d454 100644 --- a/src/DataTypes/Utils.cpp +++ b/src/DataTypes/Utils.cpp @@ -223,6 +223,7 @@ bool canBeSafelyCasted(const DataTypePtr & from_type, const DataTypePtr & to_typ case TypeIndex::AggregateFunction: case TypeIndex::Nothing: case TypeIndex::JSONPaths: + case TypeIndex::Variant: return false; } diff --git a/src/DataTypes/getLeastSupertype.cpp b/src/DataTypes/getLeastSupertype.cpp index e5bdb4b267f..09d44eeb160 100644 --- a/src/DataTypes/getLeastSupertype.cpp +++ b/src/DataTypes/getLeastSupertype.cpp @@ -18,6 +18,7 @@ #include #include #include +#include namespace DB @@ -383,6 +384,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types) return throwOrReturn(types, "because some of them are Maps and some of them are not", ErrorCodes::NO_COMMON_TYPE); auto keys_common_type = getLeastSupertype(key_types); + auto values_common_type = getLeastSupertype(value_types); /// When on_error == LeastSupertypeOnError::Null and we cannot get least supertype for keys or values, /// keys_common_type or values_common_type will be nullptr, we should return nullptr in this case. @@ -424,6 +426,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types) else { auto nested_type = getLeastSupertype(nested_types); + /// When on_error == LeastSupertypeOnError::Null and we cannot get least supertype, /// nested_type will be nullptr, we should return nullptr in this case. if (!nested_type) @@ -637,6 +640,32 @@ DataTypePtr getLeastSupertypeOrString(const DataTypes & types) return getLeastSupertype(types); } +DataTypePtr getLeastSupertypeOrVariant(const DataTypes & types) +{ + auto common_type = getLeastSupertype(types); + if (common_type) + return common_type; + + /// Create Variant with provided arguments as variants. + DataTypes variants; + for (const auto & type : types) + { + /// Nested Variant types are not supported. If we have Variant type + /// we use all its variants in the result Variant. + if (isVariant(type)) + { + const DataTypes & nested_variants = assert_cast(*type).getVariants(); + variants.insert(variants.end(), nested_variants.begin(), nested_variants.end()); + } + else + { + variants.push_back(removeNullableOrLowCardinalityNullable(type)); + } + } + + return std::make_shared(variants); +} + DataTypePtr tryGetLeastSupertype(const DataTypes & types) { return getLeastSupertype(types); diff --git a/src/DataTypes/getLeastSupertype.h b/src/DataTypes/getLeastSupertype.h index 2ef4a0e6850..ceaffbdab7a 100644 --- a/src/DataTypes/getLeastSupertype.h +++ b/src/DataTypes/getLeastSupertype.h @@ -24,6 +24,17 @@ DataTypePtr getLeastSupertype(const DataTypes & types); /// All types can be casted to String, because they can be serialized to String. DataTypePtr getLeastSupertypeOrString(const DataTypes & types); +/// Same as getLeastSupertype but in case when there is no supertype for provided types +/// it uses Variant of these types as a supertype. Any type can be casted to a Variant +/// that contains this type. +/// As nested Variants are not allowed, if one of the types is Variant, it's variants +/// are used in the resulting Variant. +/// Examples: +/// (UInt64, String) -> Variant(UInt64, String) +/// (Array(UInt64), Array(String)) -> Variant(Array(UInt64), Array(String)) +/// (Variant(UInt64, String), Array(UInt32)) -> Variant(UInt64, String, Array(UInt32)) +DataTypePtr getLeastSupertypeOrVariant(const DataTypes & types); + /// Same as above but return nullptr instead of throwing exception. DataTypePtr tryGetLeastSupertype(const DataTypes & types); diff --git a/src/Databases/DatabaseFilesystem.cpp b/src/Databases/DatabaseFilesystem.cpp index 55ae60469ed..bc6714a6471 100644 --- a/src/Databases/DatabaseFilesystem.cpp +++ b/src/Databases/DatabaseFilesystem.cpp @@ -146,9 +146,18 @@ StoragePtr DatabaseFilesystem::getTableImpl(const String & name, ContextPtr cont if (!checkTableFilePath(table_path, context_, throw_on_error)) return {}; - auto format = FormatFactory::instance().getFormatFromFileName(table_path, throw_on_error); - if (format.empty()) - return {}; + String format; + if (throw_on_error) + { + format = FormatFactory::instance().getFormatFromFileName(table_path); + } + else + { + auto format_maybe = FormatFactory::instance().tryGetFormatFromFileName(table_path); + if (!format_maybe) + return {}; + format = *format_maybe; + } auto ast_function_ptr = makeASTFunction("file", std::make_shared(table_path), std::make_shared(format)); diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 9f75bcb5529..9cf19a251f7 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -925,6 +926,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep query_context->setSetting("allow_experimental_nlp_functions", 1); query_context->setSetting("allow_experimental_hash_functions", 1); query_context->setSetting("allow_experimental_object_type", 1); + query_context->setSetting("allow_experimental_variant_type", 1); query_context->setSetting("allow_experimental_annoy_index", 1); query_context->setSetting("allow_experimental_usearch_index", 1); query_context->setSetting("allow_experimental_bigint_types", 1); @@ -1090,31 +1092,57 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep } tables_dependencies.checkNoCyclicDependencies(); - auto tables_to_create = tables_dependencies.getTablesSortedByDependency(); - for (const auto & table_id : tables_to_create) + auto allow_concurrent_table_creation = getContext()->getServerSettings().max_database_replicated_create_table_thread_pool_size > 1; + auto tables_to_create_by_level = tables_dependencies.getTablesSplitByDependencyLevel(); + + auto create_tables_runner = threadPoolCallbackRunner(getDatabaseReplicatedCreateTablesThreadPool().get(), "CreateTables"); + std::vector> create_table_futures; + + for (const auto & tables_to_create : tables_to_create_by_level) { - auto table_name = table_id.getTableName(); - auto metadata_it = table_name_to_metadata.find(table_name); - if (metadata_it == table_name_to_metadata.end()) + for (const auto & table_id : tables_to_create) { - /// getTablesSortedByDependency() may return some not existing tables or tables from other databases - LOG_WARNING(log, "Got table name {} when resolving table dependencies, " - "but database {} does not have metadata for that table. Ignoring it", table_id.getNameForLogs(), getDatabaseName()); - continue; + auto task = [&]() + { + auto table_name = table_id.getTableName(); + auto metadata_it = table_name_to_metadata.find(table_name); + if (metadata_it == table_name_to_metadata.end()) + { + /// getTablesSortedByDependency() may return some not existing tables or tables from other databases + LOG_WARNING(log, "Got table name {} when resolving table dependencies, " + "but database {} does not have metadata for that table. Ignoring it", table_id.getNameForLogs(), getDatabaseName()); + return; + } + + const auto & create_query_string = metadata_it->second; + if (isTableExist(table_name, getContext())) + { + assert(create_query_string == readMetadataFile(table_name) || getTableUUIDIfReplicated(create_query_string, getContext()) != UUIDHelpers::Nil); + return; + } + + auto query_ast = parseQueryFromMetadataInZooKeeper(table_name, create_query_string); + LOG_INFO(log, "Executing {}", serializeAST(*query_ast)); + auto create_query_context = make_query_context(); + InterpreterCreateQuery(query_ast, create_query_context).execute(); + }; + + if (allow_concurrent_table_creation) + create_table_futures.push_back(create_tables_runner(task, Priority{0})); + else + task(); } - const auto & create_query_string = metadata_it->second; - if (isTableExist(table_name, getContext())) - { - assert(create_query_string == readMetadataFile(table_name) || getTableUUIDIfReplicated(create_query_string, getContext()) != UUIDHelpers::Nil); - continue; - } + /// First wait for all tasks to finish. + for (auto & future : create_table_futures) + future.wait(); - auto query_ast = parseQueryFromMetadataInZooKeeper(table_name, create_query_string); - LOG_INFO(log, "Executing {}", serializeAST(*query_ast)); - auto create_query_context = make_query_context(); - InterpreterCreateQuery(query_ast, create_query_context).execute(); + /// Now rethrow the first exception if any. + for (auto & future : create_table_futures) + future.get(); + + create_table_futures.clear(); } LOG_INFO(log, "All tables are created successfully"); diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 2f448cd7036..ec380fa759d 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -407,7 +407,7 @@ public: virtual void stopReplication() { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Database engine {} does not run a replication thread!", getEngineName()); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Database engine {} does not run a replication thread", getEngineName()); } virtual bool shouldReplicateQuery(const ContextPtr & /*query_context*/, const ASTPtr & /*query_ptr*/) const { return false; } diff --git a/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp b/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp index 2656835f912..20db8036942 100644 --- a/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp +++ b/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp @@ -779,7 +779,7 @@ static void writeFieldsToColumn( casted_int32_column->insertValue(num & 0x800000 ? num | 0xFF000000 : num); } else - throw Exception(ErrorCodes::LOGICAL_ERROR, "LOGICAL ERROR: it is a bug."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "MaterializedMySQL is a bug."); } } } @@ -844,7 +844,7 @@ static inline bool differenceSortingKeys(const Tuple & row_old_data, const Tuple static inline size_t onUpdateData(const Row & rows_data, Block & buffer, size_t version, const std::vector & sorting_columns_index) { if (rows_data.size() % 2 != 0) - throw Exception(ErrorCodes::LOGICAL_ERROR, "LOGICAL ERROR: It is a bug."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "MaterializedMySQL is a bug."); size_t prev_bytes = buffer.bytes(); std::vector writeable_rows_mask(rows_data.size()); diff --git a/src/Databases/TablesDependencyGraph.cpp b/src/Databases/TablesDependencyGraph.cpp index 6b9e202d900..4b05f19fe91 100644 --- a/src/Databases/TablesDependencyGraph.cpp +++ b/src/Databases/TablesDependencyGraph.cpp @@ -699,6 +699,22 @@ std::vector TablesDependencyGraph::getTablesSortedByDependency() cons } +std::vector> TablesDependencyGraph::getTablesSplitByDependencyLevel() const +{ + std::vector> tables_split_by_level; + auto sorted_nodes = getNodesSortedByLevel(); + if (sorted_nodes.empty()) + return tables_split_by_level; + + tables_split_by_level.resize(sorted_nodes.back()->level + 1); + for (const auto * node : sorted_nodes) + { + tables_split_by_level[node->level].emplace_back(node->storage_id); + } + return tables_split_by_level; +} + + void TablesDependencyGraph::log() const { if (nodes.empty()) diff --git a/src/Databases/TablesDependencyGraph.h b/src/Databases/TablesDependencyGraph.h index f0553cef321..eb13539b5b6 100644 --- a/src/Databases/TablesDependencyGraph.h +++ b/src/Databases/TablesDependencyGraph.h @@ -107,6 +107,12 @@ public: /// tables which depend on the tables which depend on the tables without dependencies, and so on. std::vector getTablesSortedByDependency() const; + /// Returns a list of lists of tables by the number of dependencies they have: + /// tables without dependencies are in the first list, then + /// tables which depend on the tables without dependencies are in the second list, then + /// tables which depend on the tables which depend on the tables without dependencies are in the third list, and so on. + std::vector> getTablesSplitByDependencyLevel() const; + /// Outputs information about this graph as a bunch of logging messages. void log() const; diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index 000f0ef5b4c..bf3d5a5cd12 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -3,9 +3,7 @@ #include #include -#include #include -#include #include #include #include diff --git a/src/Dictionaries/CacheDictionary.h b/src/Dictionaries/CacheDictionary.h index aae86a83f12..a52bcbc4ae4 100644 --- a/src/Dictionaries/CacheDictionary.h +++ b/src/Dictionaries/CacheDictionary.h @@ -78,22 +78,22 @@ public: double getLoadFactor() const override; - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(hit_count.load(std::memory_order_acquire)) / queries; + return static_cast(hit_count.load()) / queries; } bool supportUpdates() const override { return false; } diff --git a/src/Dictionaries/DirectDictionary.h b/src/Dictionaries/DirectDictionary.h index 214c8ef8a13..73340904684 100644 --- a/src/Dictionaries/DirectDictionary.h +++ b/src/Dictionaries/DirectDictionary.h @@ -34,14 +34,14 @@ public: size_t getBytesAllocated() const override { return 0; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/FlatDictionary.h b/src/Dictionaries/FlatDictionary.h index a54916c5cd1..aac55610351 100644 --- a/src/Dictionaries/FlatDictionary.h +++ b/src/Dictionaries/FlatDictionary.h @@ -41,14 +41,14 @@ public: size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/HashedArrayDictionary.h b/src/Dictionaries/HashedArrayDictionary.h index 86b21443e18..f18a8f4a474 100644 --- a/src/Dictionaries/HashedArrayDictionary.h +++ b/src/Dictionaries/HashedArrayDictionary.h @@ -57,14 +57,14 @@ public: size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/HashedDictionary.h b/src/Dictionaries/HashedDictionary.h index 0b8419dd242..ed80973fcf3 100644 --- a/src/Dictionaries/HashedDictionary.h +++ b/src/Dictionaries/HashedDictionary.h @@ -99,14 +99,14 @@ public: size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/IDictionary.h b/src/Dictionaries/IDictionary.h index f1834b4b129..d3e28682f35 100644 --- a/src/Dictionaries/IDictionary.h +++ b/src/Dictionaries/IDictionary.h @@ -109,6 +109,9 @@ public: virtual size_t getQueryCount() const = 0; + /// The percentage of time a lookup successfully found an entry. + /// When there were no lookups, it returns zero (instead of NaN). + /// The value is calculated non atomically and can be slightly off in the presence of concurrent lookups. virtual double getFoundRate() const = 0; virtual double getHitRate() const = 0; diff --git a/src/Dictionaries/IPAddressDictionary.h b/src/Dictionaries/IPAddressDictionary.h index d758e23043d..105bf7e340a 100644 --- a/src/Dictionaries/IPAddressDictionary.h +++ b/src/Dictionaries/IPAddressDictionary.h @@ -41,14 +41,14 @@ public: size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/PolygonDictionary.h b/src/Dictionaries/PolygonDictionary.h index a856d12b66c..48a1f0e56da 100644 --- a/src/Dictionaries/PolygonDictionary.h +++ b/src/Dictionaries/PolygonDictionary.h @@ -71,14 +71,14 @@ public: size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/RangeHashedDictionary.h b/src/Dictionaries/RangeHashedDictionary.h index c44bffe42e1..28db67038ca 100644 --- a/src/Dictionaries/RangeHashedDictionary.h +++ b/src/Dictionaries/RangeHashedDictionary.h @@ -85,14 +85,14 @@ public: size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/RegExpTreeDictionary.h b/src/Dictionaries/RegExpTreeDictionary.h index 78b7f441d34..68b6b603692 100644 --- a/src/Dictionaries/RegExpTreeDictionary.h +++ b/src/Dictionaries/RegExpTreeDictionary.h @@ -58,14 +58,14 @@ public: size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - const auto queries = query_count.load(std::memory_order_relaxed); + const auto queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/YAMLRegExpTreeDictionarySource.cpp b/src/Dictionaries/YAMLRegExpTreeDictionarySource.cpp index f1591943a12..b35e507b242 100644 --- a/src/Dictionaries/YAMLRegExpTreeDictionarySource.cpp +++ b/src/Dictionaries/YAMLRegExpTreeDictionarySource.cpp @@ -227,7 +227,7 @@ void parseMatchNode(UInt64 parent_id, UInt64 & id, const YAML::Node & node, Resu if (!match.contains(key_name)) { - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Yaml match rule must contain key {}", key_name); + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "YAML match rule must contain key {}", key_name); } for (const auto & [key, node_] : match) { diff --git a/src/Disks/DiskEncrypted.cpp b/src/Disks/DiskEncrypted.cpp index fe00fdd64d6..68fd9012857 100644 --- a/src/Disks/DiskEncrypted.cpp +++ b/src/Disks/DiskEncrypted.cpp @@ -4,9 +4,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -374,7 +374,7 @@ std::unique_ptr DiskEncrypted::readFile( { /// File is empty, that's a normal case, see DiskEncrypted::truncateFile(). /// There is no header so we just return `ReadBufferFromString("")`. - return std::make_unique(wrapped_path); + return std::make_unique(std::make_unique(std::string_view{}), wrapped_path); } auto encryption_settings = current_settings.get(); FileEncryption::Header header = readHeader(*buffer); diff --git a/src/Disks/DiskEncryptedTransaction.cpp b/src/Disks/DiskEncryptedTransaction.cpp index 3da2e6f925a..daeab7aae6c 100644 --- a/src/Disks/DiskEncryptedTransaction.cpp +++ b/src/Disks/DiskEncryptedTransaction.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp b/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp index 8eecd0d99d1..2373640704b 100644 --- a/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp +++ b/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp @@ -48,11 +48,10 @@ AsynchronousBoundedReadBuffer::AsynchronousBoundedReadBuffer( const ReadSettings & settings_, AsyncReadCountersPtr async_read_counters_, FilesystemReadPrefetchesLogPtr prefetches_log_) - : ReadBufferFromFileBase(chooseBufferSizeForRemoteReading(settings_, impl_->getFileSize()), nullptr, 0) + : ReadBufferFromFileBase(0, nullptr, 0) , impl(std::move(impl_)) , read_settings(settings_) , reader(reader_) - , prefetch_buffer(chooseBufferSizeForRemoteReading(read_settings, impl->getFileSize())) , query_id(CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() != nullptr ? CurrentThread::getQueryId() : "") , current_reader_id(getRandomASCIIString(8)) , log(getLogger("AsynchronousBoundedReadBuffer")) @@ -70,12 +69,10 @@ bool AsynchronousBoundedReadBuffer::hasPendingDataToRead() return false; if (file_offset_of_buffer_end > *read_until_position) - { throw Exception( ErrorCodes::LOGICAL_ERROR, - "Read beyond last offset ({} > {}, info: {})", - file_offset_of_buffer_end, *read_until_position, impl->getInfoForLog()); - } + "Read beyond last offset ({} > {}): file size = {}, info: {}", + file_offset_of_buffer_end, *read_until_position, impl->getFileSize(), impl->getInfoForLog()); } return true; @@ -115,7 +112,7 @@ void AsynchronousBoundedReadBuffer::prefetch(Priority priority) last_prefetch_info.submit_time = std::chrono::system_clock::now(); last_prefetch_info.priority = priority; - chassert(prefetch_buffer.size() == chooseBufferSizeForRemoteReading(read_settings, impl->getFileSize())); + prefetch_buffer.resize(chooseBufferSizeForRemoteReading(read_settings, impl->getFileSize())); prefetch_future = readAsync(prefetch_buffer.data(), prefetch_buffer.size(), priority); ProfileEvents::increment(ProfileEvents::RemoteFSPrefetches); } @@ -127,14 +124,15 @@ void AsynchronousBoundedReadBuffer::setReadUntilPosition(size_t position) if (position < file_offset_of_buffer_end) { /// file has been read beyond new read until position already - if (working_buffer.size() >= file_offset_of_buffer_end - position) + if (available() >= file_offset_of_buffer_end - position) { - /// new read until position is inside working buffer + /// new read until position is after the current position in the working buffer file_offset_of_buffer_end = position; + working_buffer.resize(working_buffer.size() - (file_offset_of_buffer_end - position)); } else { - /// new read until position is before working buffer begin + /// new read until position is before the current position in the working buffer throw Exception( ErrorCodes::LOGICAL_ERROR, "Attempt to set read until position before already read data ({} > {}, info: {})", @@ -187,6 +185,7 @@ bool AsynchronousBoundedReadBuffer::nextImpl() return false; chassert(file_offset_of_buffer_end <= impl->getFileSize()); + size_t old_file_offset_of_buffer_end = file_offset_of_buffer_end; IAsynchronousReader::Result result; if (prefetch_future.valid()) @@ -211,7 +210,7 @@ bool AsynchronousBoundedReadBuffer::nextImpl() } else { - chassert(memory.size() == chooseBufferSizeForRemoteReading(read_settings, impl->getFileSize())); + memory.resize(chooseBufferSizeForRemoteReading(read_settings, impl->getFileSize())); { ProfileEventTimeIncrement watch(ProfileEvents::SynchronousRemoteReadWaitMicroseconds); @@ -222,6 +221,9 @@ bool AsynchronousBoundedReadBuffer::nextImpl() ProfileEvents::increment(ProfileEvents::RemoteFSUnprefetchedBytes, result.size); } + bytes_to_ignore = 0; + resetWorkingBuffer(); + size_t bytes_read = result.size - result.offset; if (bytes_read) { @@ -232,14 +234,26 @@ bool AsynchronousBoundedReadBuffer::nextImpl() } file_offset_of_buffer_end = impl->getFileOffsetOfBufferEnd(); - bytes_to_ignore = 0; /// In case of multiple files for the same file in clickhouse (i.e. log family) /// file_offset_of_buffer_end will not match getImplementationBufferOffset() /// so we use [impl->getImplementationBufferOffset(), impl->getFileSize()] chassert(file_offset_of_buffer_end <= impl->getFileSize()); - return bytes_read; + if (read_until_position && (file_offset_of_buffer_end > *read_until_position)) + { + size_t excessive_bytes_read = file_offset_of_buffer_end - *read_until_position; + + if (excessive_bytes_read > working_buffer.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "File offset moved too far: old_file_offset = {}, new_file_offset = {}, read_until_position = {}, bytes_read = {}", + old_file_offset_of_buffer_end, file_offset_of_buffer_end, *read_until_position, bytes_read); + + working_buffer.resize(working_buffer.size() - excessive_bytes_read); + file_offset_of_buffer_end = *read_until_position; + } + + return !working_buffer.empty(); } diff --git a/src/Disks/IO/AsynchronousBoundedReadBuffer.h b/src/Disks/IO/AsynchronousBoundedReadBuffer.h index e5030f37b1d..6dc76352aca 100644 --- a/src/Disks/IO/AsynchronousBoundedReadBuffer.h +++ b/src/Disks/IO/AsynchronousBoundedReadBuffer.h @@ -95,7 +95,6 @@ private: IAsynchronousReader::Result readSync(char * data, size_t size); void resetPrefetch(FilesystemPrefetchState state); - }; } diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp index 923aab5c343..0b3ecca3587 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp @@ -1,6 +1,6 @@ #include "ReadBufferFromRemoteFSGather.h" -#include +#include #include #include @@ -62,7 +62,7 @@ ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather( current_object = blobs_to_read.front(); } -std::unique_ptr ReadBufferFromRemoteFSGather::createImplementationBuffer(const StoredObject & object) +SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(const StoredObject & object) { if (current_buf && !with_cache) { @@ -79,7 +79,7 @@ std::unique_ptr ReadBufferFromRemoteFSGather::createImpl if (with_cache) { auto cache_key = settings.remote_fs_cache->createKeyForPath(object_path); - return std::make_unique( + return std::make_shared( object_path, cache_key, settings.remote_fs_cache, diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.h b/src/Disks/IO/ReadBufferFromRemoteFSGather.h index 93ded9fefb3..f6b7506a54f 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.h +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.h @@ -53,7 +53,7 @@ public: bool isContentCached(size_t offset, size_t size) override; private: - std::unique_ptr createImplementationBuffer(const StoredObject & object); + SeekableReadBufferPtr createImplementationBuffer(const StoredObject & object); bool nextImpl() override; @@ -80,7 +80,7 @@ private: StoredObject current_object; size_t current_buf_idx = 0; - std::unique_ptr current_buf; + SeekableReadBufferPtr current_buf; LoggerPtr log; }; diff --git a/src/Disks/IO/ThreadPoolReader.h b/src/Disks/IO/ThreadPoolReader.h index 42bc9bf8bb4..b8aff9f22a2 100644 --- a/src/Disks/IO/ThreadPoolReader.h +++ b/src/Disks/IO/ThreadPoolReader.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include namespace DB diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.h b/src/Disks/IO/ThreadPoolRemoteFSReader.h index cd2bf223f33..abc251b2b10 100644 --- a/src/Disks/IO/ThreadPoolRemoteFSReader.h +++ b/src/Disks/IO/ThreadPoolRemoteFSReader.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp index d281c3dfdc2..905114f50e9 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp @@ -18,17 +18,17 @@ namespace ProfileEvents namespace DB { -static constexpr auto DEFAULT_RETRY_NUM = 3; - WriteBufferFromAzureBlobStorage::WriteBufferFromAzureBlobStorage( std::shared_ptr blob_container_client_, const String & blob_path_, size_t max_single_part_upload_size_, + size_t max_unexpected_write_error_retries_, size_t buf_size_, const WriteSettings & write_settings_) : WriteBufferFromFileBase(buf_size_, nullptr, 0) , log(getLogger("WriteBufferFromAzureBlobStorage")) , max_single_part_upload_size(max_single_part_upload_size_) + , max_unexpected_write_error_retries(max_unexpected_write_error_retries_) , blob_path(blob_path_) , write_settings(write_settings_) , blob_container_client(blob_container_client_) @@ -77,13 +77,13 @@ void WriteBufferFromAzureBlobStorage::execWithRetry(std::function func, void WriteBufferFromAzureBlobStorage::finalizeImpl() { - execWithRetry([this](){ next(); }, DEFAULT_RETRY_NUM); + execWithRetry([this](){ next(); }, max_unexpected_write_error_retries); if (tmp_buffer_write_offset > 0) uploadBlock(tmp_buffer->data(), tmp_buffer_write_offset); auto block_blob_client = blob_container_client->GetBlockBlobClient(blob_path); - execWithRetry([&](){ block_blob_client.CommitBlockList(block_ids); }, DEFAULT_RETRY_NUM); + execWithRetry([&](){ block_blob_client.CommitBlockList(block_ids); }, max_unexpected_write_error_retries); LOG_TRACE(log, "Committed {} blocks for blob `{}`", block_ids.size(), blob_path); } @@ -94,7 +94,7 @@ void WriteBufferFromAzureBlobStorage::uploadBlock(const char * data, size_t size const std::string & block_id = block_ids.emplace_back(getRandomASCIIString(64)); Azure::Core::IO::MemoryBodyStream memory_stream(reinterpret_cast(data), size); - execWithRetry([&](){ block_blob_client.StageBlock(block_id, memory_stream); }, DEFAULT_RETRY_NUM, size); + execWithRetry([&](){ block_blob_client.StageBlock(block_id, memory_stream); }, max_unexpected_write_error_retries, size); tmp_buffer_write_offset = 0; LOG_TRACE(log, "Staged block (id: {}) of size {} (blob path: {}).", block_id, size, blob_path); diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h index 5e4f97b0a08..f105b35c121 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h @@ -30,6 +30,7 @@ public: AzureClientPtr blob_container_client_, const String & blob_path_, size_t max_single_part_upload_size_, + size_t max_unexpected_write_error_retries_, size_t buf_size_, const WriteSettings & write_settings_); @@ -48,6 +49,7 @@ private: LoggerPtr log; const size_t max_single_part_upload_size; + const size_t max_unexpected_write_error_retries; const std::string blob_path; const WriteSettings write_settings; diff --git a/src/Disks/IO/createReadBufferFromFileBase.cpp b/src/Disks/IO/createReadBufferFromFileBase.cpp index d4cb6b83223..a9d451496ff 100644 --- a/src/Disks/IO/createReadBufferFromFileBase.cpp +++ b/src/Disks/IO/createReadBufferFromFileBase.cpp @@ -39,7 +39,7 @@ std::unique_ptr createReadBufferFromFileBase( size_t alignment) { if (file_size.has_value() && !*file_size) - return std::make_unique(filename); + return std::make_unique(); size_t estimated_size = 0; if (read_hint.has_value()) diff --git a/src/Disks/IVolume.cpp b/src/Disks/IVolume.cpp index 0b072e6ba8b..d763c55c4aa 100644 --- a/src/Disks/IVolume.cpp +++ b/src/Disks/IVolume.cpp @@ -46,7 +46,7 @@ IVolume::IVolume( } if (disks.empty()) - throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, "Volume must contain at least one disk"); + throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, "Volume {} must contain at least one disk", name); } std::optional IVolume::getMaxUnreservedFreeSpace() const diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index a5c8afe0264..72c4abee5c9 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -7,6 +7,7 @@ #include #include #include +#include using namespace Azure::Storage::Blobs; @@ -157,14 +158,18 @@ std::unique_ptr getAzureBlobContainerClient( } } -std::unique_ptr getAzureBlobStorageSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr /*context*/) +std::unique_ptr getAzureBlobStorageSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) { return std::make_unique( config.getUInt64(config_prefix + ".max_single_part_upload_size", 100 * 1024 * 1024), config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), config.getInt(config_prefix + ".max_single_read_retries", 3), config.getInt(config_prefix + ".max_single_download_retries", 3), - config.getInt(config_prefix + ".list_object_keys_size", 1000) + config.getInt(config_prefix + ".list_object_keys_size", 1000), + config.getUInt64(config_prefix + ".max_upload_part_size", 5ULL * 1024 * 1024 * 1024), + config.getUInt64(config_prefix + ".max_single_part_copy_size", context->getSettings().azure_max_single_part_copy_size), + config.getBool(config_prefix + ".use_native_copy", false), + config.getUInt64(config_prefix + ".max_unexpected_write_error_retries", context->getSettings().azure_max_unexpected_write_error_retries) ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 05bf2281842..74389aedb64 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -92,10 +92,12 @@ private: AzureObjectStorage::AzureObjectStorage( const String & name_, AzureClientPtr && client_, - SettingsPtr && settings_) + SettingsPtr && settings_, + const String & container_) : name(name_) , client(std::move(client_)) , settings(std::move(settings_)) + , container(container_) , log(getLogger("AzureObjectStorage")) { } @@ -264,6 +266,7 @@ std::unique_ptr AzureObjectStorage::writeObject( /// NO client.get(), object.remote_path, settings.get()->max_single_part_upload_size, + settings.get()->max_unexpected_write_error_retries, buf_size, patchSettings(write_settings)); } @@ -375,7 +378,8 @@ std::unique_ptr AzureObjectStorage::cloneObjectStorage(const std return std::make_unique( name, getAzureBlobContainerClient(config, config_prefix), - getAzureBlobStorageSettings(config, config_prefix, context) + getAzureBlobStorageSettings(config, config_prefix, context), + container ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index a05eb824b91..f16c35fb52c 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -23,12 +23,20 @@ struct AzureObjectStorageSettings uint64_t min_bytes_for_seek_, int max_single_read_retries_, int max_single_download_retries_, - int list_object_keys_size_) + int list_object_keys_size_, + size_t max_upload_part_size_, + size_t max_single_part_copy_size_, + bool use_native_copy_, + size_t max_unexpected_write_error_retries_) : max_single_part_upload_size(max_single_part_upload_size_) , min_bytes_for_seek(min_bytes_for_seek_) , max_single_read_retries(max_single_read_retries_) , max_single_download_retries(max_single_download_retries_) , list_object_keys_size(list_object_keys_size_) + , max_upload_part_size(max_upload_part_size_) + , max_single_part_copy_size(max_single_part_copy_size_) + , use_native_copy(use_native_copy_) + , max_unexpected_write_error_retries (max_unexpected_write_error_retries_) { } @@ -39,6 +47,11 @@ struct AzureObjectStorageSettings size_t max_single_read_retries = 3; size_t max_single_download_retries = 3; int list_object_keys_size = 1000; + size_t min_upload_part_size = 16 * 1024 * 1024; + size_t max_upload_part_size = 5ULL * 1024 * 1024 * 1024; + size_t max_single_part_copy_size = 256 * 1024 * 1024; + bool use_native_copy = false; + size_t max_unexpected_write_error_retries = 4; }; using AzureClient = Azure::Storage::Blobs::BlobContainerClient; @@ -53,7 +66,8 @@ public: AzureObjectStorage( const String & name_, AzureClientPtr && client_, - SettingsPtr && settings_); + SettingsPtr && settings_, + const String & container_); void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const override; @@ -116,7 +130,7 @@ public: const std::string & config_prefix, ContextPtr context) override; - String getObjectsNamespace() const override { return ""; } + String getObjectsNamespace() const override { return container ; } std::unique_ptr cloneObjectStorage( const std::string & new_namespace, @@ -128,11 +142,19 @@ public: bool isRemote() const override { return true; } + std::shared_ptr getSettings() { return settings.get(); } + + std::shared_ptr getAzureBlobStorageClient() override + { + return client.get(); + } + private: const String name; /// client used to access the files in the Blob Storage cloud MultiVersion client; MultiVersion settings; + const String container; LoggerPtr log; }; diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h index 7b231b68805..437baead7be 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h @@ -3,6 +3,7 @@ #include #include #include +#include "config.h" namespace Poco { @@ -120,6 +121,13 @@ public: static bool canUseReadThroughCache(const ReadSettings & settings); +#if USE_AZURE_BLOB_STORAGE + std::shared_ptr getAzureBlobStorageClient() override + { + return object_storage->getAzureBlobStorageClient(); + } +#endif + private: FileCacheKey getCacheKey(const std::string & path) const; diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index 9c4132f433f..2a648f28f14 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -389,6 +389,7 @@ void DiskObjectStorage::shutdown() { LOG_INFO(log, "Shutting down disk {}", name); object_storage->shutdown(); + metadata_storage->shutdown(); LOG_INFO(log, "Disk {} shut down", name); } @@ -531,7 +532,7 @@ std::unique_ptr DiskObjectStorage::readFile( const bool file_can_be_empty = !file_size.has_value() || *file_size == 0; if (storage_objects.empty() && file_can_be_empty) - return std::make_unique(path); + return std::make_unique(); return object_storage->readObjects( storage_objects, diff --git a/src/Disks/ObjectStorages/IMetadataStorage.h b/src/Disks/ObjectStorages/IMetadataStorage.h index f08b0d594bd..f95db2e1eee 100644 --- a/src/Disks/ObjectStorages/IMetadataStorage.h +++ b/src/Disks/ObjectStorages/IMetadataStorage.h @@ -210,6 +210,11 @@ public: throwNotImplemented(); } + virtual void shutdown() + { + /// This method is overridden for specific metadata implementations in ClickHouse Cloud. + } + virtual ~IMetadataStorage() = default; /// ==== More specific methods. Previous were almost general purpose. ==== diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index e5a393d3a59..56c269a3fc5 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -22,12 +22,23 @@ #include #include #include -#include +#include +#include +#include "config.h" +#if USE_AZURE_BLOB_STORAGE +#include +#include +#endif namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + class ReadBufferFromFileBase; class WriteBufferFromFileBase; @@ -214,6 +225,14 @@ public: virtual WriteSettings patchSettings(const WriteSettings & write_settings) const; +#if USE_AZURE_BLOB_STORAGE + virtual std::shared_ptr getAzureBlobStorageClient() + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "This function is only implemented for AzureBlobStorage"); + } +#endif + + private: mutable std::mutex throttlers_mutex; ThrottlerPtr remote_read_throttler; diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp index f20cd67a39f..b03809f5b39 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp @@ -98,6 +98,8 @@ DirectoryIteratorPtr MetadataStorageFromPlainObjectStorage::iterateDirectory(con { /// Required for MergeTree auto paths = listDirectory(path); + // Prepend path, since iterateDirectory() includes path, unlike listDirectory() + std::for_each(paths.begin(), paths.end(), [&](auto & child) { child = fs::path(path) / child; }); std::vector fs_paths(paths.begin(), paths.end()); return std::make_unique(std::move(fs_paths)); } @@ -121,6 +123,12 @@ void MetadataStorageFromPlainObjectStorageTransaction::unlinkFile(const std::str metadata_storage.object_storage->removeObject(object); } +void MetadataStorageFromPlainObjectStorageTransaction::removeDirectory(const std::string & path) +{ + for (auto it = metadata_storage.iterateDirectory(path); it->isValid(); it->next()) + metadata_storage.object_storage->removeObject(StoredObject(it->path())); +} + void MetadataStorageFromPlainObjectStorageTransaction::createDirectory(const std::string &) { /// Noop. It is an Object Storage not a filesystem. diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h index 1bb008c0c19..8a8466c3fbe 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h @@ -101,12 +101,13 @@ public: void createDirectoryRecursive(const std::string & path) override; void unlinkFile(const std::string & path) override; + void removeDirectory(const std::string & path) override; UnlinkMetadataFileOperationOutcomePtr unlinkMetadata(const std::string & path) override; void commit() override { - /// Nothing to commit. + /// TODO: rewrite with transactions } bool supportsChmod() const override { return false; } diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index ec6f7081c85..b3626135177 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -102,7 +102,7 @@ void checkS3Capabilities( if (s3_capabilities.support_batch_delete && !checkBatchRemove(storage, key_with_trailing_slash)) { LOG_WARNING( - &Poco::Logger::get("S3ObjectStorage"), + getLogger("S3ObjectStorage"), "Storage for disk {} does not support batch delete operations, " "so `s3_capabilities.support_batch_delete` was automatically turned off during the access check. " "To remove this message set `s3_capabilities.support_batch_delete` for the disk to `false`.", @@ -213,10 +213,12 @@ void registerAzureObjectStorage(ObjectStorageFactory & factory) const ContextPtr & context, bool /* skip_access_check */) -> ObjectStoragePtr { + String container_name = config.getString(config_prefix + ".container_name", "default-container"); return std::make_unique( name, getAzureBlobContainerClient(config, config_prefix), - getAzureBlobStorageSettings(config, config_prefix, context)); + getAzureBlobStorageSettings(config, config_prefix, context), + container_name); }); } diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h index a6abe03bac9..5f63e5f6e8a 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 4cc49288af6..5771eb1ebe0 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp index 0223c24973e..786b23caf48 100644 --- a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp @@ -82,7 +82,7 @@ WebObjectStorage::loadFiles(const String & path, const std::unique_lock + +#include +#include +#include +#include +#include +#include + + +using namespace DB; +namespace fs = std::filesystem; + +class AsynchronousBoundedReadBufferTest : public ::testing::TestWithParam +{ +public: + AsynchronousBoundedReadBufferTest() { fs::create_directories(temp_folder.path()); } + + String makeTempFile(const String & contents) + { + String path = fmt::format("{}/{}", temp_folder.path(), counter); + ++counter; + + WriteBufferFromFile out{path}; + out.write(contents.data(), contents.size()); + out.finalize(); + + return path; + } + +private: + Poco::TemporaryFile temp_folder; + size_t counter = 0; +}; + +String getAlphabetWithDigits() +{ + String contents; + for (char c = 'a'; c <= 'z'; ++c) + contents += c; + for (char c = '0'; c <= '9'; ++c) + contents += c; + return contents; +} + + +TEST_F(AsynchronousBoundedReadBufferTest, setReadUntilPosition) +{ + String file_path = makeTempFile(getAlphabetWithDigits()); + ThreadPoolRemoteFSReader remote_fs_reader(4, 0); + + for (bool with_prefetch : {false, true}) + { + AsynchronousBoundedReadBuffer read_buffer(createReadBufferFromFileBase(file_path, {}), remote_fs_reader, {}); + read_buffer.setReadUntilPosition(20); + + auto try_read = [&](size_t count) + { + if (with_prefetch) + read_buffer.prefetch(Priority{0}); + + String str; + str.resize(count); + str.resize(read_buffer.read(str.data(), str.size())); + return str; + }; + + EXPECT_EQ(try_read(15), "abcdefghijklmno"); + EXPECT_EQ(try_read(15), "pqrst"); + EXPECT_EQ(try_read(15), ""); + + read_buffer.setReadUntilPosition(25); + + EXPECT_EQ(try_read(15), "uvwxy"); + EXPECT_EQ(try_read(15), ""); + + read_buffer.setReadUntilEnd(); + + EXPECT_EQ(try_read(15), "z0123456789"); + EXPECT_EQ(try_read(15), ""); + } +} diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index a7e9fb8e99f..16f8a341e03 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -109,31 +109,31 @@ bool deserializeFieldByEscapingRule( { case FormatSettings::EscapingRule::Escaped: if (parse_as_nullable) - read = SerializationNullable::deserializeTextEscapedImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextEscaped(column, buf, format_settings, serialization); else serialization->deserializeTextEscaped(column, buf, format_settings); break; case FormatSettings::EscapingRule::Quoted: if (parse_as_nullable) - read = SerializationNullable::deserializeTextQuotedImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(column, buf, format_settings, serialization); else serialization->deserializeTextQuoted(column, buf, format_settings); break; case FormatSettings::EscapingRule::CSV: if (parse_as_nullable) - read = SerializationNullable::deserializeTextCSVImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(column, buf, format_settings, serialization); else serialization->deserializeTextCSV(column, buf, format_settings); break; case FormatSettings::EscapingRule::JSON: if (parse_as_nullable) - read = SerializationNullable::deserializeTextJSONImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(column, buf, format_settings, serialization); else serialization->deserializeTextJSON(column, buf, format_settings); break; case FormatSettings::EscapingRule::Raw: if (parse_as_nullable) - read = SerializationNullable::deserializeTextRawImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextRaw(column, buf, format_settings, serialization); else serialization->deserializeTextRaw(column, buf, format_settings); break; diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 62cbadec4f4..0654dd01e49 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -39,7 +39,7 @@ const FormatFactory::Creators & FormatFactory::getCreators(const String & name) throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name); } -FormatSettings getFormatSettings(ContextPtr context) +FormatSettings getFormatSettings(const ContextPtr & context) { const auto & settings = context->getSettingsRef(); @@ -47,7 +47,7 @@ FormatSettings getFormatSettings(ContextPtr context) } template -FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) +FormatSettings getFormatSettings(const ContextPtr & context, const Settings & settings) { FormatSettings format_settings; @@ -166,6 +166,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.template_settings.resultset_format = settings.format_template_resultset; format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter; format_settings.template_settings.row_format = settings.format_template_row; + format_settings.template_settings.row_format_template = settings.format_template_row_format; + format_settings.template_settings.resultset_format_template = settings.format_template_resultset_format; format_settings.tsv.crlf_end_of_line = settings.output_format_tsv_crlf_end_of_line; format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default; format_settings.tsv.enum_as_number = settings.input_format_tsv_enum_as_number; @@ -179,6 +181,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.values.allow_data_after_semicolon = settings.input_format_values_allow_data_after_semicolon; format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions; format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions; + format_settings.values.escape_quote_with_quote = settings.output_format_values_escape_quote_with_quote; format_settings.with_names_use_header = settings.input_format_with_names_use_header; format_settings.with_types_use_header = settings.input_format_with_types_use_header; format_settings.write_statistics = settings.output_format_write_statistics; @@ -226,6 +229,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.try_infer_integers = settings.input_format_try_infer_integers; format_settings.try_infer_dates = settings.input_format_try_infer_dates; format_settings.try_infer_datetimes = settings.input_format_try_infer_datetimes; + format_settings.try_infer_exponent_floats = settings.input_format_try_infer_exponent_floats; format_settings.markdown.escape_special_characters = settings.output_format_markdown_escape_special_characters; format_settings.bson.output_string_as_string = settings.output_format_bson_string_as_string; format_settings.bson.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_bson_skip_fields_with_unsupported_types_in_schema_inference; @@ -253,16 +257,16 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) return format_settings; } -template FormatSettings getFormatSettings(ContextPtr context, const FormatFactorySettings & settings); +template FormatSettings getFormatSettings(const ContextPtr & context, const FormatFactorySettings & settings); -template FormatSettings getFormatSettings(ContextPtr context, const Settings & settings); +template FormatSettings getFormatSettings(const ContextPtr & context, const Settings & settings); InputFormatPtr FormatFactory::getInput( const String & name, ReadBuffer & _buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, UInt64 max_block_size, const std::optional & _format_settings, std::optional _max_parsing_threads, @@ -425,7 +429,7 @@ std::unique_ptr FormatFactory::wrapReadBufferIfNeeded( return res; } -static void addExistingProgressToOutputFormat(OutputFormatPtr format, ContextPtr context) +static void addExistingProgressToOutputFormat(OutputFormatPtr format, const ContextPtr & context) { auto element_id = context->getProcessListElementSafe(); if (element_id) @@ -444,7 +448,7 @@ OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible( const String & name, WriteBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & output_getter = getCreators(name).output_creator; @@ -482,7 +486,7 @@ OutputFormatPtr FormatFactory::getOutputFormat( const String & name, WriteBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & output_getter = getCreators(name).output_creator; @@ -516,7 +520,7 @@ OutputFormatPtr FormatFactory::getOutputFormat( String FormatFactory::getContentType( const String & name, - ContextPtr context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & output_getter = getCreators(name).output_creator; @@ -535,7 +539,7 @@ String FormatFactory::getContentType( SchemaReaderPtr FormatFactory::getSchemaReader( const String & name, ReadBuffer & buf, - ContextPtr & context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & schema_reader_creator = dict.at(name).schema_reader_creator; @@ -551,7 +555,7 @@ SchemaReaderPtr FormatFactory::getSchemaReader( ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader( const String & name, - ContextPtr & context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & external_schema_reader_creator = dict.at(name).external_schema_reader_creator; @@ -605,7 +609,7 @@ void FormatFactory::markFormatHasNoAppendSupport(const String & name) registerAppendSupportChecker(name, [](const FormatSettings &){ return false; }); } -bool FormatFactory::checkIfFormatSupportAppend(const String & name, ContextPtr context, const std::optional & format_settings_) +bool FormatFactory::checkIfFormatSupportAppend(const String & name, const ContextPtr & context, const std::optional & format_settings_) { auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context); auto & append_support_checker = dict[name].append_support_checker; @@ -628,10 +632,10 @@ void FormatFactory::registerFileExtension(const String & extension, const String file_extension_formats[boost::to_lower_copy(extension)] = format_name; } -String FormatFactory::getFormatFromFileName(String file_name, bool throw_if_not_found) +std::optional FormatFactory::tryGetFormatFromFileName(String file_name) { if (file_name == "stdin") - return getFormatFromFileDescriptor(STDIN_FILENO); + return tryGetFormatFromFileDescriptor(STDIN_FILENO); CompressionMethod compression_method = chooseCompressionMethod(file_name, ""); if (CompressionMethod::None != compression_method) @@ -643,43 +647,53 @@ String FormatFactory::getFormatFromFileName(String file_name, bool throw_if_not_ auto pos = file_name.find_last_of('.'); if (pos == String::npos) - { - if (throw_if_not_found) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the file format by it's extension"); - return ""; - } + return std::nullopt; String file_extension = file_name.substr(pos + 1, String::npos); boost::algorithm::to_lower(file_extension); auto it = file_extension_formats.find(file_extension); if (it == file_extension_formats.end()) - { - if (throw_if_not_found) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the file format by it's extension"); - return ""; - } + return std::nullopt; + return it->second; } -String FormatFactory::getFormatFromFileDescriptor(int fd) +String FormatFactory::getFormatFromFileName(String file_name) +{ + if (auto format = tryGetFormatFromFileName(file_name)) + return *format; + + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the format of the file {} by it's extension", file_name); +} + +std::optional FormatFactory::tryGetFormatFromFileDescriptor(int fd) { #ifdef OS_LINUX std::string proc_path = fmt::format("/proc/self/fd/{}", fd); char file_path[PATH_MAX] = {'\0'}; if (readlink(proc_path.c_str(), file_path, sizeof(file_path) - 1) != -1) - return getFormatFromFileName(file_path, false); - return ""; + return tryGetFormatFromFileName(file_path); + return std::nullopt; #elif defined(OS_DARWIN) char file_path[PATH_MAX] = {'\0'}; if (fcntl(fd, F_GETPATH, file_path) != -1) - return getFormatFromFileName(file_path, false); - return ""; + return tryGetFormatFromFileName(file_path); + return std::nullopt; #else (void)fd; - return ""; + return std::nullopt; #endif } +String FormatFactory::getFormatFromFileDescriptor(int fd) +{ + if (auto format = tryGetFormatFromFileDescriptor(fd)) + return *format; + + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the format of the data by the file descriptor {}", fd); +} + + void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine) { auto & target = dict[name].file_segmentation_engine_creator; @@ -765,7 +779,7 @@ void FormatFactory::registerAdditionalInfoForSchemaCacheGetter( target = std::move(additional_info_for_schema_cache_getter); } -String FormatFactory::getAdditionalInfoForSchemaCache(const String & name, ContextPtr context, const std::optional & format_settings_) +String FormatFactory::getAdditionalInfoForSchemaCache(const String & name, const ContextPtr & context, const std::optional & format_settings_) { const auto & additional_info_getter = getCreators(name).additional_info_for_schema_cache_getter; if (!additional_info_getter) @@ -810,7 +824,7 @@ bool FormatFactory::checkIfOutputFormatPrefersLargeBlocks(const String & name) c return target.prefers_large_blocks; } -bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, ContextPtr context) const +bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, const ContextPtr & context) const { if (name == "Parquet" && context->getSettingsRef().input_format_parquet_preserve_order) return false; @@ -825,6 +839,18 @@ void FormatFactory::checkFormatName(const String & name) const throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name); } +std::vector FormatFactory::getAllInputFormats() const +{ + std::vector input_formats; + for (const auto & [format_name, creators] : dict) + { + if (creators.input_creator || creators.random_access_input_creator) + input_formats.push_back(format_name); + } + + return input_formats; +} + FormatFactory & FormatFactory::instance() { static FormatFactory ret; diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index 9670c690456..165a20f7c4d 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -48,10 +48,10 @@ using RowOutputFormatPtr = std::shared_ptr; template struct Memory; -FormatSettings getFormatSettings(ContextPtr context); +FormatSettings getFormatSettings(const ContextPtr & context); template -FormatSettings getFormatSettings(ContextPtr context, const T & settings); +FormatSettings getFormatSettings(const ContextPtr & context, const T & settings); /** Allows to create an IInputFormat or IOutputFormat by the name of the format. * Note: format and compression are independent things. @@ -161,7 +161,7 @@ public: const String & name, ReadBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, UInt64 max_block_size, const std::optional & format_settings = std::nullopt, std::optional max_parsing_threads = std::nullopt, @@ -178,30 +178,30 @@ public: const String & name, WriteBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, const std::optional & format_settings = std::nullopt) const; OutputFormatPtr getOutputFormat( const String & name, WriteBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, const std::optional & _format_settings = std::nullopt) const; String getContentType( const String & name, - ContextPtr context, + const ContextPtr & context, const std::optional & format_settings = std::nullopt) const; SchemaReaderPtr getSchemaReader( const String & name, ReadBuffer & buf, - ContextPtr & context, + const ContextPtr & context, const std::optional & format_settings = std::nullopt) const; ExternalSchemaReaderPtr getExternalSchemaReader( const String & name, - ContextPtr & context, + const ContextPtr & context, const std::optional & format_settings = std::nullopt) const; void registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine); @@ -216,7 +216,7 @@ public: /// registerAppendSupportChecker with append_support_checker that always returns true. void markFormatHasNoAppendSupport(const String & name); - bool checkIfFormatSupportAppend(const String & name, ContextPtr context, const std::optional & format_settings_ = std::nullopt); + bool checkIfFormatSupportAppend(const String & name, const ContextPtr & context, const std::optional & format_settings_ = std::nullopt); /// Register format by its name. void registerInputFormat(const String & name, InputCreator input_creator); @@ -225,8 +225,10 @@ public: /// Register file extension for format void registerFileExtension(const String & extension, const String & format_name); - String getFormatFromFileName(String file_name, bool throw_if_not_found = false); + String getFormatFromFileName(String file_name); + std::optional tryGetFormatFromFileName(String file_name); String getFormatFromFileDescriptor(int fd); + std::optional tryGetFormatFromFileDescriptor(int fd); /// Register schema readers for format its name. void registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator); @@ -244,16 +246,18 @@ public: bool checkIfFormatHasAnySchemaReader(const String & name) const; bool checkIfOutputFormatPrefersLargeBlocks(const String & name) const; - bool checkParallelizeOutputAfterReading(const String & name, ContextPtr context) const; + bool checkParallelizeOutputAfterReading(const String & name, const ContextPtr & context) const; void registerAdditionalInfoForSchemaCacheGetter(const String & name, AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter); - String getAdditionalInfoForSchemaCache(const String & name, ContextPtr context, const std::optional & format_settings_ = std::nullopt); + String getAdditionalInfoForSchemaCache(const String & name, const ContextPtr & context, const std::optional & format_settings_ = std::nullopt); const FormatsDictionary & getAllFormats() const { return dict; } + std::vector getAllInputFormats() const; + bool isInputFormat(const String & name) const; bool isOutputFormat(const String & name) const; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 30e4dd04513..aa37216d381 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -46,6 +46,7 @@ struct FormatSettings bool try_infer_integers = false; bool try_infer_dates = false; bool try_infer_datetimes = false; + bool try_infer_exponent_floats = false; enum class DateTimeInputFormat { @@ -338,6 +339,8 @@ struct FormatSettings String resultset_format; String row_format; String row_between_delimiter; + String row_format_template; + String resultset_format_template; } template_settings; struct @@ -359,6 +362,7 @@ struct FormatSettings bool deduce_templates_of_expressions = true; bool accurate_types_of_literals = true; bool allow_data_after_semicolon = false; + bool escape_quote_with_quote = false; } values; enum class ORCCompression diff --git a/src/Formats/JSONUtils.cpp b/src/Formats/JSONUtils.cpp index 779f38032d8..7d494c1e96f 100644 --- a/src/Formats/JSONUtils.cpp +++ b/src/Formats/JSONUtils.cpp @@ -282,14 +282,14 @@ namespace JSONUtils ReadBufferFromString buf(str); if (as_nullable) - return SerializationNullable::deserializeWholeTextImpl(column, buf, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedWholeText(column, buf, format_settings, serialization); serialization->deserializeWholeText(column, buf, format_settings); return true; } if (as_nullable) - return SerializationNullable::deserializeTextJSONImpl(column, in, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(column, in, format_settings, serialization); serialization->deserializeTextJSON(column, in, format_settings); return true; diff --git a/src/Formats/NativeWriter.cpp b/src/Formats/NativeWriter.cpp index 70d5b7914a7..b150561a5fc 100644 --- a/src/Formats/NativeWriter.cpp +++ b/src/Formats/NativeWriter.cpp @@ -49,8 +49,9 @@ static void writeData(const ISerialization & serialization, const ColumnPtr & co { /** If there are columns-constants - then we materialize them. * (Since the data type does not know how to serialize / deserialize constants.) + * The same for compressed columns in-memory. */ - ColumnPtr full_column = column->convertToFullColumnIfConst(); + ColumnPtr full_column = column->convertToFullColumnIfConst()->decompress(); ISerialization::SerializeBinaryBulkSettings settings; settings.getter = [&ostr](ISerialization::SubstreamPath) -> WriteBuffer * { return &ostr; }; diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 43931be3449..5badf4301bf 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -14,7 +15,9 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int ONLY_NULLS_WHILE_READING_SCHEMA; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int CANNOT_DETECT_FORMAT; extern const int TYPE_MISMATCH; + extern const int LOGICAL_ERROR; } static std::optional getOrderedColumnsList(const NamesAndTypesList & columns_list, const Names & columns_order_hint) @@ -43,50 +46,87 @@ bool isRetryableSchemaInferenceError(int code) return code == ErrorCodes::EMPTY_DATA_PASSED || code == ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA; } -ColumnsDescription readSchemaFromFormat( - const String & format_name, +/// Order of formats to try in automatic format detection. +/// If we can successfully detect some format, we won't try next ones. +static const std::vector & getFormatsOrderForDetection() +{ + static const std::vector formats_order = + { + "Parquet", + "ORC", + "Arrow", + "ArrowStream", + "Avro", + "AvroConfluent", + "Npy", + "Native", + "BSONEachRow", + "JSONCompact", + "Values", + "TSKV", + "JSONObjectEachRow", + "JSONColumns", + "JSONCompactColumns", + "JSONCompact", + "JSON", + }; + + return formats_order; +} + +/// The set of similar formats to try in automatic format detection. +/// We will try all formats from this set and then choose the best one +/// according to inferred schema. +static const std::vector & getSimilarFormatsSetForDetection() +{ + static const std::vector formats_order = + { + "TSV", + "CSV", + }; + + return formats_order; +} + +std::pair readSchemaFromFormatImpl( + std::optional format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - bool retry, - ContextPtr & context, - std::unique_ptr & buf) + const ContextPtr & context) try { NamesAndTypesList names_and_types; SchemaInferenceMode mode = context->getSettingsRef().schema_inference_mode; - if (mode == SchemaInferenceMode::UNION && !FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name, context, format_settings)) + if (format_name && mode == SchemaInferenceMode::UNION && !FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(*format_name, context, format_settings)) { String additional_message; /// Better exception message for WithNames(AndTypes) formats. - if (format_name.ends_with("WithNames") || format_name.ends_with("WithNamesAndTypes")) + if (format_name->ends_with("WithNames") || format_name->ends_with("WithNamesAndTypes")) additional_message = " (formats -WithNames(AndTypes) support reading subset of columns only when setting input_format_with_names_use_header is enabled)"; - throw Exception(ErrorCodes::BAD_ARGUMENTS, "UNION schema inference mode is not supported for format {}, because it doesn't support reading subset of columns{}", format_name, additional_message); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "UNION schema inference mode is not supported for format {}, because it doesn't support reading subset of columns{}", *format_name, additional_message); } - if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name)) + if (format_name && FormatFactory::instance().checkIfFormatHasExternalSchemaReader(*format_name)) { - auto external_schema_reader = FormatFactory::instance().getExternalSchemaReader(format_name, context, format_settings); + auto external_schema_reader = FormatFactory::instance().getExternalSchemaReader(*format_name, context, format_settings); try { - names_and_types = external_schema_reader->readSchema(); + return {ColumnsDescription(external_schema_reader->readSchema()), *format_name}; } catch (Exception & e) { e.addMessage( - fmt::format("Cannot extract table structure from {} format file. You can specify the structure manually", format_name)); + fmt::format("The table structure cannot be extracted from a {} format file. You can specify the structure manually", *format_name)); throw; } } - else if (FormatFactory::instance().checkIfFormatHasSchemaReader(format_name)) - { - if (mode == SchemaInferenceMode::UNION) - retry = false; + if (!format_name || FormatFactory::instance().checkIfFormatHasSchemaReader(*format_name)) + { + IReadBufferIterator::Data iterator_data; std::vector> schemas_for_union_mode; - std::optional cached_columns; std::string exception_messages; - SchemaReaderPtr schema_reader; size_t max_rows_to_read = format_settings ? format_settings->max_rows_to_read_for_schema_inference : context->getSettingsRef().input_format_max_rows_to_read_for_schema_inference; size_t max_bytes_to_read = format_settings ? format_settings->max_bytes_to_read_for_schema_inference @@ -94,45 +134,71 @@ try size_t iterations = 0; while (true) { + /// When we finish working with current buffer we should put it back to iterator. + SCOPE_EXIT(if (iterator_data.buf) read_buffer_iterator.setPreviousReadBuffer(std::move(iterator_data.buf))); bool is_eof = false; try { - read_buffer_iterator.setPreviousReadBuffer(std::move(buf)); - std::tie(buf, cached_columns) = read_buffer_iterator.next(); - if (cached_columns) + iterator_data = read_buffer_iterator.next(); + + /// Read buffer iterator can determine the data format if it's unknown. + /// For example by scanning schema cache or by finding new file with format extension. + if (!format_name && iterator_data.format_name) { + format_name = *iterator_data.format_name; + read_buffer_iterator.setFormatName(*iterator_data.format_name); + } + + if (iterator_data.cached_columns) + { + /// If we have schema in cache, we must also know the format. + if (!format_name) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Schema from cache was returned, but format name is unknown"); + if (mode == SchemaInferenceMode::DEFAULT) - return *cached_columns; - schemas_for_union_mode.emplace_back(cached_columns->getAll(), read_buffer_iterator.getLastFileName()); + { + read_buffer_iterator.setResultingSchema(*iterator_data.cached_columns); + return {*iterator_data.cached_columns, *format_name}; + } + + schemas_for_union_mode.emplace_back(iterator_data.cached_columns->getAll(), read_buffer_iterator.getLastFileName()); continue; } - if (!buf) + if (!iterator_data.buf) break; /// We just want to check for eof, but eof() can be pretty expensive. /// So we use getFileSize() when available, which has better worst case. /// (For remote files, typically eof() would read 1 MB from S3, which may be much /// more than what the schema reader and even data reader will read). - auto size = tryGetFileSizeFromReadBuffer(*buf); + auto size = tryGetFileSizeFromReadBuffer(*iterator_data.buf); if (size.has_value()) is_eof = *size == 0; else - is_eof = buf->eof(); + is_eof = iterator_data.buf->eof(); } catch (Exception & e) { - e.addMessage( - fmt::format("Cannot extract table structure from {} format file. You can specify the structure manually", format_name)); + if (format_name) + e.addMessage(fmt::format("The table structure cannot be extracted from a {} format file. You can specify the structure manually", *format_name)); + else + e.addMessage("The data format cannot be detected by the contents of the files. You can specify the format manually"); throw; } catch (...) { auto exception_message = getCurrentExceptionMessage(false); + if (format_name) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file:\n{}.\nYou can specify the structure manually", + *format_name, + exception_message); + throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file:\n{}\nYou can specify the structure manually", - format_name, + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files:\n{}.\nYou can specify the format manually", exception_message); } @@ -140,91 +206,224 @@ try if (is_eof) { - auto exception_message = fmt::format("Cannot extract table structure from {} format file, file is empty", format_name); + String exception_message; + if (format_name) + exception_message = fmt::format("The table structure cannot be extracted from a {} format file: the file is empty", *format_name); + else + exception_message = fmt::format("The data format cannot be detected by the contents of the files: the file is empty"); - if (!retry) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "{}. You can specify the structure manually", exception_message); + if (mode == SchemaInferenceMode::UNION) + { + if (!format_name) + throw Exception(ErrorCodes::CANNOT_DETECT_FORMAT, "The data format cannot be detected by the contents of the files: the file is empty. You can specify the format manually"); - exception_messages += "\n" + exception_message; + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "{}. You can specify the structure manually", exception_message); + } + + if (!exception_messages.empty()) + exception_messages += "\n"; + exception_messages += exception_message; continue; } - try - { - schema_reader = FormatFactory::instance().getSchemaReader(format_name, *buf, context, format_settings); - schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); - names_and_types = schema_reader->readSchema(); - auto num_rows = schema_reader->readNumberOrRows(); - if (num_rows) - read_buffer_iterator.setNumRowsToLastFile(*num_rows); + std::unique_ptr peekable_buf; /// Can be used in format detection. Should be destroyed after schema reader. - /// In default mode, we finish when schema is inferred successfully from any file. - if (mode == SchemaInferenceMode::DEFAULT) - break; - - if (!names_and_types.empty()) - read_buffer_iterator.setSchemaToLastFile(ColumnsDescription(names_and_types)); - schemas_for_union_mode.emplace_back(names_and_types, read_buffer_iterator.getLastFileName()); - } - catch (...) + if (format_name) { - auto exception_message = getCurrentExceptionMessage(false); - if (schema_reader && mode == SchemaInferenceMode::DEFAULT) + SchemaReaderPtr schema_reader; + + try { - size_t rows_read = schema_reader->getNumRowsRead(); - assert(rows_read <= max_rows_to_read); - max_rows_to_read -= schema_reader->getNumRowsRead(); - size_t bytes_read = buf->count(); - /// We could exceed max_bytes_to_read a bit to complete row parsing. - max_bytes_to_read -= std::min(bytes_read, max_bytes_to_read); - if (rows_read != 0 && (max_rows_to_read == 0 || max_bytes_to_read == 0)) - { - exception_message += "\nTo increase the maximum number of rows/bytes to read for structure determination, use setting " - "input_format_max_rows_to_read_for_schema_inference/input_format_max_bytes_to_read_for_schema_inference"; + schema_reader = FormatFactory::instance().getSchemaReader(*format_name, *iterator_data.buf, context, format_settings); + schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); + names_and_types = schema_reader->readSchema(); + auto num_rows = schema_reader->readNumberOrRows(); + if (num_rows) + read_buffer_iterator.setNumRowsToLastFile(*num_rows); - if (iterations > 1) + /// In default mode, we finish when schema is inferred successfully from any file. + if (mode == SchemaInferenceMode::DEFAULT) + break; + + if (!names_and_types.empty()) + read_buffer_iterator.setSchemaToLastFile(ColumnsDescription(names_and_types)); + schemas_for_union_mode.emplace_back(names_and_types, read_buffer_iterator.getLastFileName()); + } + catch (...) + { + auto exception_message = getCurrentExceptionMessage(false); + if (schema_reader && mode == SchemaInferenceMode::DEFAULT) + { + size_t rows_read = schema_reader->getNumRowsRead(); + assert(rows_read <= max_rows_to_read); + max_rows_to_read -= schema_reader->getNumRowsRead(); + size_t bytes_read = iterator_data.buf->count(); + /// We could exceed max_bytes_to_read a bit to complete row parsing. + max_bytes_to_read -= std::min(bytes_read, max_bytes_to_read); + if (rows_read != 0 && (max_rows_to_read == 0 || max_bytes_to_read == 0)) { - exception_messages += "\n" + exception_message; + exception_message + += "\nTo increase the maximum number of rows/bytes to read for structure determination, use setting " + "input_format_max_rows_to_read_for_schema_inference/input_format_max_bytes_to_read_for_schema_inference"; + if (!exception_messages.empty()) + exception_messages += "\n"; + exception_messages += exception_message; break; } - retry = false; } - } - if (!retry || !isRetryableSchemaInferenceError(getCurrentExceptionCode())) - { - try - { - throw; - } - catch (Exception & e) - { - e.addMessage(fmt::format( - "Cannot extract table structure from {} format file. You can specify the structure manually", format_name)); - throw; - } - catch (...) + if (mode == SchemaInferenceMode::UNION || !isRetryableSchemaInferenceError(getCurrentExceptionCode())) { throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file. " - "Error: {}. You can specify the structure manually", - format_name, + "The table structure cannot be extracted from a {} format file. " + "Error:\n{}.\nYou can specify the structure manually", + *format_name, exception_message); } + + if (!exception_messages.empty()) + exception_messages += "\n"; + exception_messages += exception_message; + } + } + else + { + /// If the format is unknown we try some formats in order and try to apply their schema readers. + /// If we can successfully infer the schema in some format, most likely we can use this format to read this data. + + /// If read_buffer_iterator supports recreation of last buffer, we will recreate it for + /// each format. Otherwise we will use PeekableReadBuffer and will rollback to the + /// beginning of the file before each format. Using PeekableReadBuffer can lead + /// to high memory usage as it will save all the read data from the beginning of the file, + /// especially it will be noticeable for formats like Parquet/ORC/Arrow that do seeks to the + /// end of file. + bool support_buf_recreation = read_buffer_iterator.supportsLastReadBufferRecreation(); + if (!support_buf_recreation) + { + peekable_buf = std::make_unique(*iterator_data.buf); + peekable_buf->setCheckpoint(); + } + + /// First, try some formats in order. If we successfully inferred the schema for any format, + /// we will use this format. + for (const auto & format_to_detect : getFormatsOrderForDetection()) + { + try + { + SchemaReaderPtr schema_reader = FormatFactory::instance().getSchemaReader(format_to_detect, support_buf_recreation ? *iterator_data.buf : *peekable_buf, context, format_settings); + schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); + names_and_types = schema_reader->readSchema(); + if (names_and_types.empty()) + continue; + + /// We successfully inferred schema from this file using current format. + format_name = format_to_detect; + read_buffer_iterator.setFormatName(format_to_detect); + + auto num_rows = schema_reader->readNumberOrRows(); + if (num_rows) + read_buffer_iterator.setNumRowsToLastFile(*num_rows); + + break; + } + catch (...) + { + /// We failed to infer the schema for this format. + /// Recreate read buffer or rollback to the beginning of the data + /// before trying next format. + if (support_buf_recreation) + { + read_buffer_iterator.setPreviousReadBuffer(std::move(iterator_data.buf)); + iterator_data.buf = read_buffer_iterator.recreateLastReadBuffer(); + } + else + { + peekable_buf->rollbackToCheckpoint(); + } + } } - exception_messages += "\n" + exception_message; + /// If no format was detected from first set of formats, we try second set. + /// In this set formats are similar and it can happen that data matches some of them. + /// We try to infer schema for all of the formats from this set and then choose the best + /// one according to the inferred schema. + if (!format_name) + { + std::unordered_map format_to_schema; + const auto & formats_set_to_detect = getSimilarFormatsSetForDetection(); + for (size_t i = 0; i != formats_set_to_detect.size(); ++i) + { + try + { + SchemaReaderPtr schema_reader = FormatFactory::instance().getSchemaReader( + formats_set_to_detect[i], support_buf_recreation ? *iterator_data.buf : *peekable_buf, context, format_settings); + schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); + auto tmp_names_and_types = schema_reader->readSchema(); + /// If schema was inferred successfully for this format, remember it and try next format. + if (!tmp_names_and_types.empty()) + format_to_schema[formats_set_to_detect[i]] = tmp_names_and_types; + } + catch (...) // NOLINT(bugprone-empty-catch) + { + /// Try next format. + } + + if (i != formats_set_to_detect.size() - 1) + { + if (support_buf_recreation) + { + read_buffer_iterator.setPreviousReadBuffer(std::move(iterator_data.buf)); + iterator_data.buf = read_buffer_iterator.recreateLastReadBuffer(); + } + else + { + peekable_buf->rollbackToCheckpoint(); + } + } + } + + /// We choose the format with larger number of columns in inferred schema. + size_t max_number_of_columns = 0; + for (const auto & [format_to_detect, schema] : format_to_schema) + { + if (schema.size() > max_number_of_columns) + { + names_and_types = schema; + format_name = format_to_detect; + max_number_of_columns = schema.size(); + } + } + + if (format_name) + read_buffer_iterator.setFormatName(*format_name); + } + + if (mode == SchemaInferenceMode::UNION) + { + /// For UNION mode we need to know the schema of each file, + /// if we failed to detect the format, we failed to detect the schema of this file + /// in any format. It doesn't make sense to continue. + if (!format_name) + throw Exception(ErrorCodes::CANNOT_DETECT_FORMAT, "The data format cannot be detected by the contents of the files. You can specify the format manually"); + + read_buffer_iterator.setSchemaToLastFile(ColumnsDescription(names_and_types)); + schemas_for_union_mode.emplace_back(names_and_types, read_buffer_iterator.getLastFileName()); + } + + if (format_name && mode == SchemaInferenceMode::DEFAULT) + break; } } - /// If we got all schemas from cache, schema_reader can be uninitialized. - /// But we still need some stateless methods of ISchemaReader, - /// let's initialize it with empty buffer. + if (!format_name) + throw Exception(ErrorCodes::CANNOT_DETECT_FORMAT, "The data format cannot be detected by the contents of the files. You can specify the format manually"); + + /// We need some stateless methods of ISchemaReader, but during reading schema we + /// could not even create a schema reader (for example when we got schema from cache). + /// Let's create stateless schema reader from empty read buffer. EmptyReadBuffer empty; - if (!schema_reader) - schema_reader = FormatFactory::instance().getSchemaReader(format_name, empty, context, format_settings); + SchemaReaderPtr stateless_schema_reader = FormatFactory::instance().getSchemaReader(*format_name, empty, context, format_settings); if (mode == SchemaInferenceMode::UNION) { @@ -251,7 +450,7 @@ try /// If types are not the same, try to transform them according /// to the format to find common type. auto new_type_copy = type; - schema_reader->transformTypesFromDifferentFilesIfNeeded(it->second, new_type_copy); + stateless_schema_reader->transformTypesFromDifferentFilesIfNeeded(it->second, new_type_copy); /// If types are not the same after transform, we cannot do anything, throw an exception. if (!it->second->equals(*new_type_copy)) @@ -273,11 +472,23 @@ try } if (names_and_types.empty()) + { + if (iterations <= 1) + { + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file. " + "Error:\n{}.\nYou can specify the structure manually", + *format_name, + exception_messages); + } + throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "All attempts to extract table structure from files failed. " - "Errors:{}\nYou can specify the structure manually", + "Errors:\n{}\nYou can specify the structure manually", exception_messages); + } /// If we have "INSERT SELECT" query then try to order /// columns as they are ordered in table schema for formats @@ -285,7 +496,7 @@ try /// It will allow to execute simple data loading with query /// "INSERT INTO table SELECT * FROM ..." const auto & insertion_table = context->getInsertionTable(); - if (schema_reader && !schema_reader->hasStrictOrderOfColumns() && !insertion_table.empty()) + if (!stateless_schema_reader->hasStrictOrderOfColumns() && !insertion_table.empty()) { auto storage = DatabaseCatalog::instance().getTable(insertion_table, context); auto metadata = storage->getInMemoryMetadataPtr(); @@ -294,22 +505,22 @@ try if (ordered_list) names_and_types = *ordered_list; } + + /// Some formats like CSVWithNames can contain empty column names. We don't support empty column names and further processing can fail with an exception. Let's just remove columns with empty names from the structure. + names_and_types.erase( + std::remove_if(names_and_types.begin(), names_and_types.end(), [](const NameAndTypePair & pair) { return pair.name.empty(); }), + names_and_types.end()); + + auto columns = ColumnsDescription(names_and_types); + if (mode == SchemaInferenceMode::DEFAULT) + read_buffer_iterator.setResultingSchema(columns); + return {columns, *format_name}; } - else - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "{} file format doesn't support schema inference. You must specify the structure manually", - format_name); - /// Some formats like CSVWithNames can contain empty column names. We don't support empty column names and further processing can fail with an exception. Let's just remove columns with empty names from the structure. - names_and_types.erase( - std::remove_if(names_and_types.begin(), names_and_types.end(), [](const NameAndTypePair & pair) { return pair.name.empty(); }), - names_and_types.end()); - - auto columns = ColumnsDescription(names_and_types); - if (mode == SchemaInferenceMode::DEFAULT) - read_buffer_iterator.setResultingSchema(columns); - return columns; + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "{} file format doesn't support schema inference. You must specify the structure manually", + *format_name); } catch (Exception & e) { @@ -319,16 +530,21 @@ catch (Exception & e) throw; } - ColumnsDescription readSchemaFromFormat( const String & format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - bool retry, - ContextPtr & context) + const ContextPtr & context) { - std::unique_ptr buf_out; - return readSchemaFromFormat(format_name, format_settings, read_buffer_iterator, retry, context, buf_out); + return readSchemaFromFormatImpl(format_name, format_settings, read_buffer_iterator, context).first; +} + +std::pair detectFormatAndReadSchema( + const std::optional & format_settings, + IReadBufferIterator & read_buffer_iterator, + const ContextPtr & context) +{ + return readSchemaFromFormatImpl(std::nullopt, format_settings, read_buffer_iterator, context); } SchemaCache::Key getKeyForSchemaCache( diff --git a/src/Formats/ReadSchemaUtils.h b/src/Formats/ReadSchemaUtils.h index 6aa8f3f9c4c..bb5e068f696 100644 --- a/src/Formats/ReadSchemaUtils.h +++ b/src/Formats/ReadSchemaUtils.h @@ -7,29 +7,68 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + struct IReadBufferIterator { virtual ~IReadBufferIterator() = default; - virtual void setPreviousReadBuffer(std::unique_ptr /* buffer */) {} - /// Return read buffer of the next file or cached schema. /// In DEFAULT schema inference mode cached schema can be from any file. /// In UNION mode cached schema can be only from current file. /// When there is no files to process, return pair (nullptr, nullopt) - virtual std::pair, std::optional> next() = 0; + struct Data + { + /// Read buffer of the next file. Can be nullptr if there are no more files + /// or when schema was found in cache. + std::unique_ptr buf; + + /// Schema from cache. + /// In DEFAULT schema inference mode cached schema can be from any file. + /// In UNION mode cached schema can be only from current file. + std::optional cached_columns; + + /// Format of the file if known. + std::optional format_name; + }; + + virtual Data next() = 0; + + /// Set read buffer returned in previous iteration. + virtual void setPreviousReadBuffer(std::unique_ptr /* buffer */) {} + + /// Set number of rows to last file extracted during schema inference. + /// Used for caching number of rows from files metadata during schema inference. virtual void setNumRowsToLastFile(size_t /*num_rows*/) {} /// Set schema inferred from last file. Used for UNION mode to cache schema /// per file. virtual void setSchemaToLastFile(const ColumnsDescription & /*columns*/) {} + /// Set resulting inferred schema. Used for DEFAULT mode to cache schema /// for all files. virtual void setResultingSchema(const ColumnsDescription & /*columns*/) {} + /// Set auto detected format name. + virtual void setFormatName(const String & /*format_name*/) {} + /// Get last processed file name for better exception messages. virtual String getLastFileName() const { return ""; } + + /// Return true if method recreateLastReadBuffer is implemented. + virtual bool supportsLastReadBufferRecreation() const { return false; } + + /// Recreate last read buffer to read data from the same file again. + /// Used to detect format from the file content to avoid + /// copying data. + virtual std::unique_ptr recreateLastReadBuffer() + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method recreateLastReadBuffer is not implemented"); + } }; struct SingleReadBufferIterator : public IReadBufferIterator @@ -39,12 +78,22 @@ public: { } - std::pair, std::optional> next() override + Data next() override { if (done) - return {nullptr, {}}; + return {nullptr, {}, std::nullopt}; done = true; - return {std::move(buf), {}}; + return Data{std::move(buf), {}, std::nullopt}; + } + + void setPreviousReadBuffer(std::unique_ptr buf_) override + { + buf = std::move(buf_); + } + + std::unique_ptr releaseBuffer() + { + return std::move(buf); } private: @@ -73,17 +122,16 @@ ColumnsDescription readSchemaFromFormat( const String & format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - bool retry, - ContextPtr & context); + const ContextPtr & context); -/// If ReadBuffer is created, it will be written to buf_out. -ColumnsDescription readSchemaFromFormat( - const String & format_name, +/// Try to detect the format of the data and it's schema. +/// It runs schema inference for some set of formats on the same file. +/// If schema reader of some format successfully inferred the schema from +/// some file, we consider that the data is in this format. +std::pair detectFormatAndReadSchema( const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - bool retry, - ContextPtr & context, - std::unique_ptr & buf_out); + const ContextPtr & context); SchemaCache::Key getKeyForSchemaCache(const String & source, const String & format, const std::optional & format_settings, const ContextPtr & context); SchemaCache::Keys getKeysForSchemaCache(const Strings & sources, const String & format, const std::optional & format_settings, const ContextPtr & context); diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index f065d2f0f4d..06b52e7a7a2 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -865,6 +866,13 @@ namespace return std::make_shared(nested_types); } + bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings) + { + if (settings.try_infer_exponent_floats) + return tryReadFloatText(value, buf); + return tryReadFloatTextNoExponent(value, buf); + } + DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings) { if (buf.eof()) @@ -903,7 +911,7 @@ namespace buf.position() = number_start; } - if (tryReadFloatText(tmp_float, buf)) + if (tryReadFloat(tmp_float, buf, settings)) { if (read_int && buf.position() == int_end) return std::make_shared(); @@ -937,7 +945,7 @@ namespace peekable_buf.rollbackToCheckpoint(true); } - if (tryReadFloatText(tmp_float, peekable_buf)) + if (tryReadFloat(tmp_float, peekable_buf, settings)) { /// Float parsing reads no fewer bytes than integer parsing, /// so position of the buffer is either the same, or further. @@ -949,7 +957,7 @@ namespace return std::make_shared(); } } - else if (tryReadFloatText(tmp_float, buf)) + else if (tryReadFloat(tmp_float, buf, settings)) { return std::make_shared(); } @@ -966,7 +974,7 @@ namespace if constexpr (is_json) ok = tryReadJSONStringInto(field, buf); else - ok = tryReadQuotedStringInto(field, buf); + ok = tryReadQuotedString(field, buf); if (!ok) return nullptr; @@ -1390,7 +1398,7 @@ DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSetting buf.position() = buf.buffer().begin(); Float64 tmp; - if (tryReadFloatText(tmp, buf) && buf.eof()) + if (tryReadFloat(tmp, buf, settings) && buf.eof()) return std::make_shared(); return nullptr; diff --git a/src/Functions/EmptyImpl.h b/src/Functions/EmptyImpl.h index 52484524e6a..d3b2dda024b 100644 --- a/src/Functions/EmptyImpl.h +++ b/src/Functions/EmptyImpl.h @@ -35,7 +35,7 @@ struct EmptyImpl /// Only make sense if is_fixed_to_constant. static void vectorFixedToConstant(const ColumnString::Chars & /*data*/, size_t /*n*/, UInt8 & /*res*/) { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: 'vectorFixedToConstant method' is called"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "'vectorFixedToConstant method' is called"); } static void vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray & res) diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h index 1b2519d1ec5..d253095ca01 100644 --- a/src/Functions/FunctionBinaryArithmetic.h +++ b/src/Functions/FunctionBinaryArithmetic.h @@ -147,11 +147,32 @@ private: /// it's not correct for Decimal public: static constexpr bool allow_decimal = IsOperation::allow_decimal; + using DecimalResultDataType = Switch< + Case, + Case && IsDataTypeDecimal && UseLeftDecimal, LeftDataType>, + Case && IsDataTypeDecimal, RightDataType>, + Case && IsIntegralOrExtended, LeftDataType>, + Case && IsIntegralOrExtended, RightDataType>, + + /// e.g Decimal +-*/ Float, least(Decimal, Float), greatest(Decimal, Float) = Float64 + Case && IsFloatingPoint, DataTypeFloat64>, + Case && IsFloatingPoint, DataTypeFloat64>, + + Case::bit_hamming_distance && IsIntegral && IsIntegral, DataTypeUInt8>, + Case::bit_hamming_distance && IsFixedString && IsFixedString, DataTypeUInt16>, + Case::bit_hamming_distance && IsString && IsString, DataTypeUInt64>, + + /// Decimal Real is not supported (traditional DBs convert Decimal Real to Real) + Case && !IsIntegralOrExtendedOrDecimal, InvalidType>, + Case && !IsIntegralOrExtendedOrDecimal, InvalidType>>; + /// Appropriate result type for binary operator on numeric types. "Date" can also mean /// DateTime, but if both operands are Dates, their type must be the same (e.g. Date - DateTime is invalid). using ResultDataType = Switch< + /// Result must be Integer + Case::div_int || IsOperation::div_int_or_zero, DataTypeFromFieldType>, /// Decimal cases - Case || IsDataTypeDecimal), InvalidType>, + Case || IsDataTypeDecimal, DecimalResultDataType>, Case< IsDataTypeDecimal && IsDataTypeDecimal && UseLeftDecimal, LeftDataType>, @@ -622,7 +643,11 @@ private: if constexpr (op_case == OpCase::RightConstant) { if ((*right_nullmap)[0]) + { + for (size_t i = 0; i < size; ++i) + c[i] = ResultType(); return; + } for (size_t i = 0; i < size; ++i) c[i] = apply_func(undec(a[i]), undec(b)); @@ -1665,7 +1690,9 @@ public: if constexpr (!std::is_same_v) { - if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) + if constexpr (is_div_int || is_div_int_or_zero) + type_res = std::make_shared(); + else if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) { if constexpr (is_division) { @@ -1685,13 +1712,19 @@ public: ResultDataType result_type = decimalResultType(left, right); type_res = std::make_shared(result_type.getPrecision(), result_type.getScale()); } - else if constexpr ((IsDataTypeDecimal && IsFloatingPoint) || - (IsDataTypeDecimal && IsFloatingPoint)) + else if constexpr (((IsDataTypeDecimal && IsFloatingPoint) || + (IsDataTypeDecimal && IsFloatingPoint))) + { type_res = std::make_shared(); + } else if constexpr (IsDataTypeDecimal) + { type_res = std::make_shared(left.getPrecision(), left.getScale()); + } else if constexpr (IsDataTypeDecimal) + { type_res = std::make_shared(right.getPrecision(), right.getScale()); + } else if constexpr (std::is_same_v) { // Special case for DateTime: binary OPS should reuse timezone @@ -2000,6 +2033,7 @@ ColumnPtr executeStringInteger(const ColumnsWithTypeAndName & arguments, const A using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using ResultDataType = typename BinaryOperationTraits::ResultDataType; + using DecimalResultType = typename BinaryOperationTraits::DecimalResultDataType; if constexpr (std::is_same_v) return nullptr; @@ -2051,6 +2085,35 @@ ColumnPtr executeStringInteger(const ColumnsWithTypeAndName & arguments, const A col_left_size, right_nullmap); } + /// Here we check if we have `intDiv` or `intDivOrZero` and at least one of the arguments is decimal, because in this case originally we had result as decimal, so we need to convert result into integer after calculations + else if constexpr (!decimal_with_float && (is_div_int || is_div_int_or_zero) && (IsDataTypeDecimal || IsDataTypeDecimal)) + { + + if constexpr (!std::is_same_v) + { + DataTypePtr type_res; + if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) + { + DecimalResultType result_type = decimalResultType(left, right); + type_res = std::make_shared(result_type.getPrecision(), result_type.getScale()); + } + else if constexpr (IsDataTypeDecimal) + type_res = std::make_shared(left.getPrecision(), left.getScale()); + else + type_res = std::make_shared(right.getPrecision(), right.getScale()); + + auto res = executeNumericWithDecimal( + left, right, + col_left_const, col_right_const, + col_left, col_right, + col_left_size, + right_nullmap); + + auto col = ColumnWithTypeAndName(res, type_res, name); + return castColumn(col, std::make_shared()); + } + return nullptr; + } else // can't avoid else and another indentation level, otherwise the compiler would try to instantiate // ColVecResult for Decimals which would lead to a compile error. { diff --git a/src/Functions/FunctionsComparison.h b/src/Functions/FunctionsComparison.h index 446d6c008f0..3be675f39b3 100644 --- a/src/Functions/FunctionsComparison.h +++ b/src/Functions/FunctionsComparison.h @@ -643,13 +643,12 @@ class FunctionComparison : public IFunction { public: static constexpr auto name = Name::name; - static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + static FunctionPtr create(ContextPtr context) { return std::make_shared(decimalCheckComparisonOverflow(context)); } - explicit FunctionComparison(ContextPtr context_) - : context(context_), check_decimal_overflow(decimalCheckComparisonOverflow(context)) {} + explicit FunctionComparison(bool check_decimal_overflow_) + : check_decimal_overflow(check_decimal_overflow_) {} private: - ContextPtr context; bool check_decimal_overflow = true; template @@ -812,7 +811,7 @@ private: c0_const_size = c0_const_fixed_string->getN(); } else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Logical error: ColumnConst contains not String nor FixedString column"); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnConst contains not String nor FixedString column"); } if (c1_const) @@ -831,7 +830,7 @@ private: c1_const_size = c1_const_fixed_string->getN(); } else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Logical error: ColumnConst contains not String nor FixedString column"); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnConst contains not String nor FixedString column"); } using StringImpl = StringComparisonImpl>; @@ -1115,7 +1114,7 @@ private: /// This is a paranoid check to protect from a broken query analysis. if (c0->isNullable() != c1->isNullable()) throw Exception(ErrorCodes::LOGICAL_ERROR, - "Logical error: columns are assumed to be of identical types, but they are different in Nullable"); + "Columns are assumed to be of identical types, but they are different in Nullable"); if (c0_const && c1_const) { @@ -1190,7 +1189,7 @@ public: if (left_tuple && right_tuple) { - auto func = FunctionToOverloadResolverAdaptor(FunctionComparison::create(context)); + auto func = FunctionToOverloadResolverAdaptor(std::make_shared>(check_decimal_overflow)); bool has_nullable = false; bool has_null = false; diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 11ce68eba21..62148fa8022 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -2558,7 +2560,7 @@ public: if constexpr (std::is_same_v) res = std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 1, 0, false)); else if constexpr (std::is_same_v) - throw Exception(ErrorCodes::LOGICAL_ERROR, "LOGICAL ERROR: It is a bug."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "MaterializedMySQL is a bug."); else if constexpr (to_decimal) { UInt64 scale = extractToDecimalScale(arguments[1]); @@ -4066,6 +4068,259 @@ arguments, result_type, input_rows_count); \ "Cast to Object can be performed only from flatten named Tuple, Map or String. Got: {}", from_type->getName()); } + WrapperType createVariantToVariantWrapper(const DataTypeVariant & from_variant, const DataTypeVariant & to_variant) const + { + /// We support only extension of variant type, so, only new types can be added. + /// For example: Variant(T1, T2) -> Variant(T1, T2, T3) is supported, but Variant(T1, T2) -> Variant(T1, T3) is not supported. + /// We want to extend Variant type for free without rewriting the data, but we sort data types inside Variant during type creation + /// (we do it because we want Variant(T1, T2) to be the same as Variant(T2, T1)), but after extension the order of variant types + /// (and so their discriminators) can be different. For example: Variant(T1, T3) -> Variant(T1, T2, T3). + /// To avoid full rewrite of discriminators column, ColumnVariant supports it's local order of variant columns (and so local + /// discriminators) and stores mapping global order -> local order. + /// So, to extend Variant with new types for free, we should keep old local order for old variants, append new variants and change + /// mapping global order -> local order according to the new global order. + + /// Create map (new variant type) -> (it's global discriminator in new order). + const auto & new_variants = to_variant.getVariants(); + std::unordered_map new_variant_types_to_new_global_discriminator; + new_variant_types_to_new_global_discriminator.reserve(new_variants.size()); + for (size_t i = 0; i != new_variants.size(); ++i) + new_variant_types_to_new_global_discriminator[new_variants[i]->getName()] = i; + + /// Create set of old variant types. + const auto & old_variants = from_variant.getVariants(); + std::unordered_map old_variant_types_to_old_global_discriminator; + old_variant_types_to_old_global_discriminator.reserve(old_variants.size()); + for (size_t i = 0; i != old_variants.size(); ++i) + old_variant_types_to_old_global_discriminator[old_variants[i]->getName()] = i; + + /// Check that the set of old variants types is a subset of new variant types and collect new global discriminator for each old global discriminator. + std::unordered_map old_global_discriminator_to_new; + old_global_discriminator_to_new.reserve(old_variants.size()); + for (const auto & [old_variant_type, old_discriminator] : old_variant_types_to_old_global_discriminator) + { + auto it = new_variant_types_to_new_global_discriminator.find(old_variant_type); + if (it == new_variant_types_to_new_global_discriminator.end()) + throw Exception( + ErrorCodes::CANNOT_CONVERT_TYPE, + "Cannot convert type {} to {}. Conversion between Variant types is allowed only when new Variant type is an extension " + "of an initial one", from_variant.getName(), to_variant.getName()); + old_global_discriminator_to_new[old_discriminator] = it->second; + } + + /// Collect variant types and their global discriminators that should be added to the old Variant to get the new Variant. + std::vector> variant_types_and_discriminators_to_add; + variant_types_and_discriminators_to_add.reserve(new_variants.size() - old_variants.size()); + for (size_t i = 0; i != new_variants.size(); ++i) + { + if (!old_variant_types_to_old_global_discriminator.contains(new_variants[i]->getName())) + variant_types_and_discriminators_to_add.emplace_back(new_variants[i], i); + } + + return [old_global_discriminator_to_new, variant_types_and_discriminators_to_add] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr + { + const auto & column_variant = assert_cast(*arguments.front().column.get()); + size_t num_old_variants = column_variant.getNumVariants(); + Columns new_variant_columns; + new_variant_columns.reserve(num_old_variants + variant_types_and_discriminators_to_add.size()); + std::vector new_local_to_global_discriminators; + new_local_to_global_discriminators.reserve(num_old_variants + variant_types_and_discriminators_to_add.size()); + for (size_t i = 0; i != num_old_variants; ++i) + { + new_variant_columns.push_back(column_variant.getVariantPtrByLocalDiscriminator(i)); + new_local_to_global_discriminators.push_back(old_global_discriminator_to_new.at(column_variant.globalDiscriminatorByLocal(i))); + } + + for (const auto & [new_variant_type, new_global_discriminator] : variant_types_and_discriminators_to_add) + { + new_variant_columns.push_back(new_variant_type->createColumn()); + new_local_to_global_discriminators.push_back(new_global_discriminator); + } + + return ColumnVariant::create(column_variant.getLocalDiscriminatorsPtr(), column_variant.getOffsetsPtr(), new_variant_columns, new_local_to_global_discriminators); + }; + } + + WrapperType createVariantToColumnWrapper(const DataTypeVariant & from_variant, const DataTypePtr & to_type) const + { + const auto & variant_types = from_variant.getVariants(); + std::vector variant_wrappers; + variant_wrappers.reserve(variant_types.size()); + + /// Create conversion wrapper for each variant. + for (const auto & variant_type : variant_types) + variant_wrappers.push_back(prepareUnpackDictionaries(variant_type, to_type)); + + return [variant_wrappers, variant_types, to_type] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + { + const auto & column_variant = assert_cast(*arguments.front().column.get()); + + /// First, cast each variant to the result type. + std::vector casted_variant_columns; + casted_variant_columns.reserve(variant_types.size()); + for (size_t i = 0; i != variant_types.size(); ++i) + { + auto variant_col = column_variant.getVariantPtrByLocalDiscriminator(i); + ColumnsWithTypeAndName variant = {{variant_col, variant_types[i], "" }}; + const auto & variant_wrapper = variant_wrappers[column_variant.globalDiscriminatorByLocal(i)]; + casted_variant_columns.push_back(variant_wrapper(variant, result_type, nullptr, variant_col->size())); + } + + /// Second, construct resulting column from casted variant columns according to discriminators. + const auto & local_discriminators = column_variant.getLocalDiscriminators(); + auto res = result_type->createColumn(); + res->reserve(input_rows_count); + for (size_t i = 0; i != input_rows_count; ++i) + { + auto local_discr = local_discriminators[i]; + if (local_discr == ColumnVariant::NULL_DISCRIMINATOR) + res->insertDefault(); + else + res->insertFrom(*casted_variant_columns[local_discr], column_variant.offsetAt(i)); + } + + return res; + }; + } + + static ColumnPtr createVariantFromDescriptorsAndOneNonEmptyVariant(const DataTypes & variant_types, const ColumnPtr & discriminators, const ColumnPtr & variant, ColumnVariant::Discriminator variant_discr) + { + Columns variants; + variants.reserve(variant_types.size()); + for (size_t i = 0; i != variant_types.size(); ++i) + { + if (i == variant_discr) + variants.emplace_back(variant); + else + variants.push_back(variant_types[i]->createColumn()); + } + + return ColumnVariant::create(discriminators, variants); + } + + WrapperType createColumnToVariantWrapper(const DataTypePtr & from_type, const DataTypeVariant & to_variant) const + { + /// We allow converting NULL to Variant(...) as Variant can store NULLs. + if (from_type->onlyNull()) + { + return [](ColumnsWithTypeAndName &, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + { + auto result_column = result_type->createColumn(); + result_column->insertManyDefaults(input_rows_count); + return result_column; + }; + } + + auto variant_discr_opt = to_variant.tryGetVariantDiscriminator(removeNullableOrLowCardinalityNullable(from_type)); + if (!variant_discr_opt) + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Cannot convert type {} to {}. Conversion to Variant allowed only for types from this Variant", from_type->getName(), to_variant.getName()); + + return [variant_discr = *variant_discr_opt] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t) -> ColumnPtr + { + const auto & result_variant_type = assert_cast(*result_type); + const auto & variant_types = result_variant_type.getVariants(); + if (const ColumnNullable * col_nullable = typeid_cast(arguments.front().column.get())) + { + const auto & column = col_nullable->getNestedColumnPtr(); + const auto & null_map = col_nullable->getNullMapData(); + IColumn::Filter filter; + filter.reserve(column->size()); + auto discriminators = ColumnVariant::ColumnDiscriminators::create(); + auto & discriminators_data = discriminators->getData(); + discriminators_data.reserve(column->size()); + size_t variant_size_hint = 0; + for (size_t i = 0; i != column->size(); ++i) + { + if (null_map[i]) + { + discriminators_data.push_back(ColumnVariant::NULL_DISCRIMINATOR); + filter.push_back(0); + } + else + { + discriminators_data.push_back(variant_discr); + filter.push_back(1); + ++variant_size_hint; + } + } + + ColumnPtr variant_column; + /// If there were no NULLs, just use the column. + if (variant_size_hint == column->size()) + variant_column = column; + /// Otherwise we should use filtered column. + else + variant_column = column->filter(filter, variant_size_hint); + return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), variant_column, variant_discr); + } + else if (isColumnLowCardinalityNullable(*arguments.front().column)) + { + const auto & column = arguments.front().column; + + /// Variant column cannot have LowCardinality(Nullable(...)) variant, as Variant column stores NULLs itself. + /// We should create a null-map, insert NULL_DISCRIMINATOR on NULL values and filter initial column. + const auto & col_lc = assert_cast(*column); + const auto & indexes = col_lc.getIndexes(); + auto null_index = col_lc.getDictionary().getNullValueIndex(); + IColumn::Filter filter; + filter.reserve(col_lc.size()); + auto discriminators = ColumnVariant::ColumnDiscriminators::create(); + auto & discriminators_data = discriminators->getData(); + discriminators_data.reserve(col_lc.size()); + size_t variant_size_hint = 0; + for (size_t i = 0; i != col_lc.size(); ++i) + { + if (indexes.getUInt(i) == null_index) + { + discriminators_data.push_back(ColumnVariant::NULL_DISCRIMINATOR); + filter.push_back(0); + } + else + { + discriminators_data.push_back(variant_discr); + filter.push_back(1); + ++variant_size_hint; + } + } + + MutableColumnPtr variant_column; + /// If there were no NULLs, we can just clone the column. + if (variant_size_hint == col_lc.size()) + variant_column = IColumn::mutate(column); + /// Otherwise we should filter column. + else + variant_column = column->filter(filter, variant_size_hint)->assumeMutable(); + + assert_cast(*variant_column).nestedRemoveNullable(); + return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), std::move(variant_column), variant_discr); + } + else + { + const auto & column = arguments.front().column; + auto discriminators = ColumnVariant::ColumnDiscriminators::create(); + discriminators->getData().resize_fill(column->size(), variant_discr); + return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), column, variant_discr); + } + }; + } + + /// Wrapper for conversion to/from Variant type + WrapperType createVariantWrapper(const DataTypePtr & from_type, const DataTypePtr & to_type) const + { + if (const auto * from_variant = checkAndGetDataType(from_type.get())) + { + if (const auto * to_variant = checkAndGetDataType(to_type.get())) + return createVariantToVariantWrapper(*from_variant, *to_variant); + + return createVariantToColumnWrapper(*from_variant, to_type); + } + + return createColumnToVariantWrapper(from_type, assert_cast(*to_type)); + } + template WrapperType createEnumWrapper(const DataTypePtr & from_type, const DataTypeEnum * to_type) const { @@ -4245,6 +4500,11 @@ arguments, result_type, input_rows_count); \ WrapperType prepareUnpackDictionaries(const DataTypePtr & from_type, const DataTypePtr & to_type) const { + /// Conversion from/to Variant data type is processed in a special way. + /// We don't need to remove LowCardinality/Nullable. + if (isVariant(to_type) || isVariant(from_type)) + return createVariantWrapper(from_type, to_type); + const auto * from_low_cardinality = typeid_cast(from_type.get()); const auto * to_low_cardinality = typeid_cast(to_type.get()); const auto & from_nested = from_low_cardinality ? from_low_cardinality->getDictionaryType() : from_type; @@ -4252,7 +4512,7 @@ arguments, result_type, input_rows_count); \ if (from_type->onlyNull()) { - if (!to_nested->isNullable()) + if (!to_nested->isNullable() && !isVariant(to_type)) { if (cast_type == CastType::accurateOrNull) { diff --git a/src/Functions/FunctionsJSON.h b/src/Functions/FunctionsJSON.h index 31a99475b63..2539fa1aeb4 100644 --- a/src/Functions/FunctionsJSON.h +++ b/src/Functions/FunctionsJSON.h @@ -5,7 +5,7 @@ #include -#include +#include #include #include diff --git a/src/Functions/FunctionsLogical.cpp b/src/Functions/FunctionsLogical.cpp index d01fdc99076..2e0f4cd3038 100644 --- a/src/Functions/FunctionsLogical.cpp +++ b/src/Functions/FunctionsLogical.cpp @@ -530,7 +530,7 @@ DataTypePtr FunctionAnyArityLogical::getReturnTypeImpl(const DataTyp { has_nullable_arguments = arg_type->isNullable(); if (has_nullable_arguments && !Impl::specialImplementationForNulls()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: Unexpected type of argument for function \"{}\": " + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected type of argument for function \"{}\": " " argument {} is of type {}", getName(), i + 1, arg_type->getName()); } diff --git a/src/Functions/FunctionsStringSimilarity.cpp b/src/Functions/FunctionsStringSimilarity.cpp index df068531655..aadf5c246fc 100644 --- a/src/Functions/FunctionsStringSimilarity.cpp +++ b/src/Functions/FunctionsStringSimilarity.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -108,10 +109,8 @@ struct NgramDistanceImpl if constexpr (case_insensitive) { -#if defined(MEMORY_SANITIZER) /// Due to PODArray padding accessing more elements should be OK __msan_unpoison(code_points + (N - 1), padding_offset * sizeof(CodePoint)); -#endif /// We really need template lambdas with C++20 to do it inline unrollLowering(code_points, std::make_index_sequence()); } diff --git a/src/Functions/IFunction.cpp b/src/Functions/IFunction.cpp index a46f4d2a11d..d4c6b8f4ba6 100644 --- a/src/Functions/IFunction.cpp +++ b/src/Functions/IFunction.cpp @@ -313,7 +313,7 @@ ColumnPtr IExecutableFunction::execute(const ColumnsWithTypeAndName & arguments, { bool use_default_implementation_for_sparse_columns = useDefaultImplementationForSparseColumns(); /// DataTypeFunction does not support obtaining default (isDefaultAt()) - /// ColumnFunction does not support getting specific values + /// ColumnFunction does not support getting specific values. if (result_type->getTypeId() != TypeIndex::Function && use_default_implementation_for_sparse_columns) { size_t num_sparse_columns = 0; @@ -368,7 +368,7 @@ ColumnPtr IExecutableFunction::execute(const ColumnsWithTypeAndName & arguments, if (!result_type->canBeInsideSparseColumns() || !res->isDefaultAt(0) || res->getNumberOfDefaultRows() != 1) { const auto & offsets_data = assert_cast &>(*sparse_offsets).getData(); - return res->createWithOffsets(offsets_data, (*res)[0], input_rows_count, /*shift=*/ 1); + return res->createWithOffsets(offsets_data, *createColumnConst(res, 0), input_rows_count, /*shift=*/ 1); } return ColumnSparse::create(res, sparse_offsets, input_rows_count); diff --git a/src/Functions/IsOperation.h b/src/Functions/IsOperation.h index 8ea53c865ce..b2c7a27d375 100644 --- a/src/Functions/IsOperation.h +++ b/src/Functions/IsOperation.h @@ -61,7 +61,7 @@ struct IsOperation static constexpr bool bit_hamming_distance = IsSameOperation::value; static constexpr bool division = div_floating || div_int || div_int_or_zero || modulo; - + // NOTE: allow_decimal should not fully contain `division` because of divInt static constexpr bool allow_decimal = plus || minus || multiply || division || least || greatest; }; diff --git a/src/Functions/addressToLine.cpp b/src/Functions/addressToLine.cpp index 771c85cabf6..bb5edf2a07a 100644 --- a/src/Functions/addressToLine.cpp +++ b/src/Functions/addressToLine.cpp @@ -17,7 +17,7 @@ namespace DB namespace { -class FunctionAddressToLine: public FunctionAddressToLineBase +class FunctionAddressToLine : public FunctionAddressToLineBase { public: static constexpr auto name = "addressToLine"; diff --git a/src/Functions/array/array.cpp b/src/Functions/array/array.cpp index 551f0a6625b..03b51808799 100644 --- a/src/Functions/array/array.cpp +++ b/src/Functions/array/array.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB @@ -14,9 +15,12 @@ class FunctionArray : public IFunction { public: static constexpr auto name = "array"; - static FunctionPtr create(ContextPtr) + + explicit FunctionArray(bool use_variant_as_common_type_ = false) : use_variant_as_common_type(use_variant_as_common_type_) {} + + static FunctionPtr create(ContextPtr context) { - return std::make_shared(); + return std::make_shared(context->getSettingsRef().allow_experimental_variant_type && context->getSettingsRef().use_variant_as_common_type); } bool useDefaultImplementationForNulls() const override { return false; } @@ -31,6 +35,9 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { + if (use_variant_as_common_type) + return std::make_shared(getLeastSupertypeOrVariant(arguments)); + return std::make_shared(getLeastSupertype(arguments)); } @@ -97,6 +104,8 @@ private: } bool addField(DataTypePtr type_res, const Field & f, Array & arr) const; + + bool use_variant_as_common_type = false; }; diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index c68c89ee0d5..670442c0c79 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -9,6 +10,10 @@ #include #include +#if USE_MULTITARGET_CODE +#include +#endif + namespace DB { namespace ErrorCodes @@ -75,6 +80,49 @@ struct L2Distance state.sum += other_state.sum; } +#if USE_MULTITARGET_CODE + template + AVX512_FUNCTION_SPECIFIC_ATTRIBUTE static void accumulateCombine( + const ResultType * __restrict data_x, + const ResultType * __restrict data_y, + size_t i_max, + size_t & i_x, + size_t & i_y, + State & state) + { + __m512 sums; + if constexpr (std::is_same_v) + sums = _mm512_setzero_ps(); + else + sums = _mm512_setzero_pd(); + + const size_t n = (std::is_same_v) ? 16 : 8; + + for (; i_x + n < i_max; i_x += n, i_y += n) + { + if constexpr (std::is_same_v) + { + __m512 x = _mm512_loadu_ps(data_x + i_x); + __m512 y = _mm512_loadu_ps(data_y + i_y); + __m512 differences = _mm512_sub_ps(x, y); + sums = _mm512_fmadd_ps(differences, differences, sums); + } + else + { + __m512 x = _mm512_loadu_pd(data_x + i_x); + __m512 y = _mm512_loadu_pd(data_y + i_y); + __m512 differences = _mm512_sub_pd(x, y); + sums = _mm512_fmadd_pd(differences, differences, sums); + } + } + + if constexpr (std::is_same_v) + state.sum = _mm512_reduce_add_ps(sums); + else + state.sum = _mm512_reduce_add_pd(sums); + } +#endif + template static ResultType finalize(const State & state, const ConstParams &) { @@ -189,6 +237,70 @@ struct CosineDistance state.y_squared += other_state.y_squared; } +#if USE_MULTITARGET_CODE + template + AVX512_FUNCTION_SPECIFIC_ATTRIBUTE static void accumulateCombine( + const ResultType * __restrict data_x, + const ResultType * __restrict data_y, + size_t i_max, + size_t & i_x, + size_t & i_y, + State & state) + { + __m512 dot_products; + __m512 x_squareds; + __m512 y_squareds; + + if constexpr (std::is_same_v) + { + dot_products = _mm512_setzero_ps(); + x_squareds = _mm512_setzero_ps(); + y_squareds = _mm512_setzero_ps(); + } + else + { + dot_products = _mm512_setzero_pd(); + x_squareds = _mm512_setzero_pd(); + y_squareds = _mm512_setzero_pd(); + } + + const size_t n = (std::is_same_v) ? 16 : 8; + + for (; i_x + n < i_max; i_x += n, i_y += n) + { + if constexpr (std::is_same_v) + { + __m512 x = _mm512_loadu_ps(data_x + i_x); + __m512 y = _mm512_loadu_ps(data_y + i_y); + dot_products = _mm512_fmadd_ps(x, y, dot_products); + x_squareds = _mm512_fmadd_ps(x, x, x_squareds); + y_squareds = _mm512_fmadd_ps(y, y, y_squareds); + } + else + { + __m512 x = _mm512_loadu_pd(data_x + i_x); + __m512 y = _mm512_loadu_pd(data_y + i_y); + dot_products = _mm512_fmadd_pd(x, y, dot_products); + x_squareds = _mm512_fmadd_pd(x, x, x_squareds); + y_squareds = _mm512_fmadd_pd(y, y, y_squareds); + } + } + + if constexpr (std::is_same_v) + { + state.dot_prod = _mm512_reduce_add_ps(dot_products); + state.x_squared = _mm512_reduce_add_ps(x_squareds); + state.y_squared = _mm512_reduce_add_ps(y_squareds); + } + else + { + state.dot_prod = _mm512_reduce_add_pd(dot_products); + state.x_squared = _mm512_reduce_add_pd(x_squareds); + state.y_squared = _mm512_reduce_add_pd(y_squareds); + } + } +#endif + template static ResultType finalize(const State & state, const ConstParams &) { @@ -352,7 +464,7 @@ private: /// Check that arrays in both columns are the sames size for (size_t row = 0; row < offsets_x.size(); ++row) { - if (unlikely(offsets_x[row] != offsets_y[row])) + if (offsets_x[row] != offsets_y[row]) [[unlikely]] { ColumnArray::Offset prev_offset = row > 0 ? offsets_x[row] : 0; throw Exception( @@ -420,7 +532,7 @@ private: ColumnArray::Offset prev_offset = 0; for (size_t row : collections::range(0, offsets_y.size())) { - if (unlikely(offsets_x[0] != offsets_y[row] - prev_offset)) + if (offsets_x[0] != offsets_y[row] - prev_offset) [[unlikely]] { throw Exception( ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, @@ -438,14 +550,35 @@ private: auto & result_data = result->getData(); /// Do the actual computation - ColumnArray::Offset prev = 0; + size_t prev = 0; size_t row = 0; + for (auto off : offsets_y) { + size_t i = 0; + typename Kernel::template State state; + + /// SIMD optimization: process multiple elements in both input arrays at once. + /// To avoid combinatorial explosion of SIMD kernels, focus on + /// - the two most common input/output types (Float32 x Float32) --> Float32 and (Float64 x Float64) --> Float64 instead of 10 x + /// 10 input types x 2 output types, + /// - const/non-const inputs instead of non-const/non-const inputs + /// - the two most common metrics L2 and cosine distance, + /// - the most powerful SIMD instruction set (AVX-512F). +#if USE_MULTITARGET_CODE + if constexpr (std::is_same_v && std::is_same_v) /// ResultType is Float32 or Float64 + { + if constexpr (std::is_same_v + || std::is_same_v) + { + if (isArchSupported(TargetArch::AVX512F)) + Kernel::template accumulateCombine(data_x.data(), data_y.data(), i + offsets_x[0], i, prev, state); + } + } +#else /// Process chunks in vectorized manner static constexpr size_t VEC_SIZE = 4; typename Kernel::template State states[VEC_SIZE]; - size_t i = 0; for (; prev + VEC_SIZE < off; i += VEC_SIZE, prev += VEC_SIZE) { for (size_t s = 0; s < VEC_SIZE; ++s) @@ -453,10 +586,9 @@ private: states[s], static_cast(data_x[i + s]), static_cast(data_y[prev + s]), kernel_params); } - typename Kernel::template State state; for (const auto & other_state : states) Kernel::template combine(state, other_state, kernel_params); - +#endif /// Process the tail for (; prev < off; ++i, ++prev) { @@ -466,6 +598,7 @@ private: result_data[row] = Kernel::finalize(state, kernel_params); row++; } + return result; } diff --git a/src/Functions/array/arrayElement.cpp b/src/Functions/array/arrayElement.cpp index cea407aee02..8669fd1f3a7 100644 --- a/src/Functions/array/arrayElement.cpp +++ b/src/Functions/array/arrayElement.cpp @@ -670,8 +670,7 @@ struct ArrayElementStringImpl ColumnArray::Offset current_offset = 0; /// get the total result bytes at first, and reduce the cost of result_data.resize. size_t total_result_bytes = 0; - ColumnString::Chars zero_buf(1); - zero_buf.push_back(0); + ColumnString::Chars zero_buf(16, '\0'); /// Needs 15 extra bytes for memcpySmallAllowReadWriteOverflow15 std::vector> selected_bufs; selected_bufs.reserve(size); for (size_t i = 0; i < size; ++i) @@ -737,8 +736,7 @@ struct ArrayElementStringImpl size_t size = offsets.size(); result_offsets.resize(size); - ColumnString::Chars zero_buf(1); - zero_buf.push_back(0); + ColumnString::Chars zero_buf(16, '\0'); /// Needs 15 extra bytes for memcpySmallAllowReadWriteOverflow15 ColumnArray::Offset current_offset = 0; /// get the total result bytes at first, and reduce the cost of result_data.resize. size_t total_result_bytes = 0; diff --git a/src/Functions/array/arrayFold.cpp b/src/Functions/array/arrayFold.cpp index 44fe95624a6..63c14f475fc 100644 --- a/src/Functions/array/arrayFold.cpp +++ b/src/Functions/array/arrayFold.cpp @@ -32,6 +32,12 @@ public: size_t getNumberOfArguments() const override { return 0; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + /// Avoid the default adaptors since they modify the inputs and that makes knowing the lambda argument types + /// (getLambdaArgumentTypes) more complex, as it requires knowing what the adaptors will do + /// It's much simpler to avoid the adapters + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + void getLambdaArgumentTypes(DataTypes & arguments) const override { if (arguments.size() < 3) diff --git a/src/Functions/coalesce.cpp b/src/Functions/coalesce.cpp index befebd1ff52..4ae90a9db13 100644 --- a/src/Functions/coalesce.cpp +++ b/src/Functions/coalesce.cpp @@ -29,7 +29,14 @@ public: return std::make_shared(context); } - explicit FunctionCoalesce(ContextPtr context_) : context(context_) {} + explicit FunctionCoalesce(ContextPtr context_) + : context(context_) + , is_not_null(FunctionFactory::instance().get("isNotNull", context)) + , assume_not_null(FunctionFactory::instance().get("assumeNotNull", context)) + , if_function(FunctionFactory::instance().get("if", context)) + , multi_if_function(FunctionFactory::instance().get("multiIf", context)) + { + } std::string getName() const override { @@ -110,8 +117,6 @@ public: break; } - auto is_not_null = FunctionFactory::instance().get("isNotNull", context); - auto assume_not_null = FunctionFactory::instance().get("assumeNotNull", context); ColumnsWithTypeAndName multi_if_args; ColumnsWithTypeAndName tmp_args(1); @@ -146,13 +151,8 @@ public: /// If there was only two arguments (3 arguments passed to multiIf) /// use function "if" instead, because it's implemented more efficient. /// TODO: make "multiIf" the same efficient. - FunctionOverloadResolverPtr if_function; - if (multi_if_args.size() == 3) - if_function = FunctionFactory::instance().get("if", context); - else - if_function = FunctionFactory::instance().get("multiIf", context); - - ColumnPtr res = if_function->build(multi_if_args)->execute(multi_if_args, result_type, input_rows_count); + FunctionOverloadResolverPtr if_or_multi_if = multi_if_args.size() == 3 ? if_function : multi_if_function; + ColumnPtr res = if_or_multi_if->build(multi_if_args)->execute(multi_if_args, result_type, input_rows_count); /// if last argument is not nullable, result should be also not nullable if (!multi_if_args.back().column->isNullable() && res->isNullable()) @@ -170,6 +170,10 @@ public: private: ContextPtr context; + FunctionOverloadResolverPtr is_not_null; + FunctionOverloadResolverPtr assume_not_null; + FunctionOverloadResolverPtr if_function; + FunctionOverloadResolverPtr multi_if_function; }; } diff --git a/src/Functions/concat.cpp b/src/Functions/concat.cpp index b057e7fede5..d68f5256f6d 100644 --- a/src/Functions/concat.cpp +++ b/src/Functions/concat.cpp @@ -80,21 +80,21 @@ private: const ColumnConst * c0_const_string = checkAndGetColumnConst(c0); const ColumnConst * c1_const_string = checkAndGetColumnConst(c1); - auto c_res = ColumnString::create(); + auto col_res = ColumnString::create(); if (c0_string && c1_string) - concat(StringSource(*c0_string), StringSource(*c1_string), StringSink(*c_res, c0->size())); + concat(StringSource(*c0_string), StringSource(*c1_string), StringSink(*col_res, c0->size())); else if (c0_string && c1_const_string) - concat(StringSource(*c0_string), ConstSource(*c1_const_string), StringSink(*c_res, c0->size())); + concat(StringSource(*c0_string), ConstSource(*c1_const_string), StringSink(*col_res, c0->size())); else if (c0_const_string && c1_string) - concat(ConstSource(*c0_const_string), StringSource(*c1_string), StringSink(*c_res, c0->size())); + concat(ConstSource(*c0_const_string), StringSource(*c1_string), StringSink(*col_res, c0->size())); else { /// Fallback: use generic implementation for not very important cases. return executeFormatImpl(arguments, input_rows_count); } - return c_res; + return col_res; } ColumnPtr executeFormatImpl(const ColumnsWithTypeAndName & arguments, size_t input_rows_count) const @@ -102,7 +102,7 @@ private: const size_t num_arguments = arguments.size(); assert(num_arguments >= 2); - auto c_res = ColumnString::create(); + auto col_res = ColumnString::create(); std::vector data(num_arguments); std::vector offsets(num_arguments); std::vector fixed_string_sizes(num_arguments); @@ -169,11 +169,11 @@ private: offsets, fixed_string_sizes, constant_strings, - c_res->getChars(), - c_res->getOffsets(), + col_res->getChars(), + col_res->getOffsets(), input_rows_count); - return c_res; + return col_res; } }; diff --git a/src/Functions/concatWithSeparator.cpp b/src/Functions/concatWithSeparator.cpp index b4f3732710f..ed02f331192 100644 --- a/src/Functions/concatWithSeparator.cpp +++ b/src/Functions/concatWithSeparator.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -27,7 +28,6 @@ class ConcatWithSeparatorImpl : public IFunction public: static constexpr auto name = Name::name; explicit ConcatWithSeparatorImpl(ContextPtr context_) : context(context_) { } - static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } String getName() const override { return name; } @@ -49,17 +49,13 @@ public: getName(), arguments.size()); - for (const auto arg_idx : collections::range(0, arguments.size())) - { - const auto * arg = arguments[arg_idx].get(); - if (!isStringOrFixedString(arg)) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of argument {} of function {}", - arg->getName(), - arg_idx + 1, - getName()); - } + const auto * separator_arg = arguments[0].get(); + if (!isStringOrFixedString(separator_arg)) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of first argument of function {}", + separator_arg->getName(), + getName()); return std::make_shared(); } @@ -70,8 +66,9 @@ public: if (arguments.size() == 1) return result_type->createColumnConstWithDefaultValue(input_rows_count); - auto c_res = ColumnString::create(); - c_res->reserve(input_rows_count); + auto col_res = ColumnString::create(); + col_res->reserve(input_rows_count); + const ColumnConst * col_sep = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); if (!col_sep) throw Exception( @@ -88,6 +85,7 @@ public: std::vector offsets(num_args); std::vector fixed_string_sizes(num_args); std::vector> constant_strings(num_args); + std::vector converted_col_ptrs(num_args); bool has_column_string = false; bool has_column_fixed_string = false; @@ -111,9 +109,33 @@ public: fixed_string_sizes[2 * i] = fixed_col->getN(); } else if (const ColumnConst * const_col = checkAndGetColumnConstStringOrFixedString(column.get())) + { constant_strings[2 * i] = const_col->getValue(); + } else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", column->getName(), getName()); + { + /// A non-String/non-FixedString-type argument: use the default serialization to convert it to String + auto full_column = column->convertToFullIfNeeded(); + auto serialization = arguments[i +1].type->getDefaultSerialization(); + auto converted_col_str = ColumnString::create(); + ColumnStringHelpers::WriteHelper write_helper(*converted_col_str, column->size()); + auto & write_buffer = write_helper.getWriteBuffer(); + FormatSettings format_settings; + for (size_t row = 0; row < column->size(); ++row) + { + serialization->serializeText(*full_column, row, write_buffer, format_settings); + write_helper.rowWritten(); + } + write_helper.finalize(); + + /// Keep the pointer alive + converted_col_ptrs[i] = std::move(converted_col_str); + + /// Same as the normal `ColumnString` branch + has_column_string = true; + data[2 * i] = &converted_col_ptrs[i]->getChars(); + offsets[2 * i] = &converted_col_ptrs[i]->getOffsets(); + } } String pattern; @@ -129,10 +151,10 @@ public: offsets, fixed_string_sizes, constant_strings, - c_res->getChars(), - c_res->getOffsets(), + col_res->getChars(), + col_res->getOffsets(), input_rows_count); - return std::move(c_res); + return std::move(col_res); } private: diff --git a/src/Functions/countMatches.h b/src/Functions/countMatches.h index 5e02915de56..e9880e6e93f 100644 --- a/src/Functions/countMatches.h +++ b/src/Functions/countMatches.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -15,9 +16,7 @@ namespace DB namespace ErrorCodes { - extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; - extern const int LOGICAL_ERROR; } using Pos = const char *; @@ -35,45 +34,46 @@ public: DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - if (!isStringOrFixedString(arguments[1].type)) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of second argument (pattern) of function {}. Must be String/FixedString.", - arguments[1].type->getName(), getName()); - if (!isStringOrFixedString(arguments[0].type)) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of first argument (haystack) of function {}. Must be String/FixedString.", - arguments[0].type->getName(), getName()); - const auto * column = arguments[1].column.get(); - if (!column || !checkAndGetColumnConstStringOrFixedString(column)) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, - "The second argument of function {} should be a constant string with the pattern", - getName()); + FunctionArgumentDescriptors args{ + {"haystack", &isStringOrFixedString, nullptr, "String or FixedString"}, + {"pattern", &isString, isColumnConst, "constant String"} + }; + validateFunctionArgumentTypes(*this, arguments, args); return std::make_shared(); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { - const ColumnConst * column_pattern = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); - const OptimizedRegularExpression re = Regexps::createRegexp(column_pattern->getValue()); + const IColumn * col_pattern = arguments[1].column.get(); + const ColumnConst * col_pattern_const = checkAndGetColumnConst(col_pattern); + if (col_pattern_const == nullptr) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Pattern argument is not const"); + + const OptimizedRegularExpression re = Regexps::createRegexp(col_pattern_const->getValue()); + + const IColumn * col_haystack = arguments[0].column.get(); OptimizedRegularExpression::MatchVec matches; - const IColumn * column_haystack = arguments[0].column.get(); - - if (const ColumnString * col_str = checkAndGetColumn(column_haystack)) + if (const ColumnConst * col_haystack_const = checkAndGetColumnConstStringOrFixedString(col_haystack)) { - auto result_column = ColumnUInt64::create(); + std::string_view str = col_haystack_const->getDataColumn().getDataAt(0).toView(); + uint64_t matches_count = countMatches(str, re, matches); + return result_type->createColumnConst(input_rows_count, matches_count); + } + else if (const ColumnString * col_haystack_string = checkAndGetColumn(col_haystack)) + { + auto col_res = ColumnUInt64::create(); - const ColumnString::Chars & src_chars = col_str->getChars(); - const ColumnString::Offsets & src_offsets = col_str->getOffsets(); + const ColumnString::Chars & src_chars = col_haystack_string->getChars(); + const ColumnString::Offsets & src_offsets = col_haystack_string->getOffsets(); - ColumnUInt64::Container & vec_res = result_column->getData(); + ColumnUInt64::Container & vec_res = col_res->getData(); vec_res.resize(input_rows_count); - size_t size = src_offsets.size(); ColumnString::Offset current_src_offset = 0; - for (size_t i = 0; i < size; ++i) + for (size_t i = 0; i < input_rows_count; ++i) { Pos pos = reinterpret_cast(&src_chars[current_src_offset]); current_src_offset = src_offsets[i]; @@ -83,16 +83,25 @@ public: vec_res[i] = countMatches(str, re, matches); } - return result_column; + return col_res; } - else if (const ColumnConst * col_const_str = checkAndGetColumnConstStringOrFixedString(column_haystack)) + else if (const ColumnFixedString * col_haystack_fixedstring = checkAndGetColumn(col_haystack)) { - std::string_view str = col_const_str->getDataColumn().getDataAt(0).toView(); - uint64_t matches_count = countMatches(str, re, matches); - return result_type->createColumnConst(input_rows_count, matches_count); + auto col_res = ColumnUInt64::create(); + + ColumnUInt64::Container & vec_res = col_res->getData(); + vec_res.resize(input_rows_count); + + for (size_t i = 0; i < input_rows_count; ++i) + { + std::string_view str = col_haystack_fixedstring->getDataAt(i).toView(); + vec_res[i] = countMatches(str, re, matches); + } + + return col_res; } else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Error in FunctionCountMatches::getReturnTypeImpl()"); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Could not cast haystack argument to String or FixedString"); } static uint64_t countMatches(std::string_view src, const OptimizedRegularExpression & re, OptimizedRegularExpression::MatchVec & matches) @@ -116,7 +125,7 @@ public: if (!matches[0].length) break; pos += matches[0].offset + matches[0].length; - match_count++; + ++match_count; } return match_count; diff --git a/src/Functions/coverage.cpp b/src/Functions/coverage.cpp index 8a62469fa54..f4cac26df78 100644 --- a/src/Functions/coverage.cpp +++ b/src/Functions/coverage.cpp @@ -21,11 +21,14 @@ namespace enum class Kind { Current, + Cumulative, All }; /** If ClickHouse is build with coverage instrumentation, returns an array - * of currently accumulated (`coverage`) / all possible (`coverageAll`) unique code addresses. + * of currently accumulated (`coverageCurrent`) + * or accumulated since the startup (`coverageCumulative`) + * or all possible (`coverageAll`) unique code addresses. */ class FunctionCoverage : public IFunction { @@ -64,7 +67,11 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override { - auto coverage_table = kind == Kind::Current ? getCoverage() : getAllInstrumentedAddresses(); + auto coverage_table = kind == Kind::Current + ? getCurrentCoverage() + : (kind == Kind::Cumulative + ? getCumulativeCoverage() + : getAllInstrumentedAddresses()); auto column_addresses = ColumnUInt64::create(); auto & data = column_addresses->getData(); @@ -85,8 +92,68 @@ public: REGISTER_FUNCTION(Coverage) { - factory.registerFunction("coverage", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::Current)); }); - factory.registerFunction("coverageAll", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::All)); }); + factory.registerFunction("coverageCurrent", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::Current)); }, + FunctionDocumentation + { + .description=R"( +This function is only available if ClickHouse was built with the SANITIZE_COVERAGE=1 option. + +It returns an array of unique addresses (a subset of the instrumented points in code) in the code +encountered at runtime after the previous coverage reset (with the `SYSTEM RESET COVERAGE` query) or after server startup. + +[example:functions] + +The order of array elements is undetermined. + +You can use another function, `coverageAll` to find all instrumented addresses in the code to compare and calculate the percentage. + +You can process the addresses with the `addressToSymbol` (possibly with `demangle`) and `addressToLine` functions +to calculate symbol-level, file-level, or line-level coverage. + +If you run multiple tests sequentially and reset the coverage with the `SYSTEM RESET COVERAGE` query between the tests, +you can obtain a coverage information for every test in isolation, to find which functions are covered by which tests and vise-versa. + +By default, every *basic block* in the code is covered, which roughly means - a sequence of instructions without jumps, +e.g. a body of for loop without ifs, or a single branch of if. + +See https://clang.llvm.org/docs/SanitizerCoverage.html for more information. +)", + .examples{ + {"functions", "SELECT DISTINCT demangle(addressToSymbol(arrayJoin(coverageCurrent())))", ""}}, + .categories{"Introspection"} + }); + + factory.registerFunction("coverageCumulative", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::Cumulative)); }, + FunctionDocumentation + { + .description=R"( +This function is only available if ClickHouse was built with the SANITIZE_COVERAGE=1 option. + +It returns an array of unique addresses (a subset of the instrumented points in code) in the code +encountered at runtime after server startup. + +In contrast to `coverageCurrent` it cannot be reset with the `SYSTEM RESET COVERAGE`. + +See the `coverageCurrent` function for the details. +)", + .categories{"Introspection"} + }); + + factory.registerFunction("coverageAll", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::All)); }, + FunctionDocumentation + { + .description=R"( +This function is only available if ClickHouse was built with the SANITIZE_COVERAGE=1 option. + +It returns an array of all unique addresses in the code instrumented for coverage +- all possible addresses that can appear in the result of the `coverage` function. + +You can use this function, and the `coverage` function to compare and calculate the coverage percentage. + +See the `coverageCurrent` function for the details. +)", + .categories{"Introspection"} + }); } } diff --git a/src/Functions/divide/divide.cpp b/src/Functions/divide/divide.cpp index cf2cd354a7d..0708964c7d4 100644 --- a/src/Functions/divide/divide.cpp +++ b/src/Functions/divide/divide.cpp @@ -1,5 +1,5 @@ #include "divide.h" -#include +#include #if defined(__x86_64__) namespace SSE2 @@ -26,9 +26,9 @@ template void divideImpl(const A * __restrict a_pos, B b, ResultType * __restrict c_pos, size_t size) { #if defined(__x86_64__) - if (DB::Cpu::CpuFlagsCache::have_AVX2) + if (DB::CPU::CPUFlagsCache::have_AVX2) AVX2::divideImpl(a_pos, b, c_pos, size); - else if (DB::Cpu::CpuFlagsCache::have_SSE2) + else if (DB::CPU::CPUFlagsCache::have_SSE2) SSE2::divideImpl(a_pos, b, c_pos, size); #else Generic::divideImpl(a_pos, b, c_pos, size); @@ -49,9 +49,9 @@ template void divideImpl(const uint32_t * __restric template void divideImpl(const int64_t * __restrict, int64_t, int64_t * __restrict, size_t); template void divideImpl(const int64_t * __restrict, int32_t, int64_t * __restrict, size_t); template void divideImpl(const int64_t * __restrict, int16_t, int64_t * __restrict, size_t); -template void divideImpl(const int64_t * __restrict, int8_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, Int8, int64_t * __restrict, size_t); template void divideImpl(const int32_t * __restrict, int64_t, int32_t * __restrict, size_t); template void divideImpl(const int32_t * __restrict, int32_t, int32_t * __restrict, size_t); template void divideImpl(const int32_t * __restrict, int16_t, int32_t * __restrict, size_t); -template void divideImpl(const int32_t * __restrict, int8_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, Int8, int32_t * __restrict, size_t); diff --git a/src/Functions/divide/divideImpl.cpp b/src/Functions/divide/divideImpl.cpp index 966d5777c1d..6d44b427582 100644 --- a/src/Functions/divide/divideImpl.cpp +++ b/src/Functions/divide/divideImpl.cpp @@ -12,6 +12,10 @@ #include +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wbit-int-extension" +using Int8 = signed _BitInt(8); +#pragma clang diagnostic pop namespace NAMESPACE { @@ -62,11 +66,11 @@ template void divideImpl(const uint32_t * __restric template void divideImpl(const int64_t * __restrict, int64_t, int64_t * __restrict, size_t); template void divideImpl(const int64_t * __restrict, int32_t, int64_t * __restrict, size_t); template void divideImpl(const int64_t * __restrict, int16_t, int64_t * __restrict, size_t); -template void divideImpl(const int64_t * __restrict, int8_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, Int8, int64_t * __restrict, size_t); template void divideImpl(const int32_t * __restrict, int64_t, int32_t * __restrict, size_t); template void divideImpl(const int32_t * __restrict, int32_t, int32_t * __restrict, size_t); template void divideImpl(const int32_t * __restrict, int16_t, int32_t * __restrict, size_t); -template void divideImpl(const int32_t * __restrict, int8_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, Int8, int32_t * __restrict, size_t); } diff --git a/src/Functions/equals.cpp b/src/Functions/equals.cpp index de1cf623ea6..5c59daf0537 100644 --- a/src/Functions/equals.cpp +++ b/src/Functions/equals.cpp @@ -1,5 +1,6 @@ #include #include +#include namespace DB @@ -16,9 +17,16 @@ template <> ColumnPtr FunctionComparison::executeTupleImpl( const ColumnsWithTypeAndName & x, const ColumnsWithTypeAndName & y, size_t tuple_size, size_t input_rows_count) const { + FunctionOverloadResolverPtr func_builder_equals + = std::make_unique(std::make_shared(check_decimal_overflow)); + + + FunctionOverloadResolverPtr func_builder_and + = std::make_unique(std::make_shared()); + return executeTupleEqualityImpl( - FunctionFactory::instance().get("equals", context), - FunctionFactory::instance().get("and", context), + func_builder_equals, + func_builder_and, x, y, tuple_size, input_rows_count); } diff --git a/src/Functions/formatString.h b/src/Functions/formatString.h index 315e5c06227..bdd36f4aa17 100644 --- a/src/Functions/formatString.h +++ b/src/Functions/formatString.h @@ -18,7 +18,7 @@ struct FormatStringImpl static constexpr size_t right_padding = 15; template - static inline void formatExecute(bool possibly_has_column_string, bool possibly_has_column_fixed_string, Args &&... args) + static void formatExecute(bool possibly_has_column_string, bool possibly_has_column_fixed_string, Args &&... args) { if (possibly_has_column_string && possibly_has_column_fixed_string) format(std::forward(args)...); @@ -38,7 +38,7 @@ struct FormatStringImpl /// input_rows_count is the number of rows processed. /// Precondition: data.size() == offsets.size() == fixed_string_N.size() == constant_strings.size(). template - static inline void format( + static void format( String pattern, const std::vector & data, const std::vector & offsets, diff --git a/src/Functions/greater.cpp b/src/Functions/greater.cpp index c36f8d7acca..2b87b376ce0 100644 --- a/src/Functions/greater.cpp +++ b/src/Functions/greater.cpp @@ -1,11 +1,12 @@ #include #include - +#include namespace DB { using FunctionGreater = FunctionComparison; +using FunctionEquals = FunctionComparison; REGISTER_FUNCTION(Greater) { @@ -16,14 +17,24 @@ template <> ColumnPtr FunctionComparison::executeTupleImpl( const ColumnsWithTypeAndName & x, const ColumnsWithTypeAndName & y, size_t tuple_size, size_t input_rows_count) const { - auto greater = FunctionFactory::instance().get("greater", context); + FunctionOverloadResolverPtr greater + = std::make_unique(std::make_shared(check_decimal_overflow)); + + FunctionOverloadResolverPtr func_builder_or + = std::make_unique(std::make_shared()); + + FunctionOverloadResolverPtr func_builder_and + = std::make_unique(std::make_shared()); + + FunctionOverloadResolverPtr func_builder_equals + = std::make_unique(std::make_shared(check_decimal_overflow)); return executeTupleLessGreaterImpl( greater, greater, - FunctionFactory::instance().get("and", context), - FunctionFactory::instance().get("or", context), - FunctionFactory::instance().get("equals", context), + func_builder_and, + func_builder_or, + func_builder_equals, x, y, tuple_size, input_rows_count); } diff --git a/src/Functions/greaterOrEquals.cpp b/src/Functions/greaterOrEquals.cpp index 089ac4d5466..c77ca585c76 100644 --- a/src/Functions/greaterOrEquals.cpp +++ b/src/Functions/greaterOrEquals.cpp @@ -1,11 +1,14 @@ #include #include +#include namespace DB { using FunctionGreaterOrEquals = FunctionComparison; +using FunctionGreater = FunctionComparison; +using FunctionEquals = FunctionComparison; REGISTER_FUNCTION(GreaterOrEquals) { @@ -16,12 +19,28 @@ template <> ColumnPtr FunctionComparison::executeTupleImpl( const ColumnsWithTypeAndName & x, const ColumnsWithTypeAndName & y, size_t tuple_size, size_t input_rows_count) const { + + FunctionOverloadResolverPtr greater + = std::make_unique(std::make_shared(check_decimal_overflow)); + + FunctionOverloadResolverPtr greater_or_equals + = std::make_unique(std::make_shared(check_decimal_overflow)); + + FunctionOverloadResolverPtr func_builder_or + = std::make_unique(std::make_shared()); + + FunctionOverloadResolverPtr func_builder_and + = std::make_unique(std::make_shared()); + + FunctionOverloadResolverPtr func_builder_equals + = std::make_unique(std::make_shared(check_decimal_overflow)); + return executeTupleLessGreaterImpl( - FunctionFactory::instance().get("greater", context), - FunctionFactory::instance().get("greaterOrEquals", context), - FunctionFactory::instance().get("and", context), - FunctionFactory::instance().get("or", context), - FunctionFactory::instance().get("equals", context), + greater, + greater_or_equals, + func_builder_and, + func_builder_or, + func_builder_equals, x, y, tuple_size, input_rows_count); } diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index cae3b720d8b..7306dc4173e 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -1,28 +1,34 @@ -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include -#include +#include #include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include + #include namespace DB @@ -32,6 +38,7 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; } namespace @@ -42,12 +49,32 @@ using namespace GatherUtils; /** Selection function by condition: if(cond, then, else). * cond - UInt8 * then, else - numeric types for which there is a general type, or dates, datetimes, or strings, or arrays of these types. - * For better performance, try to use branch free code for numeric types(i.e. cond ? a : b --> !!cond * a + !cond * b), except floating point types because of Inf or NaN. + * For better performance, try to use branch free code for numeric types(i.e. cond ? a : b --> !!cond * a + !cond * b) */ +template +concept is_native_int_or_decimal_v + = std::is_integral_v || (is_decimal && sizeof(ResultType) <= 8); + +// This macro performs a branch-free conditional assignment for floating point types. +// It uses bitwise operations to avoid branching, which can be beneficial for performance. +#define BRANCHFREE_IF_FLOAT(TYPE, vc, va, vb, vr) \ + using UIntType = typename NumberTraits::Construct::Type; \ + using IntType = typename NumberTraits::Construct::Type; \ + auto mask = static_cast(static_cast(vc) - 1); \ + auto new_a = static_cast(va); \ + auto new_b = static_cast(vb); \ + UIntType uint_a; \ + std::memcpy(&uint_a, &new_a, sizeof(UIntType)); \ + UIntType uint_b; \ + std::memcpy(&uint_b, &new_b, sizeof(UIntType)); \ + UIntType tmp = (~mask & uint_a) | (mask & uint_b); \ + (vr) = *(reinterpret_cast(&tmp)); + template inline void fillVectorVector(const ArrayCond & cond, const ArrayA & a, const ArrayB & b, ArrayResult & res) { + size_t size = cond.size(); bool a_is_short = a.size() < size; bool b_is_short = b.size() < size; @@ -57,47 +84,68 @@ inline void fillVectorVector(const ArrayCond & cond, const ArrayA & a, const Arr size_t a_index = 0, b_index = 0; for (size_t i = 0; i < size; ++i) { - if constexpr (std::is_integral_v) - { + if constexpr (is_native_int_or_decimal_v) res[i] = !!cond[i] * static_cast(a[a_index]) + (!cond[i]) * static_cast(b[b_index]); - a_index += !!cond[i]; - b_index += !cond[i]; + else if constexpr (std::is_floating_point_v) + { + BRANCHFREE_IF_FLOAT(ResultType, cond[i], a[a_index], b[b_index], res[i]) } else - res[i] = cond[i] ? static_cast(a[a_index++]) : static_cast(b[b_index++]); + res[i] = cond[i] ? static_cast(a[a_index]) : static_cast(b[b_index]); + + a_index += !!cond[i]; + b_index += !cond[i]; } } else if (a_is_short) { size_t a_index = 0; for (size_t i = 0; i < size; ++i) - if constexpr (std::is_integral_v) - { + { + if constexpr (is_native_int_or_decimal_v) res[i] = !!cond[i] * static_cast(a[a_index]) + (!cond[i]) * static_cast(b[i]); - a_index += !!cond[i]; + else if constexpr (std::is_floating_point_v) + { + BRANCHFREE_IF_FLOAT(ResultType, cond[i], a[a_index], b[i], res[i]) } else - res[i] = cond[i] ? static_cast(a[a_index++]) : static_cast(b[i]); + res[i] = cond[i] ? static_cast(a[a_index]) : static_cast(b[i]); + + a_index += !!cond[i]; + } } else if (b_is_short) { size_t b_index = 0; for (size_t i = 0; i < size; ++i) - if constexpr (std::is_integral_v) - { + { + if constexpr (is_native_int_or_decimal_v) res[i] = !!cond[i] * static_cast(a[i]) + (!cond[i]) * static_cast(b[b_index]); - b_index += !cond[i]; + else if constexpr (std::is_floating_point_v) + { + BRANCHFREE_IF_FLOAT(ResultType, cond[i], a[i], b[b_index], res[i]) } else - res[i] = cond[i] ? static_cast(a[i]) : static_cast(b[b_index++]); + res[i] = cond[i] ? static_cast(a[i]) : static_cast(b[b_index]); + + b_index += !cond[i]; + } } else { for (size_t i = 0; i < size; ++i) - if constexpr (std::is_integral_v) + { + if constexpr (is_native_int_or_decimal_v) res[i] = !!cond[i] * static_cast(a[i]) + (!cond[i]) * static_cast(b[i]); + else if constexpr (std::is_floating_point_v) + { + BRANCHFREE_IF_FLOAT(ResultType, cond[i], a[i], b[i], res[i]) + } else + { res[i] = cond[i] ? static_cast(a[i]) : static_cast(b[i]); + } + } } } @@ -110,21 +158,32 @@ inline void fillVectorConstant(const ArrayCond & cond, const ArrayA & a, B b, Ar { size_t a_index = 0; for (size_t i = 0; i < size; ++i) - if constexpr (std::is_integral_v) - { + { + if constexpr (is_native_int_or_decimal_v) res[i] = !!cond[i] * static_cast(a[a_index]) + (!cond[i]) * static_cast(b); - a_index += !!cond[i]; + else if constexpr (std::is_floating_point_v) + { + BRANCHFREE_IF_FLOAT(ResultType, cond[i], a[a_index], b, res[i]) } else - res[i] = cond[i] ? static_cast(a[a_index++]) : static_cast(b); + res[i] = cond[i] ? static_cast(a[a_index]) : static_cast(b); + + a_index += !!cond[i]; + } } else { for (size_t i = 0; i < size; ++i) - if constexpr (std::is_integral_v) + { + if constexpr (is_native_int_or_decimal_v) res[i] = !!cond[i] * static_cast(a[i]) + (!cond[i]) * static_cast(b); + else if constexpr (std::is_floating_point_v) + { + BRANCHFREE_IF_FLOAT(ResultType, cond[i], a[i], b, res[i]) + } else res[i] = cond[i] ? static_cast(a[i]) : static_cast(b); + } } } @@ -137,21 +196,63 @@ inline void fillConstantVector(const ArrayCond & cond, A a, const ArrayB & b, Ar { size_t b_index = 0; for (size_t i = 0; i < size; ++i) - if constexpr (std::is_integral_v) - { + { + if constexpr (is_native_int_or_decimal_v) res[i] = !!cond[i] * static_cast(a) + (!cond[i]) * static_cast(b[b_index]); - b_index += !cond[i]; + else if constexpr (std::is_floating_point_v) + { + BRANCHFREE_IF_FLOAT(ResultType, cond[i], a, b[b_index], res[i]) } else - res[i] = cond[i] ? static_cast(a) : static_cast(b[b_index++]); + res[i] = cond[i] ? static_cast(a) : static_cast(b[b_index]); + + b_index += !cond[i]; + } } else { for (size_t i = 0; i < size; ++i) - if constexpr (std::is_integral_v) + { + if constexpr (is_native_int_or_decimal_v) res[i] = !!cond[i] * static_cast(a) + (!cond[i]) * static_cast(b[i]); + else if constexpr (std::is_floating_point_v) + { + BRANCHFREE_IF_FLOAT(ResultType, cond[i], a, b[i], res[i]) + } else res[i] = cond[i] ? static_cast(a) : static_cast(b[i]); + } + } +} + +template +inline void fillConstantConstant(const ArrayCond & cond, A a, B b, ArrayResult & res) +{ + size_t size = cond.size(); + + /// We manually optimize the loop for types like (U)Int128|256 or Decimal128/256 to avoid branches + if constexpr (is_over_big_int) + { + alignas(64) const ResultType ab[2] = {static_cast(a), static_cast(b)}; + for (size_t i = 0; i < size; ++i) + { + res[i] = ab[!cond[i]]; + } + } + else if constexpr (std::is_same_v || std::is_same_v) + { + ResultType new_a = static_cast(a); + ResultType new_b = static_cast(b); + for (size_t i = 0; i < size; ++i) + { + /// Reuse new_a and new_b to achieve auto-vectorization + res[i] = cond[i] ? new_a : new_b; + } + } + else + { + for (size_t i = 0; i < size; ++i) + res[i] = cond[i] ? static_cast(a) : static_cast(b); } } @@ -197,8 +298,7 @@ struct NumIfImpl auto col_res = ColVecResult::create(size); ArrayResult & res = col_res->getData(); - for (size_t i = 0; i < size; ++i) - res[i] = cond[i] ? static_cast(a) : static_cast(b); + fillConstantConstant(cond, a, b, res); return col_res; } }; @@ -247,8 +347,7 @@ struct NumIfImpl, Decimal, Decimal> auto col_res = ColVecResult::create(size, scale); ArrayResult & res = col_res->getData(); - for (size_t i = 0; i < size; ++i) - res[i] = cond[i] ? static_cast(a) : static_cast(b); + fillConstantConstant(cond, a, b, res); return col_res; } }; @@ -258,9 +357,16 @@ class FunctionIf : public FunctionIfBase { public: static constexpr auto name = "if"; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + static FunctionPtr create(ContextPtr context) + { + return std::make_shared(context->getSettingsRef().allow_experimental_variant_type && context->getSettingsRef().use_variant_as_common_type); + } + + explicit FunctionIf(bool use_variant_when_no_common_type_ = false) : FunctionIfBase(), use_variant_when_no_common_type(use_variant_when_no_common_type_) {} private: + bool use_variant_when_no_common_type = false; + template static UInt32 decimalScale(const ColumnsWithTypeAndName & arguments [[maybe_unused]]) { @@ -616,7 +722,6 @@ private: conditional(ConstSource(*col_arr_then_const), ConstSource(*col_arr_else_const), GenericArraySink(col_res->getData(), col_res->getOffsets(), rows), cond_data); else return nullptr; - return res; } @@ -668,14 +773,102 @@ private: return ColumnTuple::create(tuple_columns); } + ColumnPtr executeMap(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const + { + auto extract_kv_from_map = [](const ColumnMap * map) + { + const ColumnTuple & tuple = map->getNestedData(); + const auto & keys = tuple.getColumnPtr(0); + const auto & values = tuple.getColumnPtr(1); + const auto & offsets = map->getNestedColumn().getOffsetsPtr(); + return std::make_pair(ColumnArray::create(keys, offsets), ColumnArray::create(values, offsets)); + }; + + /// Extract keys and values from both arguments + Columns key_cols(2); + Columns value_cols(2); + for (size_t i = 0; i < 2; ++i) + { + const auto & arg = arguments[i + 1]; + if (const ColumnMap * map = checkAndGetColumn(arg.column.get())) + { + auto [key_col, value_col] = extract_kv_from_map(map); + key_cols[i] = std::move(key_col); + value_cols[i] = std::move(value_col); + } + else if (const ColumnConst * const_map = checkAndGetColumnConst(arg.column.get())) + { + const ColumnMap * map_data = assert_cast(&const_map->getDataColumn()); + auto [key_col, value_col] = extract_kv_from_map(map_data); + + size_t size = const_map->size(); + key_cols[i] = ColumnConst::create(std::move(key_col), size); + value_cols[i] = ColumnConst::create(std::move(value_col), size); + } + else + return nullptr; + } + + /// Compose temporary columns for keys and values + ColumnsWithTypeAndName key_columns(3); + key_columns[0] = arguments[0]; + ColumnsWithTypeAndName value_columns(3); + value_columns[0] = arguments[0]; + for (size_t i = 0; i < 2; ++i) + { + const auto & arg = arguments[i + 1]; + const DataTypeMap & type = static_cast(*arg.type); + const auto & key_type = type.getKeyType(); + const auto & value_type = type.getValueType(); + key_columns[i + 1] = {key_cols[i], std::make_shared(key_type), {}}; + value_columns[i + 1] = {value_cols[i], std::make_shared(value_type), {}}; + } + + /// Calculate function corresponding keys and values in map + const DataTypeMap & map_result_type = static_cast(*result_type); + auto key_result_type = std::make_shared(map_result_type.getKeyType()); + auto value_result_type = std::make_shared(map_result_type.getValueType()); + ColumnPtr key_result = executeImpl(key_columns, key_result_type, input_rows_count); + ColumnPtr value_result = executeImpl(value_columns, value_result_type, input_rows_count); + + /// key_result and value_result are not constant columns otherwise we won't reach here in executeMap + const auto * key_array = assert_cast(key_result.get()); + const auto * value_array = assert_cast(value_result.get()); + if (!key_array) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Key result column should be {} instead of {} in executeMap of function {}", + key_result_type->getName(), + key_result->getName(), + getName()); + if (!value_array) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Value result column should be {} instead of {} in executeMap of function {}", + key_result_type->getName(), + value_result->getName(), + getName()); + if (!key_array->hasEqualOffsets(*value_array)) + throw Exception( + ErrorCodes::LOGICAL_ERROR, "Key array and value array must have equal sizes in executeMap of function {}", getName()); + + auto nested_column = ColumnArray::create( + ColumnTuple::create(Columns{key_array->getDataPtr(), value_array->getDataPtr()}), key_array->getOffsetsPtr()); + return ColumnMap::create(std::move(nested_column)); + } + static ColumnPtr executeGeneric( - const ColumnUInt8 * cond_col, const ColumnsWithTypeAndName & arguments, size_t input_rows_count) + const ColumnUInt8 * cond_col, const ColumnsWithTypeAndName & arguments, size_t input_rows_count, bool use_variant_when_no_common_type) { /// Convert both columns to the common type (if needed). const ColumnWithTypeAndName & arg1 = arguments[1]; const ColumnWithTypeAndName & arg2 = arguments[2]; - DataTypePtr common_type = getLeastSupertype(DataTypes{arg1.type, arg2.type}); + DataTypePtr common_type; + if (use_variant_when_no_common_type) + common_type = getLeastSupertypeOrVariant(DataTypes{arg1.type, arg2.type}); + else + common_type = getLeastSupertype(DataTypes{arg1.type, arg2.type}); ColumnPtr col_then = castColumn(arg1, common_type); ColumnPtr col_else = castColumn(arg2, common_type); @@ -850,6 +1043,10 @@ private: ColumnPtr executeForNullableThenElse(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const { + /// If result type is Variant, we don't need to remove Nullable. + if (isVariant(result_type)) + return nullptr; + const ColumnWithTypeAndName & arg_cond = arguments[0]; const ColumnWithTypeAndName & arg_then = arguments[1]; const ColumnWithTypeAndName & arg_else = arguments[2]; @@ -955,6 +1152,11 @@ private: assert_cast(*result_column).applyNullMap(assert_cast(*arg_cond.column)); return result_column; } + else if (auto * variant_column = typeid_cast(result_column.get())) + { + variant_column->applyNullMap(assert_cast(*arg_cond.column).getData()); + return result_column; + } else return ColumnNullable::create(materializeColumnIfConst(result_column), arg_cond.column); } @@ -993,6 +1195,11 @@ private: assert_cast(*result_column).applyNegatedNullMap(assert_cast(*arg_cond.column)); return result_column; } + else if (auto * variant_column = typeid_cast(result_column.get())) + { + variant_column->applyNegatedNullMap(assert_cast(*arg_cond.column).getData()); + return result_column; + } else { size_t size = input_rows_count; @@ -1082,6 +1289,9 @@ public: throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument (condition) of function if. " "Must be UInt8.", arguments[0]->getName()); + if (use_variant_when_no_common_type) + return getLeastSupertypeOrVariant(DataTypes{arguments[1], arguments[2]}); + return getLeastSupertype(DataTypes{arguments[1], arguments[2]}); } @@ -1112,17 +1322,12 @@ public: if (cond_const_col) { - if (arg_then.type->equals(*arg_else.type)) - { - return cond_const_col->getValue() - ? arg_then.column - : arg_else.column; - } + UInt8 value = cond_const_col->getValue(); + const ColumnWithTypeAndName & arg = value ? arg_then : arg_else; + if (arg.type->equals(*result_type)) + return arg.column; else - { - materialized_cond_col = cond_const_col->convertToFullColumn(); - cond_col = typeid_cast(&*materialized_cond_col); - } + return castColumn(arg, result_type); } if (!cond_col) @@ -1159,13 +1364,16 @@ public: TypeIndex left_id = left_type->getTypeId(); TypeIndex right_id = right_type->getTypeId(); + /// TODO optimize for map type + /// TODO optimize for nullable type if (!(callOnBasicTypes(left_id, right_id, call) || (res = executeTyped(cond_col, arguments, result_type, input_rows_count)) || (res = executeString(cond_col, arguments, result_type)) || (res = executeGenericArray(cond_col, arguments, result_type)) - || (res = executeTuple(arguments, result_type, input_rows_count)))) + || (res = executeTuple(arguments, result_type, input_rows_count)) + || (res = executeMap(arguments, result_type, input_rows_count)))) { - return executeGeneric(cond_col, arguments, input_rows_count); + return executeGeneric(cond_col, arguments, input_rows_count, use_variant_when_no_common_type); } return res; diff --git a/src/Functions/isNotNull.cpp b/src/Functions/isNotNull.cpp index cbdc08c2fab..dd5182aeade 100644 --- a/src/Functions/isNotNull.cpp +++ b/src/Functions/isNotNull.cpp @@ -1,13 +1,14 @@ -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include #include - namespace DB { namespace @@ -20,10 +21,7 @@ class FunctionIsNotNull : public IFunction public: static constexpr auto name = "isNotNull"; - static FunctionPtr create(ContextPtr) - { - return std::make_shared(); - } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } std::string getName() const override { @@ -45,15 +43,27 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { const ColumnWithTypeAndName & elem = arguments[0]; + + if (isVariant(elem.type)) + { + const auto & discriminators = checkAndGetColumn(*elem.column)->getLocalDiscriminators(); + auto res = DataTypeUInt8().createColumn(); + auto & data = typeid_cast(*res).getData(); + data.resize(discriminators.size()); + for (size_t i = 0; i < discriminators.size(); ++i) + data[i] = discriminators[i] != ColumnVariant::NULL_DISCRIMINATOR; + return res; + } + if (elem.type->isLowCardinalityNullable()) { const auto * low_cardinality_column = checkAndGetColumn(*elem.column); const size_t null_index = low_cardinality_column->getDictionary().getNullValueIndex(); auto res = DataTypeUInt8().createColumn(); auto & data = typeid_cast(*res).getData(); - data.reserve(low_cardinality_column->size()); + data.resize(low_cardinality_column->size()); for (size_t i = 0; i != low_cardinality_column->size(); ++i) - data.push_back(low_cardinality_column->getIndexAt(i) != null_index); + data[i] = (low_cardinality_column->getIndexAt(i) != null_index); return res; } @@ -63,10 +73,7 @@ public: auto res_column = ColumnUInt8::create(input_rows_count); const auto & src_data = nullable->getNullMapData(); auto & res_data = assert_cast(*res_column).getData(); - - for (size_t i = 0; i < input_rows_count; ++i) - res_data[i] = !src_data[i]; - + vector(src_data, res_data); return res_column; } else @@ -75,8 +82,34 @@ public: return DataTypeUInt8().createColumnConst(elem.column->size(), 1u); } } -}; +private: + MULTITARGET_FUNCTION_AVX2_SSE42( + MULTITARGET_FUNCTION_HEADER(static void NO_INLINE), vectorImpl, MULTITARGET_FUNCTION_BODY((const PaddedPODArray & null_map, PaddedPODArray & res) /// NOLINT + { + size_t size = null_map.size(); + for (size_t i = 0; i < size; ++i) + res[i] = !null_map[i]; + })) + + static void NO_INLINE vector(const PaddedPODArray & null_map, PaddedPODArray & res) + { +#if USE_MULTITARGET_CODE + if (isArchSupported(TargetArch::AVX2)) + { + vectorImplAVX2(null_map, res); + return; + } + + if (isArchSupported(TargetArch::SSE42)) + { + vectorImplSSE42(null_map, res); + return; + } +#endif + vectorImpl(null_map, res); + } +}; } REGISTER_FUNCTION(IsNotNull) diff --git a/src/Functions/isNull.cpp b/src/Functions/isNull.cpp index cdce037088d..4bf4e44f866 100644 --- a/src/Functions/isNull.cpp +++ b/src/Functions/isNull.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB @@ -44,6 +45,18 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override { const ColumnWithTypeAndName & elem = arguments[0]; + + if (isVariant(elem.type)) + { + const auto & discriminators = checkAndGetColumn(*elem.column)->getLocalDiscriminators(); + auto res = DataTypeUInt8().createColumn(); + auto & data = typeid_cast(*res).getData(); + data.reserve(discriminators.size()); + for (auto discr : discriminators) + data.push_back(discr == ColumnVariant::NULL_DISCRIMINATOR); + return res; + } + if (elem.type->isLowCardinalityNullable()) { const auto * low_cardinality_column = checkAndGetColumn(*elem.column); diff --git a/src/Functions/keyvaluepair/extractKeyValuePairs.cpp b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp index 34081cddb92..94f02861af0 100644 --- a/src/Functions/keyvaluepair/extractKeyValuePairs.cpp +++ b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp @@ -43,11 +43,11 @@ class ExtractKeyValuePairs : public IFunction builder.withQuotingCharacter(parsed_arguments.quoting_character.value()); } - bool is_number_of_pairs_unlimited = context->getSettingsRef().extract_kvp_max_pairs_per_row == 0; + bool is_number_of_pairs_unlimited = context->getSettingsRef().extract_key_value_pairs_max_pairs_per_row == 0; if (!is_number_of_pairs_unlimited) { - builder.withMaxNumberOfPairs(context->getSettingsRef().extract_kvp_max_pairs_per_row); + builder.withMaxNumberOfPairs(context->getSettingsRef().extract_key_value_pairs_max_pairs_per_row); } return builder.build(); diff --git a/src/Functions/less.cpp b/src/Functions/less.cpp index 63bfcfc9f40..0998dc60292 100644 --- a/src/Functions/less.cpp +++ b/src/Functions/less.cpp @@ -1,11 +1,13 @@ #include #include +#include namespace DB { using FunctionLess = FunctionComparison; +using FunctionEquals = FunctionComparison; REGISTER_FUNCTION(Less) { @@ -16,14 +18,24 @@ template <> ColumnPtr FunctionComparison::executeTupleImpl( const ColumnsWithTypeAndName & x, const ColumnsWithTypeAndName & y, size_t tuple_size, size_t input_rows_count) const { - auto less = FunctionFactory::instance().get("less", context); + FunctionOverloadResolverPtr less + = std::make_unique(std::make_shared(check_decimal_overflow)); + + FunctionOverloadResolverPtr func_builder_or + = std::make_unique(std::make_shared()); + + FunctionOverloadResolverPtr func_builder_and + = std::make_unique(std::make_shared()); + + FunctionOverloadResolverPtr func_builder_equals + = std::make_unique(std::make_shared(check_decimal_overflow)); return executeTupleLessGreaterImpl( less, less, - FunctionFactory::instance().get("and", context), - FunctionFactory::instance().get("or", context), - FunctionFactory::instance().get("equals", context), + func_builder_and, + func_builder_or, + func_builder_equals, x, y, tuple_size, input_rows_count); } diff --git a/src/Functions/lessOrEquals.cpp b/src/Functions/lessOrEquals.cpp index a91afabe226..e88ae34da75 100644 --- a/src/Functions/lessOrEquals.cpp +++ b/src/Functions/lessOrEquals.cpp @@ -1,11 +1,14 @@ #include #include +#include namespace DB { using FunctionLessOrEquals = FunctionComparison; +using FunctionLess = FunctionComparison; +using FunctionEquals = FunctionComparison; REGISTER_FUNCTION(LessOrEquals) { @@ -16,12 +19,27 @@ template <> ColumnPtr FunctionComparison::executeTupleImpl( const ColumnsWithTypeAndName & x, const ColumnsWithTypeAndName & y, size_t tuple_size, size_t input_rows_count) const { + FunctionOverloadResolverPtr less_or_equals + = std::make_unique(std::make_shared(check_decimal_overflow)); + + FunctionOverloadResolverPtr less + = std::make_unique(std::make_shared(check_decimal_overflow)); + + FunctionOverloadResolverPtr func_builder_or + = std::make_unique(std::make_shared()); + + FunctionOverloadResolverPtr func_builder_and + = std::make_unique(std::make_shared()); + + FunctionOverloadResolverPtr func_builder_equals + = std::make_unique(std::make_shared(check_decimal_overflow)); + return executeTupleLessGreaterImpl( - FunctionFactory::instance().get("less", context), - FunctionFactory::instance().get("lessOrEquals", context), - FunctionFactory::instance().get("and", context), - FunctionFactory::instance().get("or", context), - FunctionFactory::instance().get("equals", context), + less, + less_or_equals, + func_builder_and, + func_builder_or, + func_builder_equals, x, y, tuple_size, input_rows_count); } diff --git a/src/Functions/map.cpp b/src/Functions/map.cpp index c950a0491a5..66cd10a3f0b 100644 --- a/src/Functions/map.cpp +++ b/src/Functions/map.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -30,9 +31,11 @@ class FunctionMap : public IFunction public: static constexpr auto name = "map"; - static FunctionPtr create(ContextPtr) + explicit FunctionMap(bool use_variant_as_common_type_) : use_variant_as_common_type(use_variant_as_common_type_) {} + + static FunctionPtr create(ContextPtr context) { - return std::make_shared(); + return std::make_shared(context->getSettingsRef().allow_experimental_variant_type && context->getSettingsRef().use_variant_as_common_type); } String getName() const override @@ -77,8 +80,16 @@ public: } DataTypes tmp; - tmp.emplace_back(getLeastSupertype(keys)); - tmp.emplace_back(getLeastSupertype(values)); + if (use_variant_as_common_type) + { + tmp.emplace_back(getLeastSupertypeOrVariant(keys)); + tmp.emplace_back(getLeastSupertypeOrVariant(values)); + } + else + { + tmp.emplace_back(getLeastSupertype(keys)); + tmp.emplace_back(getLeastSupertype(values)); + } return std::make_shared(tmp); } @@ -138,6 +149,9 @@ public: return ColumnMap::create(nested_column); } + +private: + bool use_variant_as_common_type = false; }; /// mapFromArrays(keys, values) is a function that allows you to make key-value pair from a pair of arrays diff --git a/src/Functions/multiIf.cpp b/src/Functions/multiIf.cpp index d0f5a1ce439..cb946b55c73 100644 --- a/src/Functions/multiIf.cpp +++ b/src/Functions/multiIf.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -117,6 +118,9 @@ public: types_of_branches.emplace_back(arg); }); + if (context->getSettingsRef().allow_experimental_variant_type && context->getSettingsRef().use_variant_as_common_type) + return getLeastSupertypeOrVariant(types_of_branches); + return getLeastSupertype(types_of_branches); } diff --git a/src/Functions/notEquals.cpp b/src/Functions/notEquals.cpp index 08bedff399e..3a63db46711 100644 --- a/src/Functions/notEquals.cpp +++ b/src/Functions/notEquals.cpp @@ -1,6 +1,6 @@ #include #include - +#include namespace DB { @@ -16,9 +16,15 @@ template <> ColumnPtr FunctionComparison::executeTupleImpl( const ColumnsWithTypeAndName & x, const ColumnsWithTypeAndName & y, size_t tuple_size, size_t input_rows_count) const { + FunctionOverloadResolverPtr func_builder_not_equals + = std::make_unique(std::make_shared(check_decimal_overflow)); + + FunctionOverloadResolverPtr func_builder_or + = std::make_unique(std::make_shared()); + return executeTupleEqualityImpl( - FunctionFactory::instance().get("notEquals", context), - FunctionFactory::instance().get("or", context), + func_builder_not_equals, + func_builder_or, x, y, tuple_size, input_rows_count); } diff --git a/src/Functions/padString.cpp b/src/Functions/padString.cpp index d0f22aeeb3b..b26a4ec3d6a 100644 --- a/src/Functions/padString.cpp +++ b/src/Functions/padString.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -188,7 +189,7 @@ namespace arguments[2]->getName(), getName()); - return arguments[0]; + return std::make_shared(); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override diff --git a/src/Functions/regexpExtract.cpp b/src/Functions/regexpExtract.cpp index 0502d2fbfdc..f6bbd2f96f2 100644 --- a/src/Functions/regexpExtract.cpp +++ b/src/Functions/regexpExtract.cpp @@ -124,21 +124,23 @@ private: res_offsets.push_back(res_offset); } - static void vectorConstant( + void vectorConstant( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & pattern, ssize_t index, ColumnString::Chars & res_data, - ColumnString::Offsets & res_offsets) + ColumnString::Offsets & res_offsets) const { const OptimizedRegularExpression regexp = Regexps::createRegexp(pattern); unsigned capture = regexp.getNumberOfSubpatterns(); if (index < 0 || index >= capture + 1) throw Exception( ErrorCodes::INDEX_OF_POSITIONAL_ARGUMENT_IS_OUT_OF_RANGE, - "Index value {} is out of range, should be in [0, {})", + "Index value {} for regexp pattern `{}` in function {} is out-of-range, should be in [0, {})", index, + pattern, + getName(), capture + 1); OptimizedRegularExpression::MatchVec matches; @@ -161,13 +163,13 @@ private: } } - static void vectorVector( + void vectorVector( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & pattern, const ColumnPtr & column_index, ColumnString::Chars & res_data, - ColumnString::Offsets & res_offsets) + ColumnString::Offsets & res_offsets) const { res_data.reserve(data.size() / 5); res_offsets.reserve(offsets.size()); @@ -187,8 +189,10 @@ private: if (index < 0 || index >= capture + 1) throw Exception( ErrorCodes::INDEX_OF_POSITIONAL_ARGUMENT_IS_OUT_OF_RANGE, - "Index value {} is out of range, should be in [0, {})", + "Index value {} for regexp pattern `{}` in function {} is out-of-range, should be in [0, {})", index, + pattern, + getName(), capture + 1); regexp.match( @@ -202,12 +206,12 @@ private: } } - static void constantVector( + void constantVector( const std::string & str, const std::string & pattern, const ColumnPtr & column_index, ColumnString::Chars & res_data, - ColumnString::Offsets & res_offsets) + ColumnString::Offsets & res_offsets) const { size_t rows = column_index->size(); res_data.reserve(str.size() / 5); @@ -230,8 +234,10 @@ private: if (index < 0 || index >= capture + 1) throw Exception( ErrorCodes::INDEX_OF_POSITIONAL_ARGUMENT_IS_OUT_OF_RANGE, - "Index value {} is out of range, should be in [0, {})", + "Index value {} for regexp pattern `{}` in function {} is out-of-range, should be in [0, {})", index, + pattern, + getName(), capture + 1); saveMatch(matches, index, padded_str, 0, res_data, res_offsets, res_offset); diff --git a/src/Functions/seriesDecomposeSTL.cpp b/src/Functions/seriesDecomposeSTL.cpp index 21e36761213..e9276c4aefb 100644 --- a/src/Functions/seriesDecomposeSTL.cpp +++ b/src/Functions/seriesDecomposeSTL.cpp @@ -128,6 +128,10 @@ public: res_data.insert(residue.begin(), residue.end()); res_col_offsets_data.push_back(res_data.size()); + // Create Baseline = seasonal + trend + std::transform(seasonal.begin(), seasonal.end(), trend.begin(), std::back_inserter(res_data), std::plus<>()); + res_col_offsets_data.push_back(res_data.size()); + root_offsets_data.push_back(res_col_offsets->size()); prev_src_offset = curr_offset; @@ -201,7 +205,7 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** -- An array of three arrays where the first array include seasonal components, the second array - trend, and the third array - residue component. +- An array of four arrays where the first array include seasonal components, the second array - trend, the third array - residue component, and the fourth array - baseline(seasonal + trend) component. Type: [Array](../../sql-reference/data-types/array.md). @@ -230,6 +234,10 @@ Result: [ 0, 0.0000019073486, -0.0000019073486, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.0000019073486, 0, 0 + ], + [ + 10.1, 20.449999, 40.340004, 10.100001, 20.45, 40.34, 10.100001, 20.45, 40.34, 10.1, 20.45, 40.34, + 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.100002, 20.45, 40.34 ]] │ └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ```)", diff --git a/src/Functions/seriesOutliersDetectTukey.cpp b/src/Functions/seriesOutliersDetectTukey.cpp new file mode 100644 index 00000000000..8a2e276c74a --- /dev/null +++ b/src/Functions/seriesOutliersDetectTukey.cpp @@ -0,0 +1,262 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ +extern const int BAD_ARGUMENTS; +extern const int ILLEGAL_COLUMN; +extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +/// Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences) +class FunctionSeriesOutliersDetectTukey : public IFunction +{ +public: + static constexpr auto name = "seriesOutliersDetectTukey"; + + static constexpr Float64 min_quartile = 2.0; + static constexpr Float64 max_quartile = 98.0; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + std::string getName() const override { return name; } + + bool isVariadic() const override { return true; } + + size_t getNumberOfArguments() const override { return 0; } + + bool useDefaultImplementationForConstants() const override { return true; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + if (arguments.size() != 1 && arguments.size() != 4) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Function {} needs either 1 or 4 arguments; passed {}.", + getName(), + arguments.size()); + + FunctionArgumentDescriptors mandatory_args{{"time_series", &isArray, nullptr, "Array"}}; + FunctionArgumentDescriptors optional_args{ + {"min_percentile", &isNativeNumber, isColumnConst, "Number"}, + {"max_percentile", &isNativeNumber, isColumnConst, "Number"}, + {"k", &isNativeNumber, isColumnConst, "Number"}}; + + validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args); + + return std::make_shared(std::make_shared()); + } + + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2, 3}; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + ColumnPtr col = arguments[0].column; + const ColumnArray * col_arr = checkAndGetColumn(col.get()); + + const IColumn & arr_data = col_arr->getData(); + const ColumnArray::Offsets & arr_offsets = col_arr->getOffsets(); + + ColumnPtr col_res; + if (input_rows_count == 0) + return ColumnArray::create(ColumnFloat64::create()); + + + Float64 min_percentile = 0.25; /// default 25th percentile + Float64 max_percentile = 0.75; /// default 75th percentile + Float64 k = 1.50; + + if (arguments.size() > 1) + { + Float64 p_min = arguments[1].column->getFloat64(0); + if (isnan(p_min) || !isFinite(p_min) || p_min < min_quartile|| p_min > max_quartile) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} must be in range [2.0, 98.0]", getName()); + + min_percentile = p_min / 100; + + Float64 p_max = arguments[2].column->getFloat64(0); + if (isnan(p_max) || !isFinite(p_max) || p_max < min_quartile || p_max > max_quartile || p_max < min_percentile * 100) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The third argument of function {} must be in range [2.0, 98.0]", getName()); + + max_percentile = p_max / 100; + + auto k_val = arguments[3].column->getFloat64(0); + if (k_val < 0.0 || isnan(k_val) || !isFinite(k_val)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fourth argument of function {} must be a positive number", getName()); + + k = k_val; + } + + if (executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res)) + { + return col_res; + } + else + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {} of first argument of function {}", + arguments[0].column->getName(), + getName()); + } + +private: + template + bool executeNumber( + const IColumn & arr_data, + const ColumnArray::Offsets & arr_offsets, + Float64 min_percentile, + Float64 max_percentile, + Float64 k, + ColumnPtr & res_ptr) const + { + const ColumnVector * src_data_concrete = checkAndGetColumn>(&arr_data); + if (!src_data_concrete) + return false; + + const PaddedPODArray & src_vec = src_data_concrete->getData(); + + auto outliers = ColumnFloat64::create(); + auto & outlier_data = outliers->getData(); + + ColumnArray::ColumnOffsets::MutablePtr res_offsets = ColumnArray::ColumnOffsets::create(); + auto & res_offsets_data = res_offsets->getData(); + + std::vector src_sorted; + + ColumnArray::Offset prev_src_offset = 0; + for (auto src_offset : arr_offsets) + { + chassert(prev_src_offset <= src_offset); + size_t len = src_offset - prev_src_offset; + if (len < 4) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "At least four data points are needed for function {}", getName()); + + src_sorted.assign(src_vec.begin() + prev_src_offset, src_vec.begin() + src_offset); + std::sort(src_sorted.begin(), src_sorted.end()); + + Float64 q1, q2; + + Float64 p1 = len * min_percentile; + if (p1 == static_cast(p1)) + { + size_t index = static_cast(p1) - 1; + q1 = (src_sorted[index] + src_sorted[index + 1]) / 2; + } + else + { + size_t index = static_cast(std::ceil(p1)) - 1; + q1 = src_sorted[index]; + } + + Float64 p2 = len * max_percentile; + if (p2 == static_cast(p2)) + { + size_t index = static_cast(p2) - 1; + q2 = (src_sorted[index] + src_sorted[index + 1]) / 2; + } + else + { + size_t index = static_cast(std::ceil(p2)) - 1; + q2 = src_sorted[index]; + } + + Float64 iqr = q2 - q1; /// interquantile range + + Float64 lower_fence = q1 - k * iqr; + Float64 upper_fence = q2 + k * iqr; + + for (ColumnArray::Offset j = prev_src_offset; j < src_offset; ++j) + { + auto score = std::min((src_vec[j] - lower_fence), 0.0) + std::max((src_vec[j] - upper_fence), 0.0); + outlier_data.push_back(score); + } + res_offsets_data.push_back(outlier_data.size()); + prev_src_offset = src_offset; + } + + res_ptr = ColumnArray::create(std::move(outliers), std::move(res_offsets)); + return true; + } +}; + +REGISTER_FUNCTION(SeriesOutliersDetectTukey) +{ + factory.registerFunction(FunctionDocumentation{ + .description = R"( +Detects outliers in series data using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). + +**Syntax** + +``` sql +seriesOutliersDetectTukey(series); +seriesOutliersDetectTukey(series, min_percentile, max_percentile, k); +``` + +**Arguments** + +- `series` - An array of numeric values. +- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25. +- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75. +- `k` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5 + +At least four data points are required in `series` to detect outliers. + +**Returned value** + +- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly. + +Type: [Array](../../sql-reference/data-types/array.md). + +**Examples** + +Query: + +``` sql +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6]) AS print_0; +``` + +Result: + +``` text +┌───────────print_0─────────────────┐ +│[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0] │ +└───────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 20, 80, 1.5) AS print_0; +``` + +Result: + +``` text +┌─print_0──────────────────────────────┐ +│ [0,0,0,0,0,0,0,0,0,19.5,0,0,0,0,0,0] │ +└──────────────────────────────────────┘ +```)", + .categories{"Time series analysis"}}); +} +} diff --git a/src/Functions/serverConstants.cpp b/src/Functions/serverConstants.cpp index 9f1a3584df8..fd8fb22455b 100644 --- a/src/Functions/serverConstants.cpp +++ b/src/Functions/serverConstants.cpp @@ -51,12 +51,12 @@ namespace }; - class FunctionTcpPort : public FunctionConstantBase + class FunctionTCPPort : public FunctionConstantBase { public: static constexpr auto name = "tcpPort"; - static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } - explicit FunctionTcpPort(ContextPtr context) : FunctionConstantBase(context->getTCPPort(), context->isDistributed()) {} + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + explicit FunctionTCPPort(ContextPtr context) : FunctionConstantBase(context->getTCPPort(), context->isDistributed()) {} }; @@ -153,9 +153,9 @@ REGISTER_FUNCTION(ServerUUID) factory.registerFunction(); } -REGISTER_FUNCTION(TcpPort) +REGISTER_FUNCTION(TCPPort) { - factory.registerFunction(); + factory.registerFunction(); } REGISTER_FUNCTION(Timezone) diff --git a/src/Functions/translate.cpp b/src/Functions/translate.cpp index ad5be7d9dfd..c7173909029 100644 --- a/src/Functions/translate.cpp +++ b/src/Functions/translate.cpp @@ -1,12 +1,15 @@ -#include -#include #include +#include +#include +#include #include #include -#include +#include +#include #include #include -#include +#include + #include @@ -298,7 +301,14 @@ public: throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of third argument of function {}", arguments[2]->getName(), getName()); - return std::make_shared(); + if (isString(arguments[0])) + return std::make_shared(); + else + { + const auto * ptr = checkAndGetDataType(arguments[0].get()); + chassert(ptr); + return std::make_shared(ptr->getN()); + } } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override diff --git a/src/Functions/trap.cpp b/src/Functions/trap.cpp index 99430f039a4..6ce696fedb5 100644 --- a/src/Functions/trap.cpp +++ b/src/Functions/trap.cpp @@ -177,7 +177,7 @@ public: } else if (mode == "logical error") { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: trap"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Trap"); } else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown trap mode"); diff --git a/src/Functions/variantElement.cpp b/src/Functions/variantElement.cpp new file mode 100644 index 00000000000..2744a0dabb8 --- /dev/null +++ b/src/Functions/variantElement.cpp @@ -0,0 +1,238 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +namespace +{ + +/** Extract element of Variant by variant type name. + * Also the function looks through Arrays: you can get Array of Variant elements from Array of Variants. + */ +class FunctionVariantElement : public IFunction +{ +public: + static constexpr auto name = "variantElement"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + String getName() const override { return name; } + bool isVariadic() const override { return true; } + size_t getNumberOfArguments() const override { return 0; } + bool useDefaultImplementationForConstants() const override { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + const size_t number_of_arguments = arguments.size(); + + if (number_of_arguments < 2 || number_of_arguments > 3) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 2 or 3", + getName(), number_of_arguments); + + size_t count_arrays = 0; + const IDataType * input_type = arguments[0].type.get(); + while (const DataTypeArray * array = checkAndGetDataType(input_type)) + { + input_type = array->getNestedType().get(); + ++count_arrays; + } + + const DataTypeVariant * variant_type = checkAndGetDataType(input_type); + if (!variant_type) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Variant or Array of Variant. Actual {}", + getName(), + arguments[0].type->getName()); + + std::optional variant_global_discr = getVariantGlobalDiscriminator(arguments[1].column, *variant_type, number_of_arguments); + if (variant_global_discr.has_value()) + { + DataTypePtr return_type = makeNullableOrLowCardinalityNullableSafe(variant_type->getVariant(variant_global_discr.value())); + + for (; count_arrays; --count_arrays) + return_type = std::make_shared(return_type); + + return return_type; + } + else + return arguments[2].type; + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto & input_arg = arguments[0]; + const IDataType * input_type = input_arg.type.get(); + const IColumn * input_col = input_arg.column.get(); + + bool input_arg_is_const = false; + if (typeid_cast(input_col)) + { + input_col = assert_cast(input_col)->getDataColumnPtr().get(); + input_arg_is_const = true; + } + + Columns array_offsets; + while (const DataTypeArray * array_type = checkAndGetDataType(input_type)) + { + const ColumnArray * array_col = assert_cast(input_col); + + input_type = array_type->getNestedType().get(); + input_col = &array_col->getData(); + array_offsets.push_back(array_col->getOffsetsPtr()); + } + + const DataTypeVariant * input_type_as_variant = checkAndGetDataType(input_type); + const ColumnVariant * input_col_as_variant = checkAndGetColumn(input_col); + if (!input_type_as_variant || !input_col_as_variant) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Variant or array of Variants. Actual {}", getName(), input_arg.type->getName()); + + std::optional variant_global_discr = getVariantGlobalDiscriminator(arguments[1].column, *input_type_as_variant, arguments.size()); + + if (!variant_global_discr.has_value()) + return arguments[2].column; + + const auto & variant_type = input_type_as_variant->getVariant(*variant_global_discr); + const auto & variant_column = input_col_as_variant->getVariantPtrByGlobalDiscriminator(*variant_global_discr); + + /// If Variant has only NULLs or our variant doesn't have any real values, + /// just create column with default values and create null mask with 1. + if (input_col_as_variant->hasOnlyNulls() || variant_column->empty()) + { + auto res = variant_type->createColumn(); + + if (variant_type->lowCardinality()) + assert_cast(*res).nestedToNullable(); + + res->insertManyDefaults(input_col_as_variant->size()); + if (!variant_type->canBeInsideNullable()) + return wrapInArraysAndConstIfNeeded(std::move(res), array_offsets, input_arg_is_const, input_rows_count); + + auto null_map = ColumnUInt8::create(); + auto & null_map_data = null_map->getData(); + null_map_data.resize_fill(input_col_as_variant->size(), 1); + return wrapInArraysAndConstIfNeeded(ColumnNullable::create(std::move(res), std::move(null_map)), array_offsets, input_arg_is_const, input_rows_count); + } + + /// If we extract single non-empty column and have no NULLs, then just return this variant. + if (auto non_empty_local_discr = input_col_as_variant->getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + /// If we were trying to extract some other variant, + /// it would be empty and we would already processed this case above. + chassert(input_col_as_variant->globalDiscriminatorByLocal(*non_empty_local_discr) == variant_global_discr); + return wrapInArraysAndConstIfNeeded(makeNullableOrLowCardinalityNullableSafe(variant_column), array_offsets, input_arg_is_const, input_rows_count); + } + + /// In general case we should calculate null-mask for variant + /// according to the discriminators column and expand + /// variant column by this mask to get a full column (with default values on NULLs) + const auto & local_discriminators = input_col_as_variant->getLocalDiscriminators(); + auto null_map = ColumnUInt8::create(); + auto & null_map_data = null_map->getData(); + null_map_data.reserve(local_discriminators.size()); + auto variant_local_discr = input_col_as_variant->localDiscriminatorByGlobal(*variant_global_discr); + for (auto local_discr : local_discriminators) + null_map_data.push_back(local_discr != variant_local_discr); + + auto expanded_variant_column = IColumn::mutate(variant_column); + if (variant_type->lowCardinality()) + expanded_variant_column = assert_cast(*expanded_variant_column).cloneNullable(); + expanded_variant_column->expand(null_map_data, /*inverted = */ true); + if (variant_type->canBeInsideNullable()) + return wrapInArraysAndConstIfNeeded(ColumnNullable::create(std::move(expanded_variant_column), std::move(null_map)), array_offsets, input_arg_is_const, input_rows_count); + return wrapInArraysAndConstIfNeeded(std::move(expanded_variant_column), array_offsets, input_arg_is_const, input_rows_count); + } +private: + std::optional getVariantGlobalDiscriminator(const ColumnPtr & index_column, const DataTypeVariant & variant_type, size_t argument_size) const + { + const auto * name_col = checkAndGetColumnConst(index_column.get()); + if (!name_col) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Second argument to {} with Variant argument must be a constant String", + getName()); + + String variant_element_name = name_col->getValue(); + auto variant_element_type = DataTypeFactory::instance().tryGet(variant_element_name); + if (variant_element_type) + { + const auto & variants = variant_type.getVariants(); + for (size_t i = 0; i != variants.size(); ++i) + { + if (variants[i]->getName() == variant_element_type->getName()) + return i; + } + } + + if (argument_size == 2) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{} doesn't contain variant with type {}", variant_type.getName(), variant_element_name); + return std::nullopt; + } + + ColumnPtr wrapInArraysAndConstIfNeeded(ColumnPtr res, const Columns & array_offsets, bool input_arg_is_const, size_t input_rows_count) const + { + for (auto it = array_offsets.rbegin(); it != array_offsets.rend(); ++it) + res = ColumnArray::create(res, *it); + + if (input_arg_is_const) + res = ColumnConst::create(res, input_rows_count); + + return res; + } +}; + +} + +REGISTER_FUNCTION(VariantElement) +{ + factory.registerFunction(FunctionDocumentation{ + .description = R"( +Extracts a column with specified type from a `Variant` column. +)", + .syntax{"variantElement(variant, type_name, [, default_value])"}, + .arguments{{ + {"variant", "Variant column"}, + {"type_name", "The name of the variant type to extract"}, + {"default_value", "The default value that will be used if variant doesn't have variant with specified type. Can be any type. Optional"}}}, + .examples{{{ + "Example", + R"( +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT v, variantElement(v, 'String'), variantElement(v, 'UInt64'), variantElement(v, 'Array(UInt64)') FROM test;)", + R"( +┌─v─────────────┬─variantElement(v, 'String')─┬─variantElement(v, 'UInt64')─┬─variantElement(v, 'Array(UInt64)')─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ [] │ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└───────────────┴─────────────────────────────┴─────────────────────────────┴────────────────────────────────────┘ +)"}}}, + .categories{"Variant"}, + }); +} + +} diff --git a/src/Functions/variantType.cpp b/src/Functions/variantType.cpp new file mode 100644 index 00000000000..e867cb03a23 --- /dev/null +++ b/src/Functions/variantType.cpp @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +namespace +{ + +/// Return enum with type name for each row in Variant column. +class FunctionVariantType : public IFunction +{ +public: + static constexpr auto name = "variantType"; + static constexpr auto enum_name_for_null = "None"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + if (arguments.empty() || arguments.size() > 1) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 1", + getName(), arguments.empty()); + + const DataTypeVariant * variant_type = checkAndGetDataType(arguments[0].type.get()); + + if (!variant_type) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Variant, got {} instead", + getName(), arguments[0].type->getName()); + + const auto & variants = variant_type->getVariants(); + std::vector> enum_values; + enum_values.reserve(variants.size() + 1); + for (ColumnVariant::Discriminator i = 0; i != variants.size(); ++i) + enum_values.emplace_back(variants[i]->getName(), i); + enum_values.emplace_back(enum_name_for_null, ColumnVariant::NULL_DISCRIMINATOR); + return std::make_shared>(enum_values); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + const ColumnVariant * variant_column = checkAndGetColumn(arguments[0].column.get()); + if (!variant_column) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Variant, got {} instead", + getName(), arguments[0].type->getName()); + + auto res = result_type->createColumn(); + auto & res_data = typeid_cast(res.get())->getData(); + res_data.reserve(input_rows_count); + for (size_t i = 0; i != input_rows_count; ++i) + res_data.push_back(variant_column->globalDiscriminatorAt(i)); + + return res; + } +}; + +} + +REGISTER_FUNCTION(VariantType) +{ + factory.registerFunction(FunctionDocumentation{ + .description = R"( +Returns the variant type name for each row of `Variant` column. If row contains NULL, it returns 'None' for it. +)", + .syntax = {"variantType(variant)"}, + .arguments = {{"variant", "Variant column"}}, + .examples = {{{ + "Example", + R"( +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT variantType(v) FROM test;)", + R"( +┌─variantType(v)─┐ +│ None │ +│ UInt64 │ +│ String │ +│ Array(UInt64) │ +└────────────────┘ +)"}}}, + .categories{"Variant"}, + }); +} + +} diff --git a/src/Functions/vectorFunctions.cpp b/src/Functions/vectorFunctions.cpp index 33b0e9f6039..de4a6fb0a5c 100644 --- a/src/Functions/vectorFunctions.cpp +++ b/src/Functions/vectorFunctions.cpp @@ -1,9 +1,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -1364,11 +1364,11 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - if (getReturnTypeImpl(arguments)->isNullable()) - { - return DataTypeNullable(std::make_shared()) - .createColumnConstWithDefaultValue(input_rows_count); - } + /// TODO: cosineDistance does not support nullable arguments + /// https://github.com/ClickHouse/ClickHouse/pull/27933#issuecomment-916670286 + auto return_type = getReturnTypeImpl(arguments); + if (return_type->isNullable()) + return return_type->createColumnConstWithDefaultValue(input_rows_count); FunctionDotProduct dot(context); ColumnWithTypeAndName dot_result{dot.executeImpl(arguments, DataTypePtr(), input_rows_count), diff --git a/src/Functions/widthBucket.cpp b/src/Functions/widthBucket.cpp index e95f7c05756..62ed460ca9d 100644 --- a/src/Functions/widthBucket.cpp +++ b/src/Functions/widthBucket.cpp @@ -44,7 +44,7 @@ class FunctionWidthBucket : public IFunction { throw Exception( ErrorCodes::LOGICAL_ERROR, - "Logical error in function {}: argument {} has unexpected type or size!", + "Logical error in function {}: argument {} has unexpected type or size.", getName(), argument_index); } @@ -157,7 +157,7 @@ class FunctionWidthBucket : public IFunction if (are_all_const_cols) { throw Exception( - ErrorCodes::LOGICAL_ERROR, "Logical error in function {}: unexpected combination of argument types!", getName()); + ErrorCodes::LOGICAL_ERROR, "Logical error in function {}: unexpected combination of argument types.", getName()); } auto result_column = ColumnVector::create(); diff --git a/src/IO/Archives/IArchiveReader.h b/src/IO/Archives/IArchiveReader.h index 84a1dc21f5b..ee516d2655b 100644 --- a/src/IO/Archives/IArchiveReader.h +++ b/src/IO/Archives/IArchiveReader.h @@ -56,6 +56,7 @@ public: /// It's possible to convert a file enumerator to a read buffer and vice versa. virtual std::unique_ptr readFile(std::unique_ptr enumerator) = 0; virtual std::unique_ptr nextFile(std::unique_ptr read_buffer) = 0; + virtual std::unique_ptr currentFile(std::unique_ptr read_buffer) = 0; virtual std::vector getAllFiles() = 0; virtual std::vector getAllFiles(NameFilter filter) = 0; diff --git a/src/IO/Archives/LibArchiveReader.cpp b/src/IO/Archives/LibArchiveReader.cpp index 763cd3b171b..a9ce401138b 100644 --- a/src/IO/Archives/LibArchiveReader.cpp +++ b/src/IO/Archives/LibArchiveReader.cpp @@ -228,12 +228,7 @@ public: off_t getPosition() override { - throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "getPosition is not supported when reading from archive"); - } - - size_t getFileOffsetOfBufferEnd() const override - { - throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "getFileOffsetOfBufferEnd is not supported when reading from archive"); + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "getPosition not supported when reading from archive"); } String getFileName() const override { return handle.getFileName(); } @@ -340,6 +335,15 @@ std::unique_ptr LibArchiveReader::nextFile(std return std::make_unique(std::move(handle)); } +std::unique_ptr LibArchiveReader::currentFile(std::unique_ptr read_buffer) +{ + if (!dynamic_cast(read_buffer.get())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong ReadBuffer passed to nextFile()"); + auto read_buffer_from_libarchive = std::unique_ptr(static_cast(read_buffer.release())); + auto handle = std::move(*read_buffer_from_libarchive).releaseHandle(); + return std::make_unique(std::move(handle)); +} + std::vector LibArchiveReader::getAllFiles() { return getAllFiles({}); diff --git a/src/IO/Archives/LibArchiveReader.h b/src/IO/Archives/LibArchiveReader.h index 3dadd710089..c4b08d8ddf7 100644 --- a/src/IO/Archives/LibArchiveReader.h +++ b/src/IO/Archives/LibArchiveReader.h @@ -40,6 +40,7 @@ public: /// It's possible to convert a file enumerator to a read buffer and vice versa. std::unique_ptr readFile(std::unique_ptr enumerator) override; std::unique_ptr nextFile(std::unique_ptr read_buffer) override; + std::unique_ptr currentFile(std::unique_ptr read_buffer) override; std::vector getAllFiles() override; std::vector getAllFiles(NameFilter filter) override; diff --git a/src/IO/Archives/ZipArchiveReader.cpp b/src/IO/Archives/ZipArchiveReader.cpp index 636042ec586..2a9b7a43519 100644 --- a/src/IO/Archives/ZipArchiveReader.cpp +++ b/src/IO/Archives/ZipArchiveReader.cpp @@ -15,7 +15,6 @@ namespace ErrorCodes extern const int CANNOT_UNPACK_ARCHIVE; extern const int LOGICAL_ERROR; extern const int SEEK_POSITION_OUT_OF_BOUND; - extern const int UNSUPPORTED_METHOD; extern const int CANNOT_SEEK_THROUGH_FILE; } @@ -253,11 +252,6 @@ public: checkResult(err); } - size_t getFileOffsetOfBufferEnd() const override - { - throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "getFileOffsetOfBufferEnd is not supported when reading from zip archive"); - } - off_t seek(off_t off, int whence) override { off_t current_pos = getPosition(); @@ -589,6 +583,15 @@ std::unique_ptr ZipArchiveReader::nextFile(std return std::make_unique(std::move(handle)); } +std::unique_ptr ZipArchiveReader::currentFile(std::unique_ptr read_buffer) +{ + if (!dynamic_cast(read_buffer.get())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong ReadBuffer passed to nextFile()"); + auto read_buffer_from_zip = std::unique_ptr(static_cast(read_buffer.release())); + auto handle = std::move(*read_buffer_from_zip).releaseHandle(); + return std::make_unique(std::move(handle)); +} + std::vector ZipArchiveReader::getAllFiles() { return getAllFiles({}); diff --git a/src/IO/Archives/ZipArchiveReader.h b/src/IO/Archives/ZipArchiveReader.h index a8788064fec..4b1910839eb 100644 --- a/src/IO/Archives/ZipArchiveReader.h +++ b/src/IO/Archives/ZipArchiveReader.h @@ -47,6 +47,7 @@ public: /// It's possible to convert a file enumerator to a read buffer and vice versa. std::unique_ptr readFile(std::unique_ptr enumerator) override; std::unique_ptr nextFile(std::unique_ptr read_buffer) override; + std::unique_ptr currentFile(std::unique_ptr read_buffer) override; std::vector getAllFiles() override; std::vector getAllFiles(NameFilter filter) override; diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp new file mode 100644 index 00000000000..4714c795927 --- /dev/null +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -0,0 +1,340 @@ +#include + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ProfileEvents +{ + extern const Event AzureCopyObject; + extern const Event AzureUploadPart; + + extern const Event DiskAzureCopyObject; + extern const Event DiskAzureUploadPart; +} + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INVALID_CONFIG_PARAMETER; + extern const int AZURE_BLOB_STORAGE_ERROR; +} + +namespace +{ + class UploadHelper + { + public: + UploadHelper( + const CreateReadBuffer & create_read_buffer_, + std::shared_ptr client_, + size_t offset_, + size_t total_size_, + const String & dest_container_for_logging_, + const String & dest_blob_, + std::shared_ptr settings_, + ThreadPoolCallbackRunner schedule_, + bool for_disk_azure_blob_storage_, + const Poco::Logger * log_) + : create_read_buffer(create_read_buffer_) + , client(client_) + , offset (offset_) + , total_size (total_size_) + , dest_container_for_logging(dest_container_for_logging_) + , dest_blob(dest_blob_) + , settings(settings_) + , schedule(schedule_) + , for_disk_azure_blob_storage(for_disk_azure_blob_storage_) + , log(log_) + , max_single_part_upload_size(settings_->max_single_part_upload_size) + { + } + + virtual ~UploadHelper() = default; + + protected: + std::function()> create_read_buffer; + std::shared_ptr client; + size_t offset; + size_t total_size; + const String & dest_container_for_logging; + const String & dest_blob; + std::shared_ptr settings; + ThreadPoolCallbackRunner schedule; + bool for_disk_azure_blob_storage; + const Poco::Logger * log; + size_t max_single_part_upload_size; + + struct UploadPartTask + { + size_t part_offset; + size_t part_size; + std::vector block_ids; + bool is_finished = false; + std::exception_ptr exception; + }; + + size_t normal_part_size; + std::vector block_ids; + + std::list TSA_GUARDED_BY(bg_tasks_mutex) bg_tasks; + int num_added_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0; + int num_finished_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0; + std::mutex bg_tasks_mutex; + std::condition_variable bg_tasks_condvar; + + void calculatePartSize() + { + auto max_upload_part_size = settings->max_upload_part_size; + if (!max_upload_part_size) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_upload_part_size must not be 0"); + /// We've calculated the size of a normal part (the final part can be smaller). + normal_part_size = max_upload_part_size; + } + + public: + void performCopy() + { + performMultipartUpload(); + } + + void completeMultipartUpload() + { + auto block_blob_client = client->GetBlockBlobClient(dest_blob); + block_blob_client.CommitBlockList(block_ids); + } + + void performMultipartUpload() + { + calculatePartSize(); + + size_t position = offset; + size_t end_position = offset + total_size; + + try + { + while (position < end_position) + { + size_t next_position = std::min(position + normal_part_size, end_position); + size_t part_size = next_position - position; /// `part_size` is either `normal_part_size` or smaller if it's the final part. + + uploadPart(position, part_size); + + position = next_position; + } + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + waitForAllBackgroundTasks(); + throw; + } + + waitForAllBackgroundTasks(); + completeMultipartUpload(); + } + + + void uploadPart(size_t part_offset, size_t part_size) + { + LOG_TRACE(log, "Writing part. Container: {}, Blob: {}, Size: {}", dest_container_for_logging, dest_blob, part_size); + + if (!part_size) + { + LOG_TRACE(log, "Skipping writing an empty part."); + return; + } + + if (schedule) + { + UploadPartTask * task = nullptr; + + { + std::lock_guard lock(bg_tasks_mutex); + task = &bg_tasks.emplace_back(); + ++num_added_bg_tasks; + } + + /// Notify waiting thread when task finished + auto task_finish_notify = [this, task]() + { + std::lock_guard lock(bg_tasks_mutex); + task->is_finished = true; + ++num_finished_bg_tasks; + + /// Notification under mutex is important here. + /// Otherwise, WriteBuffer could be destroyed in between + /// Releasing lock and condvar notification. + bg_tasks_condvar.notify_one(); + }; + + try + { + task->part_offset = part_offset; + task->part_size = part_size; + + schedule([this, task, task_finish_notify]() + { + try + { + processUploadPartRequest(*task); + } + catch (...) + { + task->exception = std::current_exception(); + } + task_finish_notify(); + }, Priority{}); + } + catch (...) + { + task_finish_notify(); + throw; + } + } + else + { + UploadPartTask task; + task.part_offset = part_offset; + task.part_size = part_size; + processUploadPartRequest(task); + block_ids.insert(block_ids.end(),task.block_ids.begin(), task.block_ids.end()); + } + } + + void processUploadPartRequest(UploadPartTask & task) + { + ProfileEvents::increment(ProfileEvents::AzureUploadPart); + if (for_disk_azure_blob_storage) + ProfileEvents::increment(ProfileEvents::DiskAzureUploadPart); + + auto block_blob_client = client->GetBlockBlobClient(dest_blob); + auto read_buffer = std::make_unique(create_read_buffer(), task.part_offset, task.part_size); + while (!read_buffer->eof()) + { + auto size = read_buffer->available(); + if (size > 0) + { + auto block_id = getRandomASCIIString(64); + Azure::Core::IO::MemoryBodyStream memory(reinterpret_cast(read_buffer->position()), size); + block_blob_client.StageBlock(block_id, memory); + task.block_ids.emplace_back(block_id); + read_buffer->ignore(size); + LOG_TRACE(log, "Writing part. Container: {}, Blob: {}, block_id: {}", dest_container_for_logging, dest_blob, block_id); + } + } + std::lock_guard lock(bg_tasks_mutex); /// Protect bg_tasks from race + LOG_TRACE(log, "Writing part finished. Container: {}, Blob: {}, Parts: {}", dest_container_for_logging, dest_blob, bg_tasks.size()); + } + + + void waitForAllBackgroundTasks() + { + if (!schedule) + return; + + std::unique_lock lock(bg_tasks_mutex); + /// Suppress warnings because bg_tasks_mutex is actually hold, but tsa annotations do not understand std::unique_lock + bg_tasks_condvar.wait(lock, [this]() {return TSA_SUPPRESS_WARNING_FOR_READ(num_added_bg_tasks) == TSA_SUPPRESS_WARNING_FOR_READ(num_finished_bg_tasks); }); + + auto & tasks = TSA_SUPPRESS_WARNING_FOR_WRITE(bg_tasks); + for (auto & task : tasks) + { + if (task.exception) + std::rethrow_exception(task.exception); + block_ids.insert(block_ids.end(),task.block_ids.begin(), task.block_ids.end()); + } + } + }; +} + + +void copyDataToAzureBlobStorageFile( + const std::function()> & create_read_buffer, + size_t offset, + size_t size, + std::shared_ptr dest_client, + const String & dest_container_for_logging, + const String & dest_blob, + std::shared_ptr settings, + ThreadPoolCallbackRunner schedule, + bool for_disk_azure_blob_storage) +{ + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container_for_logging, dest_blob, settings, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyDataToAzureBlobStorageFile")}; + helper.performCopy(); +} + + +void copyAzureBlobStorageFile( + std::shared_ptr src_client, + std::shared_ptr dest_client, + const String & src_container_for_logging, + const String & src_blob, + size_t offset, + size_t size, + const String & dest_container_for_logging, + const String & dest_blob, + std::shared_ptr settings, + const ReadSettings & read_settings, + ThreadPoolCallbackRunner schedule, + bool for_disk_azure_blob_storage) +{ + + if (settings->use_native_copy) + { + ProfileEvents::increment(ProfileEvents::AzureCopyObject); + if (for_disk_azure_blob_storage) + ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject); + + auto block_blob_client_src = src_client->GetBlockBlobClient(src_blob); + auto block_blob_client_dest = dest_client->GetBlockBlobClient(dest_blob); + auto source_uri = block_blob_client_src.GetUrl(); + + if (size < settings->max_single_part_copy_size) + { + block_blob_client_dest.CopyFromUri(source_uri); + } + else + { + Azure::Storage::Blobs::StartBlobCopyOperation operation = block_blob_client_dest.StartCopyFromUri(source_uri); + + // Wait for the operation to finish, checking for status every 100 second. + auto copy_response = operation.PollUntilDone(std::chrono::milliseconds(100)); + auto properties_model = copy_response.Value; + + if (properties_model.CopySource.HasValue()) + { + throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Copy failed"); + } + + } + } + else + { + LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Container: {}, Blob: {}", src_container_for_logging, src_blob); + auto create_read_buffer = [&] + { + return std::make_unique(src_client, src_blob, read_settings, settings->max_single_read_retries, + settings->max_single_download_retries); + }; + + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container_for_logging, dest_blob, settings, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyAzureBlobStorageFile")}; + helper.performCopy(); + } +} + +} + +#endif diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h new file mode 100644 index 00000000000..1433f8d18ba --- /dev/null +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -0,0 +1,56 @@ +#pragma once + +#include "config.h" + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include +#include +#include +#include + + +namespace DB +{ +class SeekableReadBuffer; + +using CreateReadBuffer = std::function()>; + +/// Copies a file from AzureBlobStorage to AzureBlobStorage. +/// The parameters `src_offset` and `src_size` specify a part in the source to copy. +void copyAzureBlobStorageFile( + std::shared_ptr src_client, + std::shared_ptr dest_client, + const String & src_container_for_logging, + const String & src_blob, + size_t src_offset, + size_t src_size, + const String & dest_container_for_logging, + const String & dest_blob, + std::shared_ptr settings, + const ReadSettings & read_settings, + ThreadPoolCallbackRunner schedule_ = {}, + bool for_disk_azure_blob_storage = false); + + +/// Copies data from any seekable source to AzureBlobStorage. +/// The same functionality can be done by using the function copyData() and the class WriteBufferFromS3 +/// however copyDataToS3File() is faster and spends less memory. +/// The callback `create_read_buffer` can be called from multiple threads in parallel, so that should be thread-safe. +/// The parameters `offset` and `size` specify a part in the source to copy. +void copyDataToAzureBlobStorageFile( + const std::function()> & create_read_buffer, + size_t offset, + size_t size, + std::shared_ptr client, + const String & dest_container_for_logging, + const String & dest_blob, + std::shared_ptr settings, + ThreadPoolCallbackRunner schedule_ = {}, + bool for_disk_azure_blob_storage = false); + +} + +#endif diff --git a/src/IO/BoundedReadBuffer.cpp b/src/IO/BoundedReadBuffer.cpp index f3b176a963c..bda79d82ad3 100644 --- a/src/IO/BoundedReadBuffer.cpp +++ b/src/IO/BoundedReadBuffer.cpp @@ -4,7 +4,8 @@ namespace DB { -BoundedReadBuffer::BoundedReadBuffer(std::unique_ptr impl_) : impl(std::move(impl_)) +BoundedReadBuffer::BoundedReadBuffer(std::unique_ptr impl_) + : ReadBufferFromFileDecorator(std::move(impl_)) { } diff --git a/src/IO/BoundedReadBuffer.h b/src/IO/BoundedReadBuffer.h index 22a6471a9a1..eb65857e83a 100644 --- a/src/IO/BoundedReadBuffer.h +++ b/src/IO/BoundedReadBuffer.h @@ -1,5 +1,5 @@ #pragma once -#include +#include namespace DB @@ -7,10 +7,10 @@ namespace DB /// A buffer which allows to make an underlying buffer as right bounded, /// e.g. the buffer cannot return data beyond offset specified in `setReadUntilPosition`. -class BoundedReadBuffer : public ReadBufferFromFileBase +class BoundedReadBuffer : public ReadBufferFromFileDecorator { public: - explicit BoundedReadBuffer(std::unique_ptr impl_); + explicit BoundedReadBuffer(std::unique_ptr impl_); bool supportsRightBoundedReads() const override { return true; } @@ -23,8 +23,6 @@ public: off_t seek(off_t off, int whence) override; size_t getFileOffsetOfBufferEnd() const override { return file_offset_of_buffer_end; } - String getFileName() const override { return impl->getFileName(); } - size_t getFileSize() override { return impl->getFileSize(); } /// file_offset_of_buffer_end can differ from impl's file_offset_of_buffer_end /// because of resizing of the tail. => Need to also override getPosition() as @@ -32,8 +30,6 @@ public: off_t getPosition() override; private: - std::unique_ptr impl; - std::optional read_until_position; /// atomic because can be used in log or exception messages while being updated. std::atomic file_offset_of_buffer_end = 0; diff --git a/src/IO/CompressedReadBufferWrapper.h b/src/IO/CompressedReadBufferWrapper.h index bb58a7bfeb3..66e57488434 100644 --- a/src/IO/CompressedReadBufferWrapper.h +++ b/src/IO/CompressedReadBufferWrapper.h @@ -1,11 +1,12 @@ #pragma once #include #include +#include namespace DB { -class CompressedReadBufferWrapper : public BufferWithOwnMemory +class CompressedReadBufferWrapper : public BufferWithOwnMemory, public ReadBufferWrapperBase { public: CompressedReadBufferWrapper( @@ -16,7 +17,7 @@ public: : BufferWithOwnMemory(buf_size, existing_memory, alignment) , in(std::move(in_)) {} - const ReadBuffer & getWrappedReadBuffer() const { return *in; } + const ReadBuffer & getWrappedReadBuffer() const override { return *in; } ReadBuffer & getWrappedReadBuffer() { return *in; } void prefetch(Priority priority) override { in->prefetch(priority); } diff --git a/src/IO/ConnectionTimeouts.cpp b/src/IO/ConnectionTimeouts.cpp index ecc0d64580b..f2db3169400 100644 --- a/src/IO/ConnectionTimeouts.cpp +++ b/src/IO/ConnectionTimeouts.cpp @@ -20,7 +20,7 @@ ConnectionTimeouts ConnectionTimeouts::getTCPTimeoutsWithoutFailover(const Setti .withConnectionTimeout(settings.connect_timeout) .withSendTimeout(settings.send_timeout) .withReceiveTimeout(settings.receive_timeout) - .withTcpKeepAliveTimeout(settings.tcp_keep_alive_timeout) + .withTCPKeepAliveTimeout(settings.tcp_keep_alive_timeout) .withHandshakeTimeout(settings.handshake_timeout_ms) .withHedgedConnectionTimeout(settings.hedged_connection_timeout_ms) .withReceiveDataTimeout(settings.receive_data_timeout_ms); @@ -40,8 +40,8 @@ ConnectionTimeouts ConnectionTimeouts::getHTTPTimeouts(const Settings & settings .withConnectionTimeout(settings.http_connection_timeout) .withSendTimeout(settings.http_send_timeout) .withReceiveTimeout(settings.http_receive_timeout) - .withHttpKeepAliveTimeout(http_keep_alive_timeout) - .withTcpKeepAliveTimeout(settings.tcp_keep_alive_timeout) + .withHTTPKeepAliveTimeout(http_keep_alive_timeout) + .withTCPKeepAliveTimeout(settings.tcp_keep_alive_timeout) .withHandshakeTimeout(settings.handshake_timeout_ms); } diff --git a/src/IO/ConnectionTimeouts.h b/src/IO/ConnectionTimeouts.h index 6967af08204..7fe97b5ec36 100644 --- a/src/IO/ConnectionTimeouts.h +++ b/src/IO/ConnectionTimeouts.h @@ -16,8 +16,8 @@ struct Settings; M(secure_connection_timeout, withSecureConnectionTimeout) \ M(send_timeout, withSendTimeout) \ M(receive_timeout, withReceiveTimeout) \ - M(tcp_keep_alive_timeout, withTcpKeepAliveTimeout) \ - M(http_keep_alive_timeout, withHttpKeepAliveTimeout) \ + M(tcp_keep_alive_timeout, withTCPKeepAliveTimeout) \ + M(http_keep_alive_timeout, withHTTPKeepAliveTimeout) \ M(hedged_connection_timeout, withHedgedConnectionTimeout) \ M(receive_data_timeout, withReceiveDataTimeout) \ M(handshake_timeout, withHandshakeTimeout) \ diff --git a/src/IO/LimitSeekableReadBuffer.h b/src/IO/LimitSeekableReadBuffer.h index 5624388dd7e..61b307c522c 100644 --- a/src/IO/LimitSeekableReadBuffer.h +++ b/src/IO/LimitSeekableReadBuffer.h @@ -18,6 +18,7 @@ public: /// Returns adjusted position, i.e. returns `3` if the position in the nested buffer is `start_offset + 3`. off_t getPosition() override; + off_t seek(off_t off, int whence) override; private: diff --git a/src/IO/MMapReadBufferFromFileDescriptor.cpp b/src/IO/MMapReadBufferFromFileDescriptor.cpp index 56a094bb1a3..9b1c132cc01 100644 --- a/src/IO/MMapReadBufferFromFileDescriptor.cpp +++ b/src/IO/MMapReadBufferFromFileDescriptor.cpp @@ -92,11 +92,6 @@ size_t MMapReadBufferFromFileDescriptor::getFileSize() return getSizeFromFileDescriptor(getFD(), getFileName()); } -size_t MMapReadBufferFromFileDescriptor::getFileOffsetOfBufferEnd() const -{ - return mapped.getOffset() + mapped.getLength(); -} - size_t MMapReadBufferFromFileDescriptor::readBigAt(char * to, size_t n, size_t offset, const std::function &) { if (offset >= mapped.getLength()) diff --git a/src/IO/MMapReadBufferFromFileDescriptor.h b/src/IO/MMapReadBufferFromFileDescriptor.h index 97d8bbe224d..2a039e04971 100644 --- a/src/IO/MMapReadBufferFromFileDescriptor.h +++ b/src/IO/MMapReadBufferFromFileDescriptor.h @@ -36,8 +36,6 @@ public: std::string getFileName() const override; - size_t getFileOffsetOfBufferEnd() const override; - int getFD() const; size_t getFileSize() override; diff --git a/src/IO/MMapReadBufferFromFileWithCache.cpp b/src/IO/MMapReadBufferFromFileWithCache.cpp index f3c4d6f4e01..d53f3bc325d 100644 --- a/src/IO/MMapReadBufferFromFileWithCache.cpp +++ b/src/IO/MMapReadBufferFromFileWithCache.cpp @@ -76,9 +76,4 @@ off_t MMapReadBufferFromFileWithCache::seek(off_t offset, int whence) return new_pos; } -size_t MMapReadBufferFromFileWithCache::getFileOffsetOfBufferEnd() const -{ - return mapped->getOffset() + mapped->getLength(); -} - } diff --git a/src/IO/MMapReadBufferFromFileWithCache.h b/src/IO/MMapReadBufferFromFileWithCache.h index ce5da29831e..cb87b03df8d 100644 --- a/src/IO/MMapReadBufferFromFileWithCache.h +++ b/src/IO/MMapReadBufferFromFileWithCache.h @@ -19,7 +19,7 @@ public: off_t getPosition() override; std::string getFileName() const override; off_t seek(off_t offset, int whence) override; - size_t getFileOffsetOfBufferEnd() const override; + bool isRegularLocalFile(size_t * /* out_view_offset */) override { return true; } private: diff --git a/src/IO/OpenedFile.h b/src/IO/OpenedFile.h index 10c36d9e1d3..4c4de2265bc 100644 --- a/src/IO/OpenedFile.h +++ b/src/IO/OpenedFile.h @@ -21,7 +21,7 @@ public: OpenedFile(const std::string & file_name_, int flags_); ~OpenedFile(); - /// Close prematurally. + /// Close prematurely. void close(); int getFD() const; @@ -40,4 +40,3 @@ private: }; } - diff --git a/src/IO/ParallelReadBuffer.h b/src/IO/ParallelReadBuffer.h index e76b40f77b7..daac1190399 100644 --- a/src/IO/ParallelReadBuffer.h +++ b/src/IO/ParallelReadBuffer.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include namespace DB diff --git a/src/IO/ReadBuffer.cpp b/src/IO/ReadBuffer.cpp index bf054d08425..0d1cd322fdd 100644 --- a/src/IO/ReadBuffer.cpp +++ b/src/IO/ReadBuffer.cpp @@ -1,4 +1,5 @@ #include +#include namespace DB @@ -7,7 +8,7 @@ namespace DB namespace { template - class ReadBufferWrapper : public ReadBuffer + class ReadBufferWrapper : public ReadBuffer, public ReadBufferWrapperBase { public: ReadBufferWrapper(ReadBuffer & in_, CustomData && custom_data_) @@ -15,6 +16,8 @@ namespace { } + const ReadBuffer & getWrappedReadBuffer() const override { return in; } + private: ReadBuffer & in; CustomData custom_data; diff --git a/src/IO/ReadBufferFromEmptyFile.h b/src/IO/ReadBufferFromEmptyFile.h index e2765765c47..f21f2f507dc 100644 --- a/src/IO/ReadBufferFromEmptyFile.h +++ b/src/IO/ReadBufferFromEmptyFile.h @@ -14,18 +14,12 @@ namespace DB /// - ThreadPoolReader class ReadBufferFromEmptyFile : public ReadBufferFromFileBase { -public: - explicit ReadBufferFromEmptyFile(const String & file_name_) : file_name(file_name_) {} - private: - String file_name; - bool nextImpl() override { return false; } - std::string getFileName() const override { return file_name; } + std::string getFileName() const override { return ""; } off_t seek(off_t /*off*/, int /*whence*/) override { return 0; } off_t getPosition() override { return 0; } size_t getFileSize() override { return 0; } - size_t getFileOffsetOfBufferEnd() const override { return 0; } }; } diff --git a/src/IO/ReadBufferFromEncryptedFile.cpp b/src/IO/ReadBufferFromEncryptedFile.cpp index 6861ae06dd8..f9cf1597153 100644 --- a/src/IO/ReadBufferFromEncryptedFile.cpp +++ b/src/IO/ReadBufferFromEncryptedFile.cpp @@ -101,18 +101,6 @@ bool ReadBufferFromEncryptedFile::nextImpl() return true; } -size_t ReadBufferFromEncryptedFile::getFileSize() -{ - size_t size = in->getFileSize(); - return size > FileEncryption::Header::kSize ? size - FileEncryption::Header::kSize : size; -} - -size_t ReadBufferFromEncryptedFile::getFileOffsetOfBufferEnd() const -{ - size_t file_offset = in->getFileOffsetOfBufferEnd(); - return file_offset > FileEncryption::Header::kSize ? file_offset - FileEncryption::Header::kSize : file_offset; -} - } #endif diff --git a/src/IO/ReadBufferFromEncryptedFile.h b/src/IO/ReadBufferFromEncryptedFile.h index 2f5093153ea..3626daccb3e 100644 --- a/src/IO/ReadBufferFromEncryptedFile.h +++ b/src/IO/ReadBufferFromEncryptedFile.h @@ -27,10 +27,10 @@ public: std::string getFileName() const override { return in->getFileName(); } void setReadUntilPosition(size_t position) override { in->setReadUntilPosition(position + FileEncryption::Header::kSize); } + void setReadUntilEnd() override { in->setReadUntilEnd(); } - size_t getFileSize() override; - size_t getFileOffsetOfBufferEnd() const override; + size_t getFileSize() override { return in->getFileSize(); } private: bool nextImpl() override; diff --git a/src/IO/ReadBufferFromFileBase.h b/src/IO/ReadBufferFromFileBase.h index b9288ce6636..296edf9c689 100644 --- a/src/IO/ReadBufferFromFileBase.h +++ b/src/IO/ReadBufferFromFileBase.h @@ -60,12 +60,6 @@ public: /// file offset and what getPosition() returns. virtual bool isRegularLocalFile(size_t * /* out_view_offset */ = nullptr) { return false; } - /// NOTE: This method should be thread-safe against seek(), since it can be - /// used in CachedOnDiskReadBufferFromFile from multiple threads (because - /// it first releases the buffer, and then do logging, and so other thread - /// can already call seek() which will lead to data-race). - virtual size_t getFileOffsetOfBufferEnd() const = 0; - protected: std::optional file_size; ProfileCallback profile_callback; diff --git a/src/IO/ReadBufferFromFileDecorator.cpp b/src/IO/ReadBufferFromFileDecorator.cpp new file mode 100644 index 00000000000..9ac0fb4e475 --- /dev/null +++ b/src/IO/ReadBufferFromFileDecorator.cpp @@ -0,0 +1,60 @@ +#include + + +namespace DB +{ + +ReadBufferFromFileDecorator::ReadBufferFromFileDecorator(std::unique_ptr impl_) + : ReadBufferFromFileDecorator(std::move(impl_), "") +{ +} + + +ReadBufferFromFileDecorator::ReadBufferFromFileDecorator(std::unique_ptr impl_, const String & file_name_) + : impl(std::move(impl_)), file_name(file_name_) +{ + swap(*impl); +} + + +std::string ReadBufferFromFileDecorator::getFileName() const +{ + if (!file_name.empty()) + return file_name; + + return getFileNameFromReadBuffer(*impl); +} + + +off_t ReadBufferFromFileDecorator::getPosition() +{ + swap(*impl); + auto position = impl->getPosition(); + swap(*impl); + return position; +} + + +off_t ReadBufferFromFileDecorator::seek(off_t off, int whence) +{ + swap(*impl); + auto result = impl->seek(off, whence); + swap(*impl); + return result; +} + + +bool ReadBufferFromFileDecorator::nextImpl() +{ + swap(*impl); + auto result = impl->next(); + swap(*impl); + return result; +} + +size_t ReadBufferFromFileDecorator::getFileSize() +{ + return getFileSizeFromReadBuffer(*impl); +} + +} diff --git a/src/IO/ReadBufferFromFileDecorator.h b/src/IO/ReadBufferFromFileDecorator.h new file mode 100644 index 00000000000..6e62c7f741b --- /dev/null +++ b/src/IO/ReadBufferFromFileDecorator.h @@ -0,0 +1,37 @@ +#pragma once + +#include + + +namespace DB +{ + +/// Delegates all reads to underlying buffer. Doesn't have own memory. +class ReadBufferFromFileDecorator : public ReadBufferFromFileBase +{ +public: + explicit ReadBufferFromFileDecorator(std::unique_ptr impl_); + ReadBufferFromFileDecorator(std::unique_ptr impl_, const String & file_name_); + + std::string getFileName() const override; + + off_t getPosition() override; + + off_t seek(off_t off, int whence) override; + + bool nextImpl() override; + + bool isWithFileSize() const { return dynamic_cast(impl.get()) != nullptr; } + + const ReadBuffer & getWrappedReadBuffer() const { return *impl; } + + ReadBuffer & getWrappedReadBuffer() { return *impl; } + + size_t getFileSize() override; + +protected: + std::unique_ptr impl; + String file_name; +}; + +} diff --git a/src/IO/ReadBufferFromMemory.h b/src/IO/ReadBufferFromMemory.h index 6d3f1a2c6e5..ad96e4bfa28 100644 --- a/src/IO/ReadBufferFromMemory.h +++ b/src/IO/ReadBufferFromMemory.h @@ -20,6 +20,7 @@ public: : SeekableReadBuffer(const_cast(str.data()), str.size(), 0) {} off_t seek(off_t off, int whence) override; + off_t getPosition() override; }; diff --git a/src/IO/ReadBufferWrapperBase.h b/src/IO/ReadBufferWrapperBase.h new file mode 100644 index 00000000000..1c594e8018a --- /dev/null +++ b/src/IO/ReadBufferWrapperBase.h @@ -0,0 +1,15 @@ +#pragma once + +#include + +namespace DB +{ + +class ReadBufferWrapperBase +{ +public: + virtual const ReadBuffer & getWrappedReadBuffer() const = 0; + virtual ~ReadBufferWrapperBase() = default; +}; + +} diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 05d35a57b12..bcfe5fd5230 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -619,13 +620,16 @@ void readQuotedStringInto(Vector & s, ReadBuffer & buf) readAnyQuotedStringInto<'\'', enable_sql_style_quoting>(s, buf); } -template +template bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf) { - return readAnyQuotedStringInto<'\'', false, Vector, bool>(s, buf); + return readAnyQuotedStringInto<'\'', enable_sql_style_quoting, Vector, bool>(s, buf); } -template bool tryReadQuotedStringInto(String & s, ReadBuffer & buf); +template bool tryReadQuotedStringInto(String & s, ReadBuffer & buf); +template bool tryReadQuotedStringInto(String & s, ReadBuffer & buf); +template bool tryReadQuotedStringInto>(PaddedPODArray & s, ReadBuffer & buf); +template bool tryReadQuotedStringInto>(PaddedPODArray & s, ReadBuffer & buf); template void readDoubleQuotedStringInto(Vector & s, ReadBuffer & buf) @@ -633,6 +637,16 @@ void readDoubleQuotedStringInto(Vector & s, ReadBuffer & buf) readAnyQuotedStringInto<'"', enable_sql_style_quoting>(s, buf); } +template +bool tryReadDoubleQuotedStringInto(Vector & s, ReadBuffer & buf) +{ + return readAnyQuotedStringInto<'"', enable_sql_style_quoting, Vector, bool>(s, buf); +} + +template bool tryReadDoubleQuotedStringInto(String & s, ReadBuffer & buf); +template bool tryReadDoubleQuotedStringInto(String & s, ReadBuffer & buf); + + template void readBackQuotedStringInto(Vector & s, ReadBuffer & buf) { @@ -652,6 +666,18 @@ void readQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) readQuotedStringInto(s, buf); } +bool tryReadQuotedString(String & s, ReadBuffer & buf) +{ + s.clear(); + return tryReadQuotedStringInto(s, buf); +} + +bool tryReadQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) +{ + s.clear(); + return tryReadQuotedStringInto(s, buf); +} + template void readQuotedStringInto(PaddedPODArray & s, ReadBuffer & buf); template void readQuotedStringInto(String & s, ReadBuffer & buf); @@ -672,6 +698,18 @@ void readDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) readDoubleQuotedStringInto(s, buf); } +bool tryReadDoubleQuotedString(String & s, ReadBuffer & buf) +{ + s.clear(); + return tryReadDoubleQuotedStringInto(s, buf); +} + +bool tryReadDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) +{ + s.clear(); + return tryReadDoubleQuotedStringInto(s, buf); +} + void readBackQuotedString(String & s, ReadBuffer & buf) { s.clear(); @@ -691,7 +729,7 @@ concept WithResize = requires (T value) { value.size() } -> std::integral<>; }; -template +template void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings) { /// Empty string @@ -754,12 +792,20 @@ void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & { PeekableReadBuffer * peekable_buf = dynamic_cast(&buf); if (!peekable_buf) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Reading CSV string with custom delimiter is allowed only when using PeekableReadBuffer"); + { + if constexpr (allow_throw) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Reading CSV string with custom delimiter is allowed only when using PeekableReadBuffer"); + return; + } while (true) { if (peekable_buf->eof()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading CSV string, expected custom delimiter \"{}\"", custom_delimiter); + { + if constexpr (allow_throw) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading CSV string, expected custom delimiter \"{}\"", custom_delimiter); + return; + } char * next_pos = reinterpret_cast(memchr(peekable_buf->position(), custom_delimiter[0], peekable_buf->available())); if (!next_pos) @@ -948,6 +994,9 @@ String readCSVFieldWithTwoPossibleDelimiters(PeekableReadBuffer & buf, const For template void readCSVStringInto>(PaddedPODArray & s, ReadBuffer & buf, const FormatSettings::CSV & settings); template void readCSVStringInto(NullOutput & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +template void readCSVStringInto(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +template void readCSVStringInto(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +template void readCSVStringInto, false, false>(PaddedPODArray & s, ReadBuffer & buf, const FormatSettings::CSV & settings); template @@ -1069,15 +1118,18 @@ ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf) } template void readJSONObjectPossiblyInvalid(String & s, ReadBuffer & buf); +template bool readJSONObjectPossiblyInvalid(String & s, ReadBuffer & buf); template void readJSONObjectPossiblyInvalid>(PaddedPODArray & s, ReadBuffer & buf); +template bool readJSONObjectPossiblyInvalid, bool>(PaddedPODArray & s, ReadBuffer & buf); -template -void readJSONArrayInto(Vector & s, ReadBuffer & buf) +template +ReturnType readJSONArrayInto(Vector & s, ReadBuffer & buf) { - readJSONObjectOrArrayPossiblyInvalid(s, buf); + return readJSONObjectOrArrayPossiblyInvalid(s, buf); } -template void readJSONArrayInto>(PaddedPODArray & s, ReadBuffer & buf); +template void readJSONArrayInto, void>(PaddedPODArray & s, ReadBuffer & buf); +template bool readJSONArrayInto, bool>(PaddedPODArray & s, ReadBuffer & buf); template ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf) @@ -1217,6 +1269,13 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D return false; } + if constexpr (!throw_exception) + { + if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[2]) || !isNumericASCII(s[3]) + || !isNumericASCII(s[5]) || !isNumericASCII(s[6]) || !isNumericASCII(s[8]) || !isNumericASCII(s[9])) + return false; + } + UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); UInt8 month = (s[5] - '0') * 10 + (s[6] - '0'); UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); @@ -1240,6 +1299,13 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D return false; } + if constexpr (!throw_exception) + { + if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[3]) || !isNumericASCII(s[4]) + || !isNumericASCII(s[6]) || !isNumericASCII(s[7])) + return false; + } + hour = (s[0] - '0') * 10 + (s[1] - '0'); minute = (s[3] - '0') * 10 + (s[4] - '0'); second = (s[6] - '0') * 10 + (s[7] - '0'); @@ -1259,7 +1325,14 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D { /// Not very efficient. for (const char * digit_pos = s; digit_pos < s_pos; ++digit_pos) + { + if constexpr (!throw_exception) + { + if (!isNumericASCII(*digit_pos)) + return false; + } datetime = datetime * 10 + *digit_pos - '0'; + } } datetime *= negative_multiplier; @@ -1282,14 +1355,24 @@ template bool readDateTimeTextFallback(time_t &, ReadBuffer &, cons template bool readDateTimeTextFallback(time_t &, ReadBuffer &, const DateLUTImpl &); -void skipJSONField(ReadBuffer & buf, StringRef name_of_field) +template +ReturnType skipJSONFieldImpl(ReadBuffer & buf, StringRef name_of_field) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString()); + return ReturnType(false); + } else if (*buf.position() == '"') /// skip double-quoted string { NullOutput sink; - readJSONStringInto(sink, buf); + if constexpr (throw_exception) + readJSONStringInto(sink, buf); + else if (!tryReadJSONStringInto(sink, buf)) + return ReturnType(false); } else if (isNumericASCII(*buf.position()) || *buf.position() == '-' || *buf.position() == '+' || *buf.position() == '.') /// skip number { @@ -1298,19 +1381,32 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) double v; if (!tryReadFloatText(v, buf)) - throw Exception(ErrorCodes::INCORRECT_DATA, "Expected a number field for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Expected a number field for key '{}'", name_of_field.toString()); + return ReturnType(false); + } } else if (*buf.position() == 'n') /// skip null { - assertString("null", buf); + if constexpr (throw_exception) + assertString("null", buf); + else if (!checkString("null", buf)) + return ReturnType(false); } else if (*buf.position() == 't') /// skip true { - assertString("true", buf); + if constexpr (throw_exception) + assertString("true", buf); + else if (!checkString("true", buf)) + return ReturnType(false); } else if (*buf.position() == 'f') /// skip false { - assertString("false", buf); + if constexpr (throw_exception) + assertString("false", buf); + else if (!checkString("false", buf)) + return ReturnType(false); } else if (*buf.position() == '[') { @@ -1320,12 +1416,16 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) if (!buf.eof() && *buf.position() == ']') /// skip empty array { ++buf.position(); - return; + return ReturnType(true); } while (true) { - skipJSONField(buf, name_of_field); + if constexpr (throw_exception) + skipJSONFieldImpl(buf, name_of_field); + else if (!skipJSONFieldImpl(buf, name_of_field)) + return ReturnType(false); + skipWhitespaceIfAny(buf); if (!buf.eof() && *buf.position() == ',') @@ -1339,7 +1439,11 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) break; } else - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + return ReturnType(false); + } } } else if (*buf.position() == '{') /// skip whole object @@ -1353,19 +1457,34 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) if (*buf.position() == '"') { NullOutput sink; - readJSONStringInto(sink, buf); + if constexpr (throw_exception) + readJSONStringInto(sink, buf); + else if (!tryReadJSONStringInto(sink, buf)) + return ReturnType(false); } else - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + return ReturnType(false); + } // ':' skipWhitespaceIfAny(buf); if (buf.eof() || !(*buf.position() == ':')) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + return ReturnType(false); + } ++buf.position(); skipWhitespaceIfAny(buf); - skipJSONField(buf, name_of_field); + if constexpr (throw_exception) + skipJSONFieldImpl(buf, name_of_field); + else if (!skipJSONFieldImpl(buf, name_of_field)) + return ReturnType(false); + skipWhitespaceIfAny(buf); // optional ',' @@ -1377,18 +1496,37 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) } if (buf.eof()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString()); + return ReturnType(false); + } ++buf.position(); } else { - throw Exception( - ErrorCodes::INCORRECT_DATA, - "Cannot read JSON field here: '{}'. Unexpected symbol '{}'{}", - String(buf.position(), std::min(buf.available(), size_t(10))), - std::string(1, *buf.position()), - name_of_field.empty() ? "" : " for key " + name_of_field.toString()); + if constexpr (throw_exception) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "Cannot read JSON field here: '{}'. Unexpected symbol '{}'{}", + String(buf.position(), std::min(buf.available(), size_t(10))), + std::string(1, *buf.position()), + name_of_field.empty() ? "" : " for key " + name_of_field.toString()); + + return ReturnType(false); } + + return ReturnType(true); +} + +void skipJSONField(ReadBuffer & buf, StringRef name_of_field) +{ + skipJSONFieldImpl(buf, name_of_field); +} + +bool trySkipJSONField(ReadBuffer & buf, StringRef name_of_field) +{ + return skipJSONFieldImpl(buf, name_of_field); } @@ -1601,23 +1739,31 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim } // Use PeekableReadBuffer to copy field to string after parsing. -template -static void readParsedValueInto(Vector & s, ReadBuffer & buf, ParseFunc parse_func) +template +static ReturnType readParsedValueInto(Vector & s, ReadBuffer & buf, ParseFunc parse_func) { PeekableReadBuffer peekable_buf(buf); peekable_buf.setCheckpoint(); - parse_func(peekable_buf); + if constexpr (std::is_same_v) + parse_func(peekable_buf); + else if (!parse_func(peekable_buf)) + return ReturnType(false); peekable_buf.makeContinuousMemoryFromCheckpointToPos(); auto * end = peekable_buf.position(); peekable_buf.rollbackToCheckpoint(); s.append(peekable_buf.position(), end); peekable_buf.position() = end; + return ReturnType(true); } -template -static void readQuotedStringFieldInto(Vector & s, ReadBuffer & buf) +template +static ReturnType readQuotedStringFieldInto(Vector & s, ReadBuffer & buf) { - assertChar('\'', buf); + if constexpr (std::is_same_v) + assertChar('\'', buf); + else if (!checkChar('\'', buf)) + return ReturnType(false); + s.push_back('\''); while (!buf.eof()) { @@ -1645,16 +1791,23 @@ static void readQuotedStringFieldInto(Vector & s, ReadBuffer & buf) } if (buf.eof()) - return; + return ReturnType(false); ++buf.position(); s.push_back('\''); + return ReturnType(true); } -template -static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf) +template +static ReturnType readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf) { - assertChar(opening_bracket, buf); + static constexpr bool throw_exception = std::is_same_v; + + if constexpr (throw_exception) + assertChar(opening_bracket, buf); + else if (!checkChar(opening_bracket, buf)) + return ReturnType(false); + s.push_back(opening_bracket); size_t balance = 1; @@ -1670,7 +1823,10 @@ static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf) if (*buf.position() == '\'') { - readQuotedStringFieldInto(s, buf); + if constexpr (throw_exception) + readQuotedStringFieldInto(s, buf); + else if (!readQuotedStringFieldInto(s, buf)) + return ReturnType(false); } else if (*buf.position() == opening_bracket) { @@ -1685,13 +1841,20 @@ static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf) ++buf.position(); } } + + if (balance) + return ReturnType(false); + + return ReturnType(true); } -template -void readQuotedFieldInto(Vector & s, ReadBuffer & buf) +template +ReturnType readQuotedFieldInto(Vector & s, ReadBuffer & buf) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) - return; + return ReturnType(false); /// Possible values in 'Quoted' field: /// - Strings: '...' @@ -1703,35 +1866,47 @@ void readQuotedFieldInto(Vector & s, ReadBuffer & buf) /// - Number: integer, float, decimal. if (*buf.position() == '\'') - readQuotedStringFieldInto(s, buf); + return readQuotedStringFieldInto(s, buf); else if (*buf.position() == '[') - readQuotedFieldInBracketsInto<'[', ']'>(s, buf); + return readQuotedFieldInBracketsInto(s, buf); else if (*buf.position() == '(') - readQuotedFieldInBracketsInto<'(', ')'>(s, buf); + return readQuotedFieldInBracketsInto(s, buf); else if (*buf.position() == '{') - readQuotedFieldInBracketsInto<'{', '}'>(s, buf); + return readQuotedFieldInBracketsInto(s, buf); else if (checkCharCaseInsensitive('n', buf)) { /// NULL or NaN if (checkCharCaseInsensitive('u', buf)) { - assertStringCaseInsensitive("ll", buf); + if constexpr (throw_exception) + assertStringCaseInsensitive("ll", buf); + else if (!checkStringCaseInsensitive("ll", buf)) + return ReturnType(false); s.append("NULL"); } else { - assertStringCaseInsensitive("an", buf); + if constexpr (throw_exception) + assertStringCaseInsensitive("an", buf); + else if (!checkStringCaseInsensitive("an", buf)) + return ReturnType(false); s.append("NaN"); } } else if (checkCharCaseInsensitive('t', buf)) { - assertStringCaseInsensitive("rue", buf); + if constexpr (throw_exception) + assertStringCaseInsensitive("rue", buf); + else if (!checkStringCaseInsensitive("rue", buf)) + return ReturnType(false); s.append("true"); } else if (checkCharCaseInsensitive('f', buf)) { - assertStringCaseInsensitive("alse", buf); + if constexpr (throw_exception) + assertStringCaseInsensitive("alse", buf); + else if (!checkStringCaseInsensitive("alse", buf)) + return ReturnType(false); s.append("false"); } else @@ -1740,13 +1915,19 @@ void readQuotedFieldInto(Vector & s, ReadBuffer & buf) auto parse_func = [](ReadBuffer & in) { Float64 tmp; - readFloatText(tmp, in); + if constexpr (throw_exception) + readFloatText(tmp, in); + else + return tryReadFloatText(tmp, in); }; - readParsedValueInto(s, buf, parse_func); + + return readParsedValueInto(s, buf, parse_func); } + + return ReturnType(true); } -template void readQuotedFieldInto(NullOutput & s, ReadBuffer & buf); +template void readQuotedFieldInto(NullOutput & s, ReadBuffer & buf); void readQuotedField(String & s, ReadBuffer & buf) { @@ -1754,11 +1935,24 @@ void readQuotedField(String & s, ReadBuffer & buf) readQuotedFieldInto(s, buf); } +bool tryReadQuotedField(String & s, ReadBuffer & buf) +{ + s.clear(); + return readQuotedFieldInto(s, buf); +} + void readJSONField(String & s, ReadBuffer & buf) { s.clear(); auto parse_func = [](ReadBuffer & in) { skipJSONField(in, ""); }; - readParsedValueInto(s, buf, parse_func); + readParsedValueInto(s, buf, parse_func); +} + +bool tryReadJSONField(String & s, ReadBuffer & buf) +{ + s.clear(); + auto parse_func = [](ReadBuffer & in) { return trySkipJSONField(in, ""); }; + return readParsedValueInto(s, buf, parse_func); } void readTSVField(String & s, ReadBuffer & buf) diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 85584d63ee8..49530f4787a 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -38,7 +38,6 @@ #include #include #include -#include #include #include @@ -51,6 +50,7 @@ namespace DB template struct Memory; +class PeekableReadBuffer; namespace ErrorCodes { @@ -258,26 +258,43 @@ inline void readBoolText(bool & x, ReadBuffer & buf) x = tmp != '0'; } -inline void readBoolTextWord(bool & x, ReadBuffer & buf, bool support_upper_case = false) +template +inline ReturnType readBoolTextWord(bool & x, ReadBuffer & buf, bool support_upper_case = false) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) [[unlikely]] - throwReadAfterEOF(); + { + if constexpr (throw_exception) + throwReadAfterEOF(); + else + return ReturnType(false); + } switch (*buf.position()) { case 't': - assertString("true", buf); + if constexpr (throw_exception) + assertString("true", buf); + else if (!checkString("true", buf)) + return ReturnType(false); x = true; break; case 'f': - assertString("false", buf); + if constexpr (throw_exception) + assertString("false", buf); + else if (!checkString("false", buf)) + return ReturnType(false); x = false; break; case 'T': { if (support_upper_case) { - assertString("TRUE", buf); + if constexpr (throw_exception) + assertString("TRUE", buf); + else if (!checkString("TRUE", buf)) + return ReturnType(false); x = true; break; } @@ -288,7 +305,10 @@ inline void readBoolTextWord(bool & x, ReadBuffer & buf, bool support_upper_case { if (support_upper_case) { - assertString("FALSE", buf); + if constexpr (throw_exception) + assertString("FALSE", buf); + else if (!checkString("FALSE", buf)) + return ReturnType(false); x = false; break; } @@ -296,8 +316,15 @@ inline void readBoolTextWord(bool & x, ReadBuffer & buf, bool support_upper_case [[fallthrough]]; } default: - throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Unexpected Bool value"); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Unexpected Bool value"); + else + return ReturnType(false); + } } + + return ReturnType(true); } enum class ReadIntTextCheckOverflow @@ -469,7 +496,10 @@ void readIntText(T & x, ReadBuffer & buf) template bool tryReadIntText(T & x, ReadBuffer & buf) { - return readIntTextImpl(x, buf); + if constexpr (is_decimal) + return tryReadIntText(x.value, buf); + else + return readIntTextImpl(x, buf); } @@ -478,16 +508,18 @@ bool tryReadIntText(T & x, ReadBuffer & buf) * - for numbers starting with zero, parsed only zero; * - symbol '+' before number is not supported; */ -template -void readIntTextUnsafe(T & x, ReadBuffer & buf) +template +ReturnType readIntTextUnsafe(T & x, ReadBuffer & buf) { + static constexpr bool throw_exception = std::is_same_v; bool negative = false; make_unsigned_t res = 0; auto on_error = [] { - if (throw_on_error) + if constexpr (throw_exception) throwReadAfterEOF(); + return ReturnType(false); }; if (buf.eof()) [[unlikely]] @@ -505,7 +537,7 @@ void readIntTextUnsafe(T & x, ReadBuffer & buf) { ++buf.position(); x = 0; - return; + return ReturnType(true); } while (!buf.eof()) @@ -524,12 +556,13 @@ void readIntTextUnsafe(T & x, ReadBuffer & buf) /// See note about undefined behaviour above. x = is_signed_v && negative ? -res : res; + return ReturnType(true); } template -void tryReadIntTextUnsafe(T & x, ReadBuffer & buf) +bool tryReadIntTextUnsafe(T & x, ReadBuffer & buf) { - return readIntTextUnsafe(x, buf); + return readIntTextUnsafe(x, buf); } @@ -551,9 +584,15 @@ void readEscapedString(String & s, ReadBuffer & buf); void readQuotedString(String & s, ReadBuffer & buf); void readQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); +bool tryReadQuotedString(String & s, ReadBuffer & buf); +bool tryReadQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); + void readDoubleQuotedString(String & s, ReadBuffer & buf); void readDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); +bool tryReadDoubleQuotedString(String & s, ReadBuffer & buf); +bool tryReadDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); + void readJSONString(String & s, ReadBuffer & buf); void readBackQuotedString(String & s, ReadBuffer & buf); @@ -616,7 +655,7 @@ void readBackQuotedStringInto(Vector & s, ReadBuffer & buf); template void readStringUntilEOFInto(Vector & s, ReadBuffer & buf); -template +template void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings); /// ReturnType is either bool or void. If bool, the function will return false instead of throwing an exception. @@ -629,7 +668,7 @@ bool tryReadJSONStringInto(Vector & s, ReadBuffer & buf) return readJSONStringInto(s, buf); } -template +template bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf); /// Reads chunk of data between {} in that way, @@ -638,8 +677,8 @@ bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf); template ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf); -template -void readJSONArrayInto(Vector & s, ReadBuffer & buf); +template +ReturnType readJSONArrayInto(Vector & s, ReadBuffer & buf); template void readStringUntilWhitespaceInto(Vector & s, ReadBuffer & buf); @@ -963,6 +1002,13 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons { if (s[4] < '0' || s[4] > '9') { + if constexpr (!throw_exception) + { + if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[2]) || !isNumericASCII(s[3]) + || !isNumericASCII(s[5]) || !isNumericASCII(s[6]) || !isNumericASCII(s[8]) || !isNumericASCII(s[9])) + return ReturnType(false); + } + UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); UInt8 month = (s[5] - '0') * 10 + (s[6] - '0'); UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); @@ -975,6 +1021,13 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons bool dt_long = (s[10] == ' ' || s[10] == 'T'); if (dt_long) { + if constexpr (!throw_exception) + { + if (!isNumericASCII(s[11]) || !isNumericASCII(s[12]) || !isNumericASCII(s[14]) || !isNumericASCII(s[15]) + || !isNumericASCII(s[17]) || !isNumericASCII(s[18])) + return ReturnType(false); + } + hour = (s[11] - '0') * 10 + (s[12] - '0'); minute = (s[14] - '0') * 10 + (s[15] - '0'); second = (s[17] - '0') * 10 + (s[18] - '0'); @@ -1312,6 +1365,11 @@ inline bool tryReadText(is_integer auto & x, ReadBuffer & buf) return tryReadIntText(x, buf); } +inline bool tryReadText(is_floating_point auto & x, ReadBuffer & buf) +{ + return tryReadFloatText(x, buf); +} + inline bool tryReadText(UUID & x, ReadBuffer & buf) { return tryReadUUIDText(x, buf); } inline bool tryReadText(IPv4 & x, ReadBuffer & buf) { return tryReadIPv4Text(x, buf); } inline bool tryReadText(IPv6 & x, ReadBuffer & buf) { return tryReadIPv6Text(x, buf); } @@ -1321,9 +1379,20 @@ inline void readText(is_floating_point auto & x, ReadBuffer & buf) { readFloatTe inline void readText(String & x, ReadBuffer & buf) { readEscapedString(x, buf); } inline void readText(DayNum & x, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance()) { readDateText(x, buf, time_zone); } +inline bool tryReadText(DayNum & x, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance()) { return tryReadDateText(x, buf, time_zone); } inline void readText(LocalDate & x, ReadBuffer & buf) { readDateText(x, buf); } +inline bool tryReadText(LocalDate & x, ReadBuffer & buf) { return tryReadDateText(x, buf); } inline void readText(LocalDateTime & x, ReadBuffer & buf) { readDateTimeText(x, buf); } +inline bool tryReadText(LocalDateTime & x, ReadBuffer & buf) +{ + time_t time; + if (!tryReadDateTimeText(time, buf)) + return false; + x = LocalDateTime(time, DateLUT::instance()); + return true; +} + inline void readText(UUID & x, ReadBuffer & buf) { readUUIDText(x, buf); } inline void readText(IPv4 & x, ReadBuffer & buf) { readIPv4Text(x, buf); } inline void readText(IPv6 & x, ReadBuffer & buf) { readIPv6Text(x, buf); } @@ -1401,39 +1470,71 @@ inline void readDoubleQuoted(LocalDateTime & x, ReadBuffer & buf) } /// CSV for numbers: quotes are optional, no special escaping rules. -template -inline void readCSVSimple(T & x, ReadBuffer & buf) +template +inline ReturnType readCSVSimple(T & x, ReadBuffer & buf) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) [[unlikely]] - throwReadAfterEOF(); + { + if constexpr (throw_exception) + throwReadAfterEOF(); + return ReturnType(false); + } char maybe_quote = *buf.position(); if (maybe_quote == '\'' || maybe_quote == '\"') ++buf.position(); - readText(x, buf); + if constexpr (throw_exception) + readText(x, buf); + else if (!tryReadText(x, buf)) + return ReturnType(false); if (maybe_quote == '\'' || maybe_quote == '\"') - assertChar(maybe_quote, buf); + { + if constexpr (throw_exception) + assertChar(maybe_quote, buf); + else if (!checkChar(maybe_quote, buf)) + return ReturnType(false); + } + + return ReturnType(true); } // standalone overload for dates: to avoid instantiating DateLUTs while parsing other types -template -inline void readCSVSimple(T & x, ReadBuffer & buf, const DateLUTImpl & time_zone) +template +inline ReturnType readCSVSimple(T & x, ReadBuffer & buf, const DateLUTImpl & time_zone) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) [[unlikely]] - throwReadAfterEOF(); + { + if constexpr (throw_exception) + throwReadAfterEOF(); + return ReturnType(false); + } char maybe_quote = *buf.position(); if (maybe_quote == '\'' || maybe_quote == '\"') ++buf.position(); - readText(x, buf, time_zone); + if constexpr (throw_exception) + readText(x, buf, time_zone); + else if (!tryReadText(x, buf, time_zone)) + return ReturnType(false); if (maybe_quote == '\'' || maybe_quote == '\"') - assertChar(maybe_quote, buf); + { + if constexpr (throw_exception) + assertChar(maybe_quote, buf); + else if (!checkChar(maybe_quote, buf)) + return ReturnType(false); + } + + return ReturnType(true); } template @@ -1443,18 +1544,52 @@ inline void readCSV(T & x, ReadBuffer & buf) readCSVSimple(x, buf); } +template +requires is_arithmetic_v +inline bool tryReadCSV(T & x, ReadBuffer & buf) +{ + return readCSVSimple(x, buf); +} + inline void readCSV(String & x, ReadBuffer & buf, const FormatSettings::CSV & settings) { readCSVString(x, buf, settings); } +inline bool tryReadCSV(String & x, ReadBuffer & buf, const FormatSettings::CSV & settings) +{ + x.clear(); + readCSVStringInto(x, buf, settings); + return true; +} + inline void readCSV(LocalDate & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(LocalDate & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(DayNum & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(DayNum & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } inline void readCSV(DayNum & x, ReadBuffer & buf, const DateLUTImpl & time_zone) { readCSVSimple(x, buf, time_zone); } +inline bool tryReadCSV(DayNum & x, ReadBuffer & buf, const DateLUTImpl & time_zone) { return readCSVSimple(x, buf, time_zone); } + inline void readCSV(LocalDateTime & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(LocalDateTime & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(UUID & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(UUID & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(IPv4 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(IPv4 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(IPv6 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(IPv6 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(UInt128 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(UInt128 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(Int128 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(Int128 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(UInt256 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(UInt256 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(Int256 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(Int256 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } template void readBinary(std::vector & x, ReadBuffer & buf) @@ -1536,6 +1671,7 @@ inline void skipWhitespaceIfAny(ReadBuffer & buf, bool one_line = false) /// Skips json value. void skipJSONField(ReadBuffer & buf, StringRef name_of_field); +bool trySkipJSONField(ReadBuffer & buf, StringRef name_of_field); /** Read serialized exception. @@ -1750,12 +1886,14 @@ struct PcgDeserializer } }; -template -void readQuotedFieldInto(Vector & s, ReadBuffer & buf); +template +ReturnType readQuotedFieldInto(Vector & s, ReadBuffer & buf); void readQuotedField(String & s, ReadBuffer & buf); +bool tryReadQuotedField(String & s, ReadBuffer & buf); void readJSONField(String & s, ReadBuffer & buf); +bool tryReadJSONField(String & s, ReadBuffer & buf); void readTSVField(String & s, ReadBuffer & buf); diff --git a/src/IO/ReadWriteBufferFromHTTP.cpp b/src/IO/ReadWriteBufferFromHTTP.cpp index bf5c426f803..a95d42ec7f3 100644 --- a/src/IO/ReadWriteBufferFromHTTP.cpp +++ b/src/IO/ReadWriteBufferFromHTTP.cpp @@ -552,7 +552,7 @@ bool ReadWriteBufferFromHTTPBase::nextImpl() if (!can_retry_request) throw; - LOG_ERROR( + LOG_WARNING( log, "HTTP request to `{}` failed at try {}/{} with bytes read: {}/{}. " "Error: {}. (Current backoff wait is {}/{} ms)", diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index 7f0ede72740..182e7ad18cd 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -27,7 +27,6 @@ #include -#include namespace ProfileEvents { @@ -48,7 +47,6 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int TOO_MANY_REDIRECTS; - extern const int BAD_ARGUMENTS; } namespace S3 @@ -106,19 +104,6 @@ void verifyClientConfiguration(const Aws::Client::ClientConfiguration & client_c assert_cast(*client_config.retryStrategy); } -void validateCredentials(const Aws::Auth::AWSCredentials& auth_credentials) -{ - if (auth_credentials.GetAWSAccessKeyId().empty()) - { - return; - } - /// Follow https://docs.aws.amazon.com/IAM/latest/APIReference/API_AccessKey.html - if (!std::all_of(auth_credentials.GetAWSAccessKeyId().begin(), auth_credentials.GetAWSAccessKeyId().end(), isWordCharASCII)) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Access key id has an invalid character"); - } -} - void addAdditionalAMZHeadersToCanonicalHeadersList( Aws::AmazonWebServiceRequest & request, const HTTPHeaderEntries & extra_headers @@ -144,7 +129,6 @@ std::unique_ptr Client::create( const ClientSettings & client_settings) { verifyClientConfiguration(client_configuration); - validateCredentials(credentials_provider->GetAWSCredentials()); return std::unique_ptr( new Client(max_redirects_, std::move(sse_kms_config_), credentials_provider, client_configuration, sign_payloads, client_settings)); } diff --git a/src/IO/S3/Credentials.cpp b/src/IO/S3/Credentials.cpp index e64f54b99ad..80366510b53 100644 --- a/src/IO/S3/Credentials.cpp +++ b/src/IO/S3/Credentials.cpp @@ -22,7 +22,6 @@ namespace ErrorCodes # include # include -# include # include # include @@ -31,9 +30,7 @@ namespace ErrorCodes # include # include -# include -# include # include # include # include @@ -755,7 +752,7 @@ S3CredentialsProviderChain::S3CredentialsProviderChain( configuration.put_request_throttler, Aws::Http::SchemeMapper::ToString(Aws::Http::Scheme::HTTP)); - /// See MakeDefaultHttpResourceClientConfiguration(). + /// See MakeDefaultHTTPResourceClientConfiguration(). /// This is part of EC2 metadata client, but unfortunately it can't be accessed from outside /// of contrib/aws/aws-cpp-sdk-core/source/internal/AWSHttpResourceClient.cpp aws_client_configuration.maxConnections = 2; diff --git a/src/IO/S3/PocoHTTPClient.cpp b/src/IO/S3/PocoHTTPClient.cpp index 21acdfd69f2..dbb93e63143 100644 --- a/src/IO/S3/PocoHTTPClient.cpp +++ b/src/IO/S3/PocoHTTPClient.cpp @@ -146,9 +146,9 @@ ConnectionTimeouts getTimeoutsFromConfiguration(const PocoHTTPClientConfiguratio .withConnectionTimeout(Poco::Timespan(client_configuration.connectTimeoutMs * 1000)) .withSendTimeout(Poco::Timespan(client_configuration.requestTimeoutMs * 1000)) .withReceiveTimeout(Poco::Timespan(client_configuration.requestTimeoutMs * 1000)) - .withTcpKeepAliveTimeout(Poco::Timespan( + .withTCPKeepAliveTimeout(Poco::Timespan( client_configuration.enableTcpKeepAlive ? client_configuration.tcpKeepAliveIntervalMs * 1000 : 0)) - .withHttpKeepAliveTimeout(Poco::Timespan( + .withHTTPKeepAliveTimeout(Poco::Timespan( client_configuration.http_keep_alive_timeout_ms * 1000)); /// flag indicating whether keep-alive is enabled is set to each session upon creation } diff --git a/src/IO/S3/copyS3File.h b/src/IO/S3/copyS3File.h index 607be51ed25..093d26ba7bb 100644 --- a/src/IO/S3/copyS3File.h +++ b/src/IO/S3/copyS3File.h @@ -5,7 +5,7 @@ #if USE_AWS_S3 #include -#include +#include #include #include #include diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 5039059f522..56e3e0df21b 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -1,7 +1,9 @@ #include #include +#include #include + #include "config.h" #if USE_AWS_S3 @@ -124,6 +126,15 @@ AuthSettings AuthSettings::loadFromConfig(const std::string & config_elem, const HTTPHeaderEntries headers = getHTTPHeaders(config_elem, config); ServerSideEncryptionKMSConfig sse_kms_config = getSSEKMSConfig(config_elem, config); + std::unordered_set users; + Poco::Util::AbstractConfiguration::Keys keys; + config.keys(config_elem, keys); + for (const auto & key : keys) + { + if (startsWith(key, "user")) + users.insert(config.getString(config_elem + "." + key)); + } + return AuthSettings { std::move(access_key_id), std::move(secret_access_key), std::move(session_token), @@ -134,10 +145,16 @@ AuthSettings AuthSettings::loadFromConfig(const std::string & config_elem, const use_environment_credentials, use_insecure_imds_request, expiration_window_seconds, - no_sign_request + no_sign_request, + std::move(users) }; } +bool AuthSettings::canBeUsedByUser(const String & user) const +{ + return users.empty() || users.contains(user); +} + bool AuthSettings::hasUpdates(const AuthSettings & other) const { AuthSettings copy = *this; @@ -173,6 +190,8 @@ void AuthSettings::updateFrom(const AuthSettings & from) if (from.no_sign_request.has_value()) no_sign_request = from.no_sign_request; + + users.insert(from.users.begin(), from.users.end()); } } diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 6ee8d96ed09..b3e01bd6132 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -6,6 +6,7 @@ #include #include +#include #include "config.h" @@ -92,9 +93,13 @@ struct AuthSettings std::optional expiration_window_seconds; std::optional no_sign_request; + std::unordered_set users; + bool hasUpdates(const AuthSettings & other) const; void updateFrom(const AuthSettings & from); + bool canBeUsedByUser(const String & user) const; + private: bool operator==(const AuthSettings & other) const = default; }; diff --git a/src/IO/SeekableReadBuffer.h b/src/IO/SeekableReadBuffer.h index 1fb66a5aa9f..c002d30e633 100644 --- a/src/IO/SeekableReadBuffer.h +++ b/src/IO/SeekableReadBuffer.h @@ -44,6 +44,12 @@ public: virtual String getInfoForLog() { return ""; } + /// NOTE: This method should be thread-safe against seek(), since it can be + /// used in CachedOnDiskReadBufferFromFile from multiple threads (because + /// it first releases the buffer, and then do logging, and so other thread + /// can already call seek() which will lead to data-race). + virtual size_t getFileOffsetOfBufferEnd() const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getFileOffsetOfBufferEnd() not implemented"); } + /// If true, setReadUntilPosition() guarantees that eof will be reported at the given position. virtual bool supportsRightBoundedReads() const { return false; } diff --git a/src/IO/SharedThreadPools.cpp b/src/IO/SharedThreadPools.cpp index c8506663bc8..2ea30400ad9 100644 --- a/src/IO/SharedThreadPools.cpp +++ b/src/IO/SharedThreadPools.cpp @@ -20,6 +20,9 @@ namespace CurrentMetrics extern const Metric MergeTreeOutdatedPartsLoaderThreads; extern const Metric MergeTreeOutdatedPartsLoaderThreadsActive; extern const Metric MergeTreeOutdatedPartsLoaderThreadsScheduled; + extern const Metric DatabaseReplicatedCreateTablesThreads; + extern const Metric DatabaseReplicatedCreateTablesThreadsActive; + extern const Metric DatabaseReplicatedCreateTablesThreadsScheduled; } namespace DB @@ -148,4 +151,10 @@ StaticThreadPool & getOutdatedPartsLoadingThreadPool() return instance; } +StaticThreadPool & getDatabaseReplicatedCreateTablesThreadPool() +{ + static StaticThreadPool instance("CreateTablesThreadPool", CurrentMetrics::DatabaseReplicatedCreateTablesThreads, CurrentMetrics::DatabaseReplicatedCreateTablesThreadsActive, CurrentMetrics::DatabaseReplicatedCreateTablesThreadsScheduled); + return instance; +} + } diff --git a/src/IO/SharedThreadPools.h b/src/IO/SharedThreadPools.h index f37f3acefe7..acc5368f8ac 100644 --- a/src/IO/SharedThreadPools.h +++ b/src/IO/SharedThreadPools.h @@ -64,4 +64,7 @@ StaticThreadPool & getPartsCleaningThreadPool(); /// the number of threads by calling enableTurboMode() :-) StaticThreadPool & getOutdatedPartsLoadingThreadPool(); +/// ThreadPool used for creating tables in DatabaseReplicated. +StaticThreadPool & getDatabaseReplicatedCreateTablesThreadPool(); + } diff --git a/src/IO/WithFileName.cpp b/src/IO/WithFileName.cpp index 2383182f7e7..7b50b205935 100644 --- a/src/IO/WithFileName.cpp +++ b/src/IO/WithFileName.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include @@ -16,10 +16,10 @@ static String getFileName(const T & entry) String getFileNameFromReadBuffer(const ReadBuffer & in) { - if (const auto * compressed = dynamic_cast(&in)) - return getFileName(compressed->getWrappedReadBuffer()); + if (const auto * wrapper = dynamic_cast(&in)) + return getFileNameFromReadBuffer(wrapper->getWrappedReadBuffer()); else if (const auto * parallel = dynamic_cast(&in)) - return getFileName(parallel->getReadBuffer()); + return getFileNameFromReadBuffer(parallel->getReadBuffer()); else if (const auto * peekable = dynamic_cast(&in)) return getFileNameFromReadBuffer(peekable->getSubBuffer()); else diff --git a/src/IO/WithFileSize.cpp b/src/IO/WithFileSize.cpp index 435789652dc..3660d962c08 100644 --- a/src/IO/WithFileSize.cpp +++ b/src/IO/WithFileSize.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include namespace DB @@ -16,15 +17,23 @@ template static size_t getFileSize(T & in) { if (auto * with_file_size = dynamic_cast(&in)) + { return with_file_size->getFileSize(); + } throw Exception(ErrorCodes::UNKNOWN_FILE_SIZE, "Cannot find out file size"); } size_t getFileSizeFromReadBuffer(ReadBuffer & in) { - if (auto * compressed = dynamic_cast(&in)) + if (auto * delegate = dynamic_cast(&in)) + { + return getFileSize(delegate->getWrappedReadBuffer()); + } + else if (auto * compressed = dynamic_cast(&in)) + { return getFileSize(compressed->getWrappedReadBuffer()); + } return getFileSize(in); } @@ -43,7 +52,11 @@ std::optional tryGetFileSizeFromReadBuffer(ReadBuffer & in) bool isBufferWithFileSize(const ReadBuffer & in) { - if (const auto * compressed = dynamic_cast(&in)) + if (const auto * delegate = dynamic_cast(&in)) + { + return delegate->isWithFileSize(); + } + else if (const auto * compressed = dynamic_cast(&in)) { return isBufferWithFileSize(compressed->getWrappedReadBuffer()); } @@ -53,7 +66,11 @@ bool isBufferWithFileSize(const ReadBuffer & in) size_t getDataOffsetMaybeCompressed(const ReadBuffer & in) { - if (const auto * compressed = dynamic_cast(&in)) + if (const auto * delegate = dynamic_cast(&in)) + { + return getDataOffsetMaybeCompressed(delegate->getWrappedReadBuffer()); + } + else if (const auto * compressed = dynamic_cast(&in)) { return getDataOffsetMaybeCompressed(compressed->getWrappedReadBuffer()); } diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index 230f39b074e..5dc269990a1 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/IO/readDecimalText.h b/src/IO/readDecimalText.h index 3417310a990..8b4405ee2e9 100644 --- a/src/IO/readDecimalText.h +++ b/src/IO/readDecimalText.h @@ -224,4 +224,24 @@ inline void readCSVDecimalText(ReadBuffer & buf, T & x, uint32_t precision, uint assertChar(maybe_quote, buf); } +template +inline bool tryReadCSVDecimalText(ReadBuffer & buf, T & x, uint32_t precision, uint32_t & scale) +{ + if (buf.eof()) + return false; + + char maybe_quote = *buf.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + ++buf.position(); + + if (!tryReadDecimalText(buf, x, precision, scale)) + return false; + + if ((maybe_quote == '\'' || maybe_quote == '\"') && !checkChar(maybe_quote, buf)) + return false; + + return true; +} + } diff --git a/src/IO/readFloatText.cpp b/src/IO/readFloatText.cpp index d1143f7c62c..17ccc1b25b7 100644 --- a/src/IO/readFloatText.cpp +++ b/src/IO/readFloatText.cpp @@ -67,4 +67,7 @@ template void readFloatText(Float64 &, ReadBuffer &); template bool tryReadFloatText(Float32 &, ReadBuffer &); template bool tryReadFloatText(Float64 &, ReadBuffer &); +template bool tryReadFloatTextNoExponent(Float32 &, ReadBuffer &); +template bool tryReadFloatTextNoExponent(Float64 &, ReadBuffer &); + } diff --git a/src/IO/readFloatText.h b/src/IO/readFloatText.h index 23e904f305a..51964636389 100644 --- a/src/IO/readFloatText.h +++ b/src/IO/readFloatText.h @@ -324,7 +324,7 @@ static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf) } -template +template ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) { static_assert(std::is_same_v || std::is_same_v, "Argument for readFloatTextImpl must be float or double"); @@ -395,30 +395,33 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) after_point_exponent = (read_digits > significant_digits ? -significant_digits : static_cast(-read_digits)) - after_point_num_leading_zeros; } - if (checkChar('e', in) || checkChar('E', in)) + if constexpr (allow_exponent) { - if (in.eof()) + if (checkChar('e', in) || checkChar('E', in)) { - if constexpr (throw_exception) - throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after exponent"); - else - return false; - } + if (in.eof()) + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after exponent"); + else + return false; + } - bool exponent_negative = false; - if (*in.position() == '-') - { - exponent_negative = true; - ++in.position(); - } - else if (*in.position() == '+') - { - ++in.position(); - } + bool exponent_negative = false; + if (*in.position() == '-') + { + exponent_negative = true; + ++in.position(); + } + else if (*in.position() == '+') + { + ++in.position(); + } - readUIntTextUpToNSignificantDigits<4>(exponent, in); - if (exponent_negative) - exponent = -exponent; + readUIntTextUpToNSignificantDigits<4>(exponent, in); + if (exponent_negative) + exponent = -exponent; + } } if (after_point) @@ -604,4 +607,7 @@ template bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { retu template void readFloatText(T & x, ReadBuffer & in) { readFloatTextFast(x, in); } template bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); } +/// Don't read exponent part of the number. +template bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in) { return readFloatTextFastImpl(x, in); } + } diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 1124ba94bc1..7240679abb7 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -282,6 +282,13 @@ const ActionsDAG::Node & ActionsDAG::addFunctionImpl( { size_t num_rows = arguments.empty() ? 0 : arguments.front().column->size(); column = node.function->execute(arguments, node.result_type, num_rows, true); + if (column->getDataType() != node.result_type->getColumnType()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Unexpected return type from {}. Expected {}. Got {}", + node.function->getName(), + node.result_type->getColumnType(), + column->getDataType()); } else { @@ -598,7 +605,7 @@ ActionsDAGPtr ActionsDAG::cloneSubDAG(const NodeRawConstPtrs & outputs, bool rem return actions; } -static ColumnWithTypeAndName executeActionForHeader(const ActionsDAG::Node * node, ColumnsWithTypeAndName arguments) +static ColumnWithTypeAndName executeActionForPartialResult(const ActionsDAG::Node * node, ColumnsWithTypeAndName arguments, size_t input_rows_count) { ColumnWithTypeAndName res_column; res_column.type = node->result_type; @@ -608,7 +615,7 @@ static ColumnWithTypeAndName executeActionForHeader(const ActionsDAG::Node * nod { case ActionsDAG::ActionType::FUNCTION: { - res_column.column = node->function->execute(arguments, res_column.type, 0, true); + res_column.column = node->function->execute(arguments, res_column.type, input_rows_count, true); break; } @@ -621,13 +628,24 @@ static ColumnWithTypeAndName executeActionForHeader(const ActionsDAG::Node * nod if (!array) throw Exception(ErrorCodes::TYPE_MISMATCH, "ARRAY JOIN of not array nor map: {}", node->result_name); - res_column.column = array->getDataPtr()->cloneEmpty(); + + ColumnPtr data; + if (input_rows_count < array->size()) + data = array->getDataInRange(0, input_rows_count); + else + data = array->getDataPtr(); + + res_column.column = data; break; } case ActionsDAG::ActionType::COLUMN: { - res_column.column = node->column->cloneResized(0); + auto column = node->column; + if (input_rows_count < column->size()) + column = column->cloneResized(input_rows_count); + + res_column.column = column; break; } @@ -674,7 +692,7 @@ Block ActionsDAG::updateHeader(Block header) const ColumnsWithTypeAndName result_columns; try { - result_columns = evaluatePartialResult(node_to_column, outputs, true); + result_columns = evaluatePartialResult(node_to_column, outputs, /* input_rows_count= */ 0, /* throw_on_error= */ true); } catch (Exception & e) { @@ -703,8 +721,11 @@ Block ActionsDAG::updateHeader(Block header) const ColumnsWithTypeAndName ActionsDAG::evaluatePartialResult( IntermediateExecutionResult & node_to_column, const NodeRawConstPtrs & outputs, + size_t input_rows_count, bool throw_on_error) { + chassert(input_rows_count <= 1); /// evaluatePartialResult() should be used only to evaluate headers or constants + ColumnsWithTypeAndName result_columns; result_columns.reserve(outputs.size()); @@ -761,7 +782,7 @@ ColumnsWithTypeAndName ActionsDAG::evaluatePartialResult( node->result_name); if (node->type != ActionsDAG::ActionType::INPUT && has_all_arguments) - node_to_column[node] = executeActionForHeader(node, std::move(arguments)); + node_to_column[node] = executeActionForPartialResult(node, std::move(arguments), input_rows_count); } } @@ -1624,7 +1645,7 @@ void ActionsDAG::mergeNodes(ActionsDAG && second) } } -ActionsDAG::SplitResult ActionsDAG::split(std::unordered_set split_nodes) const +ActionsDAG::SplitResult ActionsDAG::split(std::unordered_set split_nodes, bool create_split_nodes_mapping) const { /// Split DAG into two parts. /// (first_nodes, first_outputs) is a part which will have split_list in result. @@ -1756,15 +1777,6 @@ ActionsDAG::SplitResult ActionsDAG::split(std::unordered_set split child = child_data.to_second; } - - /// Input from second DAG should also be in the first. - if (copy.type == ActionType::INPUT) - { - auto & input_copy = first_nodes.emplace_back(*cur.node); - assert(cur_data.to_first == nullptr); - cur_data.to_first = &input_copy; - new_inputs.push_back(cur.node); - } } else { @@ -1783,11 +1795,12 @@ ActionsDAG::SplitResult ActionsDAG::split(std::unordered_set split /// If this node is needed in result, add it as input. Node input_node; input_node.type = ActionType::INPUT; - input_node.result_type = node.result_type; - input_node.result_name = node.result_name; + input_node.result_type = cur.node->result_type; + input_node.result_name = cur.node->result_name; cur_data.to_second = &second_nodes.emplace_back(std::move(input_node)); - new_inputs.push_back(cur.node); + if (cur.node->type != ActionType::INPUT) + new_inputs.push_back(cur.node); } } } @@ -1803,7 +1816,13 @@ ActionsDAG::SplitResult ActionsDAG::split(std::unordered_set split for (const auto * input_node : inputs) { const auto & cur = data[input_node]; - first_inputs.push_back(cur.to_first); + if (cur.to_first) + { + first_inputs.push_back(cur.to_first); + + if (cur.to_second) + first_outputs.push_back(cur.to_first); + } } for (const auto * input : new_inputs) @@ -1813,6 +1832,13 @@ ActionsDAG::SplitResult ActionsDAG::split(std::unordered_set split first_outputs.push_back(cur.to_first); } + for (const auto * input_node : inputs) + { + const auto & cur = data[input_node]; + if (cur.to_second) + second_inputs.push_back(cur.to_second); + } + auto first_actions = std::make_shared(); first_actions->nodes.swap(first_nodes); first_actions->outputs.swap(first_outputs); @@ -1823,7 +1849,14 @@ ActionsDAG::SplitResult ActionsDAG::split(std::unordered_set split second_actions->outputs.swap(second_outputs); second_actions->inputs.swap(second_inputs); - return {std::move(first_actions), std::move(second_actions)}; + std::unordered_map split_nodes_mapping; + if (create_split_nodes_mapping) + { + for (const auto * node : split_nodes) + split_nodes_mapping[node] = data[node].to_first; + } + + return {std::move(first_actions), std::move(second_actions), std::move(split_nodes_mapping)}; } ActionsDAG::SplitResult ActionsDAG::splitActionsBeforeArrayJoin(const NameSet & array_joined_columns) const diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index 45f6e5cc717..bd56d20d0e8 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -278,6 +278,7 @@ public: static ColumnsWithTypeAndName evaluatePartialResult( IntermediateExecutionResult & node_to_column, const NodeRawConstPtrs & outputs, + size_t input_rows_count, bool throw_on_error); /// For apply materialize() function for every output. @@ -326,13 +327,18 @@ public: /// Merge current nodes with specified dag nodes void mergeNodes(ActionsDAG && second); - using SplitResult = std::pair; + struct SplitResult + { + ActionsDAGPtr first; + ActionsDAGPtr second; + std::unordered_map split_nodes_mapping; + }; /// Split ActionsDAG into two DAGs, where first part contains all nodes from split_nodes and their children. /// Execution of first then second parts on block is equivalent to execution of initial DAG. /// First DAG and initial DAG have equal inputs, second DAG and initial DAG has equal outputs. /// Second DAG inputs may contain less inputs then first DAG (but also include other columns). - SplitResult split(std::unordered_set split_nodes) const; + SplitResult split(std::unordered_set split_nodes, bool create_split_nodes_mapping = false) const; /// Splits actions into two parts. Returned first half may be swapped with ARRAY JOIN. SplitResult splitActionsBeforeArrayJoin(const NameSet & array_joined_columns) const; diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index 1789cc6c4b1..78e125146d4 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -1414,10 +1414,7 @@ FutureSetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool set_key = right_in_operand->getTreeHash(/*ignore_aliases=*/ true); if (auto set = data.prepared_sets->findSubquery(set_key)) - { - set->markAsINSubquery(); return set; - } FutureSetFromSubqueryPtr external_table_set; @@ -1464,7 +1461,7 @@ FutureSetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool } return data.prepared_sets->addFromSubquery( - set_key, std::move(source), nullptr, std::move(external_table_set), data.getContext()->getSettingsRef(), /*in_subquery=*/true); + set_key, std::move(source), nullptr, std::move(external_table_set), data.getContext()->getSettingsRef()); } else { diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp index 331cd991ea1..50fab486568 100644 --- a/src/Interpreters/Aggregator.cpp +++ b/src/Interpreters/Aggregator.cpp @@ -624,7 +624,7 @@ Aggregator::Aggregator(const Block & header_, const Params & params_) { size_t alignment_of_next_state = params.aggregates[i + 1].function->alignOfData(); if ((alignment_of_next_state & (alignment_of_next_state - 1)) != 0) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: alignOfData is not 2^N"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "`alignOfData` is not 2^N"); /// Extend total_size to next alignment requirement /// Add padding by rounding up 'total_size_of_aggregate_states' to be a multiplier of alignment_of_next_state. @@ -857,7 +857,7 @@ AggregatedDataVariants::Type Aggregator::chooseAggregationMethod() return AggregatedDataVariants::Type::low_cardinality_keys128; if (size_of_field == 32) return AggregatedDataVariants::Type::low_cardinality_keys256; - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: low cardinality numeric column has sizeOfField not in 1, 2, 4, 8, 16, 32."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "LowCardinality numeric column has sizeOfField not in 1, 2, 4, 8, 16, 32."); } if (size_of_field == 1) @@ -872,7 +872,7 @@ AggregatedDataVariants::Type Aggregator::chooseAggregationMethod() return AggregatedDataVariants::Type::keys128; if (size_of_field == 32) return AggregatedDataVariants::Type::keys256; - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: numeric column has sizeOfField not in 1, 2, 4, 8, 16, 32."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Numeric column has sizeOfField not in 1, 2, 4, 8, 16, 32."); } if (params.keys_size == 1 && isFixedString(types_removed_nullable[0])) diff --git a/src/Interpreters/ArrayJoinedColumnsVisitor.h b/src/Interpreters/ArrayJoinedColumnsVisitor.h index 3bbd6982213..f16751c4561 100644 --- a/src/Interpreters/ArrayJoinedColumnsVisitor.h +++ b/src/Interpreters/ArrayJoinedColumnsVisitor.h @@ -62,7 +62,7 @@ private: { auto [array_join_expression_list, _] = node.arrayJoinExpressionList(); if (!array_join_expression_list) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: no ARRAY JOIN"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "No ARRAY JOIN"); std::vector out; out.reserve(array_join_expression_list->children.size()); diff --git a/src/Interpreters/AsynchronousInsertLog.cpp b/src/Interpreters/AsynchronousInsertLog.cpp index 9034f582869..5d851f6b47d 100644 --- a/src/Interpreters/AsynchronousInsertLog.cpp +++ b/src/Interpreters/AsynchronousInsertLog.cpp @@ -32,8 +32,7 @@ ColumnsDescription AsynchronousInsertLogElement::getColumnsDescription() {"Preprocessed", static_cast(DataKind::Preprocessed)}, }); - return ColumnsDescription - { + return ColumnsDescription{ {"hostname", std::make_shared(std::make_shared())}, {"event_date", std::make_shared()}, {"event_time", std::make_shared()}, @@ -53,6 +52,7 @@ ColumnsDescription AsynchronousInsertLogElement::getColumnsDescription() {"flush_time", std::make_shared()}, {"flush_time_microseconds", std::make_shared(6)}, {"flush_query_id", std::make_shared()}, + {"timeout_milliseconds", std::make_shared()}, }; } @@ -80,6 +80,7 @@ void AsynchronousInsertLogElement::appendToBlock(MutableColumns & columns) const columns[i++]->insert(flush_time); columns[i++]->insert(flush_time_microseconds); columns[i++]->insert(flush_query_id); + columns[i++]->insert(timeout_milliseconds); } } diff --git a/src/Interpreters/AsynchronousInsertLog.h b/src/Interpreters/AsynchronousInsertLog.h index d05375002ad..70b56a273ad 100644 --- a/src/Interpreters/AsynchronousInsertLog.h +++ b/src/Interpreters/AsynchronousInsertLog.h @@ -38,6 +38,7 @@ struct AsynchronousInsertLogElement time_t flush_time{}; Decimal64 flush_time_microseconds{}; String flush_query_id; + UInt64 timeout_milliseconds = 0; static std::string name() { return "AsynchronousInsertLog"; } static ColumnsDescription getColumnsDescription(); diff --git a/src/Interpreters/AsynchronousInsertQueue.cpp b/src/Interpreters/AsynchronousInsertQueue.cpp index 8206c31624c..44cc58cec84 100644 --- a/src/Interpreters/AsynchronousInsertQueue.cpp +++ b/src/Interpreters/AsynchronousInsertQueue.cpp @@ -33,13 +33,14 @@ #include #include - namespace CurrentMetrics { extern const Metric PendingAsyncInsert; extern const Metric AsynchronousInsertThreads; extern const Metric AsynchronousInsertThreadsActive; extern const Metric AsynchronousInsertThreadsScheduled; + extern const Metric AsynchronousInsertQueueSize; + extern const Metric AsynchronousInsertQueueBytes; } namespace ProfileEvents @@ -60,6 +61,7 @@ namespace ErrorCodes extern const int UNKNOWN_FORMAT; extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; + extern const int INVALID_SETTING_VALUE; } static const NameSet settings_to_skip @@ -171,16 +173,41 @@ void AsynchronousInsertQueue::InsertData::Entry::finish(std::exception_ptr excep } } +AsynchronousInsertQueue::QueueShardFlushTimeHistory::TimePoints +AsynchronousInsertQueue::QueueShardFlushTimeHistory::getRecentTimePoints() const +{ + std::shared_lock lock(mutex); + return time_points; +} + +void AsynchronousInsertQueue::QueueShardFlushTimeHistory::updateWithCurrentTime() +{ + std::unique_lock lock(mutex); + time_points.first = time_points.second; + time_points.second = std::chrono::steady_clock::now(); +} + AsynchronousInsertQueue::AsynchronousInsertQueue(ContextPtr context_, size_t pool_size_, bool flush_on_shutdown_) : WithContext(context_) , pool_size(pool_size_) , flush_on_shutdown(flush_on_shutdown_) , queue_shards(pool_size) - , pool(CurrentMetrics::AsynchronousInsertThreads, CurrentMetrics::AsynchronousInsertThreadsActive, CurrentMetrics::AsynchronousInsertThreadsScheduled, pool_size) + , flush_time_history_per_queue_shard(pool_size) + , pool( + CurrentMetrics::AsynchronousInsertThreads, + CurrentMetrics::AsynchronousInsertThreadsActive, + CurrentMetrics::AsynchronousInsertThreadsScheduled, + pool_size) { if (!pool_size) throw Exception(ErrorCodes::BAD_ARGUMENTS, "pool_size cannot be zero"); + const auto & settings = getContext()->getSettingsRef(); + + for (size_t i = 0; i < pool_size; ++i) + queue_shards[i].busy_timeout_ms + = std::min(Milliseconds(settings.async_insert_busy_timeout_min_ms), Milliseconds(settings.async_insert_busy_timeout_max_ms)); + for (size_t i = 0; i < pool_size; ++i) dump_by_first_update_threads.emplace_back([this, i] { processBatchDeadlines(i); }); } @@ -201,7 +228,7 @@ AsynchronousInsertQueue::~AsynchronousInsertQueue() if (flush_on_shutdown) { for (auto & [_, elem] : shard.queue) - scheduleDataProcessingJob(elem.key, std::move(elem.data), getContext()); + scheduleDataProcessingJob(elem.key, std::move(elem.data), getContext(), i); } else { @@ -217,14 +244,14 @@ AsynchronousInsertQueue::~AsynchronousInsertQueue() LOG_TRACE(log, "Asynchronous insertion queue finished"); } -void AsynchronousInsertQueue::scheduleDataProcessingJob(const InsertQuery & key, InsertDataPtr data, ContextPtr global_context) +void AsynchronousInsertQueue::scheduleDataProcessingJob( + const InsertQuery & key, InsertDataPtr data, ContextPtr global_context, size_t shard_num) { /// Wrap 'unique_ptr' with 'shared_ptr' to make this /// lambda copyable and allow to save it to the thread pool. - pool.scheduleOrThrowOnError([key, global_context, my_data = std::make_shared(std::move(data))]() mutable - { - processData(key, std::move(*my_data), std::move(global_context)); - }); + pool.scheduleOrThrowOnError( + [this, key, global_context, shard_num, my_data = std::make_shared(std::move(data))]() mutable + { processData(key, std::move(*my_data), std::move(global_context), flush_time_history_per_queue_shard[shard_num]); }); } void AsynchronousInsertQueue::preprocessInsertQuery(const ASTPtr & query, const ContextPtr & query_context) @@ -300,6 +327,7 @@ AsynchronousInsertQueue::PushResult AsynchronousInsertQueue::pushDataChunk(ASTPtr query, DataChunk chunk, ContextPtr query_context) { const auto & settings = query_context->getSettingsRef(); + validateSettings(settings, log); auto & insert_query = query->as(); auto data_kind = chunk.getDataKind(); @@ -319,23 +347,33 @@ AsynchronousInsertQueue::pushDataChunk(ASTPtr query, DataChunk chunk, ContextPtr auto shard_num = key.hash % pool_size; auto & shard = queue_shards[shard_num]; - + const auto flush_time_points = flush_time_history_per_queue_shard[shard_num].getRecentTimePoints(); { std::lock_guard lock(shard.mutex); auto [it, inserted] = shard.iterators.try_emplace(key.hash); - if (inserted) + auto now = std::chrono::steady_clock::now(); + auto timeout_ms = getBusyWaitTimeoutMs(settings, shard, flush_time_points, now); + if (timeout_ms != shard.busy_timeout_ms) { - auto now = std::chrono::steady_clock::now(); - auto timeout = now + Milliseconds{key.settings.async_insert_busy_timeout_ms}; - it->second = shard.queue.emplace(timeout, Container{key, std::make_unique()}).first; + LOG_TRACE( + log, + "Asynchronous timeout {} from {} to {} for queue shard {}.", + timeout_ms < shard.busy_timeout_ms ? "decreased" : "increased", + shard.busy_timeout_ms.count(), + timeout_ms.count(), + size_t(shard_num)); } + if (inserted) + it->second = shard.queue.emplace(now + timeout_ms, Container{key, std::make_unique(timeout_ms)}).first; + auto queue_it = it->second; auto & data = queue_it->second.data; size_t entry_data_size = entry->chunk.byteSize(); assert(data); + auto size_in_bytes = data->size_in_bytes; data->size_in_bytes += entry_data_size; data->entries.emplace_back(entry); insert_future = entry->getFuture(); @@ -346,23 +384,50 @@ AsynchronousInsertQueue::pushDataChunk(ASTPtr query, DataChunk chunk, ContextPtr bool has_enough_bytes = data->size_in_bytes >= key.settings.async_insert_max_data_size; bool has_enough_queries = data->entries.size() >= key.settings.async_insert_max_query_number && key.settings.async_insert_deduplicate; - /// Here we check whether we hit the limit on maximum data size in the buffer. - /// And use setting from query context. - /// It works, because queries with the same set of settings are already grouped together. - if (!flush_stopped && (has_enough_bytes || has_enough_queries)) + auto max_busy_timeout_exceeded = [&shard, &settings, &now, &flush_time_points]() -> bool { + if (!settings.async_insert_use_adaptive_busy_timeout || !shard.last_insert_time || !flush_time_points.first) + return false; + + auto max_ms = Milliseconds(settings.async_insert_busy_timeout_max_ms); + return *shard.last_insert_time + max_ms < now && *flush_time_points.first + max_ms < *flush_time_points.second; + }; + + /// Here we check whether we have hit the limit on the maximum data size in the buffer or + /// if the elapsed time since the last insert exceeds the maximum busy wait timeout. + /// We also use the limit settings from the query context. + /// This works because queries with the same set of settings are already grouped together. + if (!flush_stopped && (has_enough_bytes || has_enough_queries || max_busy_timeout_exceeded())) + { + data->timeout_ms = Milliseconds::zero(); data_to_process = std::move(data); shard.iterators.erase(it); shard.queue.erase(queue_it); } + shard.last_insert_time = now; + shard.busy_timeout_ms = timeout_ms; + CurrentMetrics::add(CurrentMetrics::PendingAsyncInsert); ProfileEvents::increment(ProfileEvents::AsyncInsertQuery); ProfileEvents::increment(ProfileEvents::AsyncInsertBytes, entry_data_size); + + if (data_to_process) + { + if (!inserted) + CurrentMetrics::sub(CurrentMetrics::AsynchronousInsertQueueSize); + CurrentMetrics::sub(CurrentMetrics::AsynchronousInsertQueueBytes, size_in_bytes); + } + else + { + if (inserted) + CurrentMetrics::add(CurrentMetrics::AsynchronousInsertQueueSize); + CurrentMetrics::add(CurrentMetrics::AsynchronousInsertQueueBytes, entry_data_size); + } } if (data_to_process) - scheduleDataProcessingJob(key, std::move(data_to_process), getContext()); + scheduleDataProcessingJob(key, std::move(data_to_process), getContext(), shard_num); else shard.are_tasks_available.notify_one(); @@ -374,6 +439,79 @@ AsynchronousInsertQueue::pushDataChunk(ASTPtr query, DataChunk chunk, ContextPtr }; } +AsynchronousInsertQueue::Milliseconds AsynchronousInsertQueue::getBusyWaitTimeoutMs( + const Settings & settings, + const QueueShard & shard, + const QueueShardFlushTimeHistory::TimePoints & flush_time_points, + std::chrono::steady_clock::time_point now) const +{ + if (!settings.async_insert_use_adaptive_busy_timeout) + return settings.async_insert_busy_timeout_max_ms; + + const auto max_ms = Milliseconds(settings.async_insert_busy_timeout_max_ms); + const auto min_ms = std::min(std::max(Milliseconds(settings.async_insert_busy_timeout_min_ms), Milliseconds(1)), max_ms); + + auto normalize = [&min_ms, &max_ms](const auto & t_ms) { return std::min(std::max(t_ms, min_ms), max_ms); }; + + if (!shard.last_insert_time || !flush_time_points.first) + return normalize(shard.busy_timeout_ms); + + const auto & last_insert_time = *shard.last_insert_time; + const auto & [t1, t2] = std::tie(*flush_time_points.first, *flush_time_points.second); + const double increase_rate = settings.async_insert_busy_timeout_increase_rate; + const double decrease_rate = settings.async_insert_busy_timeout_decrease_rate; + + const auto decreased_timeout_ms = std::min( + std::chrono::duration_cast(shard.busy_timeout_ms / (1.0 + decrease_rate)), shard.busy_timeout_ms - Milliseconds(1)); + + /// Increase the timeout for frequent inserts. + if (last_insert_time + min_ms > now) + { + auto timeout_ms = std::max( + std::chrono::duration_cast(shard.busy_timeout_ms * (1.0 + increase_rate)), + shard.busy_timeout_ms + Milliseconds(1)); + + return normalize(timeout_ms); + } + /// Decrease the timeout if inserts are not frequent, + /// that is, if the time since the last insert and the difference between the last two queue flushes were both + /// long enough (exceeding the adjusted timeout). + /// This ensures the timeout value converges to the minimum over time for non-frequent inserts. + else if (last_insert_time + decreased_timeout_ms < now && t1 + decreased_timeout_ms < t2) + return normalize(decreased_timeout_ms); + + return normalize(shard.busy_timeout_ms); +} + +void AsynchronousInsertQueue::validateSettings(const Settings & settings, LoggerPtr log) +{ + const auto max_ms = std::chrono::milliseconds(settings.async_insert_busy_timeout_max_ms); + + if (max_ms == std::chrono::milliseconds::zero()) + throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Setting 'async_insert_busy_timeout_max_ms' can't be zero"); + + if (!settings.async_insert_use_adaptive_busy_timeout) + return; + + /// Adaptive timeout settings. + const auto min_ms = std::chrono::milliseconds(settings.async_insert_busy_timeout_min_ms); + + if (min_ms > max_ms) + if (log) + LOG_WARNING( + log, + "Setting 'async_insert_busy_timeout_min_ms'={} is greater than 'async_insert_busy_timeout_max_ms'={}. Ignoring " + "'async_insert_busy_timeout_min_ms'", + min_ms.count(), + max_ms.count()); + + if (settings.async_insert_busy_timeout_increase_rate <= 0) + throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Setting 'async_insert_busy_timeout_increase_rate' must be greater than zero"); + + if (settings.async_insert_busy_timeout_decrease_rate <= 0) + throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Setting 'async_insert_busy_timeout_decrease_rate' must be greater than zero"); +} + void AsynchronousInsertQueue::flushAll() { std::lock_guard flush_lock(flush_mutex); @@ -395,14 +533,15 @@ void AsynchronousInsertQueue::flushAll() size_t total_bytes = 0; size_t total_entries = 0; - for (auto & queue : queues_to_flush) + for (size_t i = 0; i < pool_size; ++i) { + auto & queue = queues_to_flush[i]; total_queries += queue.size(); for (auto & [_, entry] : queue) { total_bytes += entry.data->size_in_bytes; total_entries += entry.data->entries.size(); - scheduleDataProcessingJob(entry.key, std::move(entry.data), getContext()); + scheduleDataProcessingJob(entry.key, std::move(entry.data), getContext(), i); } } @@ -429,17 +568,21 @@ void AsynchronousInsertQueue::processBatchDeadlines(size_t shard_num) { std::unique_lock lock(shard.mutex); - shard.are_tasks_available.wait_for(lock, - Milliseconds(getContext()->getSettingsRef().async_insert_busy_timeout_ms), [&shard, this] - { - if (shutdown) - return true; + const auto rel_time + = std::min(shard.busy_timeout_ms, Milliseconds(getContext()->getSettingsRef().async_insert_poll_timeout_ms)); + shard.are_tasks_available.wait_for( + lock, + rel_time, + [&shard, this] + { + if (shutdown) + return true; - if (!shard.queue.empty() && shard.queue.begin()->first < std::chrono::steady_clock::now()) - return true; + if (!shard.queue.empty() && shard.queue.begin()->first < std::chrono::steady_clock::now()) + return true; - return false; - }); + return false; + }); if (shutdown) return; @@ -449,21 +592,30 @@ void AsynchronousInsertQueue::processBatchDeadlines(size_t shard_num) const auto now = std::chrono::steady_clock::now(); + size_t size_in_bytes = 0; while (true) { if (shard.queue.empty() || shard.queue.begin()->first > now) break; auto it = shard.queue.begin(); + size_in_bytes += it->second.data->size_in_bytes; + shard.iterators.erase(it->second.key.hash); entries_to_flush.emplace_back(std::move(it->second)); shard.queue.erase(it); } + + if (!entries_to_flush.empty()) + { + CurrentMetrics::sub(CurrentMetrics::AsynchronousInsertQueueSize, entries_to_flush.size()); + CurrentMetrics::sub(CurrentMetrics::AsynchronousInsertQueueBytes, size_in_bytes); + } } for (auto & entry : entries_to_flush) - scheduleDataProcessingJob(entry.key, std::move(entry.data), getContext()); + scheduleDataProcessingJob(entry.key, std::move(entry.data), getContext(), shard_num); } } @@ -507,7 +659,8 @@ String serializeQuery(const IAST & query, size_t max_length) } // static -void AsynchronousInsertQueue::processData(InsertQuery key, InsertDataPtr data, ContextPtr global_context) +void AsynchronousInsertQueue::processData( + InsertQuery key, InsertDataPtr data, ContextPtr global_context, QueueShardFlushTimeHistory & queue_shard_flush_time_history) try { if (!data) @@ -613,9 +766,12 @@ try throw; } - auto add_entry_to_log = [&]( - const auto & entry, const auto & entry_query_for_logging, - const auto & exception, size_t num_rows, size_t num_bytes) + auto add_entry_to_log = [&](const auto & entry, + const auto & entry_query_for_logging, + const auto & exception, + size_t num_rows, + size_t num_bytes, + Milliseconds timeout_ms) { if (!async_insert_log) return; @@ -632,6 +788,7 @@ try elem.rows = num_rows; elem.exception = exception; elem.data_kind = entry->chunk.getDataKind(); + elem.timeout_milliseconds = timeout_ms.count(); /// If there was a parsing error, /// the entry won't be flushed anyway, @@ -666,9 +823,9 @@ try auto header = pipeline.getHeader(); if (key.data_kind == DataKind::Parsed) - chunk = processEntriesWithParsing(key, data->entries, header, insert_context, log, add_entry_to_log); + chunk = processEntriesWithParsing(key, data, header, insert_context, log, add_entry_to_log); else - chunk = processPreprocessedEntries(key, data->entries, header, insert_context, add_entry_to_log); + chunk = processPreprocessedEntries(key, data, header, insert_context, add_entry_to_log); ProfileEvents::increment(ProfileEvents::AsyncInsertRows, chunk.getNumRows()); @@ -691,6 +848,8 @@ try LOG_INFO(log, "Flushed {} rows, {} bytes for query '{}'", num_rows, num_bytes, key.query_str); + queue_shard_flush_time_history.updateWithCurrentTime(); + bool pulling_pipeline = false; logQueryFinish(query_log_elem, insert_context, key.query, pipeline, pulling_pipeline, query_span, QueryCache::Usage::None, internal); } @@ -729,7 +888,7 @@ catch (...) template Chunk AsynchronousInsertQueue::processEntriesWithParsing( const InsertQuery & key, - const std::list & entries, + const InsertDataPtr & data, const Block & header, const ContextPtr & insert_context, const LoggerPtr logger, @@ -770,7 +929,7 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing( auto chunk_info = std::make_shared(); auto query_for_logging = serializeQuery(*key.query, insert_context->getSettingsRef().log_queries_cut_to_length); - for (const auto & entry : entries) + for (const auto & entry : data->entries) { current_entry = entry; @@ -786,7 +945,7 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing( chunk_info->offsets.push_back(total_rows); chunk_info->tokens.push_back(entry->async_dedup_token); - add_to_async_insert_log(entry, query_for_logging, current_exception, num_rows, num_bytes); + add_to_async_insert_log(entry, query_for_logging, current_exception, num_rows, num_bytes, data->timeout_ms); current_exception.clear(); } @@ -798,7 +957,7 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing( template Chunk AsynchronousInsertQueue::processPreprocessedEntries( const InsertQuery & key, - const std::list & entries, + const InsertDataPtr & data, const Block & header, const ContextPtr & insert_context, LogFunc && add_to_async_insert_log) @@ -821,7 +980,7 @@ Chunk AsynchronousInsertQueue::processPreprocessedEntries( return it->second; }; - for (const auto & entry : entries) + for (const auto & entry : data->entries) { const auto * block = entry->chunk.asBlock(); if (!block) @@ -837,7 +996,7 @@ Chunk AsynchronousInsertQueue::processPreprocessedEntries( chunk_info->tokens.push_back(entry->async_dedup_token); const auto & query_for_logging = get_query_by_format(entry->format); - add_to_async_insert_log(entry, query_for_logging, "", block->rows(), block->bytes()); + add_to_async_insert_log(entry, query_for_logging, "", block->rows(), block->bytes(), data->timeout_ms); } Chunk chunk(std::move(result_columns), total_rows); diff --git a/src/Interpreters/AsynchronousInsertQueue.h b/src/Interpreters/AsynchronousInsertQueue.h index f4bfdbd38a5..17140030766 100644 --- a/src/Interpreters/AsynchronousInsertQueue.h +++ b/src/Interpreters/AsynchronousInsertQueue.h @@ -10,6 +10,7 @@ #include #include +#include #include namespace DB @@ -53,6 +54,8 @@ public: Preprocessed = 1, }; + static void validateSettings(const Settings & settings, LoggerPtr log); + /// Force flush the whole queue. void flushAll(); @@ -146,6 +149,9 @@ private: std::atomic_bool finished = false; }; + InsertData() = default; + explicit InsertData(Milliseconds timeout_ms_) : timeout_ms(timeout_ms_) { } + ~InsertData() { auto it = entries.begin(); @@ -163,6 +169,7 @@ private: std::list entries; size_t size_in_bytes = 0; + Milliseconds timeout_ms = Milliseconds::zero(); }; using InsertDataPtr = std::unique_ptr; @@ -180,6 +187,8 @@ private: using QueueIterator = Queue::iterator; using QueueIteratorByKey = std::unordered_map; + using OptionalTimePoint = std::optional; + struct QueueShard { mutable std::mutex mutex; @@ -187,12 +196,30 @@ private: Queue queue; QueueIteratorByKey iterators; + + OptionalTimePoint last_insert_time; + std::chrono::milliseconds busy_timeout_ms; + }; + + /// Times of the two most recent queue flushes. + /// Used to calculate adaptive timeout. + struct QueueShardFlushTimeHistory + { + public: + using TimePoints = std::pair; + TimePoints getRecentTimePoints() const; + void updateWithCurrentTime(); + + private: + mutable std::shared_mutex mutex; + TimePoints time_points; }; const size_t pool_size; const bool flush_on_shutdown; std::vector queue_shards; + std::vector flush_time_history_per_queue_shard; /// Logic and events behind queue are as follows: /// - async_insert_busy_timeout_ms: @@ -217,17 +244,25 @@ private: LoggerPtr log = getLogger("AsynchronousInsertQueue"); PushResult pushDataChunk(ASTPtr query, DataChunk chunk, ContextPtr query_context); + + Milliseconds getBusyWaitTimeoutMs( + const Settings & settings, + const QueueShard & shard, + const QueueShardFlushTimeHistory::TimePoints & flush_time_points, + std::chrono::steady_clock::time_point now) const; + void preprocessInsertQuery(const ASTPtr & query, const ContextPtr & query_context); void processBatchDeadlines(size_t shard_num); - void scheduleDataProcessingJob(const InsertQuery & key, InsertDataPtr data, ContextPtr global_context); + void scheduleDataProcessingJob(const InsertQuery & key, InsertDataPtr data, ContextPtr global_context, size_t shard_num); - static void processData(InsertQuery key, InsertDataPtr data, ContextPtr global_context); + static void processData( + InsertQuery key, InsertDataPtr data, ContextPtr global_context, QueueShardFlushTimeHistory & queue_shard_flush_time_history); template static Chunk processEntriesWithParsing( const InsertQuery & key, - const std::list & entries, + const InsertDataPtr & data, const Block & header, const ContextPtr & insert_context, const LoggerPtr logger, @@ -236,7 +271,7 @@ private: template static Chunk processPreprocessedEntries( const InsertQuery & key, - const std::list & entries, + const InsertDataPtr & data, const Block & header, const ContextPtr & insert_context, LogFunc && add_to_async_insert_log); diff --git a/src/Interpreters/ClientInfo.cpp b/src/Interpreters/ClientInfo.cpp index 347ec115aba..e4778edeb9c 100644 --- a/src/Interpreters/ClientInfo.cpp +++ b/src/Interpreters/ClientInfo.cpp @@ -23,7 +23,7 @@ namespace ErrorCodes void ClientInfo::write(WriteBuffer & out, UInt64 server_protocol_revision) const { if (server_protocol_revision < DBMS_MIN_REVISION_WITH_CLIENT_INFO) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: method ClientInfo::write is called for unsupported server revision"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Method ClientInfo::write is called for unsupported server revision"); writeBinary(static_cast(query_kind), out); if (empty()) @@ -103,7 +103,7 @@ void ClientInfo::write(WriteBuffer & out, UInt64 server_protocol_revision) const void ClientInfo::read(ReadBuffer & in, UInt64 client_protocol_revision) { if (client_protocol_revision < DBMS_MIN_REVISION_WITH_CLIENT_INFO) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: method ClientInfo::read is called for unsupported client revision"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Method ClientInfo::read is called for unsupported client revision"); UInt8 read_query_kind = 0; readBinary(read_query_kind, in); diff --git a/src/Interpreters/ClusterDiscovery.cpp b/src/Interpreters/ClusterDiscovery.cpp index 52b74597c4b..d432488964d 100644 --- a/src/Interpreters/ClusterDiscovery.cpp +++ b/src/Interpreters/ClusterDiscovery.cpp @@ -319,7 +319,7 @@ bool ClusterDiscovery::updateCluster(ClusterInfo & cluster_info) if (cluster_info.current_cluster_is_invisible) { - LOG_DEBUG(log, "cluster '{}' is invisible!", cluster_info.name); + LOG_DEBUG(log, "Cluster '{}' is invisible.", cluster_info.name); return true; } diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index 35451e1d774..33b86854ba9 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -32,6 +32,7 @@ namespace ErrorCodes extern const int TOO_LARGE_DISTRIBUTED_DEPTH; extern const int LOGICAL_ERROR; extern const int CLUSTER_DOESNT_EXIST; + extern const int UNEXPECTED_CLUSTER; } namespace ClusterProxy @@ -374,12 +375,12 @@ void executeQueryWithParallelReplicas( shard_num = column->getUInt(0); } - ClusterPtr new_cluster; + const auto shard_count = not_optimized_cluster->getShardCount(); + ClusterPtr new_cluster = not_optimized_cluster; /// if got valid shard_num from query initiator, then parallel replicas scope is the specified shard /// shards are numbered in order of appearance in the cluster config if (shard_num > 0) { - const auto shard_count = not_optimized_cluster->getShardCount(); if (shard_num > shard_count) throw Exception( ErrorCodes::LOGICAL_ERROR, @@ -395,17 +396,18 @@ void executeQueryWithParallelReplicas( // get cluster for shard specified by shard_num // shard_num is 1-based, but getClusterWithSingleShard expects 0-based index - auto single_shard_cluster = not_optimized_cluster->getClusterWithSingleShard(shard_num - 1); - // convert cluster to representation expected by parallel replicas - new_cluster = single_shard_cluster->getClusterWithReplicasAsShards(settings, settings.max_parallel_replicas); + new_cluster = not_optimized_cluster->getClusterWithSingleShard(shard_num - 1); } else { - new_cluster = not_optimized_cluster->getClusterWithReplicasAsShards(settings, settings.max_parallel_replicas); + if (not_optimized_cluster->getShardCount() > 1) + throw DB::Exception( + ErrorCodes::UNEXPECTED_CLUSTER, + "`cluster_for_parallel_replicas` setting refers to cluster with several shards. Expected a cluster with one shard"); } - auto coordinator - = std::make_shared(new_cluster->getShardCount(), settings.parallel_replicas_mark_segment_size); + auto coordinator = std::make_shared( + new_cluster->getShardsInfo().begin()->getAllNodeCount(), settings.parallel_replicas_mark_segment_size); auto external_tables = new_context->getExternalTables(); auto read_from_remote = std::make_unique( query_ast, diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 75c20b0a520..55a4df10206 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -94,6 +94,7 @@ #include #include #include +#include #include #include #include @@ -907,7 +908,7 @@ Strings Context::getWarnings() const if (CurrentMetrics::get(CurrentMetrics::AttachedTable) > static_cast(shared->max_table_num_to_warn)) common_warnings.emplace_back(fmt::format("The number of attached tables is more than {}", shared->max_table_num_to_warn)); if (CurrentMetrics::get(CurrentMetrics::AttachedDatabase) > static_cast(shared->max_database_num_to_warn)) - common_warnings.emplace_back(fmt::format("The number of attached databases is more than {}", shared->max_table_num_to_warn)); + common_warnings.emplace_back(fmt::format("The number of attached databases is more than {}", shared->max_database_num_to_warn)); if (CurrentMetrics::get(CurrentMetrics::PartsActive) > static_cast(shared->max_part_num_to_warn)) common_warnings.emplace_back(fmt::format("The number of active parts is more than {}", shared->max_part_num_to_warn)); } @@ -1533,7 +1534,7 @@ void Context::addExternalTable(const String & table_name, TemporaryTableHolder & std::lock_guard lock(mutex); if (external_tables_mapping.end() != external_tables_mapping.find(table_name)) - throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, "Temporary table {} already exists.", backQuoteIfNeed(table_name)); + throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, "Temporary table {} already exists", backQuoteIfNeed(table_name)); external_tables_mapping.emplace(table_name, std::make_shared(std::move(temporary_table))); } @@ -1931,6 +1932,35 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const } +StoragePtr Context::buildParametrizedViewStorage(const ASTPtr & table_expression, const String & database_name, const String & table_name) +{ + if (table_name.empty()) + return nullptr; + + StoragePtr original_view = DatabaseCatalog::instance().tryGetTable({database_name, table_name}, getQueryContext()); + if (!original_view || !original_view->isView()) + return nullptr; + auto * storage_view = original_view->as(); + if (!storage_view || !storage_view->isParameterizedView()) + return nullptr; + + auto query = original_view->getInMemoryMetadataPtr()->getSelectQuery().inner_query->clone(); + NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression); + StorageView::replaceQueryParametersIfParametrizedView(query, parameterized_view_values); + + ASTCreateQuery create; + create.select = query->as(); + auto sample_block = InterpreterSelectQueryAnalyzer::getSampleBlock(query, shared_from_this()); + auto res = std::make_shared(StorageID(database_name, table_name), + create, + ColumnsDescription(sample_block.getNamesAndTypesList()), + /* comment */ "", + /* is_parameterized_view */ true); + res->startup(); + return res; +} + + void Context::addViewSource(const StoragePtr & storage) { if (view_source) @@ -4154,12 +4184,12 @@ void Context::setMaxTableSizeToDrop(size_t max_size) size_t Context::getMaxTableSizeToDrop() const { - return shared->max_table_size_to_drop.load(std::memory_order_relaxed); + return shared->max_table_size_to_drop.load(); } void Context::checkTableCanBeDropped(const String & database, const String & table, const size_t & table_size) const { - size_t max_table_size_to_drop = shared->max_table_size_to_drop.load(std::memory_order_relaxed); + size_t max_table_size_to_drop = shared->max_table_size_to_drop.load(); checkCanBeDropped(database, table, table_size, max_table_size_to_drop); } @@ -4177,12 +4207,12 @@ void Context::setMaxPartitionSizeToDrop(size_t max_size) size_t Context::getMaxPartitionSizeToDrop() const { - return shared->max_partition_size_to_drop.load(std::memory_order_relaxed); + return shared->max_partition_size_to_drop.load(); } void Context::checkPartitionCanBeDropped(const String & database, const String & table, const size_t & partition_size) const { - size_t max_partition_size_to_drop = shared->max_partition_size_to_drop.load(std::memory_order_relaxed); + size_t max_partition_size_to_drop = shared->max_partition_size_to_drop.load(); checkCanBeDropped(database, table, partition_size, max_partition_size_to_drop); } @@ -4483,7 +4513,7 @@ void Context::setClientConnectionId(uint32_t connection_id_) client_info.connection_id = connection_id_; } -void Context::setHttpClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer) +void Context::setHTTPClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer) { client_info.http_method = http_method; client_info.http_user_agent = http_user_agent; @@ -4862,10 +4892,10 @@ AsynchronousInsertQueue * Context::getAsynchronousInsertQueue() const void Context::setAsynchronousInsertQueue(const std::shared_ptr & ptr) { - using namespace std::chrono; + AsynchronousInsertQueue::validateSettings(settings, getLogger("Context")); - if (std::chrono::milliseconds(settings.async_insert_busy_timeout_ms) == 0ms) - throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Setting async_insert_busy_timeout_ms can't be zero"); + if (std::chrono::milliseconds(settings.async_insert_poll_timeout_ms) == std::chrono::milliseconds::zero()) + throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Setting async_insert_poll_timeout_ms can't be zero"); shared->async_insert_queue = ptr; } diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 8d40ccb301b..a7908d45a9b 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -630,7 +630,7 @@ public: void setClientInterface(ClientInfo::Interface interface); void setClientVersion(UInt64 client_version_major, UInt64 client_version_minor, UInt64 client_version_patch, unsigned client_tcp_protocol_version); void setClientConnectionId(uint32_t connection_id); - void setHttpClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer); + void setHTTPClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer); void setForwardedFor(const String & forwarded_for); void setQueryKind(ClientInfo::QueryKind query_kind); void setQueryKindInitial(); @@ -718,6 +718,8 @@ public: /// Overload for the new analyzer. Structure inference is performed in QueryAnalysisPass. StoragePtr executeTableFunction(const ASTPtr & table_expression, const TableFunctionPtr & table_function_ptr); + StoragePtr buildParametrizedViewStorage(const ASTPtr & table_expression, const String & database_name, const String & table_name); + void addViewSource(const StoragePtr & storage); StoragePtr getViewSource() const; diff --git a/src/Interpreters/CrossToInnerJoinVisitor.cpp b/src/Interpreters/CrossToInnerJoinVisitor.cpp index 42af164f4ad..e3e8b80e437 100644 --- a/src/Interpreters/CrossToInnerJoinVisitor.cpp +++ b/src/Interpreters/CrossToInnerJoinVisitor.cpp @@ -173,7 +173,7 @@ std::vector getTables(const ASTSelectQuery & select) { const auto * table_element = child->as(); if (!table_element) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: TablesInSelectQueryElement expected"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "TablesInSelectQueryElement expected"); JoinedElement & t = joined_tables.emplace_back(*table_element); t.rewriteCommaToCross(); @@ -224,7 +224,7 @@ void CrossToInnerJoinMatcher::visit(ASTSelectQuery & select, ASTPtr &, Data & da { if (joined_tables.size() != data.tables_with_columns.size()) throw Exception(ErrorCodes::LOGICAL_ERROR, - "Logical error: inconsistent number of tables: {} != {}", + "Inconsistent number of tables: {} != {}", joined_tables.size(), data.tables_with_columns.size()); for (size_t i = 0; i < joined_tables.size(); ++i) diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 90eec421abf..543d8b16791 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -148,9 +148,8 @@ void DDLLogEntry::parse(const String & data) String settings_str; rb >> "settings: " >> settings_str >> "\n"; ParserSetQuery parser{true}; - constexpr UInt64 max_size = 4096; constexpr UInt64 max_depth = 16; - ASTPtr settings_ast = parseQuery(parser, settings_str, max_size, max_depth); + ASTPtr settings_ast = parseQuery(parser, settings_str, Context::getGlobalContextInstance()->getSettingsRef().max_query_size, max_depth); settings.emplace(std::move(settings_ast->as()->changes)); } } diff --git a/src/Interpreters/DatabaseAndTableWithAlias.cpp b/src/Interpreters/DatabaseAndTableWithAlias.cpp index db020cb9166..329391b45d7 100644 --- a/src/Interpreters/DatabaseAndTableWithAlias.cpp +++ b/src/Interpreters/DatabaseAndTableWithAlias.cpp @@ -71,7 +71,7 @@ DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTTableExpression & alias = table_expression.subquery->tryGetAlias(); } else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: no known elements in ASTTableExpression"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "No known elements in ASTTableExpression"); } bool DatabaseAndTableWithAlias::satisfies(const DatabaseAndTableWithAlias & db_table, bool table_may_be_an_alias) const diff --git a/src/Interpreters/ExpressionActions.cpp b/src/Interpreters/ExpressionActions.cpp index f1c577948eb..1bd1e2c318f 100644 --- a/src/Interpreters/ExpressionActions.cpp +++ b/src/Interpreters/ExpressionActions.cpp @@ -611,6 +611,13 @@ static void executeAction(const ExpressionActions::Action & action, ExecutionCon ProfileEvents::increment(ProfileEvents::CompiledFunctionExecute); res_column.column = action.node->function->execute(arguments, res_column.type, num_rows, dry_run); + if (res_column.column->getDataType() != res_column.type->getColumnType()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Unexpected return type from {}. Expected {}. Got {}", + action.node->function->getName(), + res_column.type->getColumnType(), + res_column.column->getDataType()); } break; } diff --git a/src/Interpreters/GatherFunctionQuantileVisitor.cpp b/src/Interpreters/GatherFunctionQuantileVisitor.cpp index 664bb9e9383..6b6dc362771 100644 --- a/src/Interpreters/GatherFunctionQuantileVisitor.cpp +++ b/src/Interpreters/GatherFunctionQuantileVisitor.cpp @@ -30,6 +30,7 @@ static const std::unordered_map quantile_fuse_name_mapping = {"quantileTDigestWeighted", "quantilesTDigestWeighted"}, {"quantileTiming", "quantilesTiming"}, {"quantileTimingWeighted", "quantilesTimingWeighted"}, + {"quantileGK", "quantilesGK"}, }; String GatherFunctionQuantileData::toFusedNameOrSelf(const String & func_name) diff --git a/src/Interpreters/GlobalSubqueriesVisitor.h b/src/Interpreters/GlobalSubqueriesVisitor.h index 384b562c80c..5f029395df9 100644 --- a/src/Interpreters/GlobalSubqueriesVisitor.h +++ b/src/Interpreters/GlobalSubqueriesVisitor.h @@ -32,6 +32,7 @@ namespace ErrorCodes { extern const int WRONG_GLOBAL_SUBQUERY; extern const int LOGICAL_ERROR; + extern const int SUPPORT_IS_DISABLED; } class GlobalSubqueriesMatcher @@ -200,23 +201,33 @@ public: } private: - static bool shouldBeExecutedGlobally(const Data & data) - { - const Settings & settings = data.getContext()->getSettingsRef(); - /// For parallel replicas we reinterpret JOIN as GLOBAL JOIN as a way to broadcast data - const bool enable_parallel_processing_of_joins = data.getContext()->canUseParallelReplicasOnInitiator(); - return settings.prefer_global_in_and_join || enable_parallel_processing_of_joins; - } - - /// GLOBAL IN static void visit(ASTFunction & func, ASTPtr &, Data & data) { - if ((shouldBeExecutedGlobally(data) + const Settings & settings = data.getContext()->getSettingsRef(); + const bool prefer_global = settings.prefer_global_in_and_join; + const bool enable_parallel_processing_of_joins = data.getContext()->canUseParallelReplicasOnInitiator(); + + if (((prefer_global || enable_parallel_processing_of_joins) && (func.name == "in" || func.name == "notIn" || func.name == "nullIn" || func.name == "notNullIn")) || func.name == "globalIn" || func.name == "globalNotIn" || func.name == "globalNullIn" || func.name == "globalNotNullIn") { ASTPtr & ast = func.arguments->children[1]; + if (enable_parallel_processing_of_joins) + { + /// We don't enable parallel replicas for IN (subquery) + if (ast->as()) + { + if (settings.allow_experimental_parallel_reading_from_replicas == 1) + { + LOG_DEBUG(getLogger("GlobalSubqueriesMatcher"), "IN with subquery is not supported with parallel replicas"); + data.getContext()->getQueryContext()->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0)); + return; + } + else if (settings.allow_experimental_parallel_reading_from_replicas == 2) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "IN with subquery is not supported with parallel replicas"); + } + } /// Literal or function can use regular IN. /// NOTE: We don't support passing table functions to IN. @@ -241,9 +252,41 @@ private: /// GLOBAL JOIN static void visit(ASTTablesInSelectQueryElement & table_elem, ASTPtr &, Data & data) { + const Settings & settings = data.getContext()->getSettingsRef(); + const bool prefer_global = settings.prefer_global_in_and_join; + const bool enable_parallel_processing_of_joins = data.getContext()->canUseParallelReplicasOnInitiator(); + if (table_elem.table_join - && (table_elem.table_join->as().locality == JoinLocality::Global || shouldBeExecutedGlobally(data))) + && (table_elem.table_join->as().locality == JoinLocality::Global || prefer_global + || enable_parallel_processing_of_joins)) { + if (enable_parallel_processing_of_joins) + { + /// For parallel replicas we currently only support JOIN with subqueries + /// Note that tableA join tableB is previously converted into tableA JOIN (Select * FROM tableB) so that's ok + /// We don't support WITH cte as (subquery) Select table JOIN cte because we don't do conversion in AST + bool is_subquery = false; + if (const auto * ast_table_expr = table_elem.table_expression->as()) + { + is_subquery = ast_table_expr->subquery && ast_table_expr->subquery->as() != nullptr + && ast_table_expr->subquery->as()->cte_name.empty(); + } + else if (table_elem.table_expression->as()) + is_subquery = true; + + if (!is_subquery) + { + if (settings.allow_experimental_parallel_reading_from_replicas == 1) + { + LOG_DEBUG(getLogger("GlobalSubqueriesMatcher"), "JOIN with parallel replicas is only supported with subqueries"); + data.getContext()->getQueryContext()->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0)); + return; + } + else if (settings.allow_experimental_parallel_reading_from_replicas == 2) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "JOIN with parallel replicas is only supported with subqueries"); + } + } + Names required_columns; /// Fill required columns for GLOBAL JOIN. diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 33dc178ca00..73487a0914a 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -368,7 +368,7 @@ HashJoin::Type HashJoin::chooseMethod(JoinKind kind, const ColumnRawPtrs & key_c return Type::keys128; if (size_of_field == 32) return Type::keys256; - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: numeric column has sizeOfField not in 1, 2, 4, 8, 16, 32."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Numeric column has sizeOfField not in 1, 2, 4, 8, 16, 32."); } /// If the keys fit in N bits, we will use a hash table for N-bit-packed keys diff --git a/src/Interpreters/InJoinSubqueriesPreprocessor.cpp b/src/Interpreters/InJoinSubqueriesPreprocessor.cpp index 3858830a43b..ec4241a2740 100644 --- a/src/Interpreters/InJoinSubqueriesPreprocessor.cpp +++ b/src/Interpreters/InJoinSubqueriesPreprocessor.cpp @@ -103,12 +103,12 @@ private: /// Already processed. } else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: unexpected function name {}", concrete->name); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected function name {}", concrete->name); } else if (table_join) table_join->locality = JoinLocality::Global; else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: unexpected AST node"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected AST node"); } else if (distributed_product_mode == DistributedProductMode::DENY) { diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 5b04ffb2b17..a9942ac7cde 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -82,7 +82,8 @@ #include #include - +#include +#include namespace DB { @@ -692,6 +693,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( if (!attach && !is_restore_from_backup && context_->getSettingsRef().flatten_nested) res.flattenNested(); + if (res.getAllPhysical().empty()) throw Exception(ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED, "Cannot CREATE table without physical columns"); @@ -796,6 +798,9 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti } else if (create.select) { + if (create.isParameterizedView()) + return properties; + Block as_select_sample; if (getContext()->getSettingsRef().allow_experimental_analyzer) @@ -820,11 +825,7 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti * for example: LIMIT, OFFSET, functions parameters, functions constant only arguments. */ - SelectQueryOptions options; - if (create.isParameterizedView()) - options = options.createParameterizedView(); - - InterpreterSelectWithUnionQuery interpreter(create.select->clone(), getContext(), options); + InterpreterSelectWithUnionQuery interpreter(create.select->clone(), getContext(), SelectQueryOptions()); as_select_sample = interpreter.getSampleBlock(); } @@ -957,6 +958,20 @@ void InterpreterCreateQuery::validateTableStructure(const ASTCreateQuery & creat } } } + if (!create.attach && !settings.allow_experimental_variant_type) + { + for (const auto & [name, type] : properties.columns.getAllPhysical()) + { + if (isVariant(type)) + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Cannot create table with column '{}' which type is '{}' " + "because experimental Variant type is not allowed. " + "Set setting allow_experimental_variant_type = 1 in order to allow it", + name, type->getName()); + } + } + } } namespace @@ -1062,15 +1077,22 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data { const auto * kind = create.is_dictionary ? "Dictionary" : "Table"; const auto * kind_upper = create.is_dictionary ? "DICTIONARY" : "TABLE"; + bool is_replicated_database_internal = database->getEngineName() == "Replicated" && getContext()->getClientInfo().is_replicated_database_internal; + bool from_path = create.attach_from_path.has_value(); + bool is_on_cluster = getContext()->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; - if (database->getEngineName() == "Replicated" && getContext()->getClientInfo().is_replicated_database_internal - && !internal) + if (is_replicated_database_internal && !internal) { if (create.uuid == UUIDHelpers::Nil) throw Exception(ErrorCodes::LOGICAL_ERROR, "Table UUID is not specified in DDL log"); } - bool from_path = create.attach_from_path.has_value(); + if (create.refresh_strategy && database->getEngineName() != "Atomic") + throw Exception(ErrorCodes::INCORRECT_QUERY, + "Refreshable materialized view requires Atomic database engine, but database {} has engine {}", create.getDatabase(), database->getEngineName()); + /// TODO: Support Replicated databases, only with Shared/ReplicatedMergeTree. + /// Figure out how to make the refreshed data appear all at once on other + /// replicas; maybe a replicated SYSTEM SYNC REPLICA query before the rename? if (database->getUUID() != UUIDHelpers::Nil) { @@ -1094,7 +1116,6 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data } else { - bool is_on_cluster = getContext()->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; bool has_uuid = create.uuid != UUIDHelpers::Nil || create.to_inner_uuid != UUIDHelpers::Nil; if (has_uuid && !is_on_cluster && !internal) { @@ -1107,13 +1128,6 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data "{} UUID specified, but engine of database {} is not Atomic", kind, create.getDatabase()); } - if (create.refresh_strategy && database->getEngineName() != "Atomic") - throw Exception(ErrorCodes::INCORRECT_QUERY, - "Refreshable materialized view requires Atomic database engine, but database {} has engine {}", create.getDatabase(), database->getEngineName()); - /// TODO: Support Replicated databases, only with Shared/ReplicatedMergeTree. - /// Figure out how to make the refreshed data appear all at once on other - /// replicas; maybe a replicated SYSTEM SYNC REPLICA query before the rename? - /// The database doesn't support UUID so we'll ignore it. The UUID could be set here because of either /// a) the initiator of `ON CLUSTER` query generated it to ensure the same UUIDs are used on different hosts; or /// b) `RESTORE from backup` query generated it to ensure the same UUIDs are used on different hosts. @@ -1421,8 +1435,14 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, interpreter.execute(); } else - throw Exception(storage_already_exists_error_code, - "{} {}.{} already exists", storage_name, backQuoteIfNeed(create.getDatabase()), backQuoteIfNeed(create.getTable())); + { + if (database->getTable(create.getTable(), getContext())->isDictionary()) + throw Exception(ErrorCodes::DICTIONARY_ALREADY_EXISTS, + "Dictionary {}.{} already exists", backQuoteIfNeed(create.getDatabase()), backQuoteIfNeed(create.getTable())); + else + throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, + "Table {}.{} already exists", backQuoteIfNeed(create.getDatabase()), backQuoteIfNeed(create.getTable())); + } } else if (!create.attach) { diff --git a/src/Interpreters/InterpreterFactory.cpp b/src/Interpreters/InterpreterFactory.cpp index 84432415f5e..1b6e6be2ea2 100644 --- a/src/Interpreters/InterpreterFactory.cpp +++ b/src/Interpreters/InterpreterFactory.cpp @@ -65,6 +65,7 @@ namespace ProfileEvents { extern const Event Query; + extern const Event InitialQuery; extern const Event QueriesWithSubqueries; extern const Event SelectQuery; extern const Event InsertQuery; @@ -94,7 +95,8 @@ void InterpreterFactory::registerInterpreter(const std::string & name, CreatorFn InterpreterFactory::InterpreterPtr InterpreterFactory::get(ASTPtr & query, ContextMutablePtr context, const SelectQueryOptions & options) { ProfileEvents::increment(ProfileEvents::Query); - + if (context->getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY) + ProfileEvents::increment(ProfileEvents::InitialQuery); /// SELECT and INSERT query will handle QueriesWithSubqueries on their own. if (!(query->as() || query->as() || diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index c8e05fcd5e3..724cfca6a80 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -274,7 +274,7 @@ Chain InterpreterInsertQuery::buildChain( auto sample = getSampleBlock(columns, table, metadata_snapshot); Chain sink = buildSink(table, metadata_snapshot, thread_status_holder, running_group, elapsed_counter_ms); - Chain chain = buildPreSinkChain(sink.getInputHeader(), table, metadata_snapshot, sample, thread_status_holder); + Chain chain = buildPreSinkChain(sink.getInputHeader(), table, metadata_snapshot, sample); chain.appendChain(std::move(sink)); return chain; @@ -317,25 +317,31 @@ Chain InterpreterInsertQuery::buildSink( return out; } +bool InterpreterInsertQuery::shouldAddSquashingFroStorage(const StoragePtr & table) const +{ + auto context_ptr = getContext(); + const Settings & settings = context_ptr->getSettingsRef(); + const ASTInsertQuery * query = nullptr; + if (query_ptr) + query = query_ptr->as(); + + /// Do not squash blocks if it is a sync INSERT into Distributed, since it lead to double bufferization on client and server side. + /// Client-side bufferization might cause excessive timeouts (especially in case of big blocks). + return !(settings.distributed_foreground_insert && table->isRemote()) && !async_insert && !no_squash && !(query && query->watch); +} + Chain InterpreterInsertQuery::buildPreSinkChain( const Block & subsequent_header, const StoragePtr & table, const StorageMetadataPtr & metadata_snapshot, - const Block & query_sample_block, - ThreadStatusesHolderPtr thread_status_holder) + const Block & query_sample_block) { - ThreadStatus * thread_status = current_thread; - - if (!thread_status_holder) - thread_status = nullptr; - auto context_ptr = getContext(); const ASTInsertQuery * query = nullptr; if (query_ptr) query = query_ptr->as(); - const Settings & settings = context_ptr->getSettingsRef(); bool null_as_default = query && query->select && context_ptr->getSettingsRef().insert_null_as_default; /// We create a pipeline of several streams, into which we will write data. @@ -366,26 +372,6 @@ Chain InterpreterInsertQuery::buildPreSinkChain( /// because some clients break insertion protocol (columns != header) out.addSource(std::make_shared(query_sample_block, adding_missing_defaults_actions)); - /// It's important to squash blocks as early as possible (before other transforms), - /// because other transforms may work inefficient if block size is small. - - /// Do not squash blocks if it is a sync INSERT into Distributed, since it lead to double bufferization on client and server side. - /// Client-side bufferization might cause excessive timeouts (especially in case of big blocks). - if (!(settings.distributed_foreground_insert && table->isRemote()) && !async_insert && !no_squash && !(query && query->watch)) - { - bool table_prefers_large_blocks = table->prefersLargeBlocks(); - - out.addSource(std::make_shared( - input_header(), - table_prefers_large_blocks ? settings.min_insert_block_size_rows : settings.max_block_size, - table_prefers_large_blocks ? settings.min_insert_block_size_bytes : 0ULL)); - } - - auto counting = std::make_shared(input_header(), thread_status, getContext()->getQuota()); - counting->setProcessListElement(context_ptr->getProcessListElement()); - counting->setProgressCallback(context_ptr->getProgressCallback()); - out.addSource(std::move(counting)); - return out; } @@ -533,7 +519,7 @@ BlockIO InterpreterInsertQuery::execute() { /// Change query sample block columns to Nullable to allow inserting nullable columns, where NULL values will be substituted with /// default column values (in AddingDefaultsTransform), so all values will be cast correctly. - if (isNullableOrLowCardinalityNullable(input_columns[col_idx].type) && !isNullableOrLowCardinalityNullable(query_columns[col_idx].type) && output_columns.has(query_columns[col_idx].name)) + if (isNullableOrLowCardinalityNullable(input_columns[col_idx].type) && !isNullableOrLowCardinalityNullable(query_columns[col_idx].type) && !isVariant(query_columns[col_idx].type) && output_columns.has(query_columns[col_idx].name)) query_sample_block.setColumn(col_idx, ColumnWithTypeAndName(makeNullableOrLowCardinalityNullable(query_columns[col_idx].column), makeNullableOrLowCardinalityNullable(query_columns[col_idx].type), query_columns[col_idx].name)); } } @@ -558,8 +544,7 @@ BlockIO InterpreterInsertQuery::execute() } for (size_t i = 0; i < pre_streams_size; ++i) { - auto out = buildPreSinkChain(sink_chains[0].getInputHeader(), table, metadata_snapshot, - query_sample_block, /* thread_status_holder= */ nullptr); + auto out = buildPreSinkChain(sink_chains[0].getInputHeader(), table, metadata_snapshot, query_sample_block); presink_chains.emplace_back(std::move(out)); } } @@ -592,6 +577,29 @@ BlockIO InterpreterInsertQuery::execute() return std::make_shared(in_header); }); + pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr + { + auto context_ptr = getContext(); + auto counting = std::make_shared(in_header, nullptr, context_ptr->getQuota()); + counting->setProcessListElement(context_ptr->getProcessListElement()); + counting->setProgressCallback(context_ptr->getProgressCallback()); + + return counting; + }); + + if (shouldAddSquashingFroStorage(table)) + { + bool table_prefers_large_blocks = table->prefersLargeBlocks(); + + pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr + { + return std::make_shared( + in_header, + table_prefers_large_blocks ? settings.min_insert_block_size_rows : settings.max_block_size, + table_prefers_large_blocks ? settings.min_insert_block_size_bytes : 0ULL); + }); + } + size_t num_select_threads = pipeline.getNumThreads(); for (auto & chain : presink_chains) @@ -634,7 +642,27 @@ BlockIO InterpreterInsertQuery::execute() } else { - presink_chains.at(0).appendChain(std::move(sink_chains.at(0))); + auto & chain = presink_chains.at(0); + chain.appendChain(std::move(sink_chains.at(0))); + + if (shouldAddSquashingFroStorage(table)) + { + bool table_prefers_large_blocks = table->prefersLargeBlocks(); + + auto squashing = std::make_shared( + chain.getInputHeader(), + table_prefers_large_blocks ? settings.min_insert_block_size_rows : settings.max_block_size, + table_prefers_large_blocks ? settings.min_insert_block_size_bytes : 0ULL); + + chain.addSource(std::move(squashing)); + } + + auto context_ptr = getContext(); + auto counting = std::make_shared(chain.getInputHeader(), nullptr, context_ptr->getQuota()); + counting->setProcessListElement(context_ptr->getProcessListElement()); + counting->setProgressCallback(context_ptr->getProgressCallback()); + chain.addSource(std::move(counting)); + res.pipeline = QueryPipeline(std::move(presink_chains[0])); res.pipeline.setNumThreads(std::min(res.pipeline.getNumThreads(), settings.max_threads)); res.pipeline.setConcurrencyControl(settings.use_concurrency_control); diff --git a/src/Interpreters/InterpreterInsertQuery.h b/src/Interpreters/InterpreterInsertQuery.h index 845cb6b730b..74baf4bc4f6 100644 --- a/src/Interpreters/InterpreterInsertQuery.h +++ b/src/Interpreters/InterpreterInsertQuery.h @@ -59,6 +59,8 @@ public: void addBuffer(std::unique_ptr buffer) { owned_buffers.push_back(std::move(buffer)); } + bool shouldAddSquashingFroStorage(const StoragePtr & table) const; + private: Block getSampleBlock(const Names & names, const StoragePtr & table, const StorageMetadataPtr & metadata_snapshot) const; @@ -81,8 +83,7 @@ private: const Block & subsequent_header, const StoragePtr & table, const StorageMetadataPtr & metadata_snapshot, - const Block & query_sample_block, - ThreadStatusesHolderPtr thread_status_holder); + const Block & query_sample_block); }; diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 946a62c39c1..d0cf9f1160c 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -864,38 +864,7 @@ bool InterpreterSelectQuery::adjustParallelReplicasAfterAnalysis() ASTSelectQuery & query = getSelectQuery(); /// While only_analyze we don't know anything about parts, so any decision about how many parallel replicas to use would be wrong - if (!storage || !context->canUseParallelReplicasOnInitiator()) - return false; - - /// check if IN operator with subquery is present in the query - /// if so, disable parallel replicas - if (query_analyzer->getPreparedSets()->hasSubqueries()) - { - bool in_subqueries = false; - const auto & sets = query_analyzer->getPreparedSets(); - const auto subqueries = sets->getSubqueries(); - for (const auto & subquery : subqueries) - { - if (subquery->isINSubquery()) - { - in_subqueries = true; - break; - } - } - - if (in_subqueries) - { - if (settings.allow_experimental_parallel_reading_from_replicas == 2) - throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "IN with subquery is not supported with parallel replicas"); - - context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0)); - context->setSetting("max_parallel_replicas", UInt64{0}); - LOG_DEBUG(log, "Disabling parallel replicas to execute a query with IN with subquery"); - return true; - } - } - - if (options.only_analyze) + if (!storage || options.only_analyze || !context->canUseParallelReplicasOnInitiator()) return false; if (getTrivialCount(0).has_value()) diff --git a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp index 16bc4b1fe2e..cc1d7dd6531 100644 --- a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp +++ b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp @@ -56,7 +56,7 @@ InterpreterSelectWithUnionQuery::InterpreterSelectWithUnionQuery( size_t num_children = ast->list_of_selects->children.size(); if (!num_children) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: no children in ASTSelectWithUnionQuery"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "No children in ASTSelectWithUnionQuery"); /// Note that we pass 'required_result_column_names' to first SELECT. /// And for the rest, we pass names at the corresponding positions of 'required_result_column_names' in the result of first SELECT, diff --git a/src/Interpreters/InterpreterShowFunctionsQuery.cpp b/src/Interpreters/InterpreterShowFunctionsQuery.cpp index e83f61eac53..829670d7929 100644 --- a/src/Interpreters/InterpreterShowFunctionsQuery.cpp +++ b/src/Interpreters/InterpreterShowFunctionsQuery.cpp @@ -25,13 +25,13 @@ String InterpreterShowFunctionsQuery::getRewrittenQuery() const auto & query = query_ptr->as(); - DatabasePtr systemDb = DatabaseCatalog::instance().getSystemDatabase(); + DatabasePtr system_db = DatabaseCatalog::instance().getSystemDatabase(); String rewritten_query = fmt::format( R"( SELECT * FROM {}.{})", - systemDb->getDatabaseName(), + system_db->getDatabaseName(), functions_table); if (!query.like.empty()) diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 9a80553f149..19449cd9e28 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -1083,7 +1083,7 @@ void InterpreterSystemQuery::syncReplica(ASTSystemQuery & query) auto sync_timeout = getContext()->getSettingsRef().receive_timeout.totalMilliseconds(); if (!storage_replicated->waitForProcessingQueue(sync_timeout, query.sync_replica_mode, query.src_replicas)) { - LOG_ERROR(log, "SYNC REPLICA {}: Timed out!", table_id.getNameForLogs()); + LOG_ERROR(log, "SYNC REPLICA {}: Timed out.", table_id.getNameForLogs()); throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "SYNC REPLICA {}: command timed out. " \ "See the 'receive_timeout' setting", table_id.getNameForLogs()); } diff --git a/src/Interpreters/JoinToSubqueryTransformVisitor.cpp b/src/Interpreters/JoinToSubqueryTransformVisitor.cpp index bf2d1eb79cd..6251a9604e1 100644 --- a/src/Interpreters/JoinToSubqueryTransformVisitor.cpp +++ b/src/Interpreters/JoinToSubqueryTransformVisitor.cpp @@ -168,7 +168,7 @@ private: has_asterisks = true; if (!qualified_asterisk->qualifier) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: qualified asterisk must have a qualifier"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Qualified asterisk must have a qualifier"); auto & identifier = qualified_asterisk->qualifier->as(); @@ -183,7 +183,7 @@ private: transformer->as()) IASTColumnsTransformer::transform(transformer, columns); else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: qualified asterisk must only have children of IASTColumnsTransformer type"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Qualified asterisk must only have children of IASTColumnsTransformer type"); } } } diff --git a/src/Interpreters/MergeJoin.cpp b/src/Interpreters/MergeJoin.cpp index 901c82029ee..d5fb0208d45 100644 --- a/src/Interpreters/MergeJoin.cpp +++ b/src/Interpreters/MergeJoin.cpp @@ -239,7 +239,7 @@ public: /// SortCursorImpl can work with permutation, but MergeJoinCursor can't. if (impl.permutation) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: MergeJoinCursor doesn't support permutation"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "MergeJoinCursor doesn't support permutation"); } size_t position() const { return impl.getRow(); } diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index a3d1b84fdc1..502b961ced8 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -342,6 +342,11 @@ bool MutationsInterpreter::Source::hasProjection(const String & name) const return part && part->hasProjection(name); } +bool MutationsInterpreter::Source::hasBrokenProjection(const String & name) const +{ + return part && part->hasBrokenProjection(name); +} + bool MutationsInterpreter::Source::isCompactPart() const { return part && part->getType() == MergeTreeDataPartType::Compact; @@ -807,7 +812,7 @@ void MutationsInterpreter::prepare(bool dry_run) { mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); const auto & projection = projections_desc.get(command.projection_name); - if (!source.hasProjection(projection.name)) + if (!source.hasProjection(projection.name) || source.hasBrokenProjection(projection.name)) { for (const auto & column : projection.required_columns) dependencies.emplace(column, ColumnDependency::PROJECTION); @@ -994,6 +999,13 @@ void MutationsInterpreter::prepare(bool dry_run) if (!source.hasProjection(projection.name)) continue; + /// Always rebuild broken projections. + if (source.hasBrokenProjection(projection.name)) + { + materialized_projections.insert(projection.name); + continue; + } + if (need_rebuild_projections) { materialized_projections.insert(projection.name); diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h index eda94190185..4c35ec34b58 100644 --- a/src/Interpreters/MutationsInterpreter.h +++ b/src/Interpreters/MutationsInterpreter.h @@ -126,6 +126,7 @@ public: bool materializeTTLRecalculateOnly() const; bool hasSecondaryIndex(const String & name) const; bool hasProjection(const String & name) const; + bool hasBrokenProjection(const String & name) const; bool isCompactPart() const; void read( diff --git a/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp b/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp index 0fdc9347ee9..107b435ded4 100644 --- a/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp +++ b/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp @@ -337,7 +337,7 @@ static ASTPtr getPartitionPolicy(const NamesAndTypesList & primary_keys) WhichDataType which(type); if (which.isNullable()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "LOGICAL ERROR: MySQL primary key must be not null, it is a bug."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "MySQL's primary key must be not null, it is a bug."); if (which.isDate() || which.isDate32() || which.isDateTime() || which.isDateTime64()) { diff --git a/src/Interpreters/PreparedSets.cpp b/src/Interpreters/PreparedSets.cpp index cc3db726f01..76f75cde1dc 100644 --- a/src/Interpreters/PreparedSets.cpp +++ b/src/Interpreters/PreparedSets.cpp @@ -98,12 +98,8 @@ FutureSetFromSubquery::FutureSetFromSubquery( std::unique_ptr source_, StoragePtr external_table_, std::shared_ptr external_table_set_, - const Settings & settings, - bool in_subquery_) - : external_table(std::move(external_table_)) - , external_table_set(std::move(external_table_set_)) - , source(std::move(source_)) - , in_subquery(in_subquery_) + const Settings & settings) + : external_table(std::move(external_table_)), external_table_set(std::move(external_table_set_)), source(std::move(source_)) { set_and_key = std::make_shared(); set_and_key->key = std::move(key); @@ -281,16 +277,10 @@ FutureSetFromSubqueryPtr PreparedSets::addFromSubquery( std::unique_ptr source, StoragePtr external_table, FutureSetFromSubqueryPtr external_table_set, - const Settings & settings, - bool in_subquery) + const Settings & settings) { auto from_subquery = std::make_shared( - toString(key, {}), - std::move(source), - std::move(external_table), - std::move(external_table_set), - settings, - in_subquery); + toString(key, {}), std::move(source), std::move(external_table), std::move(external_table_set), settings); auto [it, inserted] = sets_from_subqueries.emplace(key, from_subquery); @@ -340,15 +330,6 @@ std::shared_ptr PreparedSets::findSubquery(const Hash & k return it->second; } -void PreparedSets::markAsINSubquery(const Hash & key) -{ - auto it = sets_from_subqueries.find(key); - if (it == sets_from_subqueries.end()) - return; - - it->second->markAsINSubquery(); -} - std::shared_ptr PreparedSets::findStorage(const Hash & key) const { auto it = sets_from_storage.find(key); diff --git a/src/Interpreters/PreparedSets.h b/src/Interpreters/PreparedSets.h index 7178cff73b9..3419d3b6839 100644 --- a/src/Interpreters/PreparedSets.h +++ b/src/Interpreters/PreparedSets.h @@ -101,8 +101,7 @@ public: std::unique_ptr source_, StoragePtr external_table_, std::shared_ptr external_table_set_, - const Settings & settings, - bool in_subquery_); + const Settings & settings); FutureSetFromSubquery( String key, @@ -118,8 +117,6 @@ public: QueryTreeNodePtr detachQueryTree() { return std::move(query_tree); } void setQueryPlan(std::unique_ptr source_); - void markAsINSubquery() { in_subquery = true; } - bool isINSubquery() const { return in_subquery; } private: SetAndKeyPtr set_and_key; @@ -128,11 +125,6 @@ private: std::unique_ptr source; QueryTreeNodePtr query_tree; - bool in_subquery = false; // subquery used in IN operator - // the flag can be removed after enabling new analyzer and removing interpreter - // or after enabling support IN operator with subqueries in parallel replicas - // Note: it's necessary with interpreter since prepared sets used also for GLOBAL JOINs, - // with new analyzer it's not a case }; using FutureSetFromSubqueryPtr = std::shared_ptr; @@ -160,8 +152,7 @@ public: std::unique_ptr source, StoragePtr external_table, FutureSetFromSubqueryPtr external_table_set, - const Settings & settings, - bool in_subquery = false); + const Settings & settings); FutureSetFromSubqueryPtr addFromSubquery( const Hash & key, @@ -171,7 +162,6 @@ public: FutureSetFromTuplePtr findTuple(const Hash & key, const DataTypes & types) const; FutureSetFromStoragePtr findStorage(const Hash & key) const; FutureSetFromSubqueryPtr findSubquery(const Hash & key) const; - void markAsINSubquery(const Hash & key); using Subqueries = std::vector; Subqueries getSubqueries() const; diff --git a/src/Interpreters/ProcessList.cpp b/src/Interpreters/ProcessList.cpp index 5b3b87114ae..3bd7b2d4206 100644 --- a/src/Interpreters/ProcessList.cpp +++ b/src/Interpreters/ProcessList.cpp @@ -295,7 +295,7 @@ ProcessListEntry::~ProcessListEntry() auto user_process_list_it = parent.user_to_queries.find(user); if (user_process_list_it == parent.user_to_queries.end()) { - LOG_ERROR(getLogger("ProcessList"), "Logical error: cannot find user in ProcessList"); + LOG_ERROR(getLogger("ProcessList"), "Cannot find user in ProcessList"); std::terminate(); } @@ -323,7 +323,7 @@ ProcessListEntry::~ProcessListEntry() if (!found) { - LOG_ERROR(getLogger("ProcessList"), "Logical error: cannot find query by query_id and pointer to ProcessListElement in ProcessListForUser"); + LOG_ERROR(getLogger("ProcessList"), "Cannot find query by query_id and pointer to ProcessListElement in ProcessListForUser"); std::terminate(); } diff --git a/src/Interpreters/ProfileEventsExt.cpp b/src/Interpreters/ProfileEventsExt.cpp index bd421ae8e33..dd8306066e7 100644 --- a/src/Interpreters/ProfileEventsExt.cpp +++ b/src/Interpreters/ProfileEventsExt.cpp @@ -99,7 +99,7 @@ static void dumpMemoryTracker(ProfileEventsSnapshot const & snapshot, DB::Mutabl } void getProfileEvents( - const String & server_display_name, + const String & host_name, DB::InternalProfileEventsQueuePtr profile_queue, DB::Block & block, ThreadIdToCountersSnapshot & last_sent_snapshots) @@ -139,8 +139,8 @@ void getProfileEvents( } last_sent_snapshots = std::move(new_snapshots); - dumpProfileEvents(group_snapshot, columns, server_display_name); - dumpMemoryTracker(group_snapshot, columns, server_display_name); + dumpProfileEvents(group_snapshot, columns, host_name); + dumpMemoryTracker(group_snapshot, columns, host_name); Block curr_block; diff --git a/src/Interpreters/ProfileEventsExt.h b/src/Interpreters/ProfileEventsExt.h index cc338530510..9099e6902ec 100644 --- a/src/Interpreters/ProfileEventsExt.h +++ b/src/Interpreters/ProfileEventsExt.h @@ -26,7 +26,7 @@ using ThreadIdToCountersSnapshot = std::unordered_mapsetAlias(alias); } + else if (function.name == "variantElement" && column_type_id == TypeIndex::Variant) + { + const auto * literal = arguments[1]->as(); + if (!literal) + return; + + String subcolumn_name; + auto value_type = literal->value.getType(); + if (value_type != Field::Types::String) + return; + + subcolumn_name = literal->value.get(); + ast = transformToSubcolumn(name_in_storage, subcolumn_name); + ast->setAlias(alias); + } else { auto it = binary_function_to_subcolumn.find(function.name); diff --git a/src/Interpreters/RewriteSumFunctionWithSumAndCountVisitor.cpp b/src/Interpreters/RewriteSumFunctionWithSumAndCountVisitor.cpp index b654d28d750..2f5e597bdab 100644 --- a/src/Interpreters/RewriteSumFunctionWithSumAndCountVisitor.cpp +++ b/src/Interpreters/RewriteSumFunctionWithSumAndCountVisitor.cpp @@ -100,7 +100,10 @@ void RewriteSumFunctionWithSumAndCountMatcher::visit(const ASTFunction & functio if (!new_ast) return; else + { + new_ast->setAlias(ast->tryGetAlias()); ast = new_ast; + } } else if (column_id == 1) { @@ -116,7 +119,10 @@ void RewriteSumFunctionWithSumAndCountMatcher::visit(const ASTFunction & functio if (!new_ast) return; else + { + new_ast->setAlias(ast->tryGetAlias()); ast = new_ast; + } } } diff --git a/src/Interpreters/RewriteUniqToCountVisitor.cpp b/src/Interpreters/RewriteUniqToCountVisitor.cpp index ddec6fe063e..a2e3a790c27 100644 --- a/src/Interpreters/RewriteUniqToCountVisitor.cpp +++ b/src/Interpreters/RewriteUniqToCountVisitor.cpp @@ -156,7 +156,11 @@ void RewriteUniqToCountMatcher::visit(ASTPtr & ast, Data & /*data*/) }; if (match_subquery_with_distinct() || match_subquery_with_group_by()) + { + auto main_alias = expr_list->children[0]->tryGetAlias(); expr_list->children[0] = makeASTFunction("count"); + expr_list->children[0]->setAlias(main_alias); + } } } diff --git a/src/Interpreters/RowRefs.cpp b/src/Interpreters/RowRefs.cpp index 4335cde47f9..9785ba46dab 100644 --- a/src/Interpreters/RowRefs.cpp +++ b/src/Interpreters/RowRefs.cpp @@ -175,45 +175,42 @@ private: // the array becomes immutable void sort() { - if (!sorted.load(std::memory_order_acquire)) + if (sorted.load(std::memory_order_acquire)) + return; + + std::lock_guard l(lock); + + if (sorted.load(std::memory_order_relaxed)) + return; + + if constexpr (std::is_arithmetic_v && !std::is_floating_point_v) { - std::lock_guard l(lock); - - if (!sorted.load(std::memory_order_relaxed)) + if (likely(entries.size() > 256)) { - if constexpr (std::is_arithmetic_v && !std::is_floating_point_v) + struct RadixSortTraits : RadixSortNumTraits { - if (likely(entries.size() > 256)) - { - struct RadixSortTraits : RadixSortNumTraits - { - using Element = Entry; - using Result = Element; + using Element = Entry; + using Result = Element; - static TKey & extractKey(Element & elem) { return elem.value; } - static Result extractResult(Element & elem) { return elem; } - }; - - if constexpr (is_descending) - RadixSort::executeLSD(entries.data(), entries.size(), true); - else - RadixSort::executeLSD(entries.data(), entries.size(), false); - - sorted.store(true, std::memory_order_release); - return; - } - } - - if constexpr (is_descending) - ::sort(entries.begin(), entries.end(), GreaterEntryOperator()); - else - ::sort(entries.begin(), entries.end(), LessEntryOperator()); + static TKey & extractKey(Element & elem) { return elem.value; } + static Result extractResult(Element & elem) { return elem; } + }; + RadixSort::executeLSDWithTrySort(entries.data(), entries.size(), is_descending /*reverse*/); sorted.store(true, std::memory_order_release); + return; } } + + if constexpr (is_descending) + ::sort(entries.begin(), entries.end(), GreaterEntryOperator()); + else + ::sort(entries.begin(), entries.end(), LessEntryOperator()); + + sorted.store(true, std::memory_order_release); } }; + } AsofRowRefs createAsofRowRef(TypeIndex type, ASOFJoinInequality inequality) diff --git a/src/Interpreters/Session.cpp b/src/Interpreters/Session.cpp index 533f33033e3..b52f8a507e3 100644 --- a/src/Interpreters/Session.cpp +++ b/src/Interpreters/Session.cpp @@ -349,10 +349,9 @@ void Session::authenticate(const Credentials & credentials_, const Poco::Net::So try { - auto auth_result = global_context->getAccessControl().authenticate(credentials_, address.host()); + auto auth_result = global_context->getAccessControl().authenticate(credentials_, address.host(), getClientInfo().getLastForwardedFor()); user_id = auth_result.user_id; settings_from_auth_server = auth_result.settings; - LOG_DEBUG(log, "{} Authenticated with global context as user {}", toString(auth_id), toString(*user_id)); } @@ -430,11 +429,11 @@ void Session::setClientConnectionId(uint32_t connection_id) prepared_client_info->connection_id = connection_id; } -void Session::setHttpClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer) +void Session::setHTTPClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer) { if (session_context) { - session_context->setHttpClientInfo(http_method, http_user_agent, http_referer); + session_context->setHTTPClientInfo(http_method, http_user_agent, http_referer); } else { diff --git a/src/Interpreters/Session.h b/src/Interpreters/Session.h index cde000d89fa..334560a33c8 100644 --- a/src/Interpreters/Session.h +++ b/src/Interpreters/Session.h @@ -65,7 +65,7 @@ public: void setClientInterface(ClientInfo::Interface interface); void setClientVersion(UInt64 client_version_major, UInt64 client_version_minor, UInt64 client_version_patch, unsigned client_tcp_protocol_version); void setClientConnectionId(uint32_t connection_id); - void setHttpClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer); + void setHTTPClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer); void setForwardedFor(const String & forwarded_for); void setQuotaClientKey(const String & quota_key); void setConnectionClientVersion(UInt64 client_version_major, UInt64 client_version_minor, UInt64 client_version_patch, unsigned client_tcp_protocol_version); diff --git a/src/Interpreters/Set.cpp b/src/Interpreters/Set.cpp index 84260faafd4..8f11754b3be 100644 --- a/src/Interpreters/Set.cpp +++ b/src/Interpreters/Set.cpp @@ -275,7 +275,7 @@ void Set::appendSetElements(SetKeyColumns & holder) void Set::checkIsCreated() const { if (!is_created.load()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: Trying to use set before it has been built."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to use set before it has been built."); } ColumnPtr Set::execute(const ColumnsWithTypeAndName & columns, bool negative) const @@ -283,7 +283,7 @@ ColumnPtr Set::execute(const ColumnsWithTypeAndName & columns, bool negative) co size_t num_key_columns = columns.size(); if (0 == num_key_columns) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: no columns passed to Set::execute method."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "No columns passed to Set::execute method."); auto res = ColumnUInt8::create(); ColumnUInt8::Container & vec_res = res->getData(); diff --git a/src/Interpreters/Set.h b/src/Interpreters/Set.h index 5a65d40d89f..8b44f36b278 100644 --- a/src/Interpreters/Set.h +++ b/src/Interpreters/Set.h @@ -77,6 +77,7 @@ public: const DataTypes & getElementsTypes() const { return set_elements_types; } bool hasExplicitSetElements() const { return fill_set_elements || (!set_elements.empty() && set_elements.front()->size() == data.getTotalRowCount()); } + bool hasSetElements() const { return !set_elements.empty(); } Columns getSetElements() const { checkIsCreated(); return { set_elements.begin(), set_elements.end() }; } void checkColumnsNumber(size_t num_key_columns) const; diff --git a/src/Interpreters/SetVariants.cpp b/src/Interpreters/SetVariants.cpp index cd9148a01cf..0fb2e5189d4 100644 --- a/src/Interpreters/SetVariants.cpp +++ b/src/Interpreters/SetVariants.cpp @@ -146,7 +146,7 @@ typename SetVariantsTemplate::Type SetVariantsTemplate::choose return Type::keys128; if (size_of_field == 32) return Type::keys256; - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: numeric column has sizeOfField not in 1, 2, 4, 8, 16, 32."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Numeric column has sizeOfField not in 1, 2, 4, 8, 16, 32."); } /// If the keys fit in N bits, we will use a hash table for N-bit-packed keys diff --git a/src/Interpreters/SystemLog.cpp b/src/Interpreters/SystemLog.cpp index 2fb782befa1..6580dc3e9b7 100644 --- a/src/Interpreters/SystemLog.cpp +++ b/src/Interpreters/SystemLog.cpp @@ -211,16 +211,17 @@ std::shared_ptr createSystemLog( if (!settings.empty()) log_settings.engine += (storage_policy.empty() ? " " : ", ") + settings; } - - /// Add comment to AST. So it will be saved when the table will be renamed. - log_settings.engine += fmt::format(" COMMENT {} ", quoteString(comment)); } /// Validate engine definition syntax to prevent some configuration errors. ParserStorageWithComment storage_parser; - - parseQuery(storage_parser, log_settings.engine.data(), log_settings.engine.data() + log_settings.engine.size(), + auto storage_ast = parseQuery(storage_parser, log_settings.engine.data(), log_settings.engine.data() + log_settings.engine.size(), "Storage to create table for " + config_prefix, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH); + auto & storage_with_comment = storage_ast->as(); + + /// Add comment to AST. So it will be saved when the table will be renamed. + if (!storage_with_comment.comment || storage_with_comment.comment->as().value.safeGet().empty()) + log_settings.engine += fmt::format(" COMMENT {} ", quoteString(comment)); log_settings.queue_settings.flush_interval_milliseconds = config.getUInt64(config_prefix + ".flush_interval_milliseconds", TSystemLog::getDefaultFlushIntervalMilliseconds()); diff --git a/src/Interpreters/TablesStatus.cpp b/src/Interpreters/TablesStatus.cpp index 005a4515c3a..911a028f813 100644 --- a/src/Interpreters/TablesStatus.cpp +++ b/src/Interpreters/TablesStatus.cpp @@ -35,7 +35,7 @@ void TableStatus::read(ReadBuffer & in) void TablesStatusRequest::write(WriteBuffer & out, UInt64 server_protocol_revision) const { if (server_protocol_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: method TablesStatusRequest::write is called for unsupported server revision"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Method TablesStatusRequest::write is called for unsupported server revision"); writeVarUInt(tables.size(), out); for (const auto & table_name : tables) diff --git a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp index 130ce2194fd..3de7e217e53 100644 --- a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp +++ b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp @@ -158,7 +158,7 @@ void TranslateQualifiedNamesMatcher::visit(ASTFunction & node, const ASTPtr &, D void TranslateQualifiedNamesMatcher::visit(const ASTQualifiedAsterisk & node, const ASTPtr &, Data & data) { if (!node.qualifier) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: qualified asterisk must have a qualifier"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Qualified asterisk must have a qualifier"); /// @note it could contain table alias as table name. DatabaseAndTableWithAlias db_and_table(node.qualifier); diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp index 2aa52c33048..9a311d20c28 100644 --- a/src/Interpreters/TreeOptimizer.cpp +++ b/src/Interpreters/TreeOptimizer.cpp @@ -77,11 +77,10 @@ const std::unordered_set possibly_injective_function_names */ void appendUnusedGroupByColumn(ASTSelectQuery * select_query) { - /// You must insert a constant that is not the name of the column in the table. Such a case is rare, but it happens. - /// Also start unused_column integer must not intersect with ([1, source_columns.size()]) - /// might be in positional GROUP BY. + /// Since ASTLiteral is different from ASTIdentifier, so we can use a special constant String Literal for this, + /// and do not need to worry about it conflict with the name of the column in the table. select_query->setExpression(ASTSelectQuery::Expression::GROUP_BY, std::make_shared()); - select_query->groupBy()->children.emplace_back(std::make_shared(static_cast(-1))); + select_query->groupBy()->children.emplace_back(std::make_shared("__unused_group_by_column")); } /// Eliminates injective function calls and constant expressions from group by statement. diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index ecd021328e7..bb6df2da8d9 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -262,8 +262,7 @@ struct ExistsExpressionData select_with_union_query->list_of_selects->children.push_back(std::move(select_query)); select_with_union_query->children.push_back(select_with_union_query->list_of_selects); - auto new_subquery = std::make_shared(); - new_subquery->children.push_back(select_with_union_query); + auto new_subquery = std::make_shared(std::move(select_with_union_query)); auto function = makeASTFunction("in", std::make_shared(1u), new_subquery); func = *function; diff --git a/src/Interpreters/evaluateConstantExpression.cpp b/src/Interpreters/evaluateConstantExpression.cpp index af8bd19370b..b5c3e00e299 100644 --- a/src/Interpreters/evaluateConstantExpression.cpp +++ b/src/Interpreters/evaluateConstantExpression.cpp @@ -106,7 +106,7 @@ std::optional evaluateConstantExpressionImpl(c if (result_column->empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, - "Logical error: empty result column after evaluation " + "Empty result column after evaluation " "of constant expression for IN, VALUES, or LIMIT, or aggregate function parameter, or a table function argument"); /// Expressions like rand() or now() are not constant @@ -661,7 +661,7 @@ namespace const ActionsDAG::NodeRawConstPtrs & target_expr, ConjunctionMap && conjunction) { - auto columns = ActionsDAG::evaluatePartialResult(conjunction, target_expr, false); + auto columns = ActionsDAG::evaluatePartialResult(conjunction, target_expr, /* input_rows_count= */ 1, /* throw_on_error= */ false); for (const auto & column : columns) if (!column.column) return {}; diff --git a/src/Interpreters/executeDDLQueryOnCluster.h b/src/Interpreters/executeDDLQueryOnCluster.h index 7daf9babf9f..d3365553875 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.h +++ b/src/Interpreters/executeDDLQueryOnCluster.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include namespace zkutil diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index a377d2e0b97..f2aa51bd6de 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -102,8 +103,13 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; extern const int QUERY_WAS_CANCELLED; extern const int INCORRECT_DATA; + extern const int SUPPORT_IS_DISABLED; } +namespace FailPoints +{ + extern const char execute_query_calling_empty_set_result_func_on_exception[]; +} static void checkASTSizeLimits(const IAST & ast, const Settings & settings) { @@ -662,15 +668,17 @@ static std::tuple executeQueryImpl( if (query_span && query_span->trace_id != UUID{}) LOG_TRACE(getLogger("executeQuery"), "Query span trace_id for opentelemetry log: {}", query_span->trace_id); + /// Used for logging query start time in system.query_log auto query_start_time = std::chrono::system_clock::now(); - /// Used to set the watch in QueryStatus and the output formats. It is not based on query_start_time as that might be based on - /// the value passed by the client + /// Used for: + /// * Setting the watch in QueryStatus (controls timeouts and progress) and the output formats + /// * Logging query duration (system.query_log) Stopwatch start_watch{CLOCK_MONOTONIC}; const auto & client_info = context->getClientInfo(); - if (!internal) + if (!internal && client_info.initial_query_start_time == 0) { // If it's not an internal query and we don't see an initial_query_start_time yet, initialize it // to current time. Internal queries are those executed without an independent client context, @@ -678,15 +686,7 @@ static std::tuple executeQueryImpl( // possible to have unset initial_query_start_time for non-internal and non-initial queries. For // example, the query is from an initiator that is running an old version of clickhouse. // On the other hand, if it's initialized then take it as the start of the query - if (client_info.initial_query_start_time == 0) - { - context->setInitialQueryStartTime(query_start_time); - } - else - { - query_start_time = std::chrono::time_point( - std::chrono::microseconds{client_info.initial_query_start_time_microseconds}); - } + context->setInitialQueryStartTime(query_start_time); } assert(internal || CurrentThread::get().getQueryContext()); @@ -709,10 +709,7 @@ static std::tuple executeQueryImpl( { if (settings.dialect == Dialect::kusto && !internal) { - ParserKQLStatement parser(end, settings.allow_settings_after_format_in_insert); - - /// TODO: parser should fail early when max_query_size limit is reached. - ast = parseKQLQuery(parser, begin, end, "", max_query_size, settings.max_parser_depth); + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Kusto dialect is disabled until these two bugs will be fixed: https://github.com/ClickHouse/ClickHouse/issues/59037 and https://github.com/ClickHouse/ClickHouse/issues/59036"); } else if (settings.dialect == Dialect::prql && !internal) { @@ -724,6 +721,27 @@ static std::tuple executeQueryImpl( ParserQuery parser(end, settings.allow_settings_after_format_in_insert); /// TODO: parser should fail early when max_query_size limit is reached. ast = parseQuery(parser, begin, end, "", max_query_size, settings.max_parser_depth); + +#if 0 + /// Verify that AST formatting is consistent: + /// If you format AST, parse it back, and format it again, you get the same string. + + String formatted1 = ast->formatWithPossiblyHidingSensitiveData(0, true, true); + + ASTPtr ast2 = parseQuery(parser, + formatted1.data(), + formatted1.data() + formatted1.size(), + "", max_query_size, settings.max_parser_depth); + + chassert(ast2); + + String formatted2 = ast2->formatWithPossiblyHidingSensitiveData(0, true, true); + + if (formatted1 != formatted2) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Inconsistent AST formatting: the query:\n{}\nWas parsed and formatted back as:\n{}", + formatted1, formatted2); +#endif } const char * query_end = end; @@ -1367,7 +1385,7 @@ void executeQuery( BlockIO streams; OutputFormatPtr output_format; - auto update_format_for_exception_if_needed = [&]() + auto update_format_on_exception_if_needed = [&]() { if (!output_format) { @@ -1380,8 +1398,20 @@ void executeQuery( /// Force an update of the headers before we start writing result_details.content_type = output_format->getContentType(); result_details.format = format_name; - set_result_details(result_details); - set_result_details = nullptr; + + fiu_do_on(FailPoints::execute_query_calling_empty_set_result_func_on_exception, { + // it will throw std::bad_function_call + set_result_details = nullptr; + set_result_details(result_details); + }); + + if (set_result_details) + { + /// reset set_result_details func to avoid calling in SCOPE_EXIT() + auto set_result_details_copy = set_result_details; + set_result_details = nullptr; + set_result_details_copy(result_details); + } } } catch (const DB::Exception & e) @@ -1400,7 +1430,7 @@ void executeQuery( { if (handle_exception_in_output_format) { - update_format_for_exception_if_needed(); + update_format_on_exception_if_needed(); if (output_format) handle_exception_in_output_format(*output_format); } @@ -1501,13 +1531,17 @@ void executeQuery( } catch (...) { + /// first execute on exception callback, it includes updating query_log + /// otherwise closing record ('ExceptionWhileProcessing') can be not appended in query_log + /// due to possible exceptions in functions called below (passed as parameter here) + streams.onException(); + if (handle_exception_in_output_format) { - update_format_for_exception_if_needed(); + update_format_on_exception_if_needed(); if (output_format) handle_exception_in_output_format(*output_format); } - streams.onException(); throw; } diff --git a/src/Interpreters/getHeaderForProcessingStage.cpp b/src/Interpreters/getHeaderForProcessingStage.cpp index d16e01ef2d2..67a909ba6b4 100644 --- a/src/Interpreters/getHeaderForProcessingStage.cpp +++ b/src/Interpreters/getHeaderForProcessingStage.cpp @@ -167,8 +167,7 @@ Block getHeaderForProcessingStage( return result; } } - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical Error: unknown processed stage."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown processed stage."); } } - diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp index c7a1cab8bac..fd8f5b154c4 100644 --- a/src/Interpreters/inplaceBlockConversions.cpp +++ b/src/Interpreters/inplaceBlockConversions.cpp @@ -237,17 +237,36 @@ static std::unordered_map collectOffsetsColumns( { auto & offsets_column = offsets_columns[stream_name]; if (!offsets_column) + { offsets_column = current_offsets_column; + } + else + { + /// If we are inside Variant element, it may happen that + /// offsets are different, because when we read Variant + /// element as a subcolumn, we expand this column according + /// to the discriminators, so, offsets column can be changed. + /// In this case we should select the original offsets column + /// of this stream, which is the smallest one. + bool inside_variant_element = false; + for (const auto & elem : subpath) + inside_variant_element |= elem.type == ISerialization::Substream::VariantElement; - #ifndef NDEBUG - const auto & offsets_data = assert_cast(*offsets_column).getData(); - const auto & current_offsets_data = assert_cast(*current_offsets_column).getData(); + if (offsets_column->size() != current_offsets_column->size() && inside_variant_element) + offsets_column = offsets_column->size() < current_offsets_column->size() ? offsets_column : current_offsets_column; +#ifndef NDEBUG + else + { + const auto & offsets_data = assert_cast(*offsets_column).getData(); + const auto & current_offsets_data = assert_cast(*current_offsets_column).getData(); - if (offsets_data != current_offsets_data) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Found non-equal columns with offsets (sizes: {} and {}) for stream {}", - offsets_data.size(), current_offsets_data.size(), stream_name); - #endif + if (offsets_data != current_offsets_data) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Found non-equal columns with offsets (sizes: {} and {}) for stream {}", + offsets_data.size(), current_offsets_data.size(), stream_name); + } +#endif + } } }, available_column->type, res_columns[i]); } diff --git a/src/Interpreters/parseColumnsListForTableFunction.cpp b/src/Interpreters/parseColumnsListForTableFunction.cpp index 87f76f7f824..551a883d093 100644 --- a/src/Interpreters/parseColumnsListForTableFunction.cpp +++ b/src/Interpreters/parseColumnsListForTableFunction.cpp @@ -60,6 +60,17 @@ void validateDataType(const DataTypePtr & type, const DataTypeValidationSettings MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS); } } + + if (!settings.allow_experimental_variant_type) + { + if (isVariant(type)) + { + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Cannot create column with type '{}' because experimental Variant type is not allowed. " + "Set setting allow_experimental_variant_type = 1 in order to allow it", type->getName()); + } + } } ColumnsDescription parseColumnsListFromString(const std::string & structure, const ContextPtr & context) diff --git a/src/Interpreters/parseColumnsListForTableFunction.h b/src/Interpreters/parseColumnsListForTableFunction.h index ef1bbe5498e..1fbbfa4b12f 100644 --- a/src/Interpreters/parseColumnsListForTableFunction.h +++ b/src/Interpreters/parseColumnsListForTableFunction.h @@ -18,12 +18,14 @@ struct DataTypeValidationSettings : allow_suspicious_low_cardinality_types(settings.allow_suspicious_low_cardinality_types) , allow_experimental_object_type(settings.allow_experimental_object_type) , allow_suspicious_fixed_string_types(settings.allow_suspicious_fixed_string_types) + , allow_experimental_variant_type(settings.allow_experimental_variant_type) { } bool allow_suspicious_low_cardinality_types = true; bool allow_experimental_object_type = true; bool allow_suspicious_fixed_string_types = true; + bool allow_experimental_variant_type = true; }; void validateDataType(const DataTypePtr & type, const DataTypeValidationSettings & settings); diff --git a/src/Interpreters/replaceForPositionalArguments.cpp b/src/Interpreters/replaceForPositionalArguments.cpp index 241dd7cf92c..cceb0650fcd 100644 --- a/src/Interpreters/replaceForPositionalArguments.cpp +++ b/src/Interpreters/replaceForPositionalArguments.cpp @@ -10,7 +10,8 @@ namespace DB namespace ErrorCodes { - extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int BAD_ARGUMENTS; +extern const int ILLEGAL_TYPE_OF_ARGUMENT; } bool replaceForPositionalArguments(ASTPtr & argument, const ASTSelectQuery * select_query, ASTSelectQuery::Expression expression) @@ -27,14 +28,38 @@ bool replaceForPositionalArguments(ASTPtr & argument, const ASTSelectQuery * sel return false; auto which = ast_literal->value.getType(); - if (which != Field::Types::UInt64) + if (which != Field::Types::UInt64 && which != Field::Types::Int64) return false; - auto pos = ast_literal->value.get(); + UInt64 pos; + + if (which == Field::Types::UInt64) + { + pos = ast_literal->value.get(); + } + else if (which == Field::Types::Int64) + { + auto value = ast_literal->value.get(); + if (value > 0) + pos = value; + else + { + if (static_cast(std::abs(value)) > columns.size()) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Negative positional argument number {} is out of bounds. Expected in range [-{}, -1]", + value, + columns.size()); + pos = columns.size() + value + 1; + } + } + else + { + return false; + } + if (!pos || pos > columns.size()) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Positional argument out of bounds: {} (expected in range [1, {}]", - pos, columns.size()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Positional argument out of bounds: {} (expected in range [1, {}]", pos, columns.size()); const auto & column = columns[--pos]; if (typeid_cast(column.get()) || typeid_cast(column.get())) diff --git a/src/Loggers/Loggers.cpp b/src/Loggers/Loggers.cpp index 258853e8162..1d17585cc96 100644 --- a/src/Loggers/Loggers.cpp +++ b/src/Loggers/Loggers.cpp @@ -159,7 +159,6 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log if (config.getBool("logger.use_syslog", false)) { - //const std::string & cmd_name = commandName(); auto syslog_level = Poco::Logger::parseLevel(config.getString("logger.syslog_level", log_level_string)); if (syslog_level > max_log_level) { @@ -228,22 +227,24 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log split->open(); logger.close(); - logger.setChannel(split); - // Global logging level (it can be overridden for specific loggers). + logger.setChannel(split); logger.setLevel(max_log_level); - // Set level to all already created loggers - std::vector names; - //logger_root = Logger::root(); - logger.root().names(names); - for (const auto & name : names) - logger.root().get(name).setLevel(max_log_level); - - // Attach to the root logger. + // Global logging level and channel (it can be overridden for specific loggers). logger.root().setLevel(max_log_level); logger.root().setChannel(logger.getChannel()); + // Set level and channel to all already created loggers + std::vector names; + logger.names(names); + + for (const auto & name : names) + { + logger.get(name).setLevel(max_log_level); + logger.get(name).setChannel(split); + } + // Explicitly specified log levels for specific loggers. { Poco::Util::AbstractConfiguration::Keys loggers_level; diff --git a/src/Parsers/ASTAlterQuery.cpp b/src/Parsers/ASTAlterQuery.cpp index e229095df1b..a6543190904 100644 --- a/src/Parsers/ASTAlterQuery.cpp +++ b/src/Parsers/ASTAlterQuery.cpp @@ -104,6 +104,16 @@ void ASTAlterCommand::formatImpl(const FormatSettings & settings, FormatState & { settings.ostr << (settings.hilite ? hilite_keyword : "") << " REMOVE " << remove_property; } + else if (settings_changes) + { + settings.ostr << (settings.hilite ? hilite_keyword : "") << " MODIFY SETTING " << (settings.hilite ? hilite_none : ""); + settings_changes->formatImpl(settings, state, frame); + } + else if (settings_resets) + { + settings.ostr << (settings.hilite ? hilite_keyword : "") << " RESET SETTING " << (settings.hilite ? hilite_none : ""); + settings_resets->formatImpl(settings, state, frame); + } else { if (first) diff --git a/src/Parsers/ASTBackupQuery.cpp b/src/Parsers/ASTBackupQuery.cpp index 2c26e723687..bdb78eaf971 100644 --- a/src/Parsers/ASTBackupQuery.cpp +++ b/src/Parsers/ASTBackupQuery.cpp @@ -261,23 +261,24 @@ ASTPtr ASTBackupQuery::clone() const if (settings) res->settings = settings->clone(); + cloneOutputOptions(*res); + return res; } -void ASTBackupQuery::formatImpl(const FormatSettings & format, FormatState &, FormatStateStacked) const +void ASTBackupQuery::formatQueryImpl(const FormatSettings & fs, FormatState &, FormatStateStacked) const { - format.ostr << (format.hilite ? hilite_keyword : "") << ((kind == Kind::BACKUP) ? "BACKUP " : "RESTORE ") - << (format.hilite ? hilite_none : ""); + fs.ostr << (fs.hilite ? hilite_keyword : "") << ((kind == Kind::BACKUP) ? "BACKUP " : "RESTORE ") << (fs.hilite ? hilite_none : ""); - formatElements(elements, format); - formatOnCluster(format); + formatElements(elements, fs); + formatOnCluster(fs); - format.ostr << (format.hilite ? hilite_keyword : "") << ((kind == Kind::BACKUP) ? " TO " : " FROM ") << (format.hilite ? hilite_none : ""); - backup_name->format(format); + fs.ostr << (fs.hilite ? hilite_keyword : "") << ((kind == Kind::BACKUP) ? " TO " : " FROM ") << (fs.hilite ? hilite_none : ""); + backup_name->format(fs); if (settings || base_backup_name) - formatSettings(settings, base_backup_name, cluster_host_ids, format); + formatSettings(settings, base_backup_name, cluster_host_ids, fs); } ASTPtr ASTBackupQuery::getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams & params) const diff --git a/src/Parsers/ASTBackupQuery.h b/src/Parsers/ASTBackupQuery.h index 0201c2b14f9..a56cdebc7b3 100644 --- a/src/Parsers/ASTBackupQuery.h +++ b/src/Parsers/ASTBackupQuery.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include @@ -40,7 +40,7 @@ class ASTFunction; * For the BACKUP command this clause allows to set the name which an object will have inside the backup. * And for the RESTORE command this clause allows to set the name which an object will have after RESTORE has finished. */ -class ASTBackupQuery : public IAST, public ASTQueryWithOnCluster +class ASTBackupQuery : public ASTQueryWithOutput, public ASTQueryWithOnCluster { public: enum Kind @@ -91,7 +91,7 @@ public: String getID(char) const override; ASTPtr clone() const override; - void formatImpl(const FormatSettings & format, FormatState &, FormatStateStacked) const override; + void formatQueryImpl(const FormatSettings & fs, FormatState &, FormatStateStacked) const override; ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override; QueryKind getQueryKind() const override; diff --git a/src/Parsers/ASTCreateQuery.h b/src/Parsers/ASTCreateQuery.h index 49a0140625c..b1209e72b61 100644 --- a/src/Parsers/ASTCreateQuery.h +++ b/src/Parsers/ASTCreateQuery.h @@ -96,6 +96,7 @@ public: bool is_populate{false}; bool is_create_empty{false}; /// CREATE TABLE ... EMPTY AS SELECT ... bool replace_view{false}; /// CREATE OR REPLACE VIEW + bool has_uuid{false}; // CREATE TABLE x UUID '...' ASTColumns * columns_list = nullptr; diff --git a/src/Parsers/ASTFunction.cpp b/src/Parsers/ASTFunction.cpp index e309dec2131..ae9b8ddbe85 100644 --- a/src/Parsers/ASTFunction.cpp +++ b/src/Parsers/ASTFunction.cpp @@ -71,6 +71,13 @@ namespace size_t count = 0; /// Mostly it's either 0 or 1. There are only a few cases where `count` can be greater than 1 (e.g. see `encrypt`). /// In all known cases secret arguments are consecutive bool are_named = false; /// Arguments like `password = 'password'` are considered as named arguments. + /// E.g. "headers" in `url('..', headers('foo' = '[HIDDEN]'))` + std::vector nested_maps; + + bool hasSecrets() const + { + return count != 0 || !nested_maps.empty(); + } }; Result getResult() const { return result; } @@ -127,6 +134,10 @@ namespace /// encrypt('mode', 'plaintext', 'key' [, iv, aad]) findEncryptionFunctionSecretArguments(); } + else if (function.name == "url") + { + findURLSecretArguments(); + } } void findMySQLFunctionSecretArguments() @@ -143,6 +154,25 @@ namespace } } + /// Returns the number of arguments excluding "headers" and "extra_credentials" (which should + /// always be at the end). Marks "headers" as secret, if found. + size_t excludeS3OrURLNestedMaps() + { + size_t count = arguments->size(); + while (count > 0) + { + const ASTFunction * f = arguments->at(count - 1)->as(); + if (!f) + break; + if (f->name == "headers") + result.nested_maps.push_back(f->name); + else if (f->name != "extra_credentials") + break; + count -= 1; + } + return count; + } + void findS3FunctionSecretArguments(bool is_cluster_function) { /// s3Cluster('cluster_name', 'url', ...) has 'url' as its second argument. @@ -156,9 +186,10 @@ namespace } /// We should check other arguments first because we don't need to do any replacement in case of - /// s3('url', NOSIGN, 'format' [, 'compression']) - /// s3('url', 'format', 'structure' [, 'compression']) - if ((url_arg_idx + 3 <= arguments->size()) && (arguments->size() <= url_arg_idx + 4)) + /// s3('url', NOSIGN, 'format' [, 'compression'] [, extra_credentials(..)] [, headers(..)]) + /// s3('url', 'format', 'structure' [, 'compression'] [, extra_credentials(..)] [, headers(..)]) + size_t count = excludeS3OrURLNestedMaps(); + if ((url_arg_idx + 3 <= count) && (count <= url_arg_idx + 4)) { String second_arg; if (tryGetStringFromArgument(url_arg_idx + 1, &second_arg)) @@ -174,7 +205,14 @@ namespace /// We're going to replace 'aws_secret_access_key' with '[HIDDEN]' for the following signatures: /// s3('url', 'aws_access_key_id', 'aws_secret_access_key', ...) /// s3Cluster('cluster_name', 'url', 'aws_access_key_id', 'aws_secret_access_key', 'format', 'compression') - markSecretArgument(url_arg_idx + 2); + if (url_arg_idx + 2 < count) + markSecretArgument(url_arg_idx + 2); + } + + void findURLSecretArguments() + { + if (!isNamedCollectionName(0)) + excludeS3OrURLNestedMaps(); } bool tryGetStringFromArgument(size_t arg_idx, String * res, bool allow_identifier = true) const @@ -347,6 +385,10 @@ namespace /// S3('url', ['aws_access_key_id', 'aws_secret_access_key',] ...) findS3TableEngineSecretArguments(); } + else if (engine_name == "URL") + { + findURLSecretArguments(); + } } void findExternalDistributedTableEngineSecretArguments() @@ -373,9 +415,10 @@ namespace } /// We should check other arguments first because we don't need to do any replacement in case of - /// S3('url', NOSIGN, 'format' [, 'compression']) - /// S3('url', 'format', 'compression') - if ((3 <= arguments->size()) && (arguments->size() <= 4)) + /// S3('url', NOSIGN, 'format' [, 'compression'] [, extra_credentials(..)] [, headers(..)]) + /// S3('url', 'format', 'compression' [, extra_credentials(..)] [, headers(..)]) + size_t count = excludeS3OrURLNestedMaps(); + if ((3 <= count) && (count <= 4)) { String second_arg; if (tryGetStringFromArgument(1, &second_arg)) @@ -383,7 +426,7 @@ namespace if (boost::iequals(second_arg, "NOSIGN")) return; /// The argument after 'url' is "NOSIGN". - if (arguments->size() == 3) + if (count == 3) { if (second_arg == "auto" || KnownFormatNames::instance().exists(second_arg)) return; /// The argument after 'url' is a format: S3('url', 'format', ...) @@ -391,11 +434,12 @@ namespace } } - /// We replace 'aws_secret_access_key' with '[HIDDEN'] for the following signatures: + /// We replace 'aws_secret_access_key' with '[HIDDEN]' for the following signatures: /// S3('url', 'aws_access_key_id', 'aws_secret_access_key') /// S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'format') /// S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'format', 'compression') - markSecretArgument(2); + if (2 < count) + markSecretArgument(2); } void findDatabaseEngineSecretArguments() @@ -724,6 +768,25 @@ ASTSelectWithUnionQuery * ASTFunction::tryGetQueryArgument() const } +static bool formatNamedArgWithHiddenValue(IAST * arg, const IAST::FormatSettings & settings, IAST::FormatState & state, IAST::FormatStateStacked frame) +{ + const auto * equals_func = arg->as(); + if (!equals_func || (equals_func->name != "equals")) + return false; + const auto * expr_list = equals_func->arguments->as(); + if (!expr_list) + return false; + const auto & equal_args = expr_list->children; + if (equal_args.size() != 2) + return false; + + equal_args[0]->formatImpl(settings, state, frame); + settings.ostr << (settings.hilite ? IAST::hilite_operator : "") << " = " << (settings.hilite ? IAST::hilite_none : ""); + settings.ostr << "'[HIDDEN]'"; + + return true; +} + void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const { frame.expression_list_prepend_whitespace = false; @@ -772,34 +835,37 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format const auto * literal = arguments->children[0]->as(); const auto * function = arguments->children[0]->as(); - bool negate = name == "negate"; bool is_tuple = literal && literal->value.getType() == Field::Types::Tuple; // do not add parentheses for tuple literal, otherwise extra parens will be added `-((3, 7, 3), 1)` -> `-(((3, 7, 3), 1))` bool literal_need_parens = literal && !is_tuple; + // negate always requires parentheses, otherwise -(-1) will be printed as --1 - bool negate_need_parens = negate && (literal_need_parens || (function && function->name == "negate")); - // We don't need parentheses around a single literal. - bool need_parens = !literal && frame.need_parens && !negate_need_parens; + bool inside_parens = name == "negate" && (literal_need_parens || (function && function->name == "negate")); + + /// We DO need parentheses around a single literal + /// For example, SELECT (NOT 0) + (NOT 0) cannot be transformed into SELECT NOT 0 + NOT 0, since + /// this is equal to SELECT NOT (0 + NOT 0) + bool outside_parens = frame.need_parens && !inside_parens; // do not add extra parentheses for functions inside negate, i.e. -(-toUInt64(-(1))) - if (negate_need_parens) + if (inside_parens) nested_need_parens.need_parens = false; - if (need_parens) + if (outside_parens) settings.ostr << '('; settings.ostr << (settings.hilite ? hilite_operator : "") << func[1] << (settings.hilite ? hilite_none : ""); - if (negate_need_parens) + if (inside_parens) settings.ostr << '('; arguments->formatImpl(settings, state, nested_need_parens); written = true; - if (negate_need_parens) + if (inside_parens) settings.ostr << ')'; - if (need_parens) + if (outside_parens) settings.ostr << ')'; break; @@ -971,7 +1037,15 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format } } - if (!written && name == "lambda"sv) + const auto & first_argument = arguments->children[0]; + const ASTIdentifier * first_argument_identifier = first_argument->as(); + const ASTFunction * first_argument_function = first_argument->as(); + bool first_argument_is_tuple = first_argument_function && first_argument_function->name == "tuple"; + + /// Only these types of arguments are accepted by the parser of the '->' operator. + bool acceptable_first_argument_for_lambda_expression = first_argument_identifier || first_argument_is_tuple; + + if (!written && name == "lambda"sv && acceptable_first_argument_for_lambda_expression) { /// Special case: zero elements tuple in lhs of lambda is printed as (). /// Special case: one-element tuple in lhs of lambda is printed as its element. @@ -979,19 +1053,17 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format if (frame.need_parens) settings.ostr << '('; - const auto * first_arg_func = arguments->children[0]->as(); - if (first_arg_func - && first_arg_func->name == "tuple" - && first_arg_func->arguments - && (first_arg_func->arguments->children.size() == 1 || first_arg_func->arguments->children.empty())) + if (first_argument_is_tuple + && first_argument_function->arguments + && (first_argument_function->arguments->children.size() == 1 || first_argument_function->arguments->children.empty())) { - if (first_arg_func->arguments->children.size() == 1) - first_arg_func->arguments->children[0]->formatImpl(settings, state, nested_need_parens); + if (first_argument_function->arguments->children.size() == 1) + first_argument_function->arguments->children[0]->formatImpl(settings, state, nested_need_parens); else settings.ostr << "()"; } else - arguments->children[0]->formatImpl(settings, state, nested_need_parens); + first_argument->formatImpl(settings, state, nested_need_parens); settings.ostr << (settings.hilite ? hilite_operator : "") << " -> " << (settings.hilite ? hilite_none : ""); arguments->children[1]->formatImpl(settings, state, nested_need_parens); @@ -1133,17 +1205,37 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format if (argument->as()) settings.ostr << "SETTINGS "; - if (!settings.show_secrets && (secret_arguments.start <= i) && (i < secret_arguments.start + secret_arguments.count)) + if (!settings.show_secrets) { - if (secret_arguments.are_named) + if (secret_arguments.start <= i && i < secret_arguments.start + secret_arguments.count) { - assert_cast(argument.get())->arguments->children[0]->formatImpl(settings, state, nested_dont_need_parens); - settings.ostr << (settings.hilite ? hilite_operator : "") << " = " << (settings.hilite ? hilite_none : ""); + if (secret_arguments.are_named) + { + assert_cast(argument.get())->arguments->children[0]->formatImpl(settings, state, nested_dont_need_parens); + settings.ostr << (settings.hilite ? hilite_operator : "") << " = " << (settings.hilite ? hilite_none : ""); + } + settings.ostr << "'[HIDDEN]'"; + if (size <= secret_arguments.start + secret_arguments.count && !secret_arguments.are_named) + break; /// All other arguments should also be hidden. + continue; + } + + const ASTFunction * function = argument->as(); + if (function && function->arguments && std::count(secret_arguments.nested_maps.begin(), secret_arguments.nested_maps.end(), function->name) != 0) + { + /// headers('foo' = '[HIDDEN]', 'bar' = '[HIDDEN]') + settings.ostr << (settings.hilite ? hilite_function : "") << function->name << (settings.hilite ? hilite_none : "") << "("; + for (size_t j = 0; j < function->arguments->children.size(); ++j) + { + if (j != 0) + settings.ostr << ", "; + auto inner_arg = function->arguments->children[j]; + if (!formatNamedArgWithHiddenValue(inner_arg.get(), settings, state, nested_dont_need_parens)) + inner_arg->formatImpl(settings, state, nested_dont_need_parens); + } + settings.ostr << ")"; + continue; } - settings.ostr << "'[HIDDEN]'"; - if (size <= secret_arguments.start + secret_arguments.count && !secret_arguments.are_named) - break; /// All other arguments should also be hidden. - continue; } if ((i == 1) && special_hilite_regexp @@ -1166,7 +1258,7 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format bool ASTFunction::hasSecretParts() const { - return (FunctionSecretArgumentsFinder{*this}.getResult().count > 0) || childrenHaveSecretParts(); + return (FunctionSecretArgumentsFinder{*this}.getResult().hasSecrets()) || childrenHaveSecretParts(); } String getFunctionName(const IAST * ast) diff --git a/src/Parsers/ASTSelectWithUnionQuery.cpp b/src/Parsers/ASTSelectWithUnionQuery.cpp index 48b4ae3c38d..c377e4bd66b 100644 --- a/src/Parsers/ASTSelectWithUnionQuery.cpp +++ b/src/Parsers/ASTSelectWithUnionQuery.cpp @@ -71,8 +71,7 @@ void ASTSelectWithUnionQuery::formatQueryImpl(const FormatSettings & settings, F } else { - auto sub_query = std::make_shared(); - sub_query->children.push_back(*it); + auto sub_query = std::make_shared(*it); sub_query->formatImpl(settings, state, frame); } } diff --git a/src/Parsers/ASTSubquery.h b/src/Parsers/ASTSubquery.h index ef277a63126..e92a88b04dd 100644 --- a/src/Parsers/ASTSubquery.h +++ b/src/Parsers/ASTSubquery.h @@ -26,6 +26,13 @@ public: return clone; } + ASTSubquery() = default; + + ASTSubquery(ASTPtr child) + { + children.emplace_back(std::move(child)); + } + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; String getAliasOrColumnName() const override; String tryGetAlias() const override; diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index eeb76e3bb9e..486555ae86d 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -123,7 +123,7 @@ bool ParserSubquery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) throw Exception(ErrorCodes::BAD_ARGUMENTS, "EXPLAIN in a subquery cannot have a table function or table override"); /// Replace subquery `(EXPLAIN SELECT ...)` - /// with `(SELECT * FROM viewExplain("", "", SELECT ...))` + /// with `(SELECT * FROM viewExplain('', '', (SELECT ...)))` String kind_str = ASTExplainQuery::toString(explain_query.getKind()); @@ -141,7 +141,7 @@ bool ParserSubquery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) auto view_explain = makeASTFunction("viewExplain", std::make_shared(kind_str), std::make_shared(settings_str), - explained_ast); + std::make_shared(explained_ast)); result_node = buildSelectFromTableFunction(view_explain); } else @@ -161,8 +161,7 @@ bool ParserSubquery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) return false; ++pos; - node = std::make_shared(); - node->children.push_back(result_node); + node = std::make_shared(std::move(result_node)); return true; } @@ -250,7 +249,7 @@ bool ParserTableAsStringLiteralIdentifier::parseImpl(Pos & pos, ASTPtr & node, E ReadBufferFromMemory in(pos->begin, pos->size()); String s; - if (!tryReadQuotedStringInto(s, in)) + if (!tryReadQuotedString(s, in)) { expected.add(pos, "string literal"); return false; @@ -935,7 +934,7 @@ bool ParserNumber::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { if (float_value < 0) throw Exception(ErrorCodes::LOGICAL_ERROR, - "Logical error: token number cannot begin with minus, " + "Token number cannot begin with minus, " "but parsed float number is less than zero."); if (negative) diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp index 1e9383f96ae..6d267a7d215 100644 --- a/src/Parsers/ExpressionListParsers.cpp +++ b/src/Parsers/ExpressionListParsers.cpp @@ -225,8 +225,7 @@ static bool modifyAST(ASTPtr ast, SubqueryFunctionType type) select_with_union_query->list_of_selects->children.push_back(std::move(select_query)); select_with_union_query->children.push_back(select_with_union_query->list_of_selects); - auto new_subquery = std::make_shared(); - new_subquery->children.push_back(select_with_union_query); + auto new_subquery = std::make_shared(std::move(select_with_union_query)); ast->children[0]->children.back() = std::move(new_subquery); return true; @@ -1582,8 +1581,7 @@ public: if (!ParserToken(TokenType::ClosingRoundBracket).ignore(pos, expected)) return false; - auto subquery = std::make_shared(); - subquery->children.push_back(std::move(node)); + auto subquery = std::make_shared(std::move(node)); elements = {makeASTFunction("exists", subquery)}; finished = true; diff --git a/src/Parsers/IParser.h b/src/Parsers/IParser.h index d53b58baa7c..198ec0346ff 100644 --- a/src/Parsers/IParser.h +++ b/src/Parsers/IParser.h @@ -9,6 +9,7 @@ #include #include #include +#include namespace DB @@ -73,6 +74,21 @@ public: if (unlikely(max_depth > 0 && depth > max_depth)) throw Exception(ErrorCodes::TOO_DEEP_RECURSION, "Maximum parse depth ({}) exceeded. " "Consider rising max_parser_depth parameter.", max_depth); + + /** Sometimes the maximum parser depth can be set to a high value by the user, + * but we still want to avoid stack overflow. + * For this purpose, we can use the checkStackSize function, but it is too heavy. + * The solution is to check not too frequently. + * The frequency is arbitrary, but not too large, not too small, + * and a power of two to simplify the division. + */ +#if defined(USE_MUSL) || defined(SANITIZER) || !defined(NDEBUG) + static constexpr uint32_t check_frequency = 128; +#else + static constexpr uint32_t check_frequency = 8192; +#endif + if (depth % check_frequency == 0) + checkStackSize(); } ALWAYS_INLINE void decreaseDepth() diff --git a/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.cpp b/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.cpp index adac892b49d..044cc2e0622 100644 --- a/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.cpp +++ b/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.cpp @@ -359,7 +359,7 @@ std::unique_ptr KQLFunctionFactory::get(String & kql_functio return std::make_unique(); case KQLFunctionValue::extract_json: - return std::make_unique(); + return std::make_unique(); case KQLFunctionValue::has_any_index: return std::make_unique(); @@ -389,7 +389,7 @@ std::unique_ptr KQLFunctionFactory::get(String & kql_functio return std::make_unique(); case KQLFunctionValue::parse_json: - return std::make_unique(); + return std::make_unique(); case KQLFunctionValue::parse_url: return std::make_unique(); diff --git a/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.cpp b/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.cpp index 0f9ca67d6dc..afb8809c69e 100644 --- a/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.cpp +++ b/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.cpp @@ -240,7 +240,7 @@ bool ExtractAll::convertImpl(String & out, IParser::Pos & pos) return true; } -bool ExtractJson::convertImpl(String & out, IParser::Pos & pos) +bool ExtractJSON::convertImpl(String & out, IParser::Pos & pos) { String datatype = "String"; ParserKeyword s_kql("typeof"); @@ -431,7 +431,7 @@ bool ParseCSV::convertImpl(String & out, IParser::Pos & pos) return true; } -bool ParseJson::convertImpl(String & out, IParser::Pos & pos) +bool ParseJSON::convertImpl(String & out, IParser::Pos & pos) { const String fn_name = getKQLFunctionName(pos); if (fn_name.empty()) diff --git a/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.h b/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.h index 492a59263ec..9b0c6327e01 100644 --- a/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.h +++ b/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.h @@ -62,7 +62,7 @@ protected: bool convertImpl(String & out, IParser::Pos & pos) override; }; -class ExtractJson : public IParserKQLFunction +class ExtractJSON : public IParserKQLFunction { protected: const char * getName() const override { return "extract_json(), extractjson()"; } @@ -125,7 +125,7 @@ protected: bool convertImpl(String & out, IParser::Pos & pos) override; }; -class ParseJson : public IParserKQLFunction +class ParseJSON : public IParserKQLFunction { protected: const char * getName() const override { return "parse_json()"; } diff --git a/src/Parsers/Kusto/ParserKQLQuery.cpp b/src/Parsers/Kusto/ParserKQLQuery.cpp index 47986943662..30e9921e744 100644 --- a/src/Parsers/Kusto/ParserKQLQuery.cpp +++ b/src/Parsers/Kusto/ParserKQLQuery.cpp @@ -576,20 +576,19 @@ bool ParserKQLSubquery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) if (!ParserKQLTableFunction().parse(pos, select_node, expected)) return false; - ASTPtr node_subquery = std::make_shared(); - node_subquery->children.push_back(select_node); + ASTPtr node_subquery = std::make_shared(std::move(select_node)); ASTPtr node_table_expr = std::make_shared(); node_table_expr->as()->subquery = node_subquery; node_table_expr->children.emplace_back(node_subquery); - ASTPtr node_table_in_select_query_emlement = std::make_shared(); - node_table_in_select_query_emlement->as()->table_expression = node_table_expr; + ASTPtr node_table_in_select_query_element = std::make_shared(); + node_table_in_select_query_element->as()->table_expression = node_table_expr; ASTPtr res = std::make_shared(); - res->children.emplace_back(node_table_in_select_query_emlement); + res->children.emplace_back(node_table_in_select_query_element); node = res; return true; @@ -618,20 +617,19 @@ bool ParserSimpleCHSubquery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe ASTSelectQuery::Expression::TABLES, parent_select_node->as()->tables()); } - ASTPtr node_subquery = std::make_shared(); - node_subquery->children.push_back(sub_select_node); + ASTPtr node_subquery = std::make_shared(std::move(sub_select_node)); ASTPtr node_table_expr = std::make_shared(); node_table_expr->as()->subquery = node_subquery; node_table_expr->children.emplace_back(node_subquery); - ASTPtr node_table_in_select_query_emlement = std::make_shared(); - node_table_in_select_query_emlement->as()->table_expression = node_table_expr; + ASTPtr node_table_in_select_query_element = std::make_shared(); + node_table_in_select_query_element->as()->table_expression = node_table_expr; ASTPtr res = std::make_shared(); - res->children.emplace_back(node_table_in_select_query_emlement); + res->children.emplace_back(node_table_in_select_query_element); node = res; return true; diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index 1f6f68c9d8e..27c6e6258e3 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -684,6 +684,7 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe query->database = table_id->getDatabase(); query->table = table_id->getTable(); query->uuid = table_id->uuid; + query->has_uuid = table_id->uuid != UUIDHelpers::Nil; if (query->database) query->children.push_back(query->database); @@ -783,6 +784,7 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe query->database = table_id->getDatabase(); query->table = table_id->getTable(); query->uuid = table_id->uuid; + query->has_uuid = table_id->uuid != UUIDHelpers::Nil; query->cluster = cluster_str; if (query->database) diff --git a/src/Parsers/ParserDataType.cpp b/src/Parsers/ParserDataType.cpp index 3e2a6facac6..b75f17dca72 100644 --- a/src/Parsers/ParserDataType.cpp +++ b/src/Parsers/ParserDataType.cpp @@ -116,6 +116,18 @@ bool ParserDataType::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) if (!type_name_suffix.empty()) type_name = type_name_upper + " " + type_name_suffix; + /// skip trailing comma in types, e.g. Tuple(Int, String,) + if (pos->type == TokenType::Comma) + { + Expected test_expected; + auto test_pos = pos; + ++test_pos; + if (ParserToken(TokenType::ClosingRoundBracket).ignore(test_pos, test_expected)) + { // the end of the type definition was reached and there was a trailing comma + ++pos; + } + } + auto function_node = std::make_shared(); function_node->name = type_name; function_node->no_empty_args = true; @@ -133,6 +145,9 @@ bool ParserDataType::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) if (!args_parser.parse(pos, expr_list_args, expected)) return false; + if (pos->type == TokenType::Comma) + // ignore trailing comma inside Nested structures like Tuple(Int, Tuple(Int, String),) + ++pos; if (pos->type != TokenType::ClosingRoundBracket) return false; ++pos; diff --git a/src/Parsers/ParserPartition.cpp b/src/Parsers/ParserPartition.cpp index 80debc13c67..f7d972dd4af 100644 --- a/src/Parsers/ParserPartition.cpp +++ b/src/Parsers/ParserPartition.cpp @@ -18,8 +18,6 @@ bool ParserPartition::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ParserKeyword s_all("ALL"); ParserStringLiteral parser_string_literal; ParserSubstitution parser_substitution; - ParserLiteral literal_parser; - ParserTupleOfLiterals tuple_of_literals; ParserExpression parser_expr; auto partition = std::make_shared(); @@ -45,34 +43,35 @@ bool ParserPartition::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ASTPtr value; std::optional fields_count; - if (literal_parser.parse(pos, value, expected) || tuple_of_literals.parse(pos, value, expected)) - { - auto * literal = value->as(); - if (literal->value.getType() == Field::Types::Tuple) - { - fields_count = literal->value.get().size(); - } - else - { - fields_count = 1; - } - } - else if (parser_substitution.parse(pos, value, expected)) + if (parser_substitution.parse(pos, value, expected)) { /// It can be tuple substitution fields_count = std::nullopt; } else if (parser_expr.parse(pos, value, expected)) { - const auto * tuple_ast = value->as(); - if (tuple_ast && tuple_ast->name == "tuple") + if (const auto * tuple_ast = value->as(); tuple_ast) { + if (tuple_ast->name != "tuple") + return false; + const auto * arguments_ast = tuple_ast->arguments->as(); if (arguments_ast) fields_count = arguments_ast->children.size(); else fields_count = 0; } + else if (const auto* literal_ast = value->as(); literal_ast) + { + if (literal_ast->value.getType() == Field::Types::Tuple) + { + fields_count = literal_ast->value.get().size(); + } + else + { + fields_count = 1; + } + } else return false; } diff --git a/src/Parsers/ParserQuery.cpp b/src/Parsers/ParserQuery.cpp index 7ed69940bed..22ddc25019f 100644 --- a/src/Parsers/ParserQuery.cpp +++ b/src/Parsers/ParserQuery.cpp @@ -1,6 +1,5 @@ #include #include -#include #include #include #include @@ -61,7 +60,6 @@ bool ParserQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ParserExternalDDLQuery external_ddl_p; ParserTransactionControl transaction_control_p; ParserDeleteQuery delete_p; - ParserBackupQuery backup_p; bool res = query_with_output_p.parse(pos, node, expected) || insert_p.parse(pos, node, expected) @@ -86,8 +84,7 @@ bool ParserQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) || grant_p.parse(pos, node, expected) || external_ddl_p.parse(pos, node, expected) || transaction_control_p.parse(pos, node, expected) - || delete_p.parse(pos, node, expected) - || backup_p.parse(pos, node, expected); + || delete_p.parse(pos, node, expected); return res; } diff --git a/src/Parsers/ParserQueryWithOutput.cpp b/src/Parsers/ParserQueryWithOutput.cpp index f03df6cacfe..7a627ae5f6a 100644 --- a/src/Parsers/ParserQueryWithOutput.cpp +++ b/src/Parsers/ParserQueryWithOutput.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -65,6 +66,7 @@ bool ParserQueryWithOutput::parseImpl(Pos & pos, ASTPtr & node, Expected & expec ParserShowGrantsQuery show_grants_p; ParserShowPrivilegesQuery show_privileges_p; ParserExplainQuery explain_p(end, allow_settings_after_format_in_insert); + ParserBackupQuery backup_p; ASTPtr query; @@ -94,7 +96,8 @@ bool ParserQueryWithOutput::parseImpl(Pos & pos, ASTPtr & node, Expected & expec || show_access_p.parse(pos, query, expected) || show_access_entities_p.parse(pos, query, expected) || show_grants_p.parse(pos, query, expected) - || show_privileges_p.parse(pos, query, expected); + || show_privileges_p.parse(pos, query, expected) + || backup_p.parse(pos, query, expected); if (!parsed) return false; diff --git a/src/Planner/Planner.cpp b/src/Planner/Planner.cpp index efccadcbe1a..ace6d500482 100644 --- a/src/Planner/Planner.cpp +++ b/src/Planner/Planner.cpp @@ -64,6 +64,7 @@ #include #include +#include #include #include #include @@ -71,7 +72,6 @@ #include #include #include -#include #include #include #include @@ -98,14 +98,6 @@ namespace ErrorCodes extern const int SUPPORT_IS_DISABLED; } -/** ClickHouse query planner. - * - * TODO: Support projections. - * TODO: Support trivial count using partition predicates. - * TODO: Support trivial count for table functions. - * TODO: Support indexes for IN function. - */ - namespace { @@ -1066,7 +1058,7 @@ void addBuildSubqueriesForSetsStepIfNeeded( Planner subquery_planner( query_tree, subquery_options, - std::make_shared()); //planner_context->getGlobalPlannerContext()); + std::make_shared(nullptr, nullptr)); subquery_planner.buildQueryPlanIfNeeded(); subquery->setQueryPlan(std::make_unique(std::move(subquery_planner).extractQueryPlan())); @@ -1169,7 +1161,10 @@ Planner::Planner(const QueryTreeNodePtr & query_tree_, SelectQueryOptions & select_query_options_) : query_tree(query_tree_) , select_query_options(select_query_options_) - , planner_context(buildPlannerContext(query_tree, select_query_options, std::make_shared())) + , planner_context(buildPlannerContext(query_tree, select_query_options, + std::make_shared( + findQueryForParallelReplicas(query_tree, select_query_options), + findTableForParallelReplicas(query_tree, select_query_options)))) { } @@ -1232,6 +1227,8 @@ void Planner::buildPlanForUnionNode() query_planner.buildQueryPlanIfNeeded(); for (const auto & row_policy : query_planner.getUsedRowPolicies()) used_row_policies.insert(row_policy); + const auto & mapping = query_planner.getQueryNodeToPlanStepMapping(); + query_node_to_plan_step_mapping.insert(mapping.begin(), mapping.end()); auto query_node_plan = std::make_unique(std::move(query_planner).extractQueryPlan()); query_plans_headers.push_back(query_node_plan->getCurrentDataStream().header); query_plans.push_back(std::move(query_node_plan)); @@ -1411,16 +1408,27 @@ void Planner::buildPlanForQueryNode() } } - auto top_level_identifiers = collectTopLevelColumnIdentifiers(query_tree, planner_context); - auto join_tree_query_plan = buildJoinTreeQueryPlan(query_tree, - select_query_info, - select_query_options, - top_level_identifiers, - planner_context); + JoinTreeQueryPlan join_tree_query_plan; + if (planner_context->getMutableQueryContext()->canUseTaskBasedParallelReplicas() + && planner_context->getGlobalPlannerContext()->parallel_replicas_node == &query_node) + { + join_tree_query_plan = buildQueryPlanForParallelReplicas(query_node, planner_context, select_query_info.storage_limits); + } + else + { + auto top_level_identifiers = collectTopLevelColumnIdentifiers(query_tree, planner_context); + join_tree_query_plan = buildJoinTreeQueryPlan(query_tree, + select_query_info, + select_query_options, + top_level_identifiers, + planner_context); + } auto from_stage = join_tree_query_plan.from_stage; query_plan = std::move(join_tree_query_plan.query_plan); used_row_policies = std::move(join_tree_query_plan.used_row_policies); + auto & mapping = join_tree_query_plan.query_node_to_plan_step_mapping; + query_node_to_plan_step_mapping.insert(mapping.begin(), mapping.end()); LOG_TRACE(getLogger("Planner"), "Query {} from stage {} to stage {}{}", query_tree->formatConvertedASTForErrorMessage(), @@ -1690,6 +1698,8 @@ void Planner::buildPlanForQueryNode() if (!select_query_options.only_analyze) addBuildSubqueriesForSetsStepIfNeeded(query_plan, select_query_options, planner_context, result_actions_to_execute); + + query_node_to_plan_step_mapping[&query_node] = query_plan.getRootNode(); } SelectQueryInfo Planner::buildSelectQueryInfo() const diff --git a/src/Planner/Planner.h b/src/Planner/Planner.h index 2177ed59fc6..ae78f05cbd4 100644 --- a/src/Planner/Planner.h +++ b/src/Planner/Planner.h @@ -65,6 +65,11 @@ public: return planner_context; } + /// We support mapping QueryNode -> QueryPlanStep (the last step added to plan from this query) + /// It is useful for parallel replicas analysis. + using QueryNodeToPlanStepMapping = std::unordered_map; + const QueryNodeToPlanStepMapping & getQueryNodeToPlanStepMapping() const { return query_node_to_plan_step_mapping; } + private: void buildPlanForUnionNode(); @@ -76,6 +81,7 @@ private: QueryPlan query_plan; StorageLimitsList storage_limits; std::set used_row_policies; + QueryNodeToPlanStepMapping query_node_to_plan_step_mapping; }; } diff --git a/src/Planner/PlannerActionsVisitor.cpp b/src/Planner/PlannerActionsVisitor.cpp index aef6f11aa26..511e9396a35 100644 --- a/src/Planner/PlannerActionsVisitor.cpp +++ b/src/Planner/PlannerActionsVisitor.cpp @@ -495,8 +495,8 @@ PlannerActionsVisitorImpl::NodeNameAndNodeMinLevel PlannerActionsVisitorImpl::vi return visitFunction(node); throw Exception(ErrorCodes::UNSUPPORTED_METHOD, - "Expected column, constant, function. Actual {}", - node->formatASTForErrorMessage()); + "Expected column, constant, function. Actual {} with type: {}", + node->formatASTForErrorMessage(), node_type); } PlannerActionsVisitorImpl::NodeNameAndNodeMinLevel PlannerActionsVisitorImpl::visitColumn(const QueryTreeNodePtr & node) diff --git a/src/Planner/PlannerContext.h b/src/Planner/PlannerContext.h index d7ea4fd95dd..fe9eabc558b 100644 --- a/src/Planner/PlannerContext.h +++ b/src/Planner/PlannerContext.h @@ -18,10 +18,18 @@ namespace DB * * 1. Column identifiers. */ + +class QueryNode; +class TableNode; + class GlobalPlannerContext { public: - GlobalPlannerContext() = default; + explicit GlobalPlannerContext(const QueryNode * parallel_replicas_node_, const TableNode * parallel_replicas_table_) + : parallel_replicas_node(parallel_replicas_node_) + , parallel_replicas_table(parallel_replicas_table_) + { + } /** Create column identifier for column node. * @@ -38,6 +46,13 @@ public: /// Check if context has column identifier bool hasColumnIdentifier(const ColumnIdentifier & column_identifier); + /// The query which will be executed with parallel replicas. + /// In case if only the most inner subquery can be executed with parallel replicas, node is nullptr. + const QueryNode * const parallel_replicas_node = nullptr; + /// Table which is used with parallel replicas reading. Now, only one table is supported by the protocol. + /// It is the left-most table of the query (in JOINs, UNIONs and subqueries). + const TableNode * const parallel_replicas_table = nullptr; + private: std::unordered_set column_identifiers; }; diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index ab25f6d2423..227ac86d3a5 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -84,32 +84,38 @@ namespace { /// Check if current user has privileges to SELECT columns from table -void checkAccessRights(const TableNode & table_node, const Names & column_names, const ContextPtr & query_context) +/// Throws an exception if access to any column from `column_names` is not granted +/// If `column_names` is empty, check access to any columns and return names of accessible columns +NameSet checkAccessRights(const TableNode & table_node, Names & column_names, const ContextPtr & query_context) { /// StorageDummy is created on preliminary stage, ignore access check for it. if (typeid_cast(table_node.getStorage().get())) - return; + return {}; const auto & storage_id = table_node.getStorageID(); const auto & storage_snapshot = table_node.getStorageSnapshot(); if (column_names.empty()) { + NameSet accessible_columns; /** For a trivial queries like "SELECT count() FROM table", "SELECT 1 FROM table" access is granted if at least * one table column is accessible. */ auto access = query_context->getAccess(); - for (const auto & column : storage_snapshot->metadata->getColumns()) { if (access->isGranted(AccessType::SELECT, storage_id.database_name, storage_id.table_name, column.name)) - return; + accessible_columns.insert(column.name); } - throw Exception(ErrorCodes::ACCESS_DENIED, - "{}: Not enough privileges. To execute this query, it's necessary to have the grant SELECT for at least one column on {}", - query_context->getUserName(), - storage_id.getFullTableName()); + if (accessible_columns.empty()) + { + throw Exception(ErrorCodes::ACCESS_DENIED, + "{}: Not enough privileges. To execute this query, it's necessary to have the grant SELECT for at least one column on {}", + query_context->getUserName(), + storage_id.getFullTableName()); + } + return accessible_columns; } // In case of cross-replication we don't know what database is used for the table. @@ -117,6 +123,8 @@ void checkAccessRights(const TableNode & table_node, const Names & column_names, // Each shard will use the default database (in the case of cross-replication shards may have different defaults). if (storage_id.hasDatabase()) query_context->checkAccess(AccessType::SELECT, storage_id, column_names); + + return {}; } bool shouldIgnoreQuotaAndLimits(const TableNode & table_node) @@ -133,7 +141,7 @@ bool shouldIgnoreQuotaAndLimits(const TableNode & table_node) return false; } -NameAndTypePair chooseSmallestColumnToReadFromStorage(const StoragePtr & storage, const StorageSnapshotPtr & storage_snapshot) +NameAndTypePair chooseSmallestColumnToReadFromStorage(const StoragePtr & storage, const StorageSnapshotPtr & storage_snapshot, const NameSet & column_names_allowed_to_select) { /** We need to read at least one column to find the number of rows. * We will find a column with minimum . @@ -167,6 +175,18 @@ NameAndTypePair chooseSmallestColumnToReadFromStorage(const StoragePtr & storage auto column_sizes = storage->getColumnSizes(); auto column_names_and_types = storage_snapshot->getColumns(GetColumnsOptions(GetColumnsOptions::AllPhysical).withSubcolumns()); + if (!column_names_allowed_to_select.empty()) + { + auto it = column_names_and_types.begin(); + while (it != column_names_and_types.end()) + { + if (!column_names_allowed_to_select.contains(it->name)) + it = column_names_and_types.erase(it); + else + ++it; + } + } + if (!column_sizes.empty()) { for (auto & column_name_and_type : column_names_and_types) @@ -330,12 +350,13 @@ void prepareBuildQueryPlanForTableExpression(const QueryTreeNodePtr & table_expr /** The current user must have the SELECT privilege. * We do not check access rights for table functions because they have been already checked in ITableFunction::execute(). */ + NameSet columns_names_allowed_to_select; if (table_node) { auto column_names_with_aliases = columns_names; const auto & alias_columns_names = table_expression_data.getAliasColumnsNames(); column_names_with_aliases.insert(column_names_with_aliases.end(), alias_columns_names.begin(), alias_columns_names.end()); - checkAccessRights(*table_node, column_names_with_aliases, query_context); + columns_names_allowed_to_select = checkAccessRights(*table_node, column_names_with_aliases, query_context); } if (columns_names.empty()) @@ -346,8 +367,7 @@ void prepareBuildQueryPlanForTableExpression(const QueryTreeNodePtr & table_expr { const auto & storage = table_node ? table_node->getStorage() : table_function_node->getStorage(); const auto & storage_snapshot = table_node ? table_node->getStorageSnapshot() : table_function_node->getStorageSnapshot(); - additional_column_to_read = chooseSmallestColumnToReadFromStorage(storage, storage_snapshot); - + additional_column_to_read = chooseSmallestColumnToReadFromStorage(storage, storage_snapshot, columns_names_allowed_to_select); } else if (query_node || union_node) { @@ -593,6 +613,7 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres auto * union_node = table_expression->as(); QueryPlan query_plan; + std::unordered_map query_node_to_plan_step_mapping; std::set used_row_policies; if (table_node || table_function_node) @@ -603,6 +624,7 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres auto table_expression_query_info = select_query_info; table_expression_query_info.table_expression = table_expression; table_expression_query_info.filter_actions_dag = table_expression_data.getFilterActions(); + table_expression_query_info.analyzer_can_use_parallel_replicas_on_follower = table_node == planner_context->getGlobalPlannerContext()->parallel_replicas_table; size_t max_streams = settings.max_threads; size_t max_threads_execute_query = settings.max_threads; @@ -895,6 +917,8 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres /// Propagate storage limits to subquery subquery_planner.addStorageLimits(*select_query_info.storage_limits); subquery_planner.buildQueryPlanIfNeeded(); + const auto & mapping = subquery_planner.getQueryNodeToPlanStepMapping(); + query_node_to_plan_step_mapping.insert(mapping.begin(), mapping.end()); query_plan = std::move(subquery_planner).extractQueryPlan(); } } @@ -954,6 +978,7 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres .query_plan = std::move(query_plan), .from_stage = from_stage, .used_row_policies = std::move(used_row_policies), + .query_node_to_plan_step_mapping = std::move(query_node_to_plan_step_mapping), }; } @@ -1500,11 +1525,16 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_ if (join_clauses_and_actions.right_join_expressions_actions) left_join_tree_query_plan.actions_dags.emplace_back(std::move(join_clauses_and_actions.right_join_expressions_actions)); + auto mapping = std::move(left_join_tree_query_plan.query_node_to_plan_step_mapping); + auto & r_mapping = right_join_tree_query_plan.query_node_to_plan_step_mapping; + mapping.insert(r_mapping.begin(), r_mapping.end()); + return JoinTreeQueryPlan{ .query_plan = std::move(result_plan), .from_stage = QueryProcessingStage::FetchColumns, .used_row_policies = std::move(left_join_tree_query_plan.used_row_policies), .actions_dags = std::move(left_join_tree_query_plan.actions_dags), + .query_node_to_plan_step_mapping = std::move(mapping), }; } @@ -1591,6 +1621,7 @@ JoinTreeQueryPlan buildQueryPlanForArrayJoinNode(const QueryTreeNodePtr & array_ .from_stage = QueryProcessingStage::FetchColumns, .used_row_policies = std::move(join_tree_query_plan.used_row_policies), .actions_dags = std::move(join_tree_query_plan.actions_dags), + .query_node_to_plan_step_mapping = std::move(join_tree_query_plan.query_node_to_plan_step_mapping), }; } diff --git a/src/Planner/PlannerJoinTree.h b/src/Planner/PlannerJoinTree.h index c5a7d14fa55..a21438d466f 100644 --- a/src/Planner/PlannerJoinTree.h +++ b/src/Planner/PlannerJoinTree.h @@ -17,6 +17,7 @@ struct JoinTreeQueryPlan QueryProcessingStage::Enum from_stage; std::set used_row_policies; std::vector actions_dags; + std::unordered_map query_node_to_plan_step_mapping; }; /// Build JOIN TREE query plan for query node diff --git a/src/Planner/Utils.cpp b/src/Planner/Utils.cpp index 63f68ccf838..50ffa83a272 100644 --- a/src/Planner/Utils.cpp +++ b/src/Planner/Utils.cpp @@ -37,6 +37,8 @@ #include #include +#include + namespace DB { @@ -130,6 +132,34 @@ ASTPtr queryNodeToSelectQuery(const QueryTreeNodePtr & query_node) return result_ast; } +static void removeCTEs(ASTPtr & ast) +{ + std::stack stack; + stack.push(ast.get()); + while (!stack.empty()) + { + auto * node = stack.top(); + stack.pop(); + + if (auto * subquery = typeid_cast(node)) + subquery->cte_name = {}; + + for (const auto & child : node->children) + stack.push(child.get()); + } +} + +ASTPtr queryNodeToDistributedSelectQuery(const QueryTreeNodePtr & query_node) +{ + auto ast = queryNodeToSelectQuery(query_node); + /// Remove CTEs information from distributed queries. + /// Now, if cte_name is set for subquery node, AST -> String serialization will only print cte name. + /// But CTE is defined only for top-level query part, so may not be sent. + /// Removing cte_name forces subquery to be always printed. + removeCTEs(ast); + return ast; +} + /** There are no limits on the maximum size of the result for the subquery. * Since the result of the query is not the result of the entire query. */ diff --git a/src/Planner/Utils.h b/src/Planner/Utils.h index 1b8397f47cc..8df26d598b1 100644 --- a/src/Planner/Utils.h +++ b/src/Planner/Utils.h @@ -34,6 +34,9 @@ Block buildCommonHeaderForUnion(const Blocks & queries_headers, SelectUnionMode /// Convert query node to ASTSelectQuery ASTPtr queryNodeToSelectQuery(const QueryTreeNodePtr & query_node); +/// Convert query node to ASTSelectQuery for distributed processing +ASTPtr queryNodeToDistributedSelectQuery(const QueryTreeNodePtr & query_node); + /// Build context for subquery execution ContextPtr buildSubqueryContext(const ContextPtr & context); diff --git a/src/Planner/findParallelReplicasQuery.cpp b/src/Planner/findParallelReplicasQuery.cpp new file mode 100644 index 00000000000..362f7109f47 --- /dev/null +++ b/src/Planner/findParallelReplicasQuery.cpp @@ -0,0 +1,438 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int UNSUPPORTED_METHOD; +} + +/// Returns a list of (sub)queries (candidates) which may support parallel replicas. +/// The rule is : +/// subquery has only LEFT or ALL INNER JOIN (or none), and left part is MergeTree table or subquery candidate as well. +/// +/// Additional checks are required, so we return many candidates. The innermost subquery is on top. +std::stack getSupportingParallelReplicasQuery(const IQueryTreeNode * query_tree_node) +{ + std::stack res; + + while (query_tree_node) + { + auto join_tree_node_type = query_tree_node->getNodeType(); + + switch (join_tree_node_type) + { + case QueryTreeNodeType::TABLE: + { + const auto & table_node = query_tree_node->as(); + const auto & storage = table_node.getStorage(); + /// Here we check StorageDummy as well, to support a query tree with replaced storages. + if (std::dynamic_pointer_cast(storage) || typeid_cast(storage.get())) + return res; + + return {}; + } + case QueryTreeNodeType::TABLE_FUNCTION: + { + return {}; + } + case QueryTreeNodeType::QUERY: + { + const auto & query_node_to_process = query_tree_node->as(); + query_tree_node = query_node_to_process.getJoinTree().get(); + res.push(&query_node_to_process); + break; + } + case QueryTreeNodeType::UNION: + { + const auto & union_node = query_tree_node->as(); + const auto & union_queries = union_node.getQueries().getNodes(); + + if (union_queries.empty()) + return {}; + + query_tree_node = union_queries.front().get(); + break; + } + case QueryTreeNodeType::ARRAY_JOIN: + { + const auto & array_join_node = query_tree_node->as(); + query_tree_node = array_join_node.getTableExpression().get(); + break; + } + case QueryTreeNodeType::JOIN: + { + const auto & join_node = query_tree_node->as(); + auto join_kind = join_node.getKind(); + auto join_strictness = join_node.getStrictness(); + + bool can_parallelize_join = + join_kind == JoinKind::Left + || (join_kind == JoinKind::Inner && join_strictness == JoinStrictness::All); + + if (!can_parallelize_join) + return {}; + + query_tree_node = join_node.getLeftTableExpression().get(); + break; + } + default: + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Unexpected node type for table expression. " + "Expected table, table function, query, union, join or array join. Actual {}", + query_tree_node->getNodeTypeName()); + } + } + } + + return res; +} + +class ReplaceTableNodeToDummyVisitor : public InDepthQueryTreeVisitor +{ +public: + using Base = InDepthQueryTreeVisitor; + using Base::Base; + + void visitImpl(const QueryTreeNodePtr & node) + { + auto * table_node = node->as(); + auto * table_function_node = node->as(); + + if (table_node || table_function_node) + { + const auto & storage_snapshot = table_node ? table_node->getStorageSnapshot() : table_function_node->getStorageSnapshot(); + auto get_column_options = GetColumnsOptions(GetColumnsOptions::All).withExtendedObjects().withVirtuals(); + + auto storage_dummy + = std::make_shared(storage_snapshot->storage.getStorageID(), ColumnsDescription(storage_snapshot->getColumns(get_column_options))); + + auto dummy_table_node = std::make_shared(std::move(storage_dummy), context); + + dummy_table_node->setAlias(node->getAlias()); + replacement_map.emplace(node.get(), std::move(dummy_table_node)); + } + } + + ContextPtr context; + std::unordered_map replacement_map; +}; + +QueryTreeNodePtr replaceTablesWithDummyTables(const QueryTreeNodePtr & query, const ContextPtr & context) +{ + ReplaceTableNodeToDummyVisitor visitor; + visitor.context = context; + visitor.visit(query); + + return query->cloneAndReplace(visitor.replacement_map); +} + +/// Find the best candidate for parallel replicas execution by verifying query plan. +/// If query plan has only Expression, Filter of Join steps, we can execute it fully remotely and check the next query. +/// Otherwise we can execute current query up to WithMergableStage only. +const QueryNode * findQueryForParallelReplicas( + std::stack stack, + const std::unordered_map & mapping) +{ + const QueryPlan::Node * prev_checked_node = nullptr; + const QueryNode * res = nullptr; + + while (!stack.empty()) + { + const QueryNode * subquery_node = stack.top(); + stack.pop(); + + auto it = mapping.find(subquery_node); + /// This should not happen ideally. + if (it == mapping.end()) + break; + + const QueryPlan::Node * curr_node = it->second; + const QueryPlan::Node * next_node_to_check = curr_node; + bool can_distribute_full_node = true; + + while (next_node_to_check && next_node_to_check != prev_checked_node) + { + const auto & children = next_node_to_check->children; + auto * step = next_node_to_check->step.get(); + + if (children.empty()) + { + /// Found a source step. This should be possible only in the first iteration. + if (prev_checked_node) + return nullptr; + + next_node_to_check = nullptr; + } + else if (children.size() == 1) + { + const auto * expression = typeid_cast(step); + const auto * filter = typeid_cast(step); + if (!expression && !filter) + can_distribute_full_node = false; + + next_node_to_check = children.front(); + } + else + { + const auto * join = typeid_cast(step); + /// We've checked that JOIN is INNER/LEFT in query tree. + /// Don't distribute UNION node. + if (!join) + return res; + + next_node_to_check = children.front(); + } + } + + /// Current node contains steps like GROUP BY / DISTINCT + /// Will try to execute query up to WithMergableStage + if (!can_distribute_full_node) + { + /// Current query node does not contain subqueries. + /// We can execute parallel replicas over storage::read. + if (!res) + return nullptr; + + return subquery_node; + } + + /// Query is simple enough to be fully distributed. + res = subquery_node; + prev_checked_node = curr_node; + } + + return res; +} + +const QueryNode * findQueryForParallelReplicas(const QueryTreeNodePtr & query_tree_node, SelectQueryOptions & select_query_options) +{ + if (select_query_options.only_analyze) + return nullptr; + + auto * query_node = query_tree_node->as(); + auto * union_node = query_tree_node->as(); + + if (!query_node && !union_node) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Expected QUERY or UNION node. Actual {}", + query_tree_node->formatASTForErrorMessage()); + + auto context = query_node ? query_node->getContext() : union_node->getContext(); + + if (!context->canUseParallelReplicasOnInitiator()) + return nullptr; + + auto stack = getSupportingParallelReplicasQuery(query_tree_node.get()); + /// Empty stack means that storage does not support parallel replicas. + if (stack.empty()) + return nullptr; + + /// We don't have any subquery and storage can process parallel replicas by itself. + if (stack.top() == query_tree_node.get()) + return nullptr; + + /// This is needed to avoid infinite recursion. + auto mutable_context = Context::createCopy(context); + mutable_context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0)); + + /// Here we replace tables to dummy, in order to build a temporary query plan for parallel replicas analysis. + ResultReplacementMap replacement_map; + auto updated_query_tree = replaceTablesWithDummyTables(query_tree_node, mutable_context); + + SelectQueryOptions options; + Planner planner(updated_query_tree, options, std::make_shared(nullptr, nullptr)); + planner.buildQueryPlanIfNeeded(); + + /// This part is a bit clumsy. + /// We updated a query_tree with dummy storages, and mapping is using updated_query_tree now. + /// But QueryNode result should be taken from initial query tree. + /// So that we build a list of candidates again, and call findQueryForParallelReplicas for it. + auto new_stack = getSupportingParallelReplicasQuery(updated_query_tree.get()); + const auto & mapping = planner.getQueryNodeToPlanStepMapping(); + const auto * res = findQueryForParallelReplicas(new_stack, mapping); + + /// Now, return a query from initial stack. + if (res) + { + while (!new_stack.empty()) + { + if (res == new_stack.top()) + return stack.top(); + + stack.pop(); + new_stack.pop(); + } + } + + return res; +} + +static const TableNode * findTableForParallelReplicas(const IQueryTreeNode * query_tree_node) +{ + std::stack right_join_nodes; + while (query_tree_node || !right_join_nodes.empty()) + { + if (!query_tree_node) + { + query_tree_node = right_join_nodes.top(); + right_join_nodes.pop(); + } + + auto join_tree_node_type = query_tree_node->getNodeType(); + + switch (join_tree_node_type) + { + case QueryTreeNodeType::TABLE: + { + const auto & table_node = query_tree_node->as(); + const auto & storage = table_node.getStorage(); + if (std::dynamic_pointer_cast(storage) || typeid_cast(storage.get())) + return &table_node; + + query_tree_node = nullptr; + break; + } + case QueryTreeNodeType::TABLE_FUNCTION: + { + query_tree_node = nullptr; + break; + } + case QueryTreeNodeType::QUERY: + { + const auto & query_node_to_process = query_tree_node->as(); + query_tree_node = query_node_to_process.getJoinTree().get(); + break; + } + case QueryTreeNodeType::UNION: + { + const auto & union_node = query_tree_node->as(); + const auto & union_queries = union_node.getQueries().getNodes(); + + query_tree_node = nullptr; + if (!union_queries.empty()) + query_tree_node = union_queries.front().get(); + + break; + } + case QueryTreeNodeType::ARRAY_JOIN: + { + const auto & array_join_node = query_tree_node->as(); + query_tree_node = array_join_node.getTableExpression().get(); + break; + } + case QueryTreeNodeType::JOIN: + { + const auto & join_node = query_tree_node->as(); + query_tree_node = join_node.getLeftTableExpression().get(); + right_join_nodes.push(join_node.getRightTableExpression().get()); + break; + } + default: + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Unexpected node type for table expression. " + "Expected table, table function, query, union, join or array join. Actual {}", + query_tree_node->getNodeTypeName()); + } + } + } + + return nullptr; +} + +const TableNode * findTableForParallelReplicas(const QueryTreeNodePtr & query_tree_node, SelectQueryOptions & select_query_options) +{ + if (select_query_options.only_analyze) + return nullptr; + + auto * query_node = query_tree_node->as(); + auto * union_node = query_tree_node->as(); + + if (!query_node && !union_node) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Expected QUERY or UNION node. Actual {}", + query_tree_node->formatASTForErrorMessage()); + + auto context = query_node ? query_node->getContext() : union_node->getContext(); + + if (!context->canUseParallelReplicasOnFollower()) + return nullptr; + + return findTableForParallelReplicas(query_tree_node.get()); +} + +JoinTreeQueryPlan buildQueryPlanForParallelReplicas( + const QueryNode & query_node, + const PlannerContextPtr & planner_context, + std::shared_ptr storage_limits) +{ + auto processed_stage = QueryProcessingStage::WithMergeableState; + auto context = planner_context->getQueryContext(); + + QueryTreeNodePtr modified_query_tree = query_node.clone(); + + Block initial_header = InterpreterSelectQueryAnalyzer::getSampleBlock( + modified_query_tree, context, SelectQueryOptions(processed_stage).analyze()); + + rewriteJoinToGlobalJoin(modified_query_tree, context); + modified_query_tree = buildQueryTreeForShard(planner_context, modified_query_tree); + ASTPtr modified_query_ast = queryNodeToDistributedSelectQuery(modified_query_tree); + + Block header = InterpreterSelectQueryAnalyzer::getSampleBlock( + modified_query_tree, context, SelectQueryOptions(processed_stage).analyze()); + + ClusterProxy::SelectStreamFactory select_stream_factory = + ClusterProxy::SelectStreamFactory( + header, + {}, + {}, + processed_stage); + + QueryPlan query_plan; + ClusterProxy::executeQueryWithParallelReplicas( + query_plan, + select_stream_factory, + modified_query_ast, + context, + storage_limits); + + auto converting = ActionsDAG::makeConvertingActions( + header.getColumnsWithTypeAndName(), + initial_header.getColumnsWithTypeAndName(), + ActionsDAG::MatchColumnsMode::Position); + + /// initial_header is a header expected by initial query. + /// header is a header which is returned by the follower. + /// They are different because tables will have different aliases (e.g. _table1 or _table5). + /// Here we just rename columns by position, with the hope the types would match. + auto step = std::make_unique(query_plan.getCurrentDataStream(), std::move(converting)); + step->setStepDescription("Convert distributed names"); + query_plan.addStep(std::move(step)); + + return {std::move(query_plan), std::move(processed_stage), {}, {}, {}}; +} + +} diff --git a/src/Planner/findQueryForParallelReplicas.h b/src/Planner/findQueryForParallelReplicas.h new file mode 100644 index 00000000000..f5dc69dfa0e --- /dev/null +++ b/src/Planner/findQueryForParallelReplicas.h @@ -0,0 +1,38 @@ +#pragma once +#include +#include + +namespace DB +{ + +class QueryNode; +class TableNode; + +class IQueryTreeNode; +using QueryTreeNodePtr = std::shared_ptr; + +struct SelectQueryOptions; + +/// Find a qury which can be executed with parallel replicas up to WithMergableStage. +/// Returned query will always contain some (>1) subqueries, possibly with joins. +const QueryNode * findQueryForParallelReplicas(const QueryTreeNodePtr & query_tree_node, SelectQueryOptions & select_query_options); + +/// Find a table from which we should read on follower replica. It's the left-most table within all JOINs and UNIONs. +const TableNode * findTableForParallelReplicas(const QueryTreeNodePtr & query_tree_node, SelectQueryOptions & select_query_options); + +struct JoinTreeQueryPlan; + +class PlannerContext; +using PlannerContextPtr = std::shared_ptr; + +struct StorageLimits; +using StorageLimitsList = std::list; + +/// Execute QueryNode with parallel replicas up to WithMergableStage and return a plan. +/// This method does not check that QueryNode is valid. Ideally it should be a result of findParallelReplicasQuery. +JoinTreeQueryPlan buildQueryPlanForParallelReplicas( + const QueryNode & query_node, + const PlannerContextPtr & planner_context, + std::shared_ptr storage_limits); + +} diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 79b7ca17a5a..45523700a5d 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -81,7 +81,7 @@ IIRowSchemaReader::IIRowSchemaReader(ReadBuffer & in_, const FormatSettings & fo { } -void IIRowSchemaReader::setContext(ContextPtr & context) +void IIRowSchemaReader::setContext(const ContextPtr & context) { ColumnsDescription columns; if (tryParseColumnsListFromString(hints_str, columns, context, hints_parsing_error)) diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 94df71a88b4..23c6606a6bd 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -34,7 +34,7 @@ public: virtual bool hasStrictOrderOfColumns() const { return true; } virtual bool needContext() const { return false; } - virtual void setContext(ContextPtr &) {} + virtual void setContext(const ContextPtr &) {} virtual void setMaxRowsAndBytesToRead(size_t, size_t) {} virtual size_t getNumRowsRead() const { return 0; } @@ -56,7 +56,7 @@ public: IIRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_ = nullptr); bool needContext() const override { return !hints_str.empty(); } - void setContext(ContextPtr & context) override; + void setContext(const ContextPtr & context) override; protected: void setMaxRowsAndBytesToRead(size_t max_rows, size_t max_bytes) override diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 8dc8fa516dc..8ef2cda5587 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -212,7 +212,7 @@ static AvroDeserializer::DeserializeFn createDecimalDeserializeFn(const avro::No }; } -static std::string nodeToJson(avro::NodePtr root_node) +static std::string nodeToJSON(avro::NodePtr root_node) { std::ostringstream ss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM ss.exceptions(std::ios::failbit); @@ -641,7 +641,7 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(const avro throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Type {} is not compatible with Avro {}:\n{}", - target_type->getName(), avro::toString(root_node->type()), nodeToJson(root_node)); + target_type->getName(), avro::toString(root_node->type()), nodeToJSON(root_node)); } AvroDeserializer::SkipFn AvroDeserializer::createSkipFn(const avro::NodePtr & root_node) diff --git a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp index 1048bdad22f..9d6c8420069 100644 --- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp +++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp @@ -130,6 +130,11 @@ namespace DB reinterpret_cast(internal_data.data() + start), end - start, reinterpret_cast(arrow_null_bytemap_raw_ptr)); + else if constexpr (std::is_same_v) + status = builder.AppendValues( + reinterpret_cast(internal_data.data() + start), + end - start, + reinterpret_cast(arrow_null_bytemap_raw_ptr)); else status = builder.AppendValues(internal_data.data() + start, end - start, reinterpret_cast(arrow_null_bytemap_raw_ptr)); checkStatus(status, write_column->getName(), format_name); diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 2c0a2524357..dd7d6c6b024 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -392,7 +392,7 @@ bool CSVFormatReader::readFieldImpl(ReadBuffer & istr, DB::IColumn & column, con if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) { /// If value is null but type is not nullable then use default value instead. - return SerializationNullable::deserializeTextCSVImpl(column, istr, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(column, istr, format_settings, serialization); } /// Read the column normally. diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index c4b3c8feb8c..fe4d4e3be08 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB diff --git a/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp b/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp index 2602f8b881d..f91f7cf536b 100644 --- a/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp +++ b/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -28,7 +27,6 @@ #include #include #include -#include namespace DB @@ -603,6 +601,8 @@ bool ConstantExpressionTemplate::parseLiteralAndAssertType( memcpy(buf, istr.position(), bytes_to_copy); buf[bytes_to_copy] = 0; + const bool hex_like = bytes_to_copy >= 2 && buf[0] == '0' && (buf[1] == 'x' || buf[1] == 'X'); + char * pos_double = buf; errno = 0; Float64 float_value = std::strtod(buf, &pos_double); @@ -614,13 +614,13 @@ bool ConstantExpressionTemplate::parseLiteralAndAssertType( char * pos_integer = buf; errno = 0; - UInt64 uint_value = std::strtoull(buf, &pos_integer, 0); + UInt64 uint_value = std::strtoull(buf, &pos_integer, hex_like ? 16 : 10); if (pos_integer == pos_double && errno != ERANGE && (!negative || uint_value <= (1ULL << 63))) { istr.position() += pos_integer - buf; if (negative && type_info.main_type == Type::Int64) number = static_cast(-uint_value); - else if (!negative && type_info.main_type == Type::UInt64) + else if (type_info.main_type == Type::UInt64 && (!negative || uint_value == 0)) number = uint_value; else return false; diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp index 53cb5a77898..62d33d36206 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp @@ -215,7 +215,7 @@ JSONColumnsSchemaReaderBase::JSONColumnsSchemaReaderBase( { } -void JSONColumnsSchemaReaderBase::setContext(ContextPtr & ctx) +void JSONColumnsSchemaReaderBase::setContext(const ContextPtr & ctx) { ColumnsDescription columns; if (tryParseColumnsListFromString(hints_str, columns, ctx, hints_parsing_error)) diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h index fe80d77cd87..ee7e79afc54 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h @@ -84,7 +84,7 @@ public: void transformTypesFromDifferentFilesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override; bool needContext() const override { return !hints_str.empty(); } - void setContext(ContextPtr & ctx) override; + void setContext(const ContextPtr & ctx) override; void setMaxRowsAndBytesToRead(size_t max_rows, size_t max_bytes) override { diff --git a/src/Processors/Formats/Impl/JSONColumnsWithMetadataBlockInputFormat.cpp b/src/Processors/Formats/Impl/JSONColumnsWithMetadataBlockInputFormat.cpp index 23c6114fb39..572b3b0703f 100644 --- a/src/Processors/Formats/Impl/JSONColumnsWithMetadataBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONColumnsWithMetadataBlockInputFormat.cpp @@ -60,7 +60,7 @@ void registerInputFormatJSONColumnsWithMetadata(FormatFactory & factory) factory.registerInputFormat( "JSONColumnsWithMetadata", [](ReadBuffer & buf, - const Block &sample, + const Block & sample, const RowInputFormatParams &, const FormatSettings & settings) { diff --git a/src/Processors/Formats/Impl/JSONColumnsWithMetadataBlockInputFormat.h b/src/Processors/Formats/Impl/JSONColumnsWithMetadataBlockInputFormat.h index 9a6ed79c522..265f76a74c1 100644 --- a/src/Processors/Formats/Impl/JSONColumnsWithMetadataBlockInputFormat.h +++ b/src/Processors/Formats/Impl/JSONColumnsWithMetadataBlockInputFormat.h @@ -15,7 +15,7 @@ public: bool checkChunkEnd() override; private: - const Block & header; + const Block header; const bool validate_types_from_metadata; }; diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 0ef19a9c14f..6fa94356cd3 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -179,7 +179,7 @@ void JSONEachRowRowInputFormat::readJSONObject(MutableColumns & columns) else if (column_index == NESTED_FIELD) readNestedData(name_ref.toString(), columns); else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: illegal value of column_index"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Illegal value of column_index"); } else { diff --git a/src/Processors/Formats/Impl/JSONRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONRowInputFormat.cpp index f78ce530ecb..67652a2cb0d 100644 --- a/src/Processors/Formats/Impl/JSONRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONRowInputFormat.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include namespace DB @@ -70,27 +71,36 @@ void JSONRowInputFormat::resetReadBuffer() JSONEachRowRowInputFormat::resetReadBuffer(); } -JSONRowSchemaReader::JSONRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) - : JSONRowSchemaReader(std::make_unique(in_), format_settings_) +JSONRowSchemaReader::JSONRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, bool fallback_to_json_each_row_) + : JSONRowSchemaReader(std::make_unique(in_), format_settings_, fallback_to_json_each_row_) { } -JSONRowSchemaReader::JSONRowSchemaReader(std::unique_ptr buf, const DB::FormatSettings & format_settings_) - : JSONEachRowSchemaReader(*buf, format_settings_), peekable_buf(std::move(buf)) +JSONRowSchemaReader::JSONRowSchemaReader(std::unique_ptr buf, const DB::FormatSettings & format_settings_, bool fallback_to_json_each_row_) + : JSONEachRowSchemaReader(*buf, format_settings_), peekable_buf(std::move(buf)), fallback_to_json_each_row(fallback_to_json_each_row_) { } NamesAndTypesList JSONRowSchemaReader::readSchema() { skipBOMIfExists(*peekable_buf); - PeekableReadBufferCheckpoint checkpoint(*peekable_buf); - /// Try to parse metadata, if failed, try to parse data as JSONEachRow format - NamesAndTypesList names_and_types; - if (JSONUtils::checkAndSkipObjectStart(*peekable_buf) && JSONUtils::tryReadMetadata(*peekable_buf, names_and_types)) - return names_and_types; - peekable_buf->rollbackToCheckpoint(true); - return JSONEachRowSchemaReader::readSchema(); + if (fallback_to_json_each_row) + { + PeekableReadBufferCheckpoint checkpoint(*peekable_buf); + /// Try to parse metadata, if failed, try to parse data as JSONEachRow format + NamesAndTypesList names_and_types; + if (JSONUtils::checkAndSkipObjectStart(*peekable_buf) && JSONUtils::tryReadMetadata(*peekable_buf, names_and_types)) + return names_and_types; + + peekable_buf->rollbackToCheckpoint(true); + return JSONEachRowSchemaReader::readSchema(); + } + else + { + JSONUtils::skipObjectStart(*peekable_buf); + return JSONUtils::readMetadata(*peekable_buf); + } } void registerInputFormatJSON(FormatFactory & factory) @@ -109,19 +119,19 @@ void registerInputFormatJSON(FormatFactory & factory) void registerJSONSchemaReader(FormatFactory & factory) { - auto register_schema_reader = [&](const String & format) + auto register_schema_reader = [&](const String & format, bool fallback_to_json_each_row) { factory.registerSchemaReader( - format, [](ReadBuffer & buf, const FormatSettings & format_settings) { return std::make_unique(buf, format_settings); }); + format, [fallback_to_json_each_row](ReadBuffer & buf, const FormatSettings & format_settings) { return std::make_unique(buf, format_settings, fallback_to_json_each_row); }); factory.registerAdditionalInfoForSchemaCacheGetter(format, [](const FormatSettings & settings) { return getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON); }); }; - register_schema_reader("JSON"); + register_schema_reader("JSON", true); /// JSONCompact has the same suffix with metadata. - register_schema_reader("JSONCompact"); + register_schema_reader("JSONCompact", false); } } diff --git a/src/Processors/Formats/Impl/JSONRowInputFormat.h b/src/Processors/Formats/Impl/JSONRowInputFormat.h index b2e1d8a3d6d..6db5cee380a 100644 --- a/src/Processors/Formats/Impl/JSONRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONRowInputFormat.h @@ -45,16 +45,17 @@ private: class JSONRowSchemaReader : public JSONEachRowSchemaReader { public: - JSONRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + JSONRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, bool fallback_to_json_each_row_); NamesAndTypesList readSchema() override; bool hasStrictOrderOfColumns() const override { return false; } private: - JSONRowSchemaReader(std::unique_ptr buf, const FormatSettings & format_settings_); + JSONRowSchemaReader(std::unique_ptr buf, const FormatSettings & format_settings_, bool fallback_to_json_each_row_); std::unique_ptr peekable_buf; + bool fallback_to_json_each_row; }; } diff --git a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp index 7e8b4accf4d..9c7f095e661 100644 --- a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp @@ -409,7 +409,7 @@ bool MySQLDumpRowInputFormat::readField(IColumn & column, size_t column_idx) const auto & type = types[column_idx]; const auto & serialization = serializations[column_idx]; if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) - return SerializationNullable::deserializeTextQuotedImpl(column, *in, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(column, *in, format_settings, serialization); serialization->deserializeTextQuoted(column, *in, format_settings); return true; diff --git a/src/Processors/Formats/Impl/Parquet/Write.cpp b/src/Processors/Formats/Impl/Parquet/Write.cpp index 02ca2734ff8..4d71e0102d8 100644 --- a/src/Processors/Formats/Impl/Parquet/Write.cpp +++ b/src/Processors/Formats/Impl/Parquet/Write.cpp @@ -409,7 +409,7 @@ PODArray & compress(PODArray & source, PODArray & scratch, Com #pragma clang diagnostic pop if (max_dest_size > std::numeric_limits::max()) - throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column of size {}", formatReadableSizeWithBinarySuffix(source.size())); + throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column of size {}", ReadableSize(source.size())); scratch.resize(max_dest_size); diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index 432e944a246..29bc0012dc0 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -147,7 +147,7 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex const auto & type = getPort().getHeader().getByPosition(index).type; const auto & serialization = serializations[index]; if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) - read_columns[index] = SerializationNullable::deserializeTextEscapedImpl(*columns[index], *in, format_settings, serialization); + read_columns[index] = SerializationNullable::deserializeNullAsDefaultOrNestedTextEscaped(*columns[index], *in, format_settings, serialization); else serialization->deserializeTextEscaped(*columns[index], *in, format_settings); } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 6f6dae334e5..85b1797dab8 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -168,7 +168,7 @@ bool TabSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & t if (is_raw) { if (as_nullable) - return SerializationNullable::deserializeTextRawImpl(column, *buf, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextRaw(column, *buf, format_settings, serialization); serialization->deserializeTextRaw(column, *buf, format_settings); return true; @@ -176,7 +176,7 @@ bool TabSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & t if (as_nullable) - return SerializationNullable::deserializeTextEscapedImpl(column, *buf, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextEscaped(column, *buf, format_settings, serialization); serialization->deserializeTextEscaped(column, *buf, format_settings); return true; diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index 00a270e9611..32abd532a52 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB diff --git a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index 6d8fe1e5a2c..1c43a0fa331 100644 --- a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -11,6 +11,7 @@ namespace DB namespace ErrorCodes { extern const int SYNTAX_ERROR; + extern const int INVALID_TEMPLATE_FORMAT; } TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, WriteBuffer & out_, const FormatSettings & settings_, @@ -193,13 +194,25 @@ void registerOutputFormatTemplate(FormatFactory & factory) const FormatSettings & settings) { ParsedTemplateFormatString resultset_format; + auto idx_resultset_by_name = [&](const String & partName) + { + return static_cast(TemplateBlockOutputFormat::stringToResultsetPart(partName)); + }; if (settings.template_settings.resultset_format.empty()) { /// Default format string: "${data}" - resultset_format.delimiters.resize(2); - resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None); - resultset_format.format_idx_to_column_idx.emplace_back(0); - resultset_format.column_names.emplace_back("data"); + if (settings.template_settings.resultset_format_template.empty()) + { + resultset_format.delimiters.resize(2); + resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None); + resultset_format.format_idx_to_column_idx.emplace_back(0); + resultset_format.column_names.emplace_back("data"); + } + else + { + resultset_format = ParsedTemplateFormatString(); + resultset_format.parse(settings.template_settings.resultset_format_template, idx_resultset_by_name); + } } else { @@ -207,20 +220,34 @@ void registerOutputFormatTemplate(FormatFactory & factory) resultset_format = ParsedTemplateFormatString( FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path), - [&](const String & partName) - { - return static_cast(TemplateBlockOutputFormat::stringToResultsetPart(partName)); - }); + idx_resultset_by_name); + if (!settings.template_settings.resultset_format_template.empty()) + { + throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_resultset or format_template_resultset_format, but not both"); + } } - ParsedTemplateFormatString row_format = ParsedTemplateFormatString( + ParsedTemplateFormatString row_format; + auto idx_row_by_name = [&](const String & colName) + { + return sample.getPositionByName(colName); + }; + if (settings.template_settings.row_format.empty()) + { + row_format = ParsedTemplateFormatString(); + row_format.parse(settings.template_settings.row_format_template, idx_row_by_name); + } + else + { + row_format = ParsedTemplateFormatString( FormatSchemaInfo(settings.template_settings.row_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path), - [&](const String & colName) - { - return sample.getPositionByName(colName); - }); - + idx_row_by_name); + if (!settings.template_settings.row_format_template.empty()) + { + throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_template_row_format, but not both"); + } + } return std::make_shared(sample, buf, settings, resultset_format, row_format, settings.template_settings.row_between_delimiter); }); diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index a6e4600d83b..f5edfb7c9d4 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -609,7 +609,9 @@ void registerTemplateSchemaReader(FormatFactory & factory) { size_t index = 0; auto idx_getter = [&](const String &) -> std::optional { return index++; }; - auto row_format = fillRowFormat(settings, idx_getter, false); + ParsedTemplateFormatString row_format; + if (!settings.template_settings.row_format.empty()) + row_format = fillRowFormat(settings, idx_getter, false); std::unordered_set visited_escaping_rules; String result = fmt::format("row_format={}, resultset_format={}, row_between_delimiter={}", settings.template_settings.row_format, diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index 3e61bfbc794..8659dcd2318 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -293,7 +293,7 @@ bool ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx) const auto & type = types[column_idx]; const auto & serialization = serializations[column_idx]; if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) - read = SerializationNullable::deserializeTextQuotedImpl(column, *buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(column, *buf, format_settings, serialization); else serialization->deserializeTextQuoted(column, *buf, format_settings); } diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h index bf2765bfd1e..f82a8c8ab64 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h @@ -37,7 +37,7 @@ public: void resetReadBuffer() override; /// TODO: remove context somehow. - void setContext(ContextPtr & context_) { context = Context::createCopy(context_); } + void setContext(const ContextPtr & context_) { context = Context::createCopy(context_); } const BlockMissingValues & getMissingValues() const override { return block_missing_values; } diff --git a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp index a56c24a740a..fcf338577f8 100644 --- a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp +++ b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp @@ -136,7 +136,7 @@ bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(co auto * curr_position = in->position(); if (curr_position < prev_position) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: parsing is non-deterministic."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Parsing is non-deterministic."); if (isNativeNumber(type) || isDate(type) || isDateTime(type) || isDateTime64(type)) { diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index 478ce41f924..2ad6a825c8f 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include diff --git a/src/Processors/Merges/Algorithms/MergedData.h b/src/Processors/Merges/Algorithms/MergedData.h index f92d20d22e1..7ffde835ad0 100644 --- a/src/Processors/Merges/Algorithms/MergedData.h +++ b/src/Processors/Merges/Algorithms/MergedData.h @@ -100,7 +100,7 @@ public: merged_rows = 0; sum_blocks_granularity = 0; ++total_chunks; - total_allocated_bytes += chunk.allocatedBytes(); + total_allocated_bytes += chunk.bytes(); need_flush = false; return chunk; @@ -122,7 +122,7 @@ public: { size_t merged_bytes = 0; for (const auto & column : columns) - merged_bytes += column->allocatedBytes(); + merged_bytes += column->byteSize(); if (merged_bytes >= max_block_size_bytes) return true; } diff --git a/src/Processors/QueryPlan/CreatingSetsStep.cpp b/src/Processors/QueryPlan/CreatingSetsStep.cpp index 3e4dfb0c7d1..f13a717004f 100644 --- a/src/Processors/QueryPlan/CreatingSetsStep.cpp +++ b/src/Processors/QueryPlan/CreatingSetsStep.cpp @@ -157,6 +157,34 @@ void addCreatingSetsStep(QueryPlan & query_plan, PreparedSets::Subqueries subque query_plan.unitePlans(std::move(creating_sets), std::move(plans)); } +QueryPipelineBuilderPtr addCreatingSetsTransform(QueryPipelineBuilderPtr pipeline, PreparedSets::Subqueries subqueries, ContextPtr context) +{ + DataStreams input_streams; + input_streams.emplace_back(DataStream{pipeline->getHeader()}); + + QueryPipelineBuilders pipelines; + pipelines.reserve(1 + subqueries.size()); + pipelines.push_back(std::move(pipeline)); + + auto plan_settings = QueryPlanOptimizationSettings::fromContext(context); + auto pipeline_settings = BuildQueryPipelineSettings::fromContext(context); + + for (auto & future_set : subqueries) + { + if (future_set->get()) + continue; + + auto plan = future_set->build(context); + if (!plan) + continue; + + input_streams.emplace_back(plan->getCurrentDataStream()); + pipelines.emplace_back(plan->buildQueryPipeline(plan_settings, pipeline_settings)); + } + + return CreatingSetsStep(input_streams).updatePipeline(std::move(pipelines), pipeline_settings); +} + std::vector> DelayedCreatingSetsStep::makePlansForSets(DelayedCreatingSetsStep && step) { std::vector> plans; diff --git a/src/Processors/QueryPlan/CreatingSetsStep.h b/src/Processors/QueryPlan/CreatingSetsStep.h index a90b70a2fa4..292ec19914c 100644 --- a/src/Processors/QueryPlan/CreatingSetsStep.h +++ b/src/Processors/QueryPlan/CreatingSetsStep.h @@ -72,4 +72,6 @@ void addCreatingSetsStep(QueryPlan & query_plan, PreparedSets::Subqueries subque void addCreatingSetsStep(QueryPlan & query_plan, PreparedSetsPtr prepared_sets, ContextPtr context); +QueryPipelineBuilderPtr addCreatingSetsTransform(QueryPipelineBuilderPtr pipeline, PreparedSets::Subqueries subqueries, ContextPtr context); + } diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index 34a1fc2bb88..3fc2d64b11f 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -66,7 +66,7 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: NameSet sort_columns; for (const auto & col : sorting_step->getSortDescription()) sort_columns.insert(col.column_name); - auto [needed_for_sorting, unneeded_for_sorting] = expression_step->getExpression()->splitActionsBySortingDescription(sort_columns); + auto [needed_for_sorting, unneeded_for_sorting, _] = expression_step->getExpression()->splitActionsBySortingDescription(sort_columns); // No calculations can be postponed. if (unneeded_for_sorting->trivial()) diff --git a/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp b/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp index 7902b36f80e..49e1a49f131 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp @@ -4,69 +4,35 @@ #include #include #include -#include -#include +#include +#include namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - -namespace -{ - -void matchDAGOutputNodesOrderWithHeader(ActionsDAGPtr & actions_dag, const Block & expected_header) -{ - std::unordered_map output_name_to_node; - for (const auto * output_node : actions_dag->getOutputs()) - output_name_to_node.emplace(output_node->result_name, output_node); - - std::unordered_set used_output_nodes; - - ActionsDAG::NodeRawConstPtrs updated_outputs; - updated_outputs.reserve(expected_header.columns()); - - for (const auto & column : expected_header) - { - auto output_node_it = output_name_to_node.find(column.name); - if (output_node_it == output_name_to_node.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Invalid move to PREWHERE optimization. Cannot find column {} in output", - column.name); - - updated_outputs.push_back(output_node_it->second); - used_output_nodes.insert(output_node_it->second); - } - - ActionsDAG::NodeRawConstPtrs unused_outputs; - for (const auto * output_node : actions_dag->getOutputs()) - { - if (used_output_nodes.contains(output_node)) - continue; - - unused_outputs.push_back(output_node); - } - - auto & actions_dag_outputs = actions_dag->getOutputs(); - actions_dag_outputs = std::move(updated_outputs); - actions_dag_outputs.insert(actions_dag_outputs.end(), unused_outputs.begin(), unused_outputs.end()); -} - -} - - namespace QueryPlanOptimizations { -void optimizePrewhere(Stack & stack, QueryPlan::Nodes & nodes) +static void removeFromOutput(ActionsDAG & dag, const std::string name) +{ + const auto * node = &dag.findInOutputs(name); + auto & outputs = dag.getOutputs(); + for (size_t i = 0; i < outputs.size(); ++i) + { + if (node == outputs[i]) + { + outputs.erase(outputs.begin() + i); + return; + } + } +} + +void optimizePrewhere(Stack & stack, QueryPlan::Nodes &) { if (stack.size() < 3) return; - const auto & frame = stack.back(); + auto & frame = stack.back(); /** Assume that on stack there are at least 3 nodes: * @@ -82,60 +48,26 @@ void optimizePrewhere(Stack & stack, QueryPlan::Nodes & nodes) if (storage_prewhere_info && storage_prewhere_info->prewhere_actions) return; - const QueryPlan::Node * filter_node = (stack.rbegin() + 1)->node; + QueryPlan::Node * filter_node = (stack.rbegin() + 1)->node; const auto * filter_step = typeid_cast(filter_node->step.get()); if (!filter_step) return; - /** Collect required filter output columns. - * Collect output nodes that are mapped to input nodes. - * Collect input node to output nodes mapping. - */ - ColumnsWithTypeAndName required_columns_after_filter; - std::unordered_set output_nodes_mapped_to_input; - std::unordered_map> input_node_to_output_names; - - for (const auto * output_node : filter_step->getExpression()->getOutputs()) - { - const auto * node_without_alias = output_node; - while (node_without_alias->type == ActionsDAG::ActionType::ALIAS) - node_without_alias = node_without_alias->children[0]; - - if (node_without_alias->type == ActionsDAG::ActionType::INPUT) - { - output_nodes_mapped_to_input.emplace(output_node->result_name); - - auto output_names_it = input_node_to_output_names.find(node_without_alias->result_name); - if (output_names_it == input_node_to_output_names.end()) - { - auto [insert_it, _] = input_node_to_output_names.emplace(node_without_alias->result_name, std::vector()); - output_names_it = insert_it; - } - - output_names_it->second.push_back(output_node->result_name); - } - - if (output_node->result_name == filter_step->getFilterColumnName() && filter_step->removesFilterColumn()) - continue; - - required_columns_after_filter.push_back(ColumnWithTypeAndName(output_node->result_type, output_node->result_name)); - } - const auto & context = read_from_merge_tree->getContext(); const auto & settings = context->getSettingsRef(); if (!settings.allow_experimental_analyzer) return; - const auto & table_expression_modifiers = read_from_merge_tree->getQueryInfo().table_expression_modifiers; - bool is_final = table_expression_modifiers && table_expression_modifiers->hasFinal(); + bool is_final = read_from_merge_tree->isQueryWithFinal(); bool optimize_move_to_prewhere = settings.optimize_move_to_prewhere && (!is_final || settings.optimize_move_to_prewhere_if_final); if (!optimize_move_to_prewhere) return; const auto & storage_snapshot = read_from_merge_tree->getStorageSnapshot(); - if (table_expression_modifiers && table_expression_modifiers->hasSampleSizeRatio()) + ColumnsWithTypeAndName required_columns_after_filter; + if (read_from_merge_tree->isQueryWithSampling()) { const auto & sampling_key = storage_snapshot->getMetadataForQuery()->getSamplingKey(); const auto & sampling_source_columns = sampling_key.expression->getRequiredColumnsWithTypes(); @@ -170,7 +102,8 @@ void optimizePrewhere(Stack & stack, QueryPlan::Nodes & nodes) filter_step->getFilterColumnName(), read_from_merge_tree->getContext(), is_final); - if (!optimize_result.has_value()) + + if (optimize_result.prewhere_nodes.empty()) return; PrewhereInfoPtr prewhere_info; @@ -181,201 +114,85 @@ void optimizePrewhere(Stack & stack, QueryPlan::Nodes & nodes) prewhere_info->need_filter = true; - auto & prewhere_filter_actions = optimize_result->prewhere_filter_actions; + auto filter_expression = filter_step->getExpression(); + const auto & filter_column_name = filter_step->getFilterColumnName(); - ActionsChain actions_chain; - - std::string prewere_filter_node_name = prewhere_filter_actions->getOutputs().at(0)->result_name; - actions_chain.addStep(std::make_unique(prewhere_filter_actions)); - - auto & filter_actions = optimize_result->filter_actions; - - /** Merge tree where optimizer splits conjunctions in filter expression into 2 parts: - * 1. Filter expressions. - * 2. Prewhere filter expressions. - * - * There can be cases when all expressions are moved to PREWHERE, but it is not - * enough to produce required filter output columns. - * - * Example: SELECT (a AND b) AS cond FROM test_table WHERE cond AND c; - * In this example condition expressions `a`, `b`, `c` can move to PREWHERE, but PREWHERE will not contain expression `and(a, b)`. - * It will contain only `a`, `b`, `c`, `and(a, b, c)` expressions. - * - * In such scenario we need to create additional step to calculate `and(a, b)` expression after PREWHERE. - */ - bool need_additional_filter_after_prewhere = false; - - if (!filter_actions) + if (optimize_result.fully_moved_to_prewhere && filter_step->removesFilterColumn()) { - /// Any node from PREWHERE filter actions can be used as possible output node - std::unordered_set possible_prewhere_output_nodes; - for (const auto & node : prewhere_filter_actions->getNodes()) - possible_prewhere_output_nodes.insert(node.result_name); + removeFromOutput(*filter_expression, filter_column_name); + auto & outputs = filter_expression->getOutputs(); + size_t size = outputs.size(); + outputs.insert(outputs.end(), optimize_result.prewhere_nodes.begin(), optimize_result.prewhere_nodes.end()); + filter_expression->removeUnusedActions(false); + outputs.resize(size); + } - for (auto & required_column : required_columns_after_filter) + auto split_result = filter_step->getExpression()->split(optimize_result.prewhere_nodes, true); + + /// This is the leak of abstraction. + /// Splited actions may have inputs which are needed only for PREWHERE. + /// This is fine for ActionsDAG to have such a split, but it breaks defaults calculation. + /// + /// See 00950_default_prewhere for example. + /// Table has structure `APIKey UInt8, SessionType UInt8` and default `OperatingSystem = SessionType+1` + /// For a query with `SELECT OperatingSystem WHERE APIKey = 42 AND SessionType = 42` we push everything to PREWHERE + /// and columns APIKey, SessionType are removed from inputs (cause only OperatingSystem is needed). + /// However, column OperatingSystem is calculated after PREWHERE stage, based on SessionType value. + /// If column SessionType is removed by PREWHERE actions, we use zero as default, and get a wrong result. + /// + /// So, here we restore removed inputs for PREWHERE actions + { + std::unordered_set first_outputs(split_result.first->getOutputs().begin(), split_result.first->getOutputs().end()); + for (const auto * input : split_result.first->getInputs()) { - if (!possible_prewhere_output_nodes.contains(required_column.name) && - !output_nodes_mapped_to_input.contains(required_column.name)) + if (!first_outputs.contains(input)) { - need_additional_filter_after_prewhere = true; - break; + split_result.first->getOutputs().push_back(input); + /// Add column to second actions as input. + /// Do not add it to result, so it would be removed. + split_result.second->addInput(input->result_name, input->result_type); } } } - /** If there are additional filter actions after PREWHERE filter actions, we create filter actions dag using PREWHERE filter - * actions output columns as filter actions dag input columns. - * Then we merge this filter actions dag nodes with old filter step actions dag nodes, to reuse some expressions from - * PREWHERE filter actions. - */ - if (need_additional_filter_after_prewhere || filter_actions) + ActionsDAG::NodeRawConstPtrs conditions; + conditions.reserve(split_result.split_nodes_mapping.size()); + for (const auto * condition : optimize_result.prewhere_nodes) + conditions.push_back(split_result.split_nodes_mapping.at(condition)); + + prewhere_info->prewhere_actions = std::move(split_result.first); + prewhere_info->remove_prewhere_column = optimize_result.fully_moved_to_prewhere && filter_step->removesFilterColumn(); + + if (conditions.size() == 1) { - auto merged_filter_actions = std::make_shared(actions_chain.getLastStepAvailableOutputColumns()); - merged_filter_actions->getOutputs().clear(); - merged_filter_actions->mergeNodes(std::move(*filter_step->getExpression()->clone())); - - /// Add old filter step filter column to outputs - for (const auto & node : merged_filter_actions->getNodes()) - { - if (node.result_name == filter_step->getFilterColumnName()) - { - merged_filter_actions->getOutputs().push_back(&node); - break; - } - } - - filter_actions = std::move(merged_filter_actions); - - /// If there is filter after PREWHERE, we can ignore filtering during PREWHERE stage - prewhere_info->need_filter = false; - - actions_chain.addStep(std::make_unique(filter_actions)); - } - - auto required_output_actions = std::make_shared(required_columns_after_filter); - actions_chain.addStep(std::make_unique(required_output_actions)); - - actions_chain.finalize(); - - prewhere_filter_actions->projectInput(false); - - auto & prewhere_actions_chain_node = actions_chain[0]; - prewhere_info->prewhere_actions = std::move(prewhere_filter_actions); - prewhere_info->prewhere_column_name = prewere_filter_node_name; - prewhere_info->remove_prewhere_column = !prewhere_actions_chain_node->getChildRequiredOutputColumnsNames().contains(prewere_filter_node_name); - - read_from_merge_tree->updatePrewhereInfo(prewhere_info); - - QueryPlan::Node * replace_old_filter_node = nullptr; - bool remove_filter_node = false; - - if (filter_actions) - { - filter_actions->projectInput(false); - - /// Match dag output nodes with old filter step header - matchDAGOutputNodesOrderWithHeader(filter_actions, filter_step->getOutputStream().header); - - auto & filter_actions_chain_node = actions_chain[1]; - bool remove_filter_column = !filter_actions_chain_node->getChildRequiredOutputColumnsNames().contains(filter_step->getFilterColumnName()); - auto after_prewhere_filter_step = std::make_unique(read_from_merge_tree->getOutputStream(), - filter_actions, - filter_step->getFilterColumnName(), - remove_filter_column); - - auto & node = nodes.emplace_back(); - node.children.emplace_back(frame.node); - node.step = std::move(after_prewhere_filter_step); - - replace_old_filter_node = &node; + prewhere_info->prewhere_column_name = conditions.front()->result_name; + prewhere_info->prewhere_actions->getOutputs().push_back(conditions.front()); } else { - auto rename_actions_dag = std::make_shared(read_from_merge_tree->getOutputStream().header.getColumnsWithTypeAndName()); - bool apply_rename_step = false; + prewhere_info->remove_prewhere_column = true; - ActionsDAG::NodeRawConstPtrs updated_outputs; - - /** If in output after read from merge tree there are column names without aliases, - * apply old filter step aliases to them. - */ - for (const auto * output_node : rename_actions_dag->getOutputs()) - { - const auto alias_it = input_node_to_output_names.find(output_node->result_name); - if (alias_it == input_node_to_output_names.end()) - { - updated_outputs.push_back(output_node); - continue; - } - - for (auto & output_name : alias_it->second) - { - if (output_name == output_node->result_name) - { - updated_outputs.push_back(output_node); - continue; - } - - updated_outputs.push_back(&rename_actions_dag->addAlias(*output_node, output_name)); - apply_rename_step = true; - } - } - - rename_actions_dag->getOutputs() = std::move(updated_outputs); - - bool apply_match_step = false; - - /// If column order does not match old filter step column order, match dag output nodes with header - if (!blocksHaveEqualStructure(read_from_merge_tree->getOutputStream().header, filter_step->getOutputStream().header)) - { - apply_match_step = true; - matchDAGOutputNodesOrderWithHeader(rename_actions_dag, filter_step->getOutputStream().header); - } - - if (apply_rename_step || apply_match_step) - { - auto rename_step = std::make_unique(read_from_merge_tree->getOutputStream(), rename_actions_dag); - if (apply_rename_step) - rename_step->setStepDescription("Change column names to column identifiers"); - - auto & node = nodes.emplace_back(); - node.children.emplace_back(frame.node); - node.step = std::move(rename_step); - - replace_old_filter_node = &node; - } - else - { - replace_old_filter_node = frame.node; - remove_filter_node = true; - } + FunctionOverloadResolverPtr func_builder_and = std::make_unique(std::make_shared()); + const auto * node = &prewhere_info->prewhere_actions->addFunction(func_builder_and, std::move(conditions), {}); + prewhere_info->prewhere_column_name = node->result_name; + prewhere_info->prewhere_actions->getOutputs().push_back(node); } - QueryPlan::Node * filter_parent_node = (stack.rbegin() + 2)->node; + read_from_merge_tree->updatePrewhereInfo(prewhere_info); - for (auto & filter_parent_child : filter_parent_node->children) + if (!optimize_result.fully_moved_to_prewhere) { - if (filter_parent_child == filter_node) - { - filter_parent_child = replace_old_filter_node; - - size_t stack_size = stack.size(); - - /** If filter step is completely replaced with PREWHERE filter actions, remove it from stack. - * Otherwise replace old filter step with new filter step after PREWHERE. - */ - if (remove_filter_node) - { - std::swap(stack[stack_size - 1], stack[stack_size - 2]); - stack.pop_back(); - } - else - { - stack[stack_size - 2] = Frame{.node = replace_old_filter_node, .next_child = 1}; - } - - break; - } + filter_node->step = std::make_unique( + read_from_merge_tree->getOutputStream(), + std::move(split_result.second), + filter_step->getFilterColumnName(), + filter_step->removesFilterColumn()); + } + else + { + filter_node->step = std::make_unique( + read_from_merge_tree->getOutputStream(), + std::move(split_result.second)); } } diff --git a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp index c8c95e7443f..fafd6d1dc00 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp @@ -118,6 +118,34 @@ void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_s optimizePrewhere(stack, nodes); optimizePrimaryKeyCondition(stack); + auto & frame = stack.back(); + + if (frame.next_child == 0) + { + + if (optimization_settings.read_in_order) + optimizeReadInOrder(*frame.node, nodes); + + if (optimization_settings.distinct_in_order) + tryDistinctReadInOrder(frame.node); + } + + /// Traverse all children first. + if (frame.next_child < frame.node->children.size()) + { + auto next_frame = Frame{.node = frame.node->children[frame.next_child]}; + ++frame.next_child; + stack.push_back(next_frame); + continue; + } + + stack.pop_back(); + } + + stack.push_back({.node = &root}); + + while (!stack.empty()) + { { /// NOTE: frame cannot be safely used after stack was modified. auto & frame = stack.back(); @@ -126,19 +154,14 @@ void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_s { has_reading_from_mt |= typeid_cast(frame.node->step.get()) != nullptr; - if (optimization_settings.read_in_order) - optimizeReadInOrder(*frame.node, nodes); - /// Projection optimization relies on PK optimization if (optimization_settings.optimize_projection) num_applied_projection += optimizeUseAggregateProjections(*frame.node, nodes, optimization_settings.optimize_use_implicit_projections); + if (optimization_settings.aggregation_in_order) optimizeAggregationInOrder(*frame.node, nodes); - - if (optimization_settings.distinct_in_order) - tryDistinctReadInOrder(frame.node); } /// Traverse all children first. diff --git a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp index 1ac759df1d1..a183f50dee5 100644 --- a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp +++ b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp @@ -223,7 +223,7 @@ bool analyzeProjectionCandidate( { const auto & created_projections = part_with_ranges.data_part->getProjectionParts(); auto it = created_projections.find(candidate.projection->name); - if (it != created_projections.end()) + if (it != created_projections.end() && !it->second->is_broken) { projection_parts.push_back(it->second); } diff --git a/src/Processors/QueryPlan/Optimizations/splitFilter.cpp b/src/Processors/QueryPlan/Optimizations/splitFilter.cpp index 8c212936195..561ad7302c6 100644 --- a/src/Processors/QueryPlan/Optimizations/splitFilter.cpp +++ b/src/Processors/QueryPlan/Optimizations/splitFilter.cpp @@ -14,19 +14,33 @@ size_t trySplitFilter(QueryPlan::Node * node, QueryPlan::Nodes & nodes) return 0; const auto & expr = filter_step->getExpression(); + const std::string & filter_column_name = filter_step->getFilterColumnName(); /// Do not split if there are function like runningDifference. if (expr->hasStatefulFunctions()) return 0; - auto split = expr->splitActionsForFilter(filter_step->getFilterColumnName()); + bool filter_name_clashs_with_input = false; + if (filter_step->removesFilterColumn()) + { + for (const auto * input : expr->getInputs()) + { + if (input->result_name == filter_column_name) + { + filter_name_clashs_with_input = true; + break; + } + } + } + + auto split = expr->splitActionsForFilter(filter_column_name); if (split.second->trivial()) return 0; bool remove_filter = false; if (filter_step->removesFilterColumn()) - remove_filter = split.second->removeUnusedResult(filter_step->getFilterColumnName()); + remove_filter = split.second->removeUnusedResult(filter_column_name); auto description = filter_step->getStepDescription(); @@ -34,10 +48,25 @@ size_t trySplitFilter(QueryPlan::Node * node, QueryPlan::Nodes & nodes) node->children.swap(filter_node.children); node->children.push_back(&filter_node); + std::string split_filter_name = filter_column_name; + if (filter_name_clashs_with_input) + { + split_filter_name = "__split_filter"; + + for (auto & filter_output : split.first->getOutputs()) + { + if (filter_output->result_name == filter_column_name) + { + filter_output = &split.first->addAlias(*filter_output, split_filter_name); + break; + } + } + } + filter_node.step = std::make_unique( filter_node.children.at(0)->step->getOutputStream(), std::move(split.first), - filter_step->getFilterColumnName(), + std::move(split_filter_name), remove_filter); node->step = std::make_unique(filter_node.step->getOutputStream(), std::move(split.second)); diff --git a/src/Processors/QueryPlan/PartsSplitter.cpp b/src/Processors/QueryPlan/PartsSplitter.cpp index 7c66c0cc8df..0fc6ddd6408 100644 --- a/src/Processors/QueryPlan/PartsSplitter.cpp +++ b/src/Processors/QueryPlan/PartsSplitter.cpp @@ -54,7 +54,7 @@ public: Values getValue(size_t part_idx, size_t mark) const { - const auto & index = parts[part_idx].data_part->index; + const auto & index = parts[part_idx].data_part->getIndex(); Values values(index.size()); for (size_t i = 0; i < values.size(); ++i) { @@ -228,7 +228,7 @@ struct SplitPartsRangesResult RangesInDataParts intersecting_parts_ranges; }; -SplitPartsRangesResult splitPartsRanges(RangesInDataParts ranges_in_data_parts) +SplitPartsRangesResult splitPartsRanges(RangesInDataParts ranges_in_data_parts, const LoggerPtr & logger) { /** Split ranges in data parts into intersecting ranges in data parts and non intersecting ranges in data parts. * @@ -483,10 +483,15 @@ SplitPartsRangesResult splitPartsRanges(RangesInDataParts ranges_in_data_parts) intersecting_ranges_in_data_parts.end(), [](const auto & lhs, const auto & rhs) { return lhs.part_index_in_query < rhs.part_index_in_query; }); + LOG_TEST(logger, "Non intersecting ranges in data parts {}", non_intersecting_ranges_in_data_parts.getDescriptions().describe()); + LOG_TEST(logger, "Intersecting ranges in data parts {}", intersecting_ranges_in_data_parts.getDescriptions().describe()); + return {std::move(non_intersecting_ranges_in_data_parts), std::move(intersecting_ranges_in_data_parts)}; } -std::pair, std::vector> splitIntersectingPartsRangesIntoLayers(RangesInDataParts intersecting_ranges_in_data_parts, size_t max_layers) +std::pair, std::vector> splitIntersectingPartsRangesIntoLayers(RangesInDataParts intersecting_ranges_in_data_parts, + size_t max_layers, + const LoggerPtr & logger) { // We will advance the iterator pointing to the mark with the smallest PK value until // there will be not less than rows_per_layer rows in the current layer (roughly speaking). @@ -591,8 +596,18 @@ std::pair, std::vector> splitIntersecting result_layers.back() = std::move(current_layer_builder.getCurrentRangesInDataParts()); } - for (auto & layer : result_layers) + size_t result_layers_size = result_layers.size(); + LOG_TEST(logger, "Split intersecting ranges into {} layers", result_layers_size); + + for (size_t i = 0; i < result_layers_size; ++i) { + auto & layer = result_layers[i]; + + LOG_TEST(logger, "Layer {} {} filter values in ({}, {}])", + i, + layer.getDescriptions().describe(), + i ? ::toString(borders[i - 1]) : "-inf", i < borders.size() ? ::toString(borders[i]) : "+inf"); + std::stable_sort( layer.begin(), layer.end(), @@ -712,23 +727,32 @@ SplitPartsWithRangesByPrimaryKeyResult splitPartsWithRangesByPrimaryKey( size_t max_layers, ContextPtr context, ReadingInOrderStepGetter && in_order_reading_step_getter, - bool force_process_all_ranges) + bool split_parts_ranges_into_intersecting_and_non_intersecting_final, + bool split_intersecting_parts_ranges_into_layers) { if (max_layers <= 1) throw Exception(ErrorCodes::LOGICAL_ERROR, "max_layer should be greater than 1"); + auto logger = getLogger("PartsSplitter"); + SplitPartsWithRangesByPrimaryKeyResult result; RangesInDataParts intersecting_parts_ranges = std::move(parts); - if (!force_process_all_ranges) + if (split_parts_ranges_into_intersecting_and_non_intersecting_final) { - SplitPartsRangesResult split_result = splitPartsRanges(intersecting_parts_ranges); + SplitPartsRangesResult split_result = splitPartsRanges(intersecting_parts_ranges, logger); result.non_intersecting_parts_ranges = std::move(split_result.non_intersecting_parts_ranges); intersecting_parts_ranges = std::move(split_result.intersecting_parts_ranges); } - auto && [layers, borders] = splitIntersectingPartsRangesIntoLayers(intersecting_parts_ranges, max_layers); + if (!split_intersecting_parts_ranges_into_layers) + { + result.merging_pipes.emplace_back(in_order_reading_step_getter(intersecting_parts_ranges)); + return result; + } + + auto && [layers, borders] = splitIntersectingPartsRangesIntoLayers(intersecting_parts_ranges, max_layers, logger); auto filters = buildFilters(primary_key, borders); result.merging_pipes.resize(layers.size()); diff --git a/src/Processors/QueryPlan/PartsSplitter.h b/src/Processors/QueryPlan/PartsSplitter.h index f1ed1cb0b9c..9bceb344589 100644 --- a/src/Processors/QueryPlan/PartsSplitter.h +++ b/src/Processors/QueryPlan/PartsSplitter.h @@ -34,5 +34,6 @@ SplitPartsWithRangesByPrimaryKeyResult splitPartsWithRangesByPrimaryKey( size_t max_layers, ContextPtr context, ReadingInOrderStepGetter && in_order_reading_step_getter, - bool force_process_all_ranges); + bool split_parts_ranges_into_intersecting_and_non_intersecting, + bool split_intersecting_parts_ranges_into_layers); } diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 5ed56f59fc1..25e58588bb7 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -89,6 +89,34 @@ size_t countPartitions(const MergeTreeData::DataPartsVector & prepared_parts) return countPartitions(prepared_parts, get_partition_id); } +bool restoreDAGInputs(ActionsDAG & dag, const NameSet & inputs) +{ + std::unordered_set outputs(dag.getOutputs().begin(), dag.getOutputs().end()); + bool added = false; + for (const auto * input : dag.getInputs()) + { + if (inputs.contains(input->result_name) && !outputs.contains(input)) + { + dag.getOutputs().push_back(input); + added = true; + } + } + + return added; +} + +bool restorePrewhereInputs(PrewhereInfo & info, const NameSet & inputs) +{ + bool added = false; + if (info.row_level_filter) + added = added || restoreDAGInputs(*info.row_level_filter, inputs); + + if (info.prewhere_actions) + added = added || restoreDAGInputs(*info.prewhere_actions, inputs); + + return added; +} + } namespace ProfileEvents @@ -786,18 +814,13 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsWithOrder( /// To fix this, we prohibit removing any input in prewhere actions. Instead, projection actions will be added after sorting. /// See 02354_read_in_order_prewhere.sql as an example. bool have_input_columns_removed_after_prewhere = false; - if (prewhere_info && prewhere_info->prewhere_actions) + if (prewhere_info) { - auto & outputs = prewhere_info->prewhere_actions->getOutputs(); - std::unordered_set outputs_set(outputs.begin(), outputs.end()); - for (const auto * input : prewhere_info->prewhere_actions->getInputs()) - { - if (!outputs_set.contains(input)) - { - outputs.push_back(input); - have_input_columns_removed_after_prewhere = true; - } - } + NameSet sorting_columns; + for (const auto & column : metadata_for_reading->getSortingKey().expression->getRequiredColumnsWithTypes()) + sorting_columns.insert(column.name); + + have_input_columns_removed_after_prewhere = restorePrewhereInputs(*prewhere_info, sorting_columns); } /// Let's split ranges to avoid reading much data. @@ -984,7 +1007,6 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsWithOrder( /// Thus we need to merge all partition parts into a single sorted stream. Pipe pipe = Pipe::unitePipes(std::move(pipes)); merge_streams(pipe); - out_projection = createProjection(pipe_header); return pipe; } @@ -1133,6 +1155,14 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( auto sorting_expr = std::make_shared(metadata_for_reading->getSortingKey().expression->getActionsDAG().clone()); + if (prewhere_info) + { + NameSet sorting_columns; + for (const auto & column : metadata_for_reading->getSortingKey().expression->getRequiredColumnsWithTypes()) + sorting_columns.insert(column.name); + restorePrewhereInputs(*prewhere_info, sorting_columns); + } + for (size_t range_index = 0; range_index < parts_to_merge_ranges.size() - 1; ++range_index) { /// If do_not_merge_across_partitions_select_final is true and there is only one part in partition @@ -1175,7 +1205,8 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( /// Parts of non-zero level still may contain duplicate PK values to merge on FINAL if there's is_deleted column, /// so we have to process all ranges. It would be more optimal to remove this flag and add an extra filtering step. - bool force_process_all_ranges = !data.merging_params.is_deleted_column.empty(); + bool split_parts_ranges_into_intersecting_and_non_intersecting_final = settings.split_parts_ranges_into_intersecting_and_non_intersecting_final && + data.merging_params.is_deleted_column.empty(); SplitPartsWithRangesByPrimaryKeyResult split_ranges_result = splitPartsWithRangesByPrimaryKey( metadata_for_reading->getPrimaryKey(), @@ -1184,7 +1215,8 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( num_streams, context, std::move(in_order_reading_step_getter), - force_process_all_ranges); + split_parts_ranges_into_intersecting_and_non_intersecting_final, + settings.split_intersecting_parts_ranges_into_layers_final); for (auto && non_intersecting_parts_range : split_ranges_result.non_intersecting_parts_ranges) non_intersecting_parts_by_primary_key.push_back(std::move(non_intersecting_parts_range)); @@ -1802,13 +1834,20 @@ Pipe ReadFromMergeTree::spreadMarkRanges( if (!final && result.sampling.use_sampling) { + NameSet sampling_columns; + /// Add columns needed for `sample_by_ast` to `column_names_to_read`. /// Skip this if final was used, because such columns were already added from PK. for (const auto & column : result.sampling.filter_expression->getRequiredColumns().getNames()) { if (!names.contains(column)) column_names_to_read.push_back(column); + + sampling_columns.insert(column); } + + if (prewhere_info) + restorePrewhereInputs(*prewhere_info, sampling_columns); } if (final) @@ -2002,6 +2041,24 @@ void ReadFromMergeTree::initializePipeline(QueryPipelineBuilder & pipeline, cons }); } + /// Some extra columns could be added by sample/final/in-order/etc + /// Remove them from header if not needed. + if (!blocksHaveEqualStructure(pipe.getHeader(), getOutputStream().header)) + { + auto convert_actions_dag = ActionsDAG::makeConvertingActions( + pipe.getHeader().getColumnsWithTypeAndName(), + getOutputStream().header.getColumnsWithTypeAndName(), + ActionsDAG::MatchColumnsMode::Name, + true); + + auto converting_dag_expr = std::make_shared(convert_actions_dag); + + pipe.addSimpleTransform([&](const Block & header) + { + return std::make_shared(header, converting_dag_expr); + }); + } + for (const auto & processor : pipe.getProcessors()) processors.emplace_back(processor); diff --git a/src/Processors/QueryPlan/ReadFromPreparedSource.cpp b/src/Processors/QueryPlan/ReadFromPreparedSource.cpp index bf2e49727ed..b845101125b 100644 --- a/src/Processors/QueryPlan/ReadFromPreparedSource.cpp +++ b/src/Processors/QueryPlan/ReadFromPreparedSource.cpp @@ -7,7 +7,7 @@ namespace DB { ReadFromPreparedSource::ReadFromPreparedSource(Pipe pipe_) - : SourceStepWithFilter(DataStream{.header = pipe_.getHeader()}) + : ISourceStep(DataStream{.header = pipe_.getHeader()}) , pipe(std::move(pipe_)) { } @@ -35,11 +35,4 @@ ReadFromStorageStep::ReadFromStorageStep( processor->setStorageLimits(query_info.storage_limits); } -void ReadFromStorageStep::applyFilters() -{ - for (const auto & processor : pipe.getProcessors()) - if (auto * source = dynamic_cast(processor.get())) - source->setKeyCondition(filter_nodes.nodes, context); -} - } diff --git a/src/Processors/QueryPlan/ReadFromPreparedSource.h b/src/Processors/QueryPlan/ReadFromPreparedSource.h index 2eea48553b3..b40a656cee3 100644 --- a/src/Processors/QueryPlan/ReadFromPreparedSource.h +++ b/src/Processors/QueryPlan/ReadFromPreparedSource.h @@ -2,7 +2,6 @@ #include #include -#include #include #include @@ -10,7 +9,7 @@ namespace DB { /// Create source from prepared pipe. -class ReadFromPreparedSource : public SourceStepWithFilter +class ReadFromPreparedSource : public ISourceStep { public: explicit ReadFromPreparedSource(Pipe pipe_); @@ -28,7 +27,6 @@ public: ReadFromStorageStep(Pipe pipe_, String storage_name, ContextPtr context_, const SelectQueryInfo & query_info_); String getName() const override { return "ReadFromStorage"; } - void applyFilters() override; private: ContextPtr context; diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 4dd79903965..93c73a66b78 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -12,7 +12,7 @@ #include #include #include -#include "Common/logger_useful.h" +#include #include #include #include @@ -178,11 +178,11 @@ void ReadFromRemote::addLazyPipe(Pipes & pipes, const ClusterProxy::SelectStream throw; } - double max_remote_delay = 0.0; + UInt32 max_remote_delay = 0; for (const auto & try_result : try_results) { if (!try_result.is_up_to_date) - max_remote_delay = std::max(try_result.staleness, max_remote_delay); + max_remote_delay = std::max(try_result.delay, max_remote_delay); } if (try_results.empty() || local_delay < max_remote_delay) @@ -375,10 +375,11 @@ ReadFromParallelRemoteReplicasStep::ReadFromParallelRemoteReplicasStep( , storage_limits(std::move(storage_limits_)) , log(log_) { - std::vector description; + chassert(cluster->getShardCount() == 1); - for (const auto & address : cluster->getShardsAddresses()) - description.push_back(fmt::format("Replica: {}", address[0].host_name)); + std::vector description; + for (const auto & pool : cluster->getShardsInfo().front().per_replica_pools) + description.push_back(fmt::format("Replica: {}", pool->getHost())); setStepDescription(boost::algorithm::join(description, ", ")); } @@ -399,51 +400,44 @@ void ReadFromParallelRemoteReplicasStep::initializePipeline(QueryPipelineBuilder const Settings & current_settings = context->getSettingsRef(); auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings); + const auto & shard = cluster->getShardsInfo().at(0); size_t all_replicas_count = current_settings.max_parallel_replicas; - if (all_replicas_count > cluster->getShardsInfo().size()) + if (all_replicas_count > shard.getAllNodeCount()) { - LOG_INFO(getLogger("ReadFromParallelRemoteReplicasStep"), - "The number of replicas requested ({}) is bigger than the real number available in the cluster ({}). "\ - "Will use the latter number to execute the query.", current_settings.max_parallel_replicas, cluster->getShardsInfo().size()); - all_replicas_count = cluster->getShardsInfo().size(); + LOG_INFO( + getLogger("ReadFromParallelRemoteReplicasStep"), + "The number of replicas requested ({}) is bigger than the real number available in the cluster ({}). " + "Will use the latter number to execute the query.", + current_settings.max_parallel_replicas, + shard.getAllNodeCount()); + all_replicas_count = shard.getAllNodeCount(); } - /// Find local shard. It might happen that there is no local shard, but that's fine - for (const auto & shard: cluster->getShardsInfo()) - { - if (shard.isLocal()) - { - IConnections::ReplicaInfo replica_info - { - .all_replicas_count = all_replicas_count, - /// `shard_num` will be equal to the number of the given replica in the cluster (set by `Cluster::getClusterWithReplicasAsShards`). - /// we should use this number specifically because efficiency of data distribution by consistent hash depends on it. - .number_of_current_replica = shard.shard_num - 1, - }; - addPipeForSingeReplica(pipes, shard.pool, replica_info); - } + std::vector shuffled_pool; + if (all_replicas_count < shard.getAllNodeCount()) + { + shuffled_pool = shard.pool->getShuffledPools(current_settings); + shuffled_pool.resize(all_replicas_count); + } + else + { + /// try to preserve replicas order if all replicas in cluster are used for query execution + /// it's important for data locality during query execution + auto priority_func = [](size_t i) { return Priority{static_cast(i)}; }; + shuffled_pool = shard.pool->getShuffledPools(current_settings, priority_func); } - auto current_shard = cluster->getShardsInfo().begin(); - while (pipes.size() != all_replicas_count) + for (size_t i=0; i < all_replicas_count; ++i) { - if (current_shard->isLocal()) - { - ++current_shard; - continue; - } - IConnections::ReplicaInfo replica_info { .all_replicas_count = all_replicas_count, - /// `shard_num` will be equal to the number of the given replica in the cluster (set by `Cluster::getClusterWithReplicasAsShards`). /// we should use this number specifically because efficiency of data distribution by consistent hash depends on it. - .number_of_current_replica = current_shard->shard_num - 1, + .number_of_current_replica = i, }; - addPipeForSingeReplica(pipes, current_shard->pool, replica_info); - ++current_shard; + addPipeForSingeReplica(pipes, shuffled_pool[i].pool, replica_info); } auto pipe = Pipe::unitePipes(std::move(pipes)); @@ -456,7 +450,8 @@ void ReadFromParallelRemoteReplicasStep::initializePipeline(QueryPipelineBuilder } -void ReadFromParallelRemoteReplicasStep::addPipeForSingeReplica(Pipes & pipes, std::shared_ptr pool, IConnections::ReplicaInfo replica_info) +void ReadFromParallelRemoteReplicasStep::addPipeForSingeReplica( + Pipes & pipes, const ConnectionPoolPtr & pool, IConnections::ReplicaInfo replica_info) { bool add_agg_info = stage == QueryProcessingStage::WithMergeableState; bool add_totals = false; @@ -476,7 +471,14 @@ void ReadFromParallelRemoteReplicasStep::addPipeForSingeReplica(Pipes & pipes, s assert(output_stream); auto remote_query_executor = std::make_shared( - pool, query_string, output_stream->header, context, throttler, scalars, external_tables, stage, + pool, + query_string, + output_stream->header, + context, + throttler, + scalars, + external_tables, + stage, RemoteQueryExecutor::Extension{.parallel_reading_coordinator = coordinator, .replica_info = std::move(replica_info)}); remote_query_executor->setLogger(log); diff --git a/src/Processors/QueryPlan/ReadFromRemote.h b/src/Processors/QueryPlan/ReadFromRemote.h index f853a12910b..498d584e85a 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.h +++ b/src/Processors/QueryPlan/ReadFromRemote.h @@ -9,10 +9,6 @@ namespace DB { - -class ConnectionPoolWithFailover; -using ConnectionPoolWithFailoverPtr = std::shared_ptr; - class Throttler; using ThrottlerPtr = std::shared_ptr; @@ -91,8 +87,7 @@ public: void enforceAggregationInOrder(); private: - - void addPipeForSingeReplica(Pipes & pipes, std::shared_ptr pool, IConnections::ReplicaInfo replica_info); + void addPipeForSingeReplica(Pipes & pipes, const ConnectionPoolPtr & pool, IConnections::ReplicaInfo replica_info); ClusterPtr cluster; ASTPtr query_ast; diff --git a/src/Processors/Sources/WaitForAsyncInsertSource.h b/src/Processors/Sources/WaitForAsyncInsertSource.h index 1029c164941..78af6294202 100644 --- a/src/Processors/Sources/WaitForAsyncInsertSource.h +++ b/src/Processors/Sources/WaitForAsyncInsertSource.h @@ -33,7 +33,7 @@ protected: { auto status = insert_future.wait_for(std::chrono::milliseconds(timeout_ms)); if (status == std::future_status::deferred) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: got future in deferred state"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got future in deferred state"); if (status == std::future_status::timeout) throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Wait for async insert timeout ({} ms) exceeded)", timeout_ms); diff --git a/src/Processors/TTL/ITTLAlgorithm.cpp b/src/Processors/TTL/ITTLAlgorithm.cpp index 79140137df8..761f43e2422 100644 --- a/src/Processors/TTL/ITTLAlgorithm.cpp +++ b/src/Processors/TTL/ITTLAlgorithm.cpp @@ -11,8 +11,9 @@ namespace ErrorCodes } ITTLAlgorithm::ITTLAlgorithm( - const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) - : description(description_) + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) + : ttl_expressions(ttl_expressions_) + , description(description_) , old_ttl_info(old_ttl_info_) , current_time(current_time_) , force(force_) diff --git a/src/Processors/TTL/ITTLAlgorithm.h b/src/Processors/TTL/ITTLAlgorithm.h index 49cd2c46d9d..d79aa8a8dfc 100644 --- a/src/Processors/TTL/ITTLAlgorithm.h +++ b/src/Processors/TTL/ITTLAlgorithm.h @@ -8,6 +8,12 @@ namespace DB { +struct TTLExpressions +{ + ExpressionActionsPtr expression; + ExpressionActionsPtr where_expression; +}; + /** * Represents the actions, which are required to do * with data, when TTL is expired: delete, aggregate, etc. @@ -18,7 +24,7 @@ public: using TTLInfo = IMergeTreeDataPart::TTLInfo; using MutableDataPartPtr = MergeTreeMutableDataPartPtr; - ITTLAlgorithm(const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); + ITTLAlgorithm(const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); virtual ~ITTLAlgorithm() = default; virtual void execute(Block & block) = 0; @@ -39,6 +45,7 @@ protected: bool isTTLExpired(time_t ttl) const; UInt32 getTimestampByIndex(const IColumn * column, size_t index) const; + const TTLExpressions ttl_expressions; const TTLDescription description; const TTLInfo old_ttl_info; const time_t current_time; diff --git a/src/Processors/TTL/TTLAggregationAlgorithm.cpp b/src/Processors/TTL/TTLAggregationAlgorithm.cpp index 2537e21dd40..45e8a96412e 100644 --- a/src/Processors/TTL/TTLAggregationAlgorithm.cpp +++ b/src/Processors/TTL/TTLAggregationAlgorithm.cpp @@ -5,13 +5,14 @@ namespace DB { TTLAggregationAlgorithm::TTLAggregationAlgorithm( + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_, const Block & header_, const MergeTreeData & storage_) - : ITTLAlgorithm(description_, old_ttl_info_, current_time_, force_) + : ITTLAlgorithm(ttl_expressions_, description_, old_ttl_info_, current_time_, force_) , header(header_) { current_key_value.resize(description.group_by_keys.size()); @@ -75,8 +76,8 @@ void TTLAggregationAlgorithm::execute(Block & block) const auto & column_names = header.getNames(); MutableColumns aggregate_columns = header.cloneEmptyColumns(); - auto ttl_column = executeExpressionAndGetColumn(description.expression, block, description.result_column); - auto where_column = executeExpressionAndGetColumn(description.where_expression, block, description.where_result_column); + auto ttl_column = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); + auto where_column = executeExpressionAndGetColumn(ttl_expressions.where_expression, block, description.where_result_column); size_t rows_aggregated = 0; size_t current_key_start = 0; @@ -157,8 +158,8 @@ void TTLAggregationAlgorithm::execute(Block & block) /// If some rows were aggregated we have to recalculate ttl info's if (some_rows_were_aggregated) { - auto ttl_column_after_aggregation = executeExpressionAndGetColumn(description.expression, block, description.result_column); - auto where_column_after_aggregation = executeExpressionAndGetColumn(description.where_expression, block, description.where_result_column); + auto ttl_column_after_aggregation = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); + auto where_column_after_aggregation = executeExpressionAndGetColumn(ttl_expressions.where_expression, block, description.where_result_column); for (size_t i = 0; i < block.rows(); ++i) { bool where_filter_passed = !where_column_after_aggregation || where_column_after_aggregation->getBool(i); diff --git a/src/Processors/TTL/TTLAggregationAlgorithm.h b/src/Processors/TTL/TTLAggregationAlgorithm.h index 0e4bf092ed6..f7bf19a202b 100644 --- a/src/Processors/TTL/TTLAggregationAlgorithm.h +++ b/src/Processors/TTL/TTLAggregationAlgorithm.h @@ -13,6 +13,7 @@ class TTLAggregationAlgorithm final : public ITTLAlgorithm { public: TTLAggregationAlgorithm( + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, diff --git a/src/Processors/TTL/TTLColumnAlgorithm.cpp b/src/Processors/TTL/TTLColumnAlgorithm.cpp index 04c4d7b9348..e27050564ce 100644 --- a/src/Processors/TTL/TTLColumnAlgorithm.cpp +++ b/src/Processors/TTL/TTLColumnAlgorithm.cpp @@ -4,6 +4,7 @@ namespace DB { TTLColumnAlgorithm::TTLColumnAlgorithm( + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, @@ -12,7 +13,7 @@ TTLColumnAlgorithm::TTLColumnAlgorithm( const ExpressionActionsPtr & default_expression_, const String & default_column_name_, bool is_compact_part_) - : ITTLAlgorithm(description_, old_ttl_info_, current_time_, force_) + : ITTLAlgorithm(ttl_expressions_, description_, old_ttl_info_, current_time_, force_) , column_name(column_name_) , default_expression(default_expression_) , default_column_name(default_column_name_) @@ -49,7 +50,7 @@ void TTLColumnAlgorithm::execute(Block & block) if (default_column) default_column = default_column->convertToFullColumnIfConst(); - auto ttl_column = executeExpressionAndGetColumn(description.expression, block, description.result_column); + auto ttl_column = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); auto & column_with_type = block.getByName(column_name); const IColumn * values_column = column_with_type.column.get(); diff --git a/src/Processors/TTL/TTLColumnAlgorithm.h b/src/Processors/TTL/TTLColumnAlgorithm.h index 30de77dcc2a..f34dae952d1 100644 --- a/src/Processors/TTL/TTLColumnAlgorithm.h +++ b/src/Processors/TTL/TTLColumnAlgorithm.h @@ -11,6 +11,7 @@ class TTLColumnAlgorithm final : public ITTLAlgorithm { public: TTLColumnAlgorithm( + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, diff --git a/src/Processors/TTL/TTLDeleteAlgorithm.cpp b/src/Processors/TTL/TTLDeleteAlgorithm.cpp index f176df2d003..6f9bc315276 100644 --- a/src/Processors/TTL/TTLDeleteAlgorithm.cpp +++ b/src/Processors/TTL/TTLDeleteAlgorithm.cpp @@ -4,8 +4,8 @@ namespace DB { TTLDeleteAlgorithm::TTLDeleteAlgorithm( - const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) - : ITTLAlgorithm(description_, old_ttl_info_, current_time_, force_) + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) + : ITTLAlgorithm(ttl_expressions_, description_, old_ttl_info_, current_time_, force_) { if (!isMinTTLExpired()) new_ttl_info = old_ttl_info; @@ -19,8 +19,8 @@ void TTLDeleteAlgorithm::execute(Block & block) if (!block || !isMinTTLExpired()) return; - auto ttl_column = executeExpressionAndGetColumn(description.expression, block, description.result_column); - auto where_column = executeExpressionAndGetColumn(description.where_expression, block, description.where_result_column); + auto ttl_column = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); + auto where_column = executeExpressionAndGetColumn(ttl_expressions.where_expression, block, description.where_result_column); MutableColumns result_columns; const auto & column_names = block.getNames(); @@ -54,7 +54,7 @@ void TTLDeleteAlgorithm::execute(Block & block) void TTLDeleteAlgorithm::finalize(const MutableDataPartPtr & data_part) const { - if (description.where_expression) + if (ttl_expressions.where_expression) data_part->ttl_infos.rows_where_ttl[description.result_column] = new_ttl_info; else data_part->ttl_infos.table_ttl = new_ttl_info; diff --git a/src/Processors/TTL/TTLDeleteAlgorithm.h b/src/Processors/TTL/TTLDeleteAlgorithm.h index 292a29bfa27..622e45acecb 100644 --- a/src/Processors/TTL/TTLDeleteAlgorithm.h +++ b/src/Processors/TTL/TTLDeleteAlgorithm.h @@ -10,7 +10,7 @@ namespace DB class TTLDeleteAlgorithm final : public ITTLAlgorithm { public: - TTLDeleteAlgorithm(const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); + TTLDeleteAlgorithm(const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); void execute(Block & block) override; void finalize(const MutableDataPartPtr & data_part) const override; diff --git a/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp b/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp index eba364aa2b8..b7cddf3c165 100644 --- a/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp +++ b/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp @@ -4,13 +4,14 @@ namespace DB { TTLUpdateInfoAlgorithm::TTLUpdateInfoAlgorithm( + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLUpdateField ttl_update_field_, const String ttl_update_key_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) - : ITTLAlgorithm(description_, old_ttl_info_, current_time_, force_) + : ITTLAlgorithm(ttl_expressions_, description_, old_ttl_info_, current_time_, force_) , ttl_update_field(ttl_update_field_) , ttl_update_key(ttl_update_key_) { @@ -21,7 +22,7 @@ void TTLUpdateInfoAlgorithm::execute(Block & block) if (!block) return; - auto ttl_column = executeExpressionAndGetColumn(description.expression, block, description.result_column); + auto ttl_column = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); for (size_t i = 0; i < block.rows(); ++i) { UInt32 cur_ttl = ITTLAlgorithm::getTimestampByIndex(ttl_column.get(), i); diff --git a/src/Processors/TTL/TTLUpdateInfoAlgorithm.h b/src/Processors/TTL/TTLUpdateInfoAlgorithm.h index 45eecbde3d0..0cf31765aef 100644 --- a/src/Processors/TTL/TTLUpdateInfoAlgorithm.h +++ b/src/Processors/TTL/TTLUpdateInfoAlgorithm.h @@ -20,6 +20,7 @@ class TTLUpdateInfoAlgorithm : public ITTLAlgorithm { public: TTLUpdateInfoAlgorithm( + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLUpdateField ttl_update_field_, const String ttl_update_key_, diff --git a/src/Processors/Transforms/AggregatingTransform.cpp b/src/Processors/Transforms/AggregatingTransform.cpp index 2fd9f102159..74da97f2199 100644 --- a/src/Processors/Transforms/AggregatingTransform.cpp +++ b/src/Processors/Transforms/AggregatingTransform.cpp @@ -600,6 +600,12 @@ IProcessor::Status AggregatingTransform::prepare() if (is_consume_finished) { output.finish(); + /// input.isFinished() means that merging is done. Now we can release our reference to aggregation states. + /// TODO: there is another case, when output port is getting closed first. + /// E.g. `select ... group by x limit 10`, if it was two-level aggregation and first few buckets contained already enough rows + /// limit will stop merging. It turned out to be not trivial to both release aggregation states and ensure that + /// ManyAggregatedData holds the last references to them to trigger parallel destruction in its dtor. Will work on that. + many_data.reset(); return Status::Finished; } else @@ -828,8 +834,6 @@ void AggregatingTransform::initGenerate() processors = Pipe::detachProcessors(std::move(pipe)); } - - many_data.reset(); } } diff --git a/src/Processors/Transforms/AggregatingTransform.h b/src/Processors/Transforms/AggregatingTransform.h index e05528afdc7..e167acde067 100644 --- a/src/Processors/Transforms/AggregatingTransform.h +++ b/src/Processors/Transforms/AggregatingTransform.h @@ -71,16 +71,12 @@ struct AggregatingTransformParams struct ManyAggregatedData { ManyAggregatedDataVariants variants; - std::vector> mutexes; std::atomic num_finished = 0; - explicit ManyAggregatedData(size_t num_threads = 0) : variants(num_threads), mutexes(num_threads) + explicit ManyAggregatedData(size_t num_threads = 0) : variants(num_threads) { for (auto & elem : variants) elem = std::make_shared(); - - for (auto & mut : mutexes) - mut = std::make_unique(); } ~ManyAggregatedData() diff --git a/src/Processors/Transforms/ColumnGathererTransform.cpp b/src/Processors/Transforms/ColumnGathererTransform.cpp index d7f52a538e1..b2e8e9bc89e 100644 --- a/src/Processors/Transforms/ColumnGathererTransform.cpp +++ b/src/Processors/Transforms/ColumnGathererTransform.cpp @@ -17,9 +17,14 @@ namespace ErrorCodes } ColumnGathererStream::ColumnGathererStream( - size_t num_inputs, ReadBuffer & row_sources_buf_, size_t block_preferred_size_) - : sources(num_inputs), row_sources_buf(row_sources_buf_) - , block_preferred_size(block_preferred_size_) + size_t num_inputs, + ReadBuffer & row_sources_buf_, + size_t block_preferred_size_rows_, + size_t block_preferred_size_bytes_) + : sources(num_inputs) + , row_sources_buf(row_sources_buf_) + , block_preferred_size_rows(block_preferred_size_rows_) + , block_preferred_size_bytes(block_preferred_size_bytes_) { if (num_inputs == 0) throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "There are no streams to gather"); @@ -124,10 +129,11 @@ ColumnGathererTransform::ColumnGathererTransform( const Block & header, size_t num_inputs, ReadBuffer & row_sources_buf_, - size_t block_preferred_size_) + size_t block_preferred_size_rows_, + size_t block_preferred_size_bytes_) : IMergingTransform( num_inputs, header, header, /*have_all_inputs_=*/ true, /*limit_hint_=*/ 0, /*always_read_till_end_=*/ false, - num_inputs, row_sources_buf_, block_preferred_size_) + num_inputs, row_sources_buf_, block_preferred_size_rows_, block_preferred_size_bytes_) , log(getLogger("ColumnGathererStream")) { if (header.columns() != 1) diff --git a/src/Processors/Transforms/ColumnGathererTransform.h b/src/Processors/Transforms/ColumnGathererTransform.h index 885cb3f81ba..4e56cffa46a 100644 --- a/src/Processors/Transforms/ColumnGathererTransform.h +++ b/src/Processors/Transforms/ColumnGathererTransform.h @@ -5,7 +5,6 @@ #include #include - namespace Poco { class Logger; } @@ -57,7 +56,11 @@ using MergedRowSources = PODArray; class ColumnGathererStream final : public IMergingAlgorithm { public: - ColumnGathererStream(size_t num_inputs, ReadBuffer & row_sources_buf_, size_t block_preferred_size_ = DEFAULT_BLOCK_SIZE); + ColumnGathererStream( + size_t num_inputs, + ReadBuffer & row_sources_buf_, + size_t block_preferred_size_rows_, + size_t block_preferred_size_bytes_); const char * getName() const override { return "ColumnGathererStream"; } void initialize(Inputs inputs) override; @@ -92,13 +95,12 @@ private: std::vector sources; ReadBuffer & row_sources_buf; - const size_t block_preferred_size; + const size_t block_preferred_size_rows; + const size_t block_preferred_size_bytes; Source * source_to_fully_copy = nullptr; ssize_t next_required_source = -1; - size_t cur_block_preferred_size = 0; - UInt64 merged_rows = 0; UInt64 merged_bytes = 0; }; @@ -110,7 +112,8 @@ public: const Block & header, size_t num_inputs, ReadBuffer & row_sources_buf_, - size_t block_preferred_size_ = DEFAULT_BLOCK_SIZE); + size_t block_preferred_size_rows_, + size_t block_preferred_size_bytes_); String getName() const override { return "ColumnGathererTransform"; } @@ -134,15 +137,22 @@ void ColumnGathererStream::gather(Column & column_res) if (next_required_source == -1) { /// Start new column. - cur_block_preferred_size = std::min(static_cast(row_sources_end - row_source_pos), block_preferred_size); - column_res.reserve(cur_block_preferred_size); + /// Actually reserve works only for fixed size columns. + /// So it's safe to ignore preferred size in bytes and call reserve for number of rows. + size_t size_to_reserve = std::min(static_cast(row_sources_end - row_source_pos), block_preferred_size_rows); + column_res.reserve(size_to_reserve); } - size_t cur_size = column_res.size(); next_required_source = -1; - while (row_source_pos < row_sources_end && cur_size < cur_block_preferred_size) + + /// We use do ... while here to ensure there will be at least one iteration of this loop. + /// Because the column_res.byteSize() could be bigger than block_preferred_size_bytes already at this point. + do { + if (row_source_pos >= row_sources_end) + break; + RowSourcePart row_source = *row_source_pos; size_t source_num = row_source.getSourceNum(); Source & source = sources[source_num]; @@ -159,6 +169,7 @@ void ColumnGathererStream::gather(Column & column_res) /// Consecutive optimization. TODO: precompute lengths size_t len = 1; size_t max_len = std::min(static_cast(row_sources_end - row_source_pos), source.size - source.pos); // interval should be in the same block + while (len < max_len && row_source_pos->data == row_source.data) { ++len; @@ -181,12 +192,10 @@ void ColumnGathererStream::gather(Column & column_res) column_res.insertFrom(*source.column, source.pos); else column_res.insertRangeFrom(*source.column, source.pos, len); - - cur_size += len; } source.pos += len; - } + } while (column_res.size() < block_preferred_size_rows && column_res.byteSize() < block_preferred_size_bytes); } } diff --git a/src/Processors/Transforms/CreatingSetsTransform.cpp b/src/Processors/Transforms/CreatingSetsTransform.cpp index cc0b5926e66..eeb8f4a6060 100644 --- a/src/Processors/Transforms/CreatingSetsTransform.cpp +++ b/src/Processors/Transforms/CreatingSetsTransform.cpp @@ -163,7 +163,7 @@ void CreatingSetsTransform::startSubquery() done_with_table = !external_table; if ((done_with_set && !set_from_cache) && done_with_table) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: nothing to do with subquery"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Nothing to do with subquery"); if (table_out.initialized()) { diff --git a/src/Processors/Transforms/TTLCalcTransform.cpp b/src/Processors/Transforms/TTLCalcTransform.cpp index 2b4ed96d4e3..0e55507cb3a 100644 --- a/src/Processors/Transforms/TTLCalcTransform.cpp +++ b/src/Processors/Transforms/TTLCalcTransform.cpp @@ -4,7 +4,24 @@ namespace DB { +static TTLExpressions getExpressions(const TTLDescription & ttl_descr, PreparedSets::Subqueries & subqueries_for_sets, const ContextPtr & context) +{ + auto expr = ttl_descr.buildExpression(context); + auto expr_queries = expr.sets->getSubqueries(); + subqueries_for_sets.insert(subqueries_for_sets.end(), expr_queries.begin(), expr_queries.end()); + + auto where_expr = ttl_descr.buildWhereExpression(context); + if (where_expr.sets) + { + auto where_expr_queries = where_expr.sets->getSubqueries(); + subqueries_for_sets.insert(subqueries_for_sets.end(), where_expr_queries.begin(), where_expr_queries.end()); + } + + return {expr.expression, where_expr.expression}; +} + TTLCalcTransform::TTLCalcTransform( + const ContextPtr & context, const Block & header_, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, @@ -21,33 +38,39 @@ TTLCalcTransform::TTLCalcTransform( { const auto & rows_ttl = metadata_snapshot_->getRowsTTL(); algorithms.emplace_back(std::make_unique( - rows_ttl, TTLUpdateField::TABLE_TTL, rows_ttl.result_column, old_ttl_infos.table_ttl, current_time_, force_)); + getExpressions(rows_ttl, subqueries_for_sets, context), rows_ttl, + TTLUpdateField::TABLE_TTL, rows_ttl.result_column, old_ttl_infos.table_ttl, current_time_, force_)); } for (const auto & where_ttl : metadata_snapshot_->getRowsWhereTTLs()) algorithms.emplace_back(std::make_unique( - where_ttl, TTLUpdateField::ROWS_WHERE_TTL, where_ttl.result_column, old_ttl_infos.rows_where_ttl[where_ttl.result_column], current_time_, force_)); + getExpressions(where_ttl, subqueries_for_sets, context), where_ttl, + TTLUpdateField::ROWS_WHERE_TTL, where_ttl.result_column, old_ttl_infos.rows_where_ttl[where_ttl.result_column], current_time_, force_)); for (const auto & group_by_ttl : metadata_snapshot_->getGroupByTTLs()) algorithms.emplace_back(std::make_unique( - group_by_ttl, TTLUpdateField::GROUP_BY_TTL, group_by_ttl.result_column, old_ttl_infos.group_by_ttl[group_by_ttl.result_column], current_time_, force_)); + getExpressions(group_by_ttl, subqueries_for_sets, context), group_by_ttl, + TTLUpdateField::GROUP_BY_TTL, group_by_ttl.result_column, old_ttl_infos.group_by_ttl[group_by_ttl.result_column], current_time_, force_)); if (metadata_snapshot_->hasAnyColumnTTL()) { for (const auto & [name, description] : metadata_snapshot_->getColumnTTLs()) { algorithms.emplace_back(std::make_unique( - description, TTLUpdateField::COLUMNS_TTL, name, old_ttl_infos.columns_ttl[name], current_time_, force_)); + getExpressions(description, subqueries_for_sets, context), description, + TTLUpdateField::COLUMNS_TTL, name, old_ttl_infos.columns_ttl[name], current_time_, force_)); } } for (const auto & move_ttl : metadata_snapshot_->getMoveTTLs()) algorithms.emplace_back(std::make_unique( - move_ttl, TTLUpdateField::MOVES_TTL, move_ttl.result_column, old_ttl_infos.moves_ttl[move_ttl.result_column], current_time_, force_)); + getExpressions(move_ttl, subqueries_for_sets, context), move_ttl, + TTLUpdateField::MOVES_TTL, move_ttl.result_column, old_ttl_infos.moves_ttl[move_ttl.result_column], current_time_, force_)); for (const auto & recompression_ttl : metadata_snapshot_->getRecompressionTTLs()) algorithms.emplace_back(std::make_unique( - recompression_ttl, TTLUpdateField::RECOMPRESSION_TTL, recompression_ttl.result_column, old_ttl_infos.recompression_ttl[recompression_ttl.result_column], current_time_, force_)); + getExpressions(recompression_ttl, subqueries_for_sets, context), recompression_ttl, + TTLUpdateField::RECOMPRESSION_TTL, recompression_ttl.result_column, old_ttl_infos.recompression_ttl[recompression_ttl.result_column], current_time_, force_)); } void TTLCalcTransform::consume(Chunk chunk) diff --git a/src/Processors/Transforms/TTLCalcTransform.h b/src/Processors/Transforms/TTLCalcTransform.h index baa31c01c52..37b378bc5de 100644 --- a/src/Processors/Transforms/TTLCalcTransform.h +++ b/src/Processors/Transforms/TTLCalcTransform.h @@ -15,6 +15,7 @@ class TTLCalcTransform : public IAccumulatingTransform { public: TTLCalcTransform( + const ContextPtr & context, const Block & header_, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, @@ -23,6 +24,8 @@ public: bool force_ ); + PreparedSets::Subqueries getSubqueries() { return std::move(subqueries_for_sets); } + String getName() const override { return "TTL_CALC"; } Status prepare() override; @@ -35,6 +38,7 @@ protected: private: std::vector algorithms; + PreparedSets::Subqueries subqueries_for_sets; /// ttl_infos and empty_columns are updating while reading const MergeTreeData::MutableDataPartPtr & data_part; diff --git a/src/Processors/Transforms/TTLTransform.cpp b/src/Processors/Transforms/TTLTransform.cpp index db9326f9acf..42b932becec 100644 --- a/src/Processors/Transforms/TTLTransform.cpp +++ b/src/Processors/Transforms/TTLTransform.cpp @@ -16,7 +16,24 @@ namespace DB { +static TTLExpressions getExpressions(const TTLDescription & ttl_descr, PreparedSets::Subqueries & subqueries_for_sets, const ContextPtr & context) +{ + auto expr = ttl_descr.buildExpression(context); + auto expr_queries = expr.sets->getSubqueries(); + subqueries_for_sets.insert(subqueries_for_sets.end(), expr_queries.begin(), expr_queries.end()); + + auto where_expr = ttl_descr.buildWhereExpression(context); + if (where_expr.sets) + { + auto where_expr_queries = where_expr.sets->getSubqueries(); + subqueries_for_sets.insert(subqueries_for_sets.end(), where_expr_queries.begin(), where_expr_queries.end()); + } + + return {expr.expression, where_expr.expression}; +} + TTLTransform::TTLTransform( + const ContextPtr & context, const Block & header_, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, @@ -33,10 +50,11 @@ TTLTransform::TTLTransform( { const auto & rows_ttl = metadata_snapshot_->getRowsTTL(); auto algorithm = std::make_unique( - rows_ttl, old_ttl_infos.table_ttl, current_time_, force_); + getExpressions(rows_ttl, subqueries_for_sets, context), rows_ttl, + old_ttl_infos.table_ttl, current_time_, force_); /// Skip all data if table ttl is expired for part - if (algorithm->isMaxTTLExpired() && !rows_ttl.where_expression) + if (algorithm->isMaxTTLExpired() && !rows_ttl.where_expression_ast) all_data_dropped = true; delete_algorithm = algorithm.get(); @@ -45,11 +63,13 @@ TTLTransform::TTLTransform( for (const auto & where_ttl : metadata_snapshot_->getRowsWhereTTLs()) algorithms.emplace_back(std::make_unique( - where_ttl, old_ttl_infos.rows_where_ttl[where_ttl.result_column], current_time_, force_)); + getExpressions(where_ttl, subqueries_for_sets, context), where_ttl, + old_ttl_infos.rows_where_ttl[where_ttl.result_column], current_time_, force_)); for (const auto & group_by_ttl : metadata_snapshot_->getGroupByTTLs()) algorithms.emplace_back(std::make_unique( - group_by_ttl, old_ttl_infos.group_by_ttl[group_by_ttl.result_column], current_time_, force_, + getExpressions(group_by_ttl, subqueries_for_sets, context), group_by_ttl, + old_ttl_infos.group_by_ttl[group_by_ttl.result_column], current_time_, force_, getInputPort().getHeader(), storage_)); if (metadata_snapshot_->hasAnyColumnTTL()) @@ -75,18 +95,21 @@ TTLTransform::TTLTransform( } algorithms.emplace_back(std::make_unique( - description, old_ttl_infos.columns_ttl[name], current_time_, + getExpressions(description, subqueries_for_sets, context), description, + old_ttl_infos.columns_ttl[name], current_time_, force_, name, default_expression, default_column_name, isCompactPart(data_part))); } } for (const auto & move_ttl : metadata_snapshot_->getMoveTTLs()) algorithms.emplace_back(std::make_unique( - move_ttl, TTLUpdateField::MOVES_TTL, move_ttl.result_column, old_ttl_infos.moves_ttl[move_ttl.result_column], current_time_, force_)); + getExpressions(move_ttl, subqueries_for_sets, context), move_ttl, + TTLUpdateField::MOVES_TTL, move_ttl.result_column, old_ttl_infos.moves_ttl[move_ttl.result_column], current_time_, force_)); for (const auto & recompression_ttl : metadata_snapshot_->getRecompressionTTLs()) algorithms.emplace_back(std::make_unique( - recompression_ttl, TTLUpdateField::RECOMPRESSION_TTL, recompression_ttl.result_column, old_ttl_infos.recompression_ttl[recompression_ttl.result_column], current_time_, force_)); + getExpressions(recompression_ttl, subqueries_for_sets, context), recompression_ttl, + TTLUpdateField::RECOMPRESSION_TTL, recompression_ttl.result_column, old_ttl_infos.recompression_ttl[recompression_ttl.result_column], current_time_, force_)); } Block reorderColumns(Block block, const Block & header) diff --git a/src/Processors/Transforms/TTLTransform.h b/src/Processors/Transforms/TTLTransform.h index 3606db7f4c2..57563116711 100644 --- a/src/Processors/Transforms/TTLTransform.h +++ b/src/Processors/Transforms/TTLTransform.h @@ -16,6 +16,7 @@ class TTLTransform : public IAccumulatingTransform { public: TTLTransform( + const ContextPtr & context, const Block & header_, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, @@ -28,6 +29,8 @@ public: Status prepare() override; + PreparedSets::Subqueries getSubqueries() { return std::move(subqueries_for_sets); } + protected: void consume(Chunk chunk) override; Chunk generate() override; @@ -40,6 +43,8 @@ private: const TTLDeleteAlgorithm * delete_algorithm = nullptr; bool all_data_dropped = false; + PreparedSets::Subqueries subqueries_for_sets; + /// ttl_infos and empty_columns are updating while reading const MergeTreeData::MutableDataPartPtr & data_part; LoggerPtr log; diff --git a/src/Processors/Transforms/buildPushingToViewsChain.cpp b/src/Processors/Transforms/buildPushingToViewsChain.cpp index 960cc019001..91bbf04f327 100644 --- a/src/Processors/Transforms/buildPushingToViewsChain.cpp +++ b/src/Processors/Transforms/buildPushingToViewsChain.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -244,7 +245,9 @@ Chain buildPushingToViewsChain( // Do not deduplicate insertions into MV if the main insertion is Ok if (disable_deduplication_for_children) + { insert_context->setSetting("insert_deduplicate", Field{false}); + } // Processing of blocks for MVs is done block by block, and there will // be no parallel reading after (plus it is not a costless operation) @@ -301,6 +304,46 @@ Chain buildPushingToViewsChain( auto & target_name = runtime_stats->target_name; auto * view_counter_ms = &runtime_stats->elapsed_ms; + const auto & insert_settings = insert_context->getSettingsRef(); + ContextMutablePtr view_insert_context = insert_context; + + if (!disable_deduplication_for_children && + insert_settings.update_insert_deduplication_token_in_dependent_materialized_views && + !insert_settings.insert_deduplication_token.value.empty()) + { + /** Update deduplication token passed to dependent MV with current view id. So it is possible to properly handle + * deduplication in complex INSERT flows. + * + * Example: + * + * landing -┬--> mv_1_1 ---> ds_1_1 ---> mv_2_1 --┬-> ds_2_1 ---> mv_3_1 ---> ds_3_1 + * | | + * └--> mv_1_2 ---> ds_1_2 ---> mv_2_2 --┘ + * + * Here we want to avoid deduplication for two different blocks generated from `mv_2_1` and `mv_2_2` that will + * be inserted into `ds_2_1`. + * + * We are forced to use view id instead of table id because there are some possible INSERT flows where no tables + * are involved. + * + * Example: + * + * landing -┬--> mv_1_1 --┬-> ds_1_1 + * | | + * └--> mv_1_2 --┘ + * + */ + auto insert_deduplication_token = insert_settings.insert_deduplication_token.value; + + if (view_id.hasUUID()) + insert_deduplication_token += "_" + toString(view_id.uuid); + else + insert_deduplication_token += "_" + view_id.getFullNameNotQuoted(); + + view_insert_context = Context::createCopy(insert_context); + view_insert_context->setSetting("insert_deduplication_token", insert_deduplication_token); + } + if (auto * materialized_view = dynamic_cast(view.get())) { auto lock = materialized_view->tryLockForShare(context->getInitialQueryId(), context->getSettingsRef().lock_acquire_timeout); @@ -368,8 +411,25 @@ Chain buildPushingToViewsChain( insert_columns.emplace_back(column.name); } - InterpreterInsertQuery interpreter(nullptr, insert_context, false, false, false); + InterpreterInsertQuery interpreter(nullptr, view_insert_context, false, false, false); out = interpreter.buildChain(inner_table, inner_metadata_snapshot, insert_columns, thread_status_holder, view_counter_ms); + + if (interpreter.shouldAddSquashingFroStorage(inner_table)) + { + bool table_prefers_large_blocks = inner_table->prefersLargeBlocks(); + const auto & settings = view_insert_context->getSettingsRef(); + + out.addSource(std::make_shared( + out.getInputHeader(), + table_prefers_large_blocks ? settings.min_insert_block_size_rows : settings.max_block_size, + table_prefers_large_blocks ? settings.min_insert_block_size_bytes : 0ULL)); + } + + auto counting = std::make_shared(out.getInputHeader(), current_thread, view_insert_context->getQuota()); + counting->setProcessListElement(view_insert_context->getProcessListElement()); + counting->setProgressCallback(view_insert_context->getProgressCallback()); + out.addSource(std::move(counting)); + out.addStorageHolder(view); out.addStorageHolder(inner_table); } @@ -378,7 +438,7 @@ Chain buildPushingToViewsChain( runtime_stats->type = QueryViewsLogElement::ViewType::LIVE; query = live_view->getInnerQuery(); // Used only to log in system.query_views_log out = buildPushingToViewsChain( - view, view_metadata_snapshot, insert_context, ASTPtr(), + view, view_metadata_snapshot, view_insert_context, ASTPtr(), /* no_destination= */ true, thread_status_holder, running_group, view_counter_ms, async_insert, storage_header); } @@ -387,13 +447,13 @@ Chain buildPushingToViewsChain( runtime_stats->type = QueryViewsLogElement::ViewType::WINDOW; query = window_view->getMergeableQuery(); // Used only to log in system.query_views_log out = buildPushingToViewsChain( - view, view_metadata_snapshot, insert_context, ASTPtr(), + view, view_metadata_snapshot, view_insert_context, ASTPtr(), /* no_destination= */ true, thread_status_holder, running_group, view_counter_ms, async_insert); } else out = buildPushingToViewsChain( - view, view_metadata_snapshot, insert_context, ASTPtr(), + view, view_metadata_snapshot, view_insert_context, ASTPtr(), /* no_destination= */ false, thread_status_holder, running_group, view_counter_ms, async_insert); diff --git a/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp b/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp index 6c7c7447070..8a13973b970 100644 --- a/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp +++ b/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp @@ -37,7 +37,7 @@ InputFormatPtr getInputFormatFromASTInsertQuery( const auto * ast_insert_query = ast->as(); if (!ast_insert_query) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: query requires data to insert, but it is not INSERT query"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Query requires data to insert, but it is not INSERT query"); if (ast_insert_query->infile && context->getApplicationType() == Context::ApplicationType::SERVER) throw Exception(ErrorCodes::UNKNOWN_TYPE_OF_QUERY, "Query has infile and was send directly to server"); @@ -47,7 +47,7 @@ InputFormatPtr getInputFormatFromASTInsertQuery( if (input_function) throw Exception(ErrorCodes::INVALID_USAGE_OF_INPUT, "FORMAT must be specified for function input()"); else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: INSERT query requires format to be set"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "INSERT query requires format to be set"); } /// Data could be in parsed (ast_insert_query.data) and in not parsed yet (input_buffer_tail_part) part of query. @@ -105,7 +105,7 @@ std::unique_ptr getReadBufferFromASTInsertQuery(const ASTPtr & ast) { const auto * insert_query = ast->as(); if (!insert_query) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: query requires data to insert, but it is not INSERT query"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Query requires data to insert, but it is not INSERT query"); if (insert_query->infile) { diff --git a/src/QueryPipeline/ExecutionSpeedLimits.cpp b/src/QueryPipeline/ExecutionSpeedLimits.cpp index f8ae4c76d0f..05fd394db77 100644 --- a/src/QueryPipeline/ExecutionSpeedLimits.cpp +++ b/src/QueryPipeline/ExecutionSpeedLimits.cpp @@ -113,7 +113,7 @@ static bool handleOverflowMode(OverflowMode mode, int code, FormatStringHelper #include -#include "Core/Protocol.h" +#include #include #include #include @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,7 @@ namespace ProfileEvents extern const Event SuspendSendingQueryToShard; extern const Event ReadTaskRequestsReceived; extern const Event MergeTreeReadTaskRequestsReceived; + extern const Event ParallelReplicasAvailableCount; } namespace DB @@ -62,11 +64,65 @@ RemoteQueryExecutor::RemoteQueryExecutor( { } +RemoteQueryExecutor::RemoteQueryExecutor( + ConnectionPoolPtr pool, + const String & query_, + const Block & header_, + ContextPtr context_, + ThrottlerPtr throttler, + const Scalars & scalars_, + const Tables & external_tables_, + QueryProcessingStage::Enum stage_, + std::optional extension_) + : RemoteQueryExecutor(query_, header_, context_, scalars_, external_tables_, stage_, extension_) +{ + create_connections = [this, pool, throttler, extension_](AsyncCallback) + { + const Settings & current_settings = context->getSettingsRef(); + auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings); + + ConnectionPoolWithFailover::TryResult result; + std::string fail_message; + if (main_table) + { + auto table_name = main_table.getQualifiedName(); + + ConnectionEstablisher connection_establisher(pool, &timeouts, current_settings, log, &table_name); + connection_establisher.run(result, fail_message); + } + else + { + ConnectionEstablisher connection_establisher(pool, &timeouts, current_settings, log, nullptr); + connection_establisher.run(result, fail_message); + } + + std::vector connection_entries; + if (!result.entry.isNull() && result.is_usable) + { + if (extension_ && extension_->parallel_reading_coordinator) + ProfileEvents::increment(ProfileEvents::ParallelReplicasAvailableCount); + + connection_entries.emplace_back(std::move(result.entry)); + } + + auto res = std::make_unique(std::move(connection_entries), current_settings, throttler); + if (extension_ && extension_->replica_info) + res->setReplicaInfo(*extension_->replica_info); + + return res; + }; +} + RemoteQueryExecutor::RemoteQueryExecutor( Connection & connection, - const String & query_, const Block & header_, ContextPtr context_, - ThrottlerPtr throttler, const Scalars & scalars_, const Tables & external_tables_, - QueryProcessingStage::Enum stage_, std::optional extension_) + const String & query_, + const Block & header_, + ContextPtr context_, + ThrottlerPtr throttler, + const Scalars & scalars_, + const Tables & external_tables_, + QueryProcessingStage::Enum stage_, + std::optional extension_) : RemoteQueryExecutor(query_, header_, context_, scalars_, external_tables_, stage_, extension_) { create_connections = [this, &connection, throttler, extension_](AsyncCallback) @@ -80,9 +136,14 @@ RemoteQueryExecutor::RemoteQueryExecutor( RemoteQueryExecutor::RemoteQueryExecutor( std::shared_ptr connection_ptr, - const String & query_, const Block & header_, ContextPtr context_, - ThrottlerPtr throttler, const Scalars & scalars_, const Tables & external_tables_, - QueryProcessingStage::Enum stage_, std::optional extension_) + const String & query_, + const Block & header_, + ContextPtr context_, + ThrottlerPtr throttler, + const Scalars & scalars_, + const Tables & external_tables_, + QueryProcessingStage::Enum stage_, + std::optional extension_) : RemoteQueryExecutor(query_, header_, context_, scalars_, external_tables_, stage_, extension_) { create_connections = [this, connection_ptr, throttler, extension_](AsyncCallback) @@ -96,12 +157,18 @@ RemoteQueryExecutor::RemoteQueryExecutor( RemoteQueryExecutor::RemoteQueryExecutor( std::vector && connections_, - const String & query_, const Block & header_, ContextPtr context_, - const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, - QueryProcessingStage::Enum stage_, std::optional extension_) + const String & query_, + const Block & header_, + ContextPtr context_, + const ThrottlerPtr & throttler, + const Scalars & scalars_, + const Tables & external_tables_, + QueryProcessingStage::Enum stage_, + std::optional extension_) : RemoteQueryExecutor(query_, header_, context_, scalars_, external_tables_, stage_, extension_) { - create_connections = [this, connections_, throttler, extension_](AsyncCallback) mutable { + create_connections = [this, connections_, throttler, extension_](AsyncCallback) mutable + { auto res = std::make_unique(std::move(connections_), context->getSettingsRef(), throttler); if (extension_ && extension_->replica_info) res->setReplicaInfo(*extension_->replica_info); @@ -182,7 +249,19 @@ RemoteQueryExecutor::~RemoteQueryExecutor() { /// Set was_cancelled, so the query won't be sent after creating connections. was_cancelled = true; - read_context->cancel(); + + /// Cancellation may throw (i.e. some timeout), and in case of pipeline + /// had not been properly created properly (EXCEPTION_BEFORE_START) + /// cancel will not be sent, so cancellation will be done from dtor and + /// will throw. + try + { + read_context->cancel(); + } + catch (...) + { + tryLogCurrentException(log ? log : getLogger("RemoteQueryExecutor")); + } } /** If interrupted in the middle of the loop of communication with replicas, then interrupt @@ -190,7 +269,17 @@ RemoteQueryExecutor::~RemoteQueryExecutor() * these connections did not remain hanging in the out-of-sync state. */ if (established || (isQueryPending() && connections)) - connections->disconnect(); + { + /// May also throw (so as cancel() above) + try + { + connections->disconnect(); + } + catch (...) + { + tryLogCurrentException(log ? log : getLogger("RemoteQueryExecutor")); + } + } } /** If we receive a block with slightly different column types, or with excessive columns, diff --git a/src/QueryPipeline/RemoteQueryExecutor.h b/src/QueryPipeline/RemoteQueryExecutor.h index 444f1258f3e..6b1539bd08e 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.h +++ b/src/QueryPipeline/RemoteQueryExecutor.h @@ -50,29 +50,55 @@ public: std::shared_ptr task_iterator = nullptr; std::shared_ptr parallel_reading_coordinator = nullptr; std::optional replica_info = {}; - GetPriorityForLoadBalancing::Func priority_func; }; + /// Takes a connection pool for a node (not cluster) + RemoteQueryExecutor( + ConnectionPoolPtr pool, + const String & query_, + const Block & header_, + ContextPtr context_, + ThrottlerPtr throttler = nullptr, + const Scalars & scalars_ = Scalars(), + const Tables & external_tables_ = Tables(), + QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, + std::optional extension_ = std::nullopt); + /// Takes already set connection. RemoteQueryExecutor( Connection & connection, - const String & query_, const Block & header_, ContextPtr context_, - ThrottlerPtr throttler_ = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(), - QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, std::optional extension_ = std::nullopt); + const String & query_, + const Block & header_, + ContextPtr context_, + ThrottlerPtr throttler_ = nullptr, + const Scalars & scalars_ = Scalars(), + const Tables & external_tables_ = Tables(), + QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, + std::optional extension_ = std::nullopt); /// Takes already set connection. RemoteQueryExecutor( std::shared_ptr connection, - const String & query_, const Block & header_, ContextPtr context_, - ThrottlerPtr throttler_ = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(), - QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, std::optional extension_ = std::nullopt); + const String & query_, + const Block & header_, + ContextPtr context_, + ThrottlerPtr throttler_ = nullptr, + const Scalars & scalars_ = Scalars(), + const Tables & external_tables_ = Tables(), + QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, + std::optional extension_ = std::nullopt); /// Accepts several connections already taken from pool. RemoteQueryExecutor( std::vector && connections_, - const String & query_, const Block & header_, ContextPtr context_, - const ThrottlerPtr & throttler = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(), - QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, std::optional extension_ = std::nullopt); + const String & query_, + const Block & header_, + ContextPtr context_, + const ThrottlerPtr & throttler = nullptr, + const Scalars & scalars_ = Scalars(), + const Tables & external_tables_ = Tables(), + QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, + std::optional extension_ = std::nullopt); /// Takes a pool and gets one or several connections from it. RemoteQueryExecutor( diff --git a/src/QueryPipeline/RemoteQueryExecutorReadContext.h b/src/QueryPipeline/RemoteQueryExecutorReadContext.h index adfc0c5eacf..50df7e2db35 100644 --- a/src/QueryPipeline/RemoteQueryExecutorReadContext.h +++ b/src/QueryPipeline/RemoteQueryExecutorReadContext.h @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Server/CloudPlacementInfo.cpp b/src/Server/CloudPlacementInfo.cpp new file mode 100644 index 00000000000..0790f825a45 --- /dev/null +++ b/src/Server/CloudPlacementInfo.cpp @@ -0,0 +1,83 @@ +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace PlacementInfo +{ + +namespace +{ + std::string getConfigPath(std::string_view path) + { + return fmt::format("{}.{}", PLACEMENT_CONFIG_PREFIX, path); + } + + String loadAvailabilityZoneFromFile(const Poco::Util::AbstractConfiguration & config) + { + auto az_file = config.getString(getConfigPath("availability_zone_from_file"), DEFAULT_AZ_FILE_PATH); + + if (!std::filesystem::exists(az_file)) + return ""; + + String availability_zone_from_file; + + ReadBufferFromFile in(az_file); + readStringUntilEOF(availability_zone_from_file, in); + Poco::trimInPlace(availability_zone_from_file); + + return availability_zone_from_file; + } +} + + +PlacementInfo & PlacementInfo::instance() +{ + static PlacementInfo instance; + return instance; +} + +void PlacementInfo::initialize(const Poco::Util::AbstractConfiguration & config) +{ + use_imds = config.getBool(getConfigPath("use_imds"), false); + + if (use_imds) + { + availability_zone = S3::getRunningAvailabilityZone(); + } + else + { + availability_zone = config.getString(getConfigPath("availability_zone"), ""); + + if (availability_zone.empty()) + availability_zone = loadAvailabilityZoneFromFile(config); + + if (availability_zone.empty()) + LOG_WARNING(log, "Availability zone info not found"); + } + + LOG_DEBUG(log, "Loaded info: availability_zone: {}", availability_zone); + initialized = true; +} + +std::string PlacementInfo::getAvailabilityZone() const +{ + if (!initialized) + { + LOG_WARNING(log, "Placement info has not been loaded"); + return ""; + } + + return availability_zone; +} + +} +} diff --git a/src/Server/CloudPlacementInfo.h b/src/Server/CloudPlacementInfo.h new file mode 100644 index 00000000000..407f668142f --- /dev/null +++ b/src/Server/CloudPlacementInfo.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +namespace PlacementInfo +{ + +static constexpr auto PLACEMENT_CONFIG_PREFIX = "placement"; +static constexpr auto DEFAULT_AZ_FILE_PATH = "/run/instance-metadata/node-zone"; + +/// A singleton providing information on where in cloud server is running. +class PlacementInfo : private boost::noncopyable +{ +public: + static PlacementInfo & instance(); + + void initialize(const Poco::Util::AbstractConfiguration & config); + + std::string getAvailabilityZone() const; + +private: + PlacementInfo() = default; + + LoggerPtr log = getLogger("CloudPlacementInfo"); + + bool initialized; + + bool use_imds; + std::string availability_zone; +}; + +} +} diff --git a/src/Server/GRPCServer.cpp b/src/Server/GRPCServer.cpp index f31a8d6feb5..15765f99b4b 100644 --- a/src/Server/GRPCServer.cpp +++ b/src/Server/GRPCServer.cpp @@ -76,7 +76,7 @@ namespace static std::once_flag once_flag; std::call_once(once_flag, [&config] { - static LoggerPtr logger = getLogger("grpc"); + static LoggerRawPtr logger = getRawLogger("grpc"); gpr_set_log_function([](gpr_log_func_args* args) { if (args->severity == GPR_LOG_SEVERITY_DEBUG) @@ -419,7 +419,11 @@ namespace void read(GRPCQueryInfo & query_info_, const CompletionCallback & callback) override { if (!query_info.has_value()) + { callback(false); + return; + } + query_info_ = std::move(query_info).value(); query_info.reset(); callback(true); @@ -486,7 +490,11 @@ namespace void read(GRPCQueryInfo & query_info_, const CompletionCallback & callback) override { if (!query_info.has_value()) + { callback(false); + return; + } + query_info_ = std::move(query_info).value(); query_info.reset(); callback(true); @@ -614,7 +622,7 @@ namespace class Call { public: - Call(CallType call_type_, std::unique_ptr responder_, IServer & iserver_, LoggerPtr log_); + Call(CallType call_type_, std::unique_ptr responder_, IServer & iserver_, LoggerRawPtr log_); ~Call(); void start(const std::function & on_finish_call_callback); @@ -656,7 +664,7 @@ namespace const CallType call_type; std::unique_ptr responder; IServer & iserver; - LoggerPtr log = nullptr; + LoggerRawPtr log = nullptr; std::optional session; ContextMutablePtr query_context; @@ -718,7 +726,7 @@ namespace }; // NOLINTEND(clang-analyzer-optin.performance.Padding) - Call::Call(CallType call_type_, std::unique_ptr responder_, IServer & iserver_, LoggerPtr log_) + Call::Call(CallType call_type_, std::unique_ptr responder_, IServer & iserver_, LoggerRawPtr log_) : call_type(call_type_), responder(std::move(responder_)), iserver(iserver_), log(log_) { } @@ -1843,7 +1851,7 @@ private: GRPCServer::GRPCServer(IServer & iserver_, const Poco::Net::SocketAddress & address_to_listen_) : iserver(iserver_) , address_to_listen(address_to_listen_) - , log(getLogger("GRPCServer")) + , log(getRawLogger("GRPCServer")) , runner(std::make_unique(*this)) {} diff --git a/src/Server/GRPCServer.h b/src/Server/GRPCServer.h index a9c8161298f..70c16d3e9af 100644 --- a/src/Server/GRPCServer.h +++ b/src/Server/GRPCServer.h @@ -3,10 +3,11 @@ #include "config.h" #if USE_GRPC + +#include "clickhouse_grpc.grpc.pb.h" #include #include #include -#include "clickhouse_grpc.grpc.pb.h" namespace Poco { class Logger; } @@ -47,7 +48,7 @@ private: IServer & iserver; const Poco::Net::SocketAddress address_to_listen; - LoggerPtr log; + LoggerRawPtr log; GRPCService grpc_service; std::unique_ptr grpc_server; std::unique_ptr queue; diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index 72e7c5552f8..35a95c0534d 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -125,7 +125,7 @@ namespace ErrorCodes namespace { -bool tryAddHttpOptionHeadersFromConfig(HTTPServerResponse & response, const Poco::Util::LayeredConfiguration & config) +bool tryAddHTTPOptionHeadersFromConfig(HTTPServerResponse & response, const Poco::Util::LayeredConfiguration & config) { if (config.has("http_options_response")) { @@ -153,7 +153,7 @@ bool tryAddHttpOptionHeadersFromConfig(HTTPServerResponse & response, const Poco void processOptionsRequest(HTTPServerResponse & response, const Poco::Util::LayeredConfiguration & config) { /// If can add some headers from config - if (tryAddHttpOptionHeadersFromConfig(response, config)) + if (tryAddHTTPOptionHeadersFromConfig(response, config)) { response.setKeepAlive(false); response.setStatusAndReason(HTTPResponse::HTTP_NO_CONTENT); @@ -496,7 +496,7 @@ bool HTTPHandler::authenticateUser( else if (request.getMethod() == HTTPServerRequest::HTTP_POST) http_method = ClientInfo::HTTPMethod::POST; - session->setHttpClientInfo(http_method, request.get("User-Agent", ""), request.get("Referer", "")); + session->setHTTPClientInfo(http_method, request.get("User-Agent", ""), request.get("Referer", "")); session->setForwardedFor(request.get("X-Forwarded-For", "")); session->setQuotaClientKey(quota_key); @@ -1065,7 +1065,7 @@ void HTTPHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse response.set("X-ClickHouse-Server-Display-Name", server_display_name); if (!request.get("Origin", "").empty()) - tryAddHttpOptionHeadersFromConfig(response, server.config()); + tryAddHTTPOptionHeadersFromConfig(response, server.config()); /// For keep-alive to work. if (request.getVersion() == HTTPServerRequest::HTTP_1_1) diff --git a/src/Server/HTTPHandlerFactory.cpp b/src/Server/HTTPHandlerFactory.cpp index 66b55f68217..9e4a440ddb2 100644 --- a/src/Server/HTTPHandlerFactory.cpp +++ b/src/Server/HTTPHandlerFactory.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -7,6 +8,7 @@ #include #include "HTTPHandler.h" +#include "Server/PrometheusMetricsWriter.h" #include "StaticRequestHandler.h" #include "ReplicasStatusHandler.h" #include "InterserverIOHTTPHandler.h" @@ -113,9 +115,12 @@ HTTPRequestHandlerFactoryPtr createHandlerFactory(IServer & server, const Poco:: else if (name == "InterserverIOHTTPHandler-factory" || name == "InterserverIOHTTPSHandler-factory") return createInterserverHTTPHandlerFactory(server, name); else if (name == "PrometheusHandler-factory") - return createPrometheusMainHandlerFactory(server, config, async_metrics, name); + { + auto metrics_writer = std::make_shared(config, "prometheus", async_metrics); + return createPrometheusMainHandlerFactory(server, config, metrics_writer, name); + } - throw Exception(ErrorCodes::LOGICAL_ERROR, "LOGICAL ERROR: Unknown HTTP handler factory name."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown HTTP handler factory name."); } @@ -208,7 +213,7 @@ void addDefaultHandlersFactory( /// Otherwise it will be created separately, see createHandlerFactory(...). if (config.has("prometheus") && config.getInt("prometheus.port", 0) == 0) { - PrometheusMetricsWriter writer(config, "prometheus", async_metrics); + auto writer = std::make_shared(config, "prometheus", async_metrics); auto creator = [&server, writer] () -> std::unique_ptr { return std::make_unique(server, writer); diff --git a/src/Server/HTTPHandlerFactory.h b/src/Server/HTTPHandlerFactory.h index 94b02e52277..427d495f659 100644 --- a/src/Server/HTTPHandlerFactory.h +++ b/src/Server/HTTPHandlerFactory.h @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -130,10 +131,10 @@ createPrometheusHandlerFactory(IServer & server, AsynchronousMetrics & async_metrics, const std::string & config_prefix); -HTTPRequestHandlerFactoryPtr -createPrometheusMainHandlerFactory(IServer & server, +HTTPRequestHandlerFactoryPtr createPrometheusMainHandlerFactory( + IServer & server, const Poco::Util::AbstractConfiguration & config, - AsynchronousMetrics & async_metrics, + PrometheusMetricsWriterPtr metrics_writer, const std::string & name); /// @param server - used in handlers to check IServer::isCancelled() diff --git a/src/Server/MySQLHandler.cpp b/src/Server/MySQLHandler.cpp index c159a09c874..72fe3b7cea9 100644 --- a/src/Server/MySQLHandler.cpp +++ b/src/Server/MySQLHandler.cpp @@ -57,16 +57,109 @@ namespace ErrorCodes extern const int UNSUPPORTED_METHOD; } - static const size_t PACKET_HEADER_SIZE = 4; static const size_t SSL_REQUEST_PAYLOAD_SIZE = 32; -static String showWarningsReplacementQuery(const String & query); -static String showCountWarningsReplacementQuery(const String & query); -static String selectEmptyReplacementQuery(const String & query); -static String showTableStatusReplacementQuery(const String & query); -static String killConnectionIdReplacementQuery(const String & query); -static String selectLimitReplacementQuery(const String & query); +static bool checkShouldReplaceQuery(const String & query, const String & prefix) +{ + return query.length() >= prefix.length() + && std::equal(prefix.begin(), prefix.end(), query.begin(), [](char a, char b) { return std::tolower(a) == std::tolower(b); }); +} + +static bool isFederatedServerSetupSetCommand(const String & query) +{ + re2::RE2::Options regexp_options; + regexp_options.set_case_sensitive(false); + static const re2::RE2 expr( + "(^(SET NAMES(.*)))" + "|(^(SET character_set_results(.*)))" + "|(^(SET FOREIGN_KEY_CHECKS(.*)))" + "|(^(SET AUTOCOMMIT(.*)))" + "|(^(SET sql_mode(.*)))" + "|(^(SET @@(.*)))" + "|(^(SET SESSION TRANSACTION ISOLATION LEVEL(.*)))", regexp_options); + assert(expr.ok()); + return re2::RE2::FullMatch(query, expr); +} + +/// Always return an empty set with appropriate column definitions for SHOW WARNINGS queries +/// See also: https://dev.mysql.com/doc/refman/8.0/en/show-warnings.html +static String showWarningsReplacementQuery([[maybe_unused]] const String & query) +{ + return "SELECT '' AS Level, 0::UInt32 AS Code, '' AS Message WHERE false"; +} + +static String showCountWarningsReplacementQuery([[maybe_unused]] const String & query) +{ + return "SELECT 0::UInt64 AS `@@session.warning_count`"; +} + +/// Replace "[query(such as SHOW VARIABLES...)]" into "". +static String selectEmptyReplacementQuery(const String & query) +{ + std::ignore = query; + return "select ''"; +} + +/// Replace "SHOW TABLE STATUS LIKE 'xx'" into "SELECT ... FROM system.tables WHERE name LIKE 'xx'". +static String showTableStatusReplacementQuery(const String & query) +{ + const String prefix = "SHOW TABLE STATUS LIKE "; + if (query.size() > prefix.size()) + { + String suffix = query.data() + prefix.length(); + return ( + "SELECT" + " name AS Name," + " engine AS Engine," + " '10' AS Version," + " 'Dynamic' AS Row_format," + " 0 AS Rows," + " 0 AS Avg_row_length," + " 0 AS Data_length," + " 0 AS Max_data_length," + " 0 AS Index_length," + " 0 AS Data_free," + " 'NULL' AS Auto_increment," + " metadata_modification_time AS Create_time," + " metadata_modification_time AS Update_time," + " metadata_modification_time AS Check_time," + " 'utf8_bin' AS Collation," + " 'NULL' AS Checksum," + " '' AS Create_options," + " '' AS Comment" + " FROM system.tables" + " WHERE name LIKE " + + suffix); + } + return query; +} + +static std::optional setSettingReplacementQuery(const String & query, const String & mysql_setting, const String & clickhouse_setting) +{ + const String prefix = "SET " + mysql_setting; + // if (query.length() >= prefix.length() && boost::iequals(std::string_view(prefix), std::string_view(query.data(), 3))) + if (checkShouldReplaceQuery(query, prefix)) + return "SET " + clickhouse_setting + String(query.data() + prefix.length()); + return std::nullopt; +} + +/// Replace "KILL QUERY [connection_id]" into "KILL QUERY WHERE query_id LIKE 'mysql:[connection_id]:xxx'". +static String killConnectionIdReplacementQuery(const String & query) +{ + const String prefix = "KILL QUERY "; + if (query.size() > prefix.size()) + { + String suffix = query.data() + prefix.length(); + static const re2::RE2 expr("^[0-9]"); + if (re2::RE2::FullMatch(suffix, expr)) + { + String replacement = fmt::format("KILL QUERY WHERE query_id LIKE 'mysql:{}:%'", suffix); + return replacement; + } + } + return query; +} MySQLHandler::MySQLHandler( IServer & server_, @@ -88,12 +181,14 @@ MySQLHandler::MySQLHandler( if (ssl_enabled) server_capabilities |= CLIENT_SSL; - replacements.emplace("SHOW WARNINGS", showWarningsReplacementQuery); - replacements.emplace("SHOW COUNT(*) WARNINGS", showCountWarningsReplacementQuery); - replacements.emplace("KILL QUERY", killConnectionIdReplacementQuery); - replacements.emplace("SHOW TABLE STATUS LIKE", showTableStatusReplacementQuery); - replacements.emplace("SHOW VARIABLES", selectEmptyReplacementQuery); - replacements.emplace("SET SQL_SELECT_LIMIT", selectLimitReplacementQuery); + queries_replacements.emplace("SHOW WARNINGS", showWarningsReplacementQuery); + queries_replacements.emplace("SHOW COUNT(*) WARNINGS", showCountWarningsReplacementQuery); + queries_replacements.emplace("KILL QUERY", killConnectionIdReplacementQuery); + queries_replacements.emplace("SHOW TABLE STATUS LIKE", showTableStatusReplacementQuery); + queries_replacements.emplace("SHOW VARIABLES", selectEmptyReplacementQuery); + settings_replacements.emplace("SQL_SELECT_LIMIT", "limit"); + settings_replacements.emplace("NET_WRITE_TIMEOUT", "send_timeout"); + settings_replacements.emplace("NET_READ_TIMEOUT", "receive_timeout"); } void MySQLHandler::run() @@ -324,8 +419,6 @@ void MySQLHandler::comPing() packet_endpoint->sendPacket(OKPacket(0x0, client_capabilities, 0, 0, 0), true); } -static bool isFederatedServerSetupSetCommand(const String & query); - void MySQLHandler::comQuery(ReadBuffer & payload, bool binary_protocol) { String query = String(payload.position(), payload.buffer().end()); @@ -342,17 +435,29 @@ void MySQLHandler::comQuery(ReadBuffer & payload, bool binary_protocol) bool should_replace = false; bool with_output = false; - for (auto const & x : replacements) + // Queries replacements + for (auto const & [query_to_replace, replacement_fn] : queries_replacements) { - if (0 == strncasecmp(x.first.c_str(), query.c_str(), x.first.size())) + if (checkShouldReplaceQuery(query, query_to_replace)) { should_replace = true; - replacement_query = x.second(query); + replacement_query = replacement_fn(query); break; } } - ReadBufferFromString replacement(replacement_query); + // Settings replacements + if (!should_replace) + for (auto const & [mysql_setting, clickhouse_setting] : settings_replacements) + { + const auto replacement_query_opt = setSettingReplacementQuery(query, mysql_setting, clickhouse_setting); + if (replacement_query_opt.has_value()) + { + should_replace = true; + replacement_query = replacement_query_opt.value(); + break; + } + } auto query_context = session->makeQueryContext(); query_context->setCurrentQueryId(fmt::format("mysql:{}:{}", connection_id, toString(UUIDHelpers::generateV4()))); @@ -385,7 +490,14 @@ void MySQLHandler::comQuery(ReadBuffer & payload, bool binary_protocol) } }; - executeQuery(should_replace ? replacement : payload, *out, false, query_context, set_result_details, QueryFlags{}, format_settings); + if (should_replace) + { + ReadBufferFromString replacement(replacement_query); + executeQuery(replacement, *out, false, query_context, set_result_details, QueryFlags{}, format_settings); + } + else + executeQuery(payload, *out, false, query_context, set_result_details, QueryFlags{}, format_settings); + if (!with_output) packet_endpoint->sendPacket(OKPacket(0x00, client_capabilities, affected_rows, 0, 0), true); @@ -531,99 +643,4 @@ void MySQLHandlerSSL::finishHandshakeSSL( } #endif - -static bool isFederatedServerSetupSetCommand(const String & query) -{ - re2::RE2::Options regexp_options; - regexp_options.set_case_sensitive(false); - static const re2::RE2 expr( - "(^(SET NAMES(.*)))" - "|(^(SET character_set_results(.*)))" - "|(^(SET FOREIGN_KEY_CHECKS(.*)))" - "|(^(SET AUTOCOMMIT(.*)))" - "|(^(SET sql_mode(.*)))" - "|(^(SET @@(.*)))" - "|(^(SET SESSION TRANSACTION ISOLATION LEVEL(.*)))", regexp_options); - assert(expr.ok()); - return re2::RE2::FullMatch(query, expr); -} - -/// Always return an empty set with appropriate column definitions for SHOW WARNINGS queries -/// See also: https://dev.mysql.com/doc/refman/8.0/en/show-warnings.html -static String showWarningsReplacementQuery([[maybe_unused]] const String & query) -{ - return "SELECT '' AS Level, 0::UInt32 AS Code, '' AS Message WHERE false"; -} - -static String showCountWarningsReplacementQuery([[maybe_unused]] const String & query) -{ - return "SELECT 0::UInt64 AS `@@session.warning_count`"; -} - -/// Replace "[query(such as SHOW VARIABLES...)]" into "". -static String selectEmptyReplacementQuery(const String & query) -{ - std::ignore = query; - return "select ''"; -} - -/// Replace "SHOW TABLE STATUS LIKE 'xx'" into "SELECT ... FROM system.tables WHERE name LIKE 'xx'". -static String showTableStatusReplacementQuery(const String & query) -{ - const String prefix = "SHOW TABLE STATUS LIKE "; - if (query.size() > prefix.size()) - { - String suffix = query.data() + prefix.length(); - return ( - "SELECT" - " name AS Name," - " engine AS Engine," - " '10' AS Version," - " 'Dynamic' AS Row_format," - " 0 AS Rows," - " 0 AS Avg_row_length," - " 0 AS Data_length," - " 0 AS Max_data_length," - " 0 AS Index_length," - " 0 AS Data_free," - " 'NULL' AS Auto_increment," - " metadata_modification_time AS Create_time," - " metadata_modification_time AS Update_time," - " metadata_modification_time AS Check_time," - " 'utf8_bin' AS Collation," - " 'NULL' AS Checksum," - " '' AS Create_options," - " '' AS Comment" - " FROM system.tables" - " WHERE name LIKE " - + suffix); - } - return query; -} - -static String selectLimitReplacementQuery(const String & query) -{ - const String prefix = "SET SQL_SELECT_LIMIT"; - if (query.starts_with(prefix)) - return "SET limit" + std::string(query.data() + prefix.length()); - return query; -} - -/// Replace "KILL QUERY [connection_id]" into "KILL QUERY WHERE query_id LIKE 'mysql:[connection_id]:xxx'". -static String killConnectionIdReplacementQuery(const String & query) -{ - const String prefix = "KILL QUERY "; - if (query.size() > prefix.size()) - { - String suffix = query.data() + prefix.length(); - static const re2::RE2 expr("^[0-9]"); - if (re2::RE2::FullMatch(suffix, expr)) - { - String replacement = fmt::format("KILL QUERY WHERE query_id LIKE 'mysql:{}:%'", suffix); - return replacement; - } - } - return query; -} - } diff --git a/src/Server/MySQLHandler.h b/src/Server/MySQLHandler.h index 867a90a6205..2deb2b8f435 100644 --- a/src/Server/MySQLHandler.h +++ b/src/Server/MySQLHandler.h @@ -92,9 +92,13 @@ protected: MySQLProtocol::PacketEndpointPtr packet_endpoint; std::unique_ptr session; - using ReplacementFn = std::function; - using Replacements = std::unordered_map; - Replacements replacements; + using QueryReplacementFn = std::function; + using QueriesReplacements = std::unordered_map; + QueriesReplacements queries_replacements; + + /// MySQL setting name --> ClickHouse setting name + using SettingsReplacements = std::unordered_map; + SettingsReplacements settings_replacements; std::mutex prepared_statements_mutex; UInt32 current_prepared_statement_id TSA_GUARDED_BY(prepared_statements_mutex) = 0; diff --git a/src/Server/PrometheusMetricsWriter.cpp b/src/Server/PrometheusMetricsWriter.cpp index 3d09c2165e5..d0fdcd61493 100644 --- a/src/Server/PrometheusMetricsWriter.cpp +++ b/src/Server/PrometheusMetricsWriter.cpp @@ -4,6 +4,8 @@ #include #include +#include "config.h" + namespace { @@ -38,8 +40,83 @@ void convertHelpToSingleLine(std::string & help) std::replace(help.begin(), help.end(), '\n', ' '); } +constexpr auto profile_events_prefix = "ClickHouseProfileEvents_"; +constexpr auto current_metrics_prefix = "ClickHouseMetrics_"; +constexpr auto asynchronous_metrics_prefix = "ClickHouseAsyncMetrics_"; +constexpr auto error_metrics_prefix = "ClickHouseErrorMetric_"; + +void writeEvent(DB::WriteBuffer & wb, ProfileEvents::Event event) +{ + const auto counter = ProfileEvents::global_counters[event].load(std::memory_order_relaxed); + + std::string metric_name{ProfileEvents::getName(static_cast(event))}; + std::string metric_doc{ProfileEvents::getDocumentation(static_cast(event))}; + + convertHelpToSingleLine(metric_doc); + + if (!replaceInvalidChars(metric_name)) + return; + + std::string key{profile_events_prefix + metric_name}; + + writeOutLine(wb, "# HELP", key, metric_doc); + writeOutLine(wb, "# TYPE", key, "counter"); + writeOutLine(wb, key, counter); } +void writeMetric(DB::WriteBuffer & wb, size_t metric) +{ + const auto value = CurrentMetrics::values[metric].load(std::memory_order_relaxed); + + std::string metric_name{CurrentMetrics::getName(static_cast(metric))}; + std::string metric_doc{CurrentMetrics::getDocumentation(static_cast(metric))}; + + convertHelpToSingleLine(metric_doc); + + if (!replaceInvalidChars(metric_name)) + return; + + std::string key{current_metrics_prefix + metric_name}; + + writeOutLine(wb, "# HELP", key, metric_doc); + writeOutLine(wb, "# TYPE", key, "gauge"); + writeOutLine(wb, key, value); +} + +void writeAsyncMetrics(DB::WriteBuffer & wb, const DB::AsynchronousMetricValues & values) +{ + for (const auto & name_value : values) + { + std::string key{asynchronous_metrics_prefix + name_value.first}; + + if (!replaceInvalidChars(key)) + continue; + + auto value = name_value.second; + + std::string metric_doc{value.documentation}; + convertHelpToSingleLine(metric_doc); + + writeOutLine(wb, "# HELP", key, metric_doc); + writeOutLine(wb, "# TYPE", key, "gauge"); + writeOutLine(wb, key, value.value); + } +} + +} + +#if USE_NURAFT +namespace ProfileEvents +{ + extern const std::vector keeper_profile_events; +} + +namespace CurrentMetrics +{ + extern const std::vector keeper_metrics; +} +#endif + namespace DB { @@ -60,65 +137,17 @@ void PrometheusMetricsWriter::write(WriteBuffer & wb) const if (send_events) { for (ProfileEvents::Event i = ProfileEvents::Event(0), end = ProfileEvents::end(); i < end; ++i) - { - const auto counter = ProfileEvents::global_counters[i].load(std::memory_order_relaxed); - - std::string metric_name{ProfileEvents::getName(static_cast(i))}; - std::string metric_doc{ProfileEvents::getDocumentation(static_cast(i))}; - - convertHelpToSingleLine(metric_doc); - - if (!replaceInvalidChars(metric_name)) - continue; - std::string key{profile_events_prefix + metric_name}; - - writeOutLine(wb, "# HELP", key, metric_doc); - writeOutLine(wb, "# TYPE", key, "counter"); - writeOutLine(wb, key, counter); - } + writeEvent(wb, i); } if (send_metrics) { for (size_t i = 0, end = CurrentMetrics::end(); i < end; ++i) - { - const auto value = CurrentMetrics::values[i].load(std::memory_order_relaxed); - - std::string metric_name{CurrentMetrics::getName(static_cast(i))}; - std::string metric_doc{CurrentMetrics::getDocumentation(static_cast(i))}; - - convertHelpToSingleLine(metric_doc); - - if (!replaceInvalidChars(metric_name)) - continue; - std::string key{current_metrics_prefix + metric_name}; - - writeOutLine(wb, "# HELP", key, metric_doc); - writeOutLine(wb, "# TYPE", key, "gauge"); - writeOutLine(wb, key, value); - } + writeMetric(wb, i); } if (send_asynchronous_metrics) - { - auto async_metrics_values = async_metrics.getValues(); - for (const auto & name_value : async_metrics_values) - { - std::string key{asynchronous_metrics_prefix + name_value.first}; - - if (!replaceInvalidChars(key)) - continue; - - auto value = name_value.second; - - std::string metric_doc{value.documentation}; - convertHelpToSingleLine(metric_doc); - - writeOutLine(wb, "# HELP", key, metric_doc); - writeOutLine(wb, "# TYPE", key, "gauge"); - writeOutLine(wb, key, value.value); - } - } + writeAsyncMetrics(wb, async_metrics.getValues()); if (send_errors) { @@ -152,4 +181,24 @@ void PrometheusMetricsWriter::write(WriteBuffer & wb) const } +void KeeperPrometheusMetricsWriter::write([[maybe_unused]] WriteBuffer & wb) const +{ +#if USE_NURAFT + if (send_events) + { + for (auto event : ProfileEvents::keeper_profile_events) + writeEvent(wb, event); + } + + if (send_metrics) + { + for (auto metric : CurrentMetrics::keeper_metrics) + writeMetric(wb, metric); + } + + if (send_asynchronous_metrics) + writeAsyncMetrics(wb, async_metrics.getValues()); +#endif +} + } diff --git a/src/Server/PrometheusMetricsWriter.h b/src/Server/PrometheusMetricsWriter.h index b909a0ddcf6..933ad909ee0 100644 --- a/src/Server/PrometheusMetricsWriter.h +++ b/src/Server/PrometheusMetricsWriter.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -19,20 +20,25 @@ public: const Poco::Util::AbstractConfiguration & config, const std::string & config_name, const AsynchronousMetrics & async_metrics_); - void write(WriteBuffer & wb) const; + virtual void write(WriteBuffer & wb) const; -private: + virtual ~PrometheusMetricsWriter() = default; + +protected: const AsynchronousMetrics & async_metrics; - const bool send_events; const bool send_metrics; const bool send_asynchronous_metrics; const bool send_errors; - - static inline constexpr auto profile_events_prefix = "ClickHouseProfileEvents_"; - static inline constexpr auto current_metrics_prefix = "ClickHouseMetrics_"; - static inline constexpr auto asynchronous_metrics_prefix = "ClickHouseAsyncMetrics_"; - static inline constexpr auto error_metrics_prefix = "ClickHouseErrorMetric_"; }; +class KeeperPrometheusMetricsWriter : public PrometheusMetricsWriter +{ + using PrometheusMetricsWriter::PrometheusMetricsWriter; + + void write(WriteBuffer & wb) const override; +}; + +using PrometheusMetricsWriterPtr = std::shared_ptr; + } diff --git a/src/Server/PrometheusRequestHandler.cpp b/src/Server/PrometheusRequestHandler.cpp index 8690ec9121e..dff960f7031 100644 --- a/src/Server/PrometheusRequestHandler.cpp +++ b/src/Server/PrometheusRequestHandler.cpp @@ -7,6 +7,7 @@ #include #include #include +#include "Server/PrometheusMetricsWriter.h" #include @@ -34,7 +35,7 @@ void PrometheusRequestHandler::handleRequest(HTTPServerRequest & request, HTTPSe WriteBufferFromHTTPServerResponse wb(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout, write_event); try { - metrics_writer.write(wb); + metrics_writer->write(wb); wb.finalize(); } catch (...) @@ -54,7 +55,7 @@ HTTPRequestHandlerFactoryPtr createPrometheusHandlerFactory( AsynchronousMetrics & async_metrics, const std::string & config_prefix) { - PrometheusMetricsWriter writer(config, config_prefix + ".handler", async_metrics); + auto writer = std::make_shared(config, config_prefix + ".handler", async_metrics); auto creator = [&server, writer]() -> std::unique_ptr { return std::make_unique(server, writer); @@ -66,13 +67,12 @@ HTTPRequestHandlerFactoryPtr createPrometheusHandlerFactory( } HTTPRequestHandlerFactoryPtr createPrometheusMainHandlerFactory( - IServer & server, const Poco::Util::AbstractConfiguration & config, AsynchronousMetrics & async_metrics, const std::string & name) + IServer & server, const Poco::Util::AbstractConfiguration & config, PrometheusMetricsWriterPtr metrics_writer, const std::string & name) { auto factory = std::make_shared(name); - PrometheusMetricsWriter writer(config, "prometheus", async_metrics); - auto creator = [&server, writer]() -> std::unique_ptr + auto creator = [&server, metrics_writer] { - return std::make_unique(server, writer); + return std::make_unique(server, metrics_writer); }; auto handler = std::make_shared>(std::move(creator)); diff --git a/src/Server/PrometheusRequestHandler.h b/src/Server/PrometheusRequestHandler.h index 9ec54cc2e4e..d120752c8c5 100644 --- a/src/Server/PrometheusRequestHandler.h +++ b/src/Server/PrometheusRequestHandler.h @@ -13,12 +13,12 @@ class PrometheusRequestHandler : public HTTPRequestHandler { private: IServer & server; - const PrometheusMetricsWriter & metrics_writer; + PrometheusMetricsWriterPtr metrics_writer; public: - PrometheusRequestHandler(IServer & server_, const PrometheusMetricsWriter & metrics_writer_) + PrometheusRequestHandler(IServer & server_, PrometheusMetricsWriterPtr metrics_writer_) : server(server_) - , metrics_writer(metrics_writer_) + , metrics_writer(std::move(metrics_writer_)) { } diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index ec6b374518d..9464ef74586 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -184,7 +183,15 @@ void validateClientInfo(const ClientInfo & session_client_info, const ClientInfo namespace DB { -TCPHandler::TCPHandler(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, bool parse_proxy_protocol_, std::string server_display_name_, const ProfileEvents::Event & read_event_, const ProfileEvents::Event & write_event_) +TCPHandler::TCPHandler( + IServer & server_, + TCPServer & tcp_server_, + const Poco::Net::StreamSocket & socket_, + bool parse_proxy_protocol_, + std::string server_display_name_, + std::string host_name_, + const ProfileEvents::Event & read_event_, + const ProfileEvents::Event & write_event_) : Poco::Net::TCPServerConnection(socket_) , server(server_) , tcp_server(tcp_server_) @@ -193,11 +200,20 @@ TCPHandler::TCPHandler(IServer & server_, TCPServer & tcp_server_, const Poco::N , read_event(read_event_) , write_event(write_event_) , server_display_name(std::move(server_display_name_)) + , host_name(std::move(host_name_)) { } -TCPHandler::TCPHandler(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, TCPProtocolStackData & stack_data, std::string server_display_name_, const ProfileEvents::Event & read_event_, const ProfileEvents::Event & write_event_) -: Poco::Net::TCPServerConnection(socket_) +TCPHandler::TCPHandler( + IServer & server_, + TCPServer & tcp_server_, + const Poco::Net::StreamSocket & socket_, + TCPProtocolStackData & stack_data, + std::string server_display_name_, + std::string host_name_, + const ProfileEvents::Event & read_event_, + const ProfileEvents::Event & write_event_) + : Poco::Net::TCPServerConnection(socket_) , server(server_) , tcp_server(tcp_server_) , log(getLogger("TCPHandler")) @@ -207,6 +223,7 @@ TCPHandler::TCPHandler(IServer & server_, TCPServer & tcp_server_, const Poco::N , write_event(write_event_) , default_database(stack_data.default_database) , server_display_name(std::move(server_display_name_)) + , host_name(std::move(host_name_)) { if (!forwarded_for.empty()) LOG_TRACE(log, "Forwarded client address: {}", forwarded_for); @@ -926,7 +943,7 @@ void TCPHandler::processInsertQuery() auto wait_status = result.future.wait_for(std::chrono::milliseconds(timeout_ms)); if (wait_status == std::future_status::deferred) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: got future in deferred state"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got future in deferred state"); if (wait_status == std::future_status::timeout) throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Wait for async insert timeout ({} ms) exceeded)", timeout_ms); @@ -1201,7 +1218,7 @@ void TCPHandler::sendExtremes(const Block & extremes) void TCPHandler::sendProfileEvents() { Block block; - ProfileEvents::getProfileEvents(server_display_name, state.profile_queue, block, last_sent_snapshots); + ProfileEvents::getProfileEvents(host_name, state.profile_queue, block, last_sent_snapshots); if (block.rows() != 0) { initProfileEventsBlockOutput(block); diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index 26cecf46662..fc42a614f5c 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -147,8 +147,24 @@ public: * because it allows to check the IP ranges of the trusted proxy. * Proxy-forwarded (original client) IP address is used for quota accounting if quota is keyed by forwarded IP. */ - TCPHandler(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, bool parse_proxy_protocol_, std::string server_display_name_, const ProfileEvents::Event & read_event_ = ProfileEvents::end(), const ProfileEvents::Event & write_event_ = ProfileEvents::end()); - TCPHandler(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, TCPProtocolStackData & stack_data, std::string server_display_name_, const ProfileEvents::Event & read_event_ = ProfileEvents::end(), const ProfileEvents::Event & write_event_ = ProfileEvents::end()); + TCPHandler( + IServer & server_, + TCPServer & tcp_server_, + const Poco::Net::StreamSocket & socket_, + bool parse_proxy_protocol_, + String server_display_name_, + String host_name_, + const ProfileEvents::Event & read_event_ = ProfileEvents::end(), + const ProfileEvents::Event & write_event_ = ProfileEvents::end()); + TCPHandler( + IServer & server_, + TCPServer & tcp_server_, + const Poco::Net::StreamSocket & socket_, + TCPProtocolStackData & stack_data, + String server_display_name_, + String host_name_, + const ProfileEvents::Event & read_event_ = ProfileEvents::end(), + const ProfileEvents::Event & write_event_ = ProfileEvents::end()); ~TCPHandler() override; void run() override; @@ -225,6 +241,7 @@ private: /// It is the name of the server that will be sent to the client. String server_display_name; + String host_name; void runImpl(); diff --git a/src/Server/TCPHandlerFactory.h b/src/Server/TCPHandlerFactory.h index d65c9898b23..4e9963d2c6e 100644 --- a/src/Server/TCPHandlerFactory.h +++ b/src/Server/TCPHandlerFactory.h @@ -19,6 +19,7 @@ private: IServer & server; bool parse_proxy_protocol = false; LoggerPtr log; + std::string host_name; std::string server_display_name; ProfileEvents::Event read_event; @@ -42,7 +43,8 @@ public: , read_event(read_event_) , write_event(write_event_) { - server_display_name = server.config().getString("display_name", getFQDNOrHostName()); + host_name = getFQDNOrHostName(); + server_display_name = server.config().getString("display_name", host_name); } Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) override @@ -50,7 +52,7 @@ public: try { LOG_TRACE(log, "TCP Request. Address: {}", socket.peerAddress().toString()); - return new TCPHandler(server, tcp_server, socket, parse_proxy_protocol, server_display_name, read_event, write_event); + return new TCPHandler(server, tcp_server, socket, parse_proxy_protocol, server_display_name, host_name, read_event, write_event); } catch (const Poco::Net::NetException &) { @@ -64,7 +66,7 @@ public: try { LOG_TRACE(log, "TCP Request. Address: {}", socket.peerAddress().toString()); - return new TCPHandler(server, tcp_server, socket, stack_data, server_display_name, read_event, write_event); + return new TCPHandler(server, tcp_server, socket, stack_data, server_display_name, host_name, read_event, write_event); } catch (const Poco::Net::NetException &) { diff --git a/src/Server/WebUIRequestHandler.cpp b/src/Server/WebUIRequestHandler.cpp index ac7a3bfccf3..e45d2a55acb 100644 --- a/src/Server/WebUIRequestHandler.cpp +++ b/src/Server/WebUIRequestHandler.cpp @@ -17,6 +17,7 @@ INCBIN(resource_play_html, SOURCE_DIR "/programs/server/play.html"); INCBIN(resource_dashboard_html, SOURCE_DIR "/programs/server/dashboard.html"); INCBIN(resource_uplot_js, SOURCE_DIR "/programs/server/js/uplot.js"); +INCBIN(resource_lz_string_js, SOURCE_DIR "/programs/server/js/lz-string.js"); INCBIN(resource_binary_html, SOURCE_DIR "/programs/server/binary.html"); @@ -59,6 +60,9 @@ void WebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerR static re2::RE2 uplot_url = R"(https://[^\s"'`]+u[Pp]lot[^\s"'`]*\.js)"; RE2::Replace(&html, uplot_url, "/js/uplot.js"); + static re2::RE2 lz_string_url = R"(https://[^\s"'`]+lz-string[^\s"'`]*\.js)"; + RE2::Replace(&html, lz_string_url, "/js/lz-string.js"); + WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD, keep_alive_timeout).write(html); } else if (request.getURI().starts_with("/binary")) @@ -71,6 +75,11 @@ void WebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerR response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_OK); WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD, keep_alive_timeout).write(reinterpret_cast(gresource_uplot_jsData), gresource_uplot_jsSize); } + else if (request.getURI() == "/js/lz-string.js") + { + response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_OK); + WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD, keep_alive_timeout).write(reinterpret_cast(gresource_lz_string_jsData), gresource_lz_string_jsSize); + } else { response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_NOT_FOUND); diff --git a/src/Storages/DataLakes/IStorageDataLake.h b/src/Storages/DataLakes/IStorageDataLake.h index db3f835494f..582b55c505b 100644 --- a/src/Storages/DataLakes/IStorageDataLake.h +++ b/src/Storages/DataLakes/IStorageDataLake.h @@ -38,25 +38,25 @@ public: static ColumnsDescription getTableStructureFromData( Configuration & base_configuration, const std::optional & format_settings, - ContextPtr local_context) + const ContextPtr & local_context) { auto configuration = getConfigurationForDataRead(base_configuration, local_context); return Storage::getTableStructureFromData(configuration, format_settings, local_context); } - static Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context) + static Configuration getConfiguration(ASTs & engine_args, const ContextPtr & local_context) { return Storage::getConfiguration(engine_args, local_context, /* get_format_from_file */false); } - Configuration updateConfigurationAndGetCopy(ContextPtr local_context) override + Configuration updateConfigurationAndGetCopy(const ContextPtr & local_context) override { std::lock_guard lock(configuration_update_mutex); updateConfigurationImpl(local_context); return Storage::getConfiguration(); } - void updateConfiguration(ContextPtr local_context) override + void updateConfiguration(const ContextPtr & local_context) override { std::lock_guard lock(configuration_update_mutex); updateConfigurationImpl(local_context); @@ -64,7 +64,7 @@ public: private: static Configuration getConfigurationForDataRead( - const Configuration & base_configuration, ContextPtr local_context, const Strings & keys = {}, bool attach = false) + const Configuration & base_configuration, const ContextPtr & local_context, const Strings & keys = {}, bool attach = false) { auto configuration{base_configuration}; configuration.update(local_context); @@ -94,12 +94,12 @@ private: } } - static Strings getDataFiles(const Configuration & configuration, ContextPtr local_context) + static Strings getDataFiles(const Configuration & configuration, const ContextPtr & local_context) { return MetadataParser().getFiles(configuration, local_context); } - void updateConfigurationImpl(ContextPtr local_context) + void updateConfigurationImpl(const ContextPtr & local_context) { const bool updated = base_configuration.update(local_context); auto new_keys = getDataFiles(base_configuration, local_context); diff --git a/src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp index e01a9a831c0..df1536f53fc 100644 --- a/src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp +++ b/src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp @@ -596,10 +596,11 @@ Strings IcebergMetadata::getDataFiles() const auto status = status_int_column->getInt(i); const auto data_path = std::string(file_path_string_column->getDataAt(i).toView()); const auto pos = data_path.find(configuration.url.key); - const auto file_path = data_path.substr(pos); if (pos == std::string::npos) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected to find {} in data path: {}", configuration.url.key, data_path); + const auto file_path = data_path.substr(pos); + if (ManifestEntryStatus(status) == ManifestEntryStatus::DELETED) { LOG_TEST(log, "Processing delete file for path: {}", file_path); diff --git a/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp b/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp index 8a1a2cdbd8f..345f2553ccb 100644 --- a/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp +++ b/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp @@ -61,7 +61,7 @@ StorageIceberg::StorageIceberg( ColumnsDescription StorageIceberg::getTableStructureFromData( Configuration & base_configuration, const std::optional &, - ContextPtr local_context) + const ContextPtr & local_context) { auto configuration{base_configuration}; configuration.update(local_context); @@ -69,7 +69,7 @@ ColumnsDescription StorageIceberg::getTableStructureFromData( return ColumnsDescription(metadata->getTableSchema()); } -void StorageIceberg::updateConfigurationImpl(ContextPtr local_context) +void StorageIceberg::updateConfigurationImpl(const ContextPtr & local_context) { const bool updated = base_configuration.update(local_context); auto new_metadata = parseIcebergMetadata(base_configuration, local_context); diff --git a/src/Storages/DataLakes/Iceberg/StorageIceberg.h b/src/Storages/DataLakes/Iceberg/StorageIceberg.h index 4e63da5508a..7cae89442ff 100644 --- a/src/Storages/DataLakes/Iceberg/StorageIceberg.h +++ b/src/Storages/DataLakes/Iceberg/StorageIceberg.h @@ -52,28 +52,28 @@ public: static ColumnsDescription getTableStructureFromData( Configuration & base_configuration, const std::optional &, - ContextPtr local_context); + const ContextPtr & local_context); static Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context) { return StorageS3::getConfiguration(engine_args, local_context, /* get_format_from_file */false); } - Configuration updateConfigurationAndGetCopy(ContextPtr local_context) override + Configuration updateConfigurationAndGetCopy(const ContextPtr & local_context) override { std::lock_guard lock(configuration_update_mutex); updateConfigurationImpl(local_context); return StorageS3::getConfiguration(); } - void updateConfiguration(ContextPtr local_context) override + void updateConfiguration(const ContextPtr & local_context) override { std::lock_guard lock(configuration_update_mutex); updateConfigurationImpl(local_context); } private: - void updateConfigurationImpl(ContextPtr local_context); + void updateConfigurationImpl(const ContextPtr & local_context); std::unique_ptr current_metadata; Configuration base_configuration; diff --git a/src/Storages/FileLog/DirectoryWatcherBase.cpp b/src/Storages/FileLog/DirectoryWatcherBase.cpp index 8209483fac9..f1cf0866de7 100644 --- a/src/Storages/FileLog/DirectoryWatcherBase.cpp +++ b/src/Storages/FileLog/DirectoryWatcherBase.cpp @@ -34,8 +34,8 @@ DirectoryWatcherBase::DirectoryWatcherBase( if (!std::filesystem::is_directory(path)) throw Exception(ErrorCodes::BAD_FILE_TYPE, "Path {} is not a directory", path); - fd = inotify_init(); - if (fd == -1) + inotify_fd = inotify_init(); + if (inotify_fd == -1) throw ErrnoException(ErrorCodes::IO_SETUP_ERROR, "Cannot initialize inotify"); watch_task = getContext()->getSchedulePool().createTask("directory_watch", [this] { watchFunc(); }); @@ -56,7 +56,7 @@ void DirectoryWatcherBase::watchFunc() if (eventMask() & DirectoryWatcherBase::DW_ITEM_MOVED_TO) mask |= IN_MOVED_TO; - int wd = inotify_add_watch(fd, path.c_str(), mask); + int wd = inotify_add_watch(inotify_fd, path.c_str(), mask); if (wd == -1) { owner.onError(Exception(ErrorCodes::IO_SETUP_ERROR, "Watch directory {} failed", path)); @@ -65,16 +65,20 @@ void DirectoryWatcherBase::watchFunc() std::string buffer; buffer.resize(buffer_size); - pollfd pfd; - pfd.fd = fd; - pfd.events = POLLIN; + pollfd pfds[2]; + /// inotify descriptor + pfds[0].fd = inotify_fd; + pfds[0].events = POLLIN; + // notifier + pfds[1].fd = event_pipe.fds_rw[0]; + pfds[1].events = POLLIN; while (!stopped) { const auto & settings = owner.storage.getFileLogSettings(); - if (poll(&pfd, 1, static_cast(milliseconds_to_wait)) > 0 && pfd.revents & POLLIN) + if (poll(pfds, 2, static_cast(milliseconds_to_wait)) > 0 && pfds[0].revents & POLLIN) { milliseconds_to_wait = settings->poll_directory_watch_events_backoff_init.totalMilliseconds(); - ssize_t n = read(fd, buffer.data(), buffer.size()); + ssize_t n = read(inotify_fd, buffer.data(), buffer.size()); int i = 0; if (n > 0) { @@ -130,7 +134,7 @@ void DirectoryWatcherBase::watchFunc() DirectoryWatcherBase::~DirectoryWatcherBase() { stop(); - int err = ::close(fd); + int err = ::close(inotify_fd); chassert(!err || errno == EINTR); } @@ -143,6 +147,7 @@ void DirectoryWatcherBase::start() void DirectoryWatcherBase::stop() { stopped = true; + ::write(event_pipe.fds_rw[1], "\0", 1); if (watch_task) watch_task->deactivate(); } diff --git a/src/Storages/FileLog/DirectoryWatcherBase.h b/src/Storages/FileLog/DirectoryWatcherBase.h index a640f686c8a..0dfb58fbc5c 100644 --- a/src/Storages/FileLog/DirectoryWatcherBase.h +++ b/src/Storages/FileLog/DirectoryWatcherBase.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -85,10 +86,6 @@ public: void watchFunc(); -protected: - void start(); - void stop(); - private: FileLogDirectoryWatcher & owner; @@ -102,7 +99,11 @@ private: int event_mask; uint64_t milliseconds_to_wait; - int fd; + int inotify_fd; + PipeFDs event_pipe; + + void start(); + void stop(); }; } diff --git a/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.cpp b/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.cpp index 65df2c020ba..6b6151f5474 100644 --- a/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.cpp +++ b/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.cpp @@ -37,7 +37,7 @@ namespace ErrorCodes AsynchronousReadBufferFromHDFS::AsynchronousReadBufferFromHDFS( IAsynchronousReader & reader_, const ReadSettings & settings_, std::shared_ptr impl_) - : ReadBufferFromFileBase(settings_.remote_fs_buffer_size, nullptr, 0) + : BufferWithOwnMemory(settings_.remote_fs_buffer_size) , reader(reader_) , base_priority(settings_.priority) , impl(std::move(impl_)) diff --git a/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.h b/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.h index 1d3e8b8e3e9..10e2749fd4a 100644 --- a/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.h +++ b/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.h @@ -21,7 +21,7 @@ namespace DB class IAsynchronousReader; -class AsynchronousReadBufferFromHDFS : public ReadBufferFromFileBase +class AsynchronousReadBufferFromHDFS : public BufferWithOwnMemory, public WithFileName, public WithFileSize { public: AsynchronousReadBufferFromHDFS( diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index ab21c4946e4..5e937d3d31d 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -67,6 +67,7 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; extern const int CANNOT_COMPILE_REGEXP; + extern const int CANNOT_DETECT_FORMAT; } namespace { @@ -194,7 +195,7 @@ StorageHDFS::StorageHDFS( const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_, const bool distributed_processing_, ASTPtr partition_by_) @@ -206,7 +207,8 @@ StorageHDFS::StorageHDFS( , distributed_processing(distributed_processing_) , partition_by(partition_by_) { - FormatFactory::instance().checkFormatName(format_name); + if (format_name != "auto") + FormatFactory::instance().checkFormatName(format_name); context_->getRemoteHostFilter().checkURL(Poco::URI(uri_)); checkHDFSURL(uri_); @@ -217,11 +219,19 @@ StorageHDFS::StorageHDFS( if (columns_.empty()) { - auto columns = getTableStructureFromData(format_name, uri_, compression_method, context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = getTableStructureAndFormatFromData(uri_, compression_method_, context_); + else + columns = getTableStructureFromData(format_name, uri_, compression_method, context_); + storage_metadata.setColumns(columns); } else { + if (format_name == "auto") + format_name = getTableStructureAndFormatFromData(uri_, compression_method_, context_).second; + /// We don't allow special columns in HDFS storage. if (!columns_.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine HDFS doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -243,25 +253,25 @@ namespace ReadBufferIterator( const std::vector & paths_with_info_, const String & uri_without_path_, - const String & format_, + std::optional format_, const String & compression_method_, const ContextPtr & context_) : WithContext(context_) , paths_with_info(paths_with_info_) , uri_without_path(uri_without_path_) - , format(format_) + , format(std::move(format_)) , compression_method(compression_method_) { } - std::pair, std::optional> next() override + Data next() override { bool is_first = current_index == 0; /// For default mode check cached columns for all paths on first iteration. if (is_first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) { if (auto cached_columns = tryGetColumnsFromCache(paths_with_info)) - return {nullptr, cached_columns}; + return {nullptr, cached_columns, format}; } StorageHDFS::PathWithInfo path_with_info; @@ -271,10 +281,17 @@ namespace if (current_index == paths_with_info.size()) { if (is_first) - throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. " - "You must specify table structure manually", format); - return {nullptr, std::nullopt}; + { + if (format) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because all files are empty. " + "You can specify table structure manually", *format); + + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because all files are empty. You can specify table structure manually"); + } + return {nullptr, std::nullopt, format}; } path_with_info = paths_with_info[current_index++]; @@ -285,7 +302,7 @@ namespace { std::vector paths = {path_with_info}; if (auto cached_columns = tryGetColumnsFromCache(paths)) - return {nullptr, cached_columns}; + return {nullptr, cached_columns, format}; } auto compression = chooseCompressionMethod(path_with_info.path, compression_method); @@ -293,7 +310,7 @@ namespace if (!getContext()->getSettingsRef().hdfs_skip_empty_files || !impl->eof()) { const Int64 zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; - return {wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)), std::nullopt}; + return {wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)), std::nullopt, format}; } } } @@ -304,7 +321,7 @@ namespace return; String source = uri_without_path + paths_with_info[current_index - 1].path; - auto key = getKeyForSchemaCache(source, format, std::nullopt, getContext()); + auto key = getKeyForSchemaCache(source, *format, std::nullopt, getContext()); StorageHDFS::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -315,7 +332,7 @@ namespace return; String source = uri_without_path + paths_with_info[current_index - 1].path; - auto key = getKeyForSchemaCache(source, format, std::nullopt, getContext()); + auto key = getKeyForSchemaCache(source, *format, std::nullopt, getContext()); StorageHDFS::getSchemaCache(getContext()).addColumns(key, columns); } @@ -328,10 +345,15 @@ namespace Strings sources; sources.reserve(paths_with_info.size()); std::transform(paths_with_info.begin(), paths_with_info.end(), std::back_inserter(sources), [&](const StorageHDFS::PathWithInfo & path_with_info){ return uri_without_path + path_with_info.path; }); - auto cache_keys = getKeysForSchemaCache(sources, format, {}, getContext()); + auto cache_keys = getKeysForSchemaCache(sources, *format, {}, getContext()); StorageHDFS::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { if (current_index != 0) @@ -340,13 +362,27 @@ namespace return ""; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + chassert(current_index > 0 && current_index <= paths_with_info.size()); + auto path_with_info = paths_with_info[current_index - 1]; + auto compression = chooseCompressionMethod(path_with_info.path, compression_method); + auto impl = std::make_unique(uri_without_path, path_with_info.path, getContext()->getGlobalContext()->getConfigRef(), getContext()->getReadSettings()); + const Int64 zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; + return wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)); + } + private: std::optional tryGetColumnsFromCache(const std::vector & paths_with_info_) { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_hdfs) + auto context = getContext(); + + if (!context->getSettingsRef().schema_inference_use_cache_for_hdfs) return std::nullopt; - auto & schema_cache = StorageHDFS::getSchemaCache(getContext()); + auto & schema_cache = StorageHDFS::getSchemaCache(context); for (const auto & path_with_info : paths_with_info_) { auto get_last_mod_time = [&]() -> std::optional @@ -354,7 +390,7 @@ namespace if (path_with_info.info) return path_with_info.info->last_mod_time; - auto builder = createHDFSBuilder(uri_without_path + "/", getContext()->getGlobalContext()->getConfigRef()); + auto builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); auto fs = createHDFSFS(builder.get()); HDFSFileInfoPtr hdfs_info(hdfsGetPathInfo(fs.get(), path_with_info.path.c_str())); if (hdfs_info) @@ -364,10 +400,28 @@ namespace }; String url = uri_without_path + path_with_info.path; - auto cache_key = getKeyForSchemaCache(url, format, {}, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(url, *format, {}, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(url, format_name, {}, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -375,29 +429,49 @@ namespace const std::vector & paths_with_info; const String & uri_without_path; - const String & format; + std::optional format; const String & compression_method; size_t current_index = 0; }; } -ColumnsDescription StorageHDFS::getTableStructureFromData( - const String & format, +std::pair StorageHDFS::getTableStructureAndFormatFromDataImpl( + std::optional format, const String & uri, const String & compression_method, - ContextPtr ctx) + const ContextPtr & ctx) { const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); auto paths_with_info = getPathsList(path_from_uri, uri, ctx); - if (paths_with_info.empty() && !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format)) + if (paths_with_info.empty() && (!format || !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(*format))) + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because there are no files in HDFS with provided path." + " You can specify table structure manually", *format); + throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files in HDFS with provided path." - " You must specify table structure manually", format); + "The data format cannot be detected by the contents of the files, because there are no files in HDFS with provided path." + " You can specify the format manually"); + } ReadBufferIterator read_buffer_iterator(paths_with_info, uri_without_path, format, compression_method, ctx); - return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, paths_with_info.size() > 1, ctx); + if (format) + return {readSchemaFromFormat(*format, std::nullopt, read_buffer_iterator, ctx), *format}; + return detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, ctx); +} + +std::pair StorageHDFS::getTableStructureAndFormatFromData(const String & uri, const String & compression_method, const ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(std::nullopt, uri, compression_method, ctx); +} + +ColumnsDescription StorageHDFS::getTableStructureFromData(const String & format, const String & uri, const String & compression_method, const DB::ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(format, uri, compression_method, ctx).first; } class HDFSSource::DisclosedGlobIterator::Impl @@ -533,7 +607,7 @@ StorageHDFS::PathWithInfo HDFSSource::URISIterator::next() HDFSSource::HDFSSource( const ReadFromFormatInfo & info, StorageHDFSPtr storage_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, bool need_only_count_) @@ -712,7 +786,7 @@ public: HDFSSink(const String & uri, const String & format, const Block & sample_block, - ContextPtr context, + const ContextPtr & context, const CompressionMethod compression_method) : SinkToStorage(sample_block) { @@ -1073,7 +1147,7 @@ void registerStorageHDFS(StorageFactory & factory) } if (format_name == "auto") - format_name = FormatFactory::instance().getFormatFromFileName(url, true); + format_name = FormatFactory::instance().tryGetFormatFromFileName(url).value_or("auto"); String compression_method; if (engine_args.size() == 3) diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index 7170763c959..b36ff7ea37e 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -44,7 +44,7 @@ public: const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_ = "", bool distributed_processing_ = false, ASTPtr partition_by = nullptr); @@ -86,7 +86,12 @@ public: const String & format, const String & uri, const String & compression_method, - ContextPtr ctx); + const ContextPtr & ctx); + + static std::pair getTableStructureAndFormatFromData( + const String & uri, + const String & compression_method, + const ContextPtr & ctx); static SchemaCache & getSchemaCache(const ContextPtr & ctx); @@ -97,6 +102,12 @@ protected: friend class ReadFromHDFS; private: + static std::pair getTableStructureAndFormatFromDataImpl( + std::optional format, + const String & uri, + const String & compression_method, + const ContextPtr & ctx); + std::vector uris; String format_name; String compression_method; @@ -141,7 +152,7 @@ public: HDFSSource( const ReadFromFormatInfo & info, StorageHDFSPtr storage_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, bool need_only_count_); diff --git a/src/Storages/HDFS/StorageHDFSCluster.cpp b/src/Storages/HDFS/StorageHDFSCluster.cpp index fad29436102..714d6391543 100644 --- a/src/Storages/HDFS/StorageHDFSCluster.cpp +++ b/src/Storages/HDFS/StorageHDFSCluster.cpp @@ -43,12 +43,10 @@ StorageHDFSCluster::StorageHDFSCluster( const String & format_name_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const String & compression_method_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, getLogger("StorageHDFSCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) + const String & compression_method) + : IStorageCluster(cluster_name_, table_id_, getLogger("StorageHDFSCluster (" + table_id_.table_name + ")")) , uri(uri_) , format_name(format_name_) - , compression_method(compression_method_) { checkHDFSURL(uri_); context_->getRemoteHostFilter().checkURL(Poco::URI(uri_)); @@ -57,11 +55,20 @@ StorageHDFSCluster::StorageHDFSCluster( if (columns_.empty()) { - auto columns = StorageHDFS::getTableStructureFromData(format_name, uri_, compression_method, context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = StorageHDFS::getTableStructureAndFormatFromData(uri_, compression_method, context_); + else + columns = StorageHDFS::getTableStructureFromData(format_name, uri_, compression_method, context_); storage_metadata.setColumns(columns); } else + { + if (format_name == "auto") + format_name = StorageHDFS::getTableStructureAndFormatFromData(uri_, compression_method, context_).second; + storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -69,13 +76,14 @@ StorageHDFSCluster::StorageHDFSCluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageHDFSCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageHDFSCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const DB::StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function hdfsCluster, got '{}'", queryToString(query)); - TableFunctionHDFSCluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionHDFSCluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), format_name, context); } diff --git a/src/Storages/HDFS/StorageHDFSCluster.h b/src/Storages/HDFS/StorageHDFSCluster.h index 7c4c41a573a..40884f98984 100644 --- a/src/Storages/HDFS/StorageHDFSCluster.h +++ b/src/Storages/HDFS/StorageHDFSCluster.h @@ -28,8 +28,7 @@ public: const String & format_name_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const String & compression_method_, - bool structure_argument_was_provided_); + const String & compression_method); std::string getName() const override { return "HDFSCluster"; } @@ -42,11 +41,10 @@ public: bool supportsTrivialCountOptimization() const override { return true; } private: - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; String uri; String format_name; - String compression_method; NamesAndTypesList virtual_columns; }; diff --git a/src/Storages/IMessageProducer.cpp b/src/Storages/IMessageProducer.cpp index c723ec77b70..20c47f6f0b4 100644 --- a/src/Storages/IMessageProducer.cpp +++ b/src/Storages/IMessageProducer.cpp @@ -12,7 +12,16 @@ void AsynchronousMessageProducer::start(const ContextPtr & context) { LOG_TEST(log, "Executing startup"); - initialize(); + try + { + initialize(); + } + catch (...) + { + finished = true; + throw; + } + producing_task = context->getSchedulePool().createTask(getProducingTaskName(), [this] { LOG_TEST(log, "Starting producing task loop"); diff --git a/src/Storages/IStorageCluster.cpp b/src/Storages/IStorageCluster.cpp index 812b213cf33..3129da30f54 100644 --- a/src/Storages/IStorageCluster.cpp +++ b/src/Storages/IStorageCluster.cpp @@ -32,12 +32,10 @@ namespace DB IStorageCluster::IStorageCluster( const String & cluster_name_, const StorageID & table_id_, - LoggerPtr log_, - bool structure_argument_was_provided_) + LoggerPtr log_) : IStorage(table_id_) , log(log_) , cluster_name(cluster_name_) - , structure_argument_was_provided(structure_argument_was_provided_) { } @@ -130,8 +128,7 @@ void IStorageCluster::read( query_to_send = interpreter.getQueryInfo().query->clone(); } - if (!structure_argument_was_provided) - addColumnsStructureToQuery(query_to_send, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), context); + updateQueryToSendIfNeeded(query_to_send, storage_snapshot, context); RestoreQualifiedNamesVisitor::Data data; data.distributed_table = DatabaseAndTableWithAlias(*getTableExpression(query_info.query->as(), 0)); diff --git a/src/Storages/IStorageCluster.h b/src/Storages/IStorageCluster.h index 8d93e94be9a..f3283247672 100644 --- a/src/Storages/IStorageCluster.h +++ b/src/Storages/IStorageCluster.h @@ -19,8 +19,7 @@ public: IStorageCluster( const String & cluster_name_, const StorageID & table_id_, - LoggerPtr log_, - bool structure_argument_was_provided_); + LoggerPtr log_); void read( QueryPlan & query_plan, @@ -42,13 +41,11 @@ public: protected: virtual void updateBeforeRead(const ContextPtr &) {} - - virtual void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) = 0; + virtual void updateQueryToSendIfNeeded(ASTPtr & /*query*/, const StorageSnapshotPtr & /*storage_snapshot*/, const ContextPtr & /*context*/) {} private: LoggerPtr log; String cluster_name; - bool structure_argument_was_provided; }; diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index 0cb9eb84bf8..000d36752cb 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -335,7 +335,9 @@ void DataPartStorageOnDiskBase::backup( const ReadSettings & read_settings, bool make_temporary_hard_links, BackupEntries & backup_entries, - TemporaryFilesOnDisks * temp_dirs) const + TemporaryFilesOnDisks * temp_dirs, + bool is_projection_part, + bool allow_backup_broken_projection) const { fs::path part_path_on_disk = fs::path{root_path} / part_dir; fs::path part_path_in_backup = fs::path{path_in_backup} / part_dir; @@ -377,7 +379,7 @@ void DataPartStorageOnDiskBase::backup( bool copy_encrypted = !backup_settings.decrypt_files_from_encrypted_disks; - for (const auto & filepath : files_to_backup) + auto backup_file = [&](const String & filepath) { auto filepath_on_disk = part_path_on_disk / filepath; auto filepath_in_backup = part_path_in_backup / filepath; @@ -385,8 +387,10 @@ void DataPartStorageOnDiskBase::backup( if (files_without_checksums.contains(filepath)) { backup_entries.emplace_back(filepath_in_backup, std::make_unique(disk, filepath_on_disk, read_settings, copy_encrypted)); - continue; + return; } + else if (is_projection_part && allow_backup_broken_projection && !disk->exists(filepath_on_disk)) + return; if (make_temporary_hard_links) { @@ -411,6 +415,31 @@ void DataPartStorageOnDiskBase::backup( backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); backup_entries.emplace_back(filepath_in_backup, std::move(backup_entry)); + }; + + auto * log = &Poco::Logger::get("DataPartStorageOnDiskBase::backup"); + + for (const auto & filepath : files_to_backup) + { + if (is_projection_part && allow_backup_broken_projection) + { + try + { + backup_file(filepath); + } + catch (Exception & e) + { + if (e.code() != ErrorCodes::FILE_DOESNT_EXIST) + throw; + + LOG_ERROR(log, "Cannot backup file {} of projection part {}. Will try to ignore it", filepath, part_dir); + continue; + } + } + else + { + backup_file(filepath); + } } } diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h index 52dc850c7fd..75bf3d6f93c 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h @@ -58,7 +58,9 @@ public: const ReadSettings & read_settings, bool make_temporary_hard_links, BackupEntries & backup_entries, - TemporaryFilesOnDisks * temp_dirs) const override; + TemporaryFilesOnDisks * temp_dirs, + bool is_projection_part, + bool allow_backup_broken_projection) const override; MutableDataPartStoragePtr freeze( const std::string & to, diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index ce70fbe18e5..168c5f729ce 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -903,7 +903,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk( || part_name.empty() || std::string::npos != tmp_prefix.find_first_of("/.") || std::string::npos != part_name.find_first_of("/.")) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: tmp_prefix and part_name cannot be empty or contain '.' or '/' characters."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "`tmp_prefix` and `part_name` cannot be empty or contain '.' or '/' characters."); auto part_dir = tmp_prefix + part_name; auto part_relative_path = data.getRelativeDataPath() + String(to_detached ? "detached/" : ""); diff --git a/src/Storages/MergeTree/EphemeralLockInZooKeeper.cpp b/src/Storages/MergeTree/EphemeralLockInZooKeeper.cpp index 1ffb5177430..cbdeabffa97 100644 --- a/src/Storages/MergeTree/EphemeralLockInZooKeeper.cpp +++ b/src/Storages/MergeTree/EphemeralLockInZooKeeper.cpp @@ -17,7 +17,7 @@ EphemeralLockInZooKeeper::EphemeralLockInZooKeeper(const String & path_prefix_, : zookeeper(zookeeper_), path_prefix(path_prefix_), path(path_), conflict_path(conflict_path_) { if (conflict_path.empty() && path.size() <= path_prefix.size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: name of the main node is shorter than prefix."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Name of the main node is shorter than prefix."); } template @@ -179,7 +179,7 @@ EphemeralLocksInAllPartitions::EphemeralLocksInAllPartitions( size_t prefix_size = block_numbers_path.size() + 1 + partitions[i].size() + 1 + path_prefix.size(); const String & path = dynamic_cast(*lock_responses[i]).path_created; if (path.size() <= prefix_size) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: name of the sequential node is shorter than prefix."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Name of the sequential node is shorter than prefix."); UInt64 number = parse(path.c_str() + prefix_size, path.size() - prefix_size); locks.push_back(LockInfo{path, partitions[i], number}); diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index 5899ef58cd5..d06d9791a53 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -223,7 +223,9 @@ public: const ReadSettings & read_settings, bool make_temporary_hard_links, BackupEntries & backup_entries, - TemporaryFilesOnDisks * temp_dirs) const = 0; + TemporaryFilesOnDisks * temp_dirs, + bool is_projection_part, + bool allow_backup_broken_projection) const = 0; /// Creates hardlinks into 'to/dir_path' for every file in data part. /// Callback is called after hardlinks are created, but before 'delete-on-destroy.txt' marker is removed. diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 87f23b0da2a..11ede661f78 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -313,13 +313,13 @@ IMergeTreeDataPart::IMergeTreeDataPart( const IMergeTreeDataPart * parent_part_) : DataPartStorageHolder(data_part_storage_) , storage(storage_) - , mutable_name(name_) , name(mutable_name) , info(info_) , index_granularity_info(storage_, part_type_) , part_type(part_type_) , parent_part(parent_part_) , parent_part_name(parent_part ? parent_part->name : "") + , mutable_name(name_) { if (parent_part) { @@ -342,6 +342,27 @@ IMergeTreeDataPart::~IMergeTreeDataPart() decrementTypeMetric(part_type); } + +const IMergeTreeDataPart::Index & IMergeTreeDataPart::getIndex() const +{ + std::scoped_lock lock(index_mutex); + if (!index_loaded) + loadIndex(); + index_loaded = true; + return index; +} + + +void IMergeTreeDataPart::setIndex(Columns index_) +{ + std::scoped_lock lock(index_mutex); + if (!index.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "The index of data part can be set only once"); + index = std::move(index_); + index_loaded = true; +} + + void IMergeTreeDataPart::setName(const String & new_name) { mutable_name = new_name; @@ -548,6 +569,7 @@ void IMergeTreeDataPart::removeIfNeeded() UInt64 IMergeTreeDataPart::getIndexSizeInBytes() const { + std::scoped_lock lock(index_mutex); UInt64 res = 0; for (const ColumnPtr & column : index) res += column->byteSize(); @@ -556,6 +578,7 @@ UInt64 IMergeTreeDataPart::getIndexSizeInBytes() const UInt64 IMergeTreeDataPart::getIndexSizeInAllocatedBytes() const { + std::scoped_lock lock(index_mutex); UInt64 res = 0; for (const ColumnPtr & column : index) res += column->allocatedBytes(); @@ -669,26 +692,33 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks loadColumns(require_columns_checksums); loadChecksums(require_columns_checksums); loadIndexGranularity(); + + if (!storage.getSettings()->primary_key_lazy_load) + getIndex(); + calculateColumnsAndSecondaryIndicesSizesOnDisk(); - loadIndex(); /// Must be called after loadIndexGranularity as it uses the value of `index_granularity` loadRowsCount(); /// Must be called after loadIndexGranularity() as it uses the value of `index_granularity`. loadPartitionAndMinMaxIndex(); + bool has_broken_projections = false; if (!parent_part) { loadTTLInfos(); - loadProjections(require_columns_checksums, check_consistency, false /* if_not_loaded */); + loadProjections(require_columns_checksums, check_consistency, has_broken_projections, false /* if_not_loaded */); } - if (check_consistency) + if (check_consistency && !has_broken_projections) checkConsistency(require_columns_checksums); loadDefaultCompressionCodec(); } catch (...) { + /// Don't scare people with broken part error + if (!isRetryableException(std::current_exception())) + LOG_ERROR(storage.log, "Part {} is broken and need manual correction", getDataPartStorage().getFullPath()); + // There could be conditions that data part to be loaded is broken, but some of meta infos are already written // into meta data before exception, need to clean them all. - LOG_ERROR(storage.log, "Part {} is broken and need manual correction", getDataPartStorage().getFullPath()); metadata_manager->deleteAll(/*include_projection*/ true); metadata_manager->assertAllDeleted(/*include_projection*/ true); throw; @@ -741,7 +771,7 @@ void IMergeTreeDataPart::addProjectionPart( projection_parts[projection_name] = std::move(projection_part); } -void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded) +void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency, bool & has_broken_projection, bool if_not_loaded) { auto metadata_snapshot = storage.getInMemoryMetadataPtr(); for (const auto & projection : metadata_snapshot->projections) @@ -758,10 +788,34 @@ void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch else { auto part = getProjectionPartBuilder(projection.name).withPartFormatFromDisk().build(); - part->loadColumnsChecksumsIndexes(require_columns_checksums, check_consistency); + + try + { + part->loadColumnsChecksumsIndexes(require_columns_checksums, check_consistency); + } + catch (...) + { + if (isRetryableException(std::current_exception())) + throw; + + auto message = getCurrentExceptionMessage(true); + LOG_ERROR(&Poco::Logger::get("IMergeTreeDataPart"), + "Cannot load projection {}, will consider it broken. Reason: {}", projection.name, message); + + has_broken_projection = true; + part->setBrokenReason(message, getCurrentExceptionCode()); + } + addProjectionPart(projection.name, std::move(part)); } } + else if (checksums.has(path)) + { + auto part = getProjectionPartBuilder(projection.name).withPartFormatFromDisk().build(); + part->setBrokenReason("Projection directory " + path + " does not exist while loading projections", ErrorCodes::NO_FILE_IN_DATA_PART); + addProjectionPart(projection.name, std::move(part)); + has_broken_projection = true; + } } } @@ -776,8 +830,11 @@ void IMergeTreeDataPart::appendFilesOfIndexGranularity(Strings & /* files */) co { } -void IMergeTreeDataPart::loadIndex() +void IMergeTreeDataPart::loadIndex() const { + /// Memory for index must not be accounted as memory usage for query, because it belongs to a table. + MemoryTrackerBlockerInThread temporarily_disable_memory_tracker; + /// It can be empty in case of mutations if (!index_granularity.isInitialized()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Index granularity is not loaded before index loading"); @@ -814,6 +871,7 @@ void IMergeTreeDataPart::loadIndex() for (size_t i = 0; i < key_size; ++i) { + loaded_index[i]->shrinkToFit(); loaded_index[i]->protect(); if (loaded_index[i]->size() != marks_count) throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read all data from index file {}(expected size: " @@ -1156,7 +1214,8 @@ void IMergeTreeDataPart::loadChecksums(bool require) /// Check the data while we are at it. LOG_WARNING(storage.log, "Checksums for part {} not found. Will calculate them from data on disk.", name); - checksums = checkDataPart(shared_from_this(), false); + bool noop; + checksums = checkDataPart(shared_from_this(), false, noop, /* is_cancelled */[]{ return false; }, /* throw_on_broken_projection */false); writeChecksums(checksums, {}); bytes_on_disk = checksums.getTotalSizeOnDisk(); @@ -1663,7 +1722,7 @@ try metadata_manager->deleteAll(true); metadata_manager->assertAllDeleted(true); - getDataPartStorage().rename(to.parent_path(), to.filename(), storage.log, remove_new_dir_if_exists, fsync_dir); + getDataPartStorage().rename(to.parent_path(), to.filename(), storage.log.load(), remove_new_dir_if_exists, fsync_dir); metadata_manager->updateAll(true); auto new_projection_root_path = to.string(); @@ -1758,7 +1817,7 @@ void IMergeTreeDataPart::remove() } bool is_temporary_part = is_temp || state == MergeTreeDataPartState::Temporary; - getDataPartStorage().remove(std::move(can_remove_callback), checksums, projection_checksums, is_temporary_part, storage.log); + getDataPartStorage().remove(std::move(can_remove_callback), checksums, projection_checksums, is_temporary_part, storage.log.load()); } std::optional IMergeTreeDataPart::getRelativePathForPrefix(const String & prefix, bool detached, bool broken) const @@ -1775,7 +1834,7 @@ std::optional IMergeTreeDataPart::getRelativePathForPrefix(const String if (detached && parent_part) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot detach projection"); - return getDataPartStorage().getRelativePathForPrefix(storage.log, prefix, detached, broken); + return getDataPartStorage().getRelativePathForPrefix(storage.log.load(), prefix, detached, broken); } std::optional IMergeTreeDataPart::getRelativePathForDetachedPart(const String & prefix, bool broken) const @@ -1841,7 +1900,7 @@ MutableDataPartStoragePtr IMergeTreeDataPart::makeCloneOnDisk( throw Exception(ErrorCodes::LOGICAL_ERROR, "Can not clone data part {} to empty directory.", name); String path_to_clone = fs::path(storage.relative_data_path) / directory_name / ""; - return getDataPartStorage().clonePart(path_to_clone, getDataPartStorage().getPartDirectory(), disk, read_settings, write_settings, storage.log, cancellation_hook); + return getDataPartStorage().clonePart(path_to_clone, getDataPartStorage().getPartDirectory(), disk, read_settings, write_settings, storage.log.load(), cancellation_hook); } UInt64 IMergeTreeDataPart::getIndexSizeFromFile() const @@ -2163,6 +2222,32 @@ std::optional IMergeTreeDataPart::getStreamNameForColumn( return getStreamNameOrHash(stream_name, extension, storage_); } +void IMergeTreeDataPart::markProjectionPartAsBroken(const String & projection_name, const String & message, int code) const +{ + auto it = projection_parts.find(projection_name); + if (it == projection_parts.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no projection part '{}'", projection_name); + it->second->setBrokenReason(message, code); +} + +bool IMergeTreeDataPart::hasBrokenProjection(const String & projection_name) const +{ + auto it = projection_parts.find(projection_name); + if (it == projection_parts.end()) + return false; + return it->second->is_broken; +} + +void IMergeTreeDataPart::setBrokenReason(const String & message, int code) const +{ + std::lock_guard lock(broken_reason_mutex); + if (is_broken) + return; + is_broken = true; + exception = message; + exception_code = code; +} + bool isCompactPart(const MergeTreeDataPartPtr & data_part) { return (data_part && data_part->getType() == MergeTreeDataPartType::Compact); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 640a1f1d0a3..0d7acfab891 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -75,6 +76,7 @@ public: using ColumnSizeByName = std::unordered_map; using NameToNumber = std::unordered_map; + using Index = Columns; using IndexSizeByName = std::unordered_map; using Type = MergeTreeDataPartType; @@ -212,10 +214,6 @@ public: const MergeTreeData & storage; -private: - String mutable_name; - mutable MergeTreeDataPartState state{MergeTreeDataPartState::Temporary}; - public: const String & name; // const ref to private mutable_name MergeTreePartInfo info; @@ -261,6 +259,12 @@ public: /// Frozen by ALTER TABLE ... FREEZE ... It is used for information purposes in system.parts table. mutable std::atomic is_frozen {false}; + /// If it is a projection part, it can be broken sometimes. + mutable std::atomic is_broken {false}; + mutable std::string exception; + mutable int exception_code = 0; + mutable std::mutex broken_reason_mutex; + /// Indicates that the part was marked Outdated by PartCheckThread because the part was not committed to ZooKeeper mutable bool is_unexpected_local_part = false; @@ -303,12 +307,6 @@ public: /// Throws an exception if state of the part is not in affordable_states void assertState(const std::initializer_list & affordable_states) const; - /// Primary key (correspond to primary.idx file). - /// Always loaded in RAM. Contains each index_granularity-th value of primary key tuple. - /// Note that marks (also correspond to primary key) is not always in RAM, but cached. See MarkCache.h. - using Index = Columns; - Index index; - MergeTreePartition partition; /// Amount of rows between marks @@ -363,6 +361,9 @@ public: /// Version of part metadata (columns, pk and so on). Managed properly only for replicated merge tree. int32_t metadata_version; + const Index & getIndex() const; + void setIndex(Columns index_); + /// For data in RAM ('index') UInt64 getIndexSizeInBytes() const; UInt64 getIndexSizeInAllocatedBytes() const; @@ -423,9 +424,16 @@ public: void addProjectionPart(const String & projection_name, std::shared_ptr && projection_part); + void markProjectionPartAsBroken(const String & projection_name, const String & message, int code) const; + bool hasProjection(const String & projection_name) const { return projection_parts.contains(projection_name); } - void loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded = false); + bool hasBrokenProjection(const String & projection_name) const; + + /// Return true, if all projections were loaded successfully and none was marked as broken. + void loadProjections(bool require_columns_checksums, bool check_consistency, bool & has_broken_projection, bool if_not_loaded = false); + + void setBrokenReason(const String & message, int code) const; /// Return set of metadata file names without checksums. For example, /// columns.txt or checksums.txt itself. @@ -554,6 +562,12 @@ public: mutable std::atomic last_removal_attempt_time = 0; protected: + /// Primary key (correspond to primary.idx file). + /// Lazily loaded in RAM. Contains each index_granularity-th value of primary key tuple. + /// Note that marks (also correspond to primary key) are not always in RAM, but cached. See MarkCache.h. + mutable std::mutex index_mutex; + mutable Index index TSA_GUARDED_BY(index_mutex); + mutable bool index_loaded TSA_GUARDED_BY(index_mutex) = false; /// Total size of all columns, calculated once in calcuateColumnSizesOnDisk ColumnSize total_columns_size; @@ -579,7 +593,7 @@ protected: const IMergeTreeDataPart * parent_part; String parent_part_name; - std::map> projection_parts; + mutable std::map> projection_parts; mutable PartMetadataManagerPtr metadata_manager; @@ -610,6 +624,9 @@ protected: void initializeIndexGranularityInfo(); private: + String mutable_name; + mutable MergeTreeDataPartState state{MergeTreeDataPartState::Temporary}; + /// In compact parts order of columns is necessary NameToNumber column_name_to_position; @@ -647,8 +664,8 @@ private: virtual void appendFilesOfIndexGranularity(Strings & files) const; - /// Loads index file. - void loadIndex(); + /// Loads the index file. + void loadIndex() const TSA_REQUIRES(index_mutex); void appendFilesOfIndex(Strings & files) const; diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 4b5b7ca8018..e6ae63da7e3 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -33,6 +33,9 @@ #include #include #include +#include +#include +#include namespace DB { @@ -433,7 +436,7 @@ MergeTask::StageRuntimeContextPtr MergeTask::VerticalMergeStage::getContextForNe bool MergeTask::ExecuteAndFinalizeHorizontalPart::execute() { assert(subtasks_iterator != subtasks.end()); - if ((*subtasks_iterator)()) + if ((this->**subtasks_iterator)()) return true; /// Move to the next subtask in an array of subtasks @@ -588,7 +591,15 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const auto pipe = Pipe::unitePipes(std::move(pipes)); ctx->rows_sources_read_buf->seek(0, 0); - auto transform = std::make_unique(pipe.getHeader(), pipe.numOutputPorts(), *ctx->rows_sources_read_buf); + + const auto data_settings = global_ctx->data->getSettings(); + auto transform = std::make_unique( + pipe.getHeader(), + pipe.numOutputPorts(), + *ctx->rows_sources_read_buf, + data_settings->merge_max_block_size, + data_settings->merge_max_block_size_bytes); + pipe.addTransform(std::move(transform)); ctx->column_parts_pipeline = QueryPipeline(std::move(pipe)); @@ -720,8 +731,9 @@ bool MergeTask::MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections() c MergeTreeData::DataPartsVector projection_parts; for (const auto & part : global_ctx->future_part->parts) { - auto it = part->getProjectionParts().find(projection.name); - if (it != part->getProjectionParts().end()) + auto actual_projection_parts = part->getProjectionParts(); + auto it = actual_projection_parts.find(projection.name); + if (it != actual_projection_parts.end() && !it->second->is_broken) projection_parts.push_back(it->second); } if (projection_parts.size() < global_ctx->future_part->parts.size()) @@ -815,7 +827,7 @@ bool MergeTask::MergeProjectionsStage::finalizeProjectionsAndWholeMerge() const bool MergeTask::VerticalMergeStage::execute() { assert(subtasks_iterator != subtasks.end()); - if ((*subtasks_iterator)()) + if ((this->**subtasks_iterator)()) return true; /// Move to the next subtask in an array of subtasks @@ -826,7 +838,7 @@ bool MergeTask::VerticalMergeStage::execute() bool MergeTask::MergeProjectionsStage::execute() { assert(subtasks_iterator != subtasks.end()); - if ((*subtasks_iterator)()) + if ((this->**subtasks_iterator)()) return true; /// Move to the next subtask in an array of subtasks @@ -1047,13 +1059,14 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() break; } - auto res_pipe = Pipe::unitePipes(std::move(pipes)); - res_pipe.addTransform(std::move(merged_transform)); + auto builder = std::make_unique(); + builder->init(Pipe::unitePipes(std::move(pipes))); + builder->addTransform(std::move(merged_transform)); #ifndef NDEBUG if (!sort_description.empty()) { - res_pipe.addSimpleTransform([&](const Block & header_) + builder->addSimpleTransform([&](const Block & header_) { auto transform = std::make_shared(header_, sort_description); return transform; @@ -1075,26 +1088,34 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() } if (DistinctSortedTransform::isApplicable(header, sort_description, global_ctx->deduplicate_by_columns)) - res_pipe.addTransform(std::make_shared( - res_pipe.getHeader(), sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); + builder->addTransform(std::make_shared( + builder->getHeader(), sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); else - res_pipe.addTransform(std::make_shared( - res_pipe.getHeader(), SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); + builder->addTransform(std::make_shared( + builder->getHeader(), SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); } + PreparedSets::Subqueries subqueries; + if (ctx->need_remove_expired_values) - res_pipe.addTransform(std::make_shared( - res_pipe.getHeader(), *global_ctx->data, global_ctx->metadata_snapshot, global_ctx->new_data_part, global_ctx->time_of_merge, ctx->force_ttl)); + { + auto transform = std::make_shared(global_ctx->context, builder->getHeader(), *global_ctx->data, global_ctx->metadata_snapshot, global_ctx->new_data_part, global_ctx->time_of_merge, ctx->force_ttl); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } if (global_ctx->metadata_snapshot->hasSecondaryIndices()) { const auto & indices = global_ctx->metadata_snapshot->getSecondaryIndices(); - res_pipe.addTransform(std::make_shared( - res_pipe.getHeader(), indices.getSingleExpressionForIndices(global_ctx->metadata_snapshot->getColumns(), global_ctx->data->getContext()))); - res_pipe.addTransform(std::make_shared(res_pipe.getHeader())); + builder->addTransform(std::make_shared( + builder->getHeader(), indices.getSingleExpressionForIndices(global_ctx->metadata_snapshot->getColumns(), global_ctx->data->getContext()))); + builder->addTransform(std::make_shared(builder->getHeader())); } - global_ctx->merged_pipeline = QueryPipeline(std::move(res_pipe)); + if (!subqueries.empty()) + builder = addCreatingSetsTransform(std::move(builder), std::move(subqueries), global_ctx->context); + + global_ctx->merged_pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder)); /// Dereference unique_ptr and pass horizontal_stage_progress by reference global_ctx->merged_pipeline.setProgressCallback(MergeProgressCallback(global_ctx->merge_list_element_ptr, global_ctx->watch_prev_elapsed, *global_ctx->horizontal_stage_progress)); /// Is calculated inside MergeProgressCallback. diff --git a/src/Storages/MergeTree/MergeTask.h b/src/Storages/MergeTree/MergeTask.h index 6f5336baaad..7fb4797e482 100644 --- a/src/Storages/MergeTree/MergeTask.h +++ b/src/Storages/MergeTree/MergeTask.h @@ -246,15 +246,16 @@ private: bool prepare(); bool executeImpl(); - using ExecuteAndFinalizeHorizontalPartSubtasks = std::array, 2>; + /// NOTE: Using pointer-to-member instead of std::function and lambda makes stacktraces much more concise and readable + using ExecuteAndFinalizeHorizontalPartSubtasks = std::array; - ExecuteAndFinalizeHorizontalPartSubtasks subtasks + const ExecuteAndFinalizeHorizontalPartSubtasks subtasks { - [this] () { return prepare(); }, - [this] () { return executeImpl(); } + &ExecuteAndFinalizeHorizontalPart::prepare, + &ExecuteAndFinalizeHorizontalPart::executeImpl }; - ExecuteAndFinalizeHorizontalPartSubtasks::iterator subtasks_iterator = subtasks.begin(); + ExecuteAndFinalizeHorizontalPartSubtasks::const_iterator subtasks_iterator = subtasks.begin(); MergeAlgorithm chooseMergeAlgorithm() const; @@ -323,16 +324,17 @@ private: bool executeVerticalMergeForAllColumns() const; bool finalizeVerticalMergeForAllColumns() const; - using VerticalMergeStageSubtasks = std::array, 3>; + /// NOTE: Using pointer-to-member instead of std::function and lambda makes stacktraces much more concise and readable + using VerticalMergeStageSubtasks = std::array; - VerticalMergeStageSubtasks subtasks + const VerticalMergeStageSubtasks subtasks { - [this] () { return prepareVerticalMergeForAllColumns(); }, - [this] () { return executeVerticalMergeForAllColumns(); }, - [this] () { return finalizeVerticalMergeForAllColumns(); } + &VerticalMergeStage::prepareVerticalMergeForAllColumns, + &VerticalMergeStage::executeVerticalMergeForAllColumns, + &VerticalMergeStage::finalizeVerticalMergeForAllColumns }; - VerticalMergeStageSubtasks::iterator subtasks_iterator = subtasks.begin(); + VerticalMergeStageSubtasks::const_iterator subtasks_iterator = subtasks.begin(); void prepareVerticalMergeForOneColumn() const; bool executeVerticalMergeForOneColumn() const; @@ -373,16 +375,17 @@ private: bool executeProjections() const; bool finalizeProjectionsAndWholeMerge() const; - using MergeProjectionsStageSubtasks = std::array, 3>; + /// NOTE: Using pointer-to-member instead of std::function and lambda makes stacktraces much more concise and readable + using MergeProjectionsStageSubtasks = std::array; - MergeProjectionsStageSubtasks subtasks + const MergeProjectionsStageSubtasks subtasks { - [this] () { return mergeMinMaxIndexAndPrepareProjections(); }, - [this] () { return executeProjections(); }, - [this] () { return finalizeProjectionsAndWholeMerge(); } + &MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections, + &MergeProjectionsStage::executeProjections, + &MergeProjectionsStage::finalizeProjectionsAndWholeMerge }; - MergeProjectionsStageSubtasks::iterator subtasks_iterator = subtasks.begin(); + MergeProjectionsStageSubtasks::const_iterator subtasks_iterator = subtasks.begin(); MergeProjectionsRuntimeContextPtr ctx; GlobalRuntimeContextPtr global_ctx; @@ -392,14 +395,14 @@ private: using Stages = std::array; - Stages stages + const Stages stages { std::make_shared(), std::make_shared(), std::make_shared() }; - Stages::iterator stages_iterator = stages.begin(); + Stages::const_iterator stages_iterator = stages.begin(); /// Check for persisting block number column static bool supportsBlockNumberColumn(GlobalRuntimeContextPtr global_ctx) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 39c113c240e..5b297de3fda 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -61,7 +60,6 @@ #include #include #include -#include #include #include #include @@ -75,7 +73,6 @@ #include #include #include -#include #include #include #include @@ -93,9 +90,7 @@ #include #include -#include #include -#include #include #include #include @@ -300,7 +295,11 @@ void MergeTreeData::initializeDirectoriesAndFormatVersion(const std::string & re if (disk->isBroken()) continue; - if (!disk->isReadOnly()) + /// Write once disk is almost the same as read-only for MergeTree, + /// since it does not support move, that is required for any + /// operation over MergeTree, so avoid writing format_version.txt + /// into it as well, to avoid leaving it after DROP. + if (!disk->isReadOnly() && !disk->isWriteOnce()) { auto buf = disk->writeFile(format_version_path, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, getContext()->getWriteSettings()); writeIntText(format_version.toUnderType(), *buf); @@ -354,8 +353,7 @@ MergeTreeData::MergeTreeData( , merging_params(merging_params_) , require_part_metadata(require_part_metadata_) , broken_part_callback(broken_part_callback_) - , log_name(std::make_shared(table_id_.getNameForLogs())) - , log(getLogger(*log_name)) + , log(table_id_.getNameForLogs()) , storage_settings(std::move(storage_settings_)) , pinned_part_uuids(std::make_shared()) , data_parts_by_info(data_parts_indexes.get()) @@ -871,7 +869,7 @@ void MergeTreeData::MergingParams::check(const StorageInMemoryMetadata & metadat if (is_optional) return; - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: Sign column for storage {} is empty", storage); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Sign column for storage {} is empty", storage); } bool miss_column = true; @@ -898,7 +896,7 @@ void MergeTreeData::MergingParams::check(const StorageInMemoryMetadata & metadat if (is_optional) return; - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: Version column for storage {} is empty", storage); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Version column for storage {} is empty", storage); } bool miss_column = true; @@ -927,12 +925,12 @@ void MergeTreeData::MergingParams::check(const StorageInMemoryMetadata & metadat if (is_optional) return; - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: is_deleted ({}) column for storage {} is empty", is_deleted_column, storage); + throw Exception(ErrorCodes::LOGICAL_ERROR, "`is_deleted` ({}) column for storage {} is empty", is_deleted_column, storage); } else { if (version_column.empty() && !is_optional) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: Version column ({}) for storage {} is empty while is_deleted ({}) is not.", + throw Exception(ErrorCodes::LOGICAL_ERROR, "Version column ({}) for storage {} is empty while is_deleted ({}) is not.", version_column, storage, is_deleted_column); bool miss_is_deleted_column = true; @@ -1296,7 +1294,7 @@ MergeTreeData::LoadPartResult MergeTreeData::loadDataPart( res.is_broken = true; tryLogCurrentException(log, fmt::format("while loading part {} on path {}", part_name, part_path)); - res.size_of_part = calculatePartSizeSafe(res.part, log); + res.size_of_part = calculatePartSizeSafe(res.part, log.load()); auto part_size_str = res.size_of_part ? formatReadableSizeWithBinarySuffix(*res.size_of_part) : "failed to calculate size"; LOG_ERROR(log, @@ -1327,7 +1325,7 @@ MergeTreeData::LoadPartResult MergeTreeData::loadDataPart( if (part_disk_ptr->exists(marker_path)) { /// NOTE: getBytesOnDisk() cannot be used here, since it may be zero if checksums.txt does not exist. - res.size_of_part = calculatePartSizeSafe(res.part, log); + res.size_of_part = calculatePartSizeSafe(res.part, log.load()); res.is_broken = true; auto part_size_str = res.size_of_part ? formatReadableSizeWithBinarySuffix(*res.size_of_part) : "failed to calculate size"; @@ -2114,7 +2112,7 @@ size_t MergeTreeData::clearOldTemporaryDirectories(const String & root_path, siz { /// Actually we don't rely on temporary_directories_lifetime when removing old temporaries directories, /// it's just an extra level of protection just in case we have a bug. - LOG_INFO(LogFrequencyLimiter(log, 10), "{} is in use (by merge/mutation/INSERT) (consider increasing temporary_directories_lifetime setting)", full_path); + LOG_INFO(LogFrequencyLimiter(log.load(), 10), "{} is in use (by merge/mutation/INSERT) (consider increasing temporary_directories_lifetime setting)", full_path); continue; } else if (!disk->exists(it->path())) @@ -2734,12 +2732,20 @@ void MergeTreeData::rename(const String & new_table_path, const StorageID & new_ void MergeTreeData::renameInMemory(const StorageID & new_table_id) { IStorage::renameInMemory(new_table_id); - std::atomic_store(&log_name, std::make_shared(new_table_id.getNameForLogs())); - log = getLogger(*log_name); + log.store(new_table_id.getNameForLogs()); } void MergeTreeData::dropAllData() { + /// In case there is read-only/write-once disk we cannot allow to call dropAllData(), but dropping tables is allowed. + /// + /// Note, that one may think that drop on write-once disk should be + /// supported, since it is pretty trivial to implement + /// MetadataStorageFromPlainObjectStorageTransaction::removeDirectory(), + /// however removing part requires moveDirectory() as well. + if (isStaticStorage()) + return; + LOG_TRACE(log, "dropAllData: waiting for locks."); auto settings_ptr = getSettings(); @@ -5286,7 +5292,7 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( if (hold_table_lock && !table_lock) table_lock = lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); - if (backup_settings.check_parts) + if (backup_settings.check_projection_parts) part->checkConsistencyWithProjections(/* require_part_metadata= */ true); BackupEntries backup_entries_from_part; @@ -5298,7 +5304,8 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( read_settings, make_temporary_hard_links, backup_entries_from_part, - &temp_dirs); + &temp_dirs, + false, false); auto projection_parts = part->getProjectionParts(); for (const auto & [projection_name, projection_part] : projection_parts) @@ -5311,7 +5318,9 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( read_settings, make_temporary_hard_links, backup_entries_from_part, - &temp_dirs); + &temp_dirs, + projection_part->is_broken, + backup_settings.allow_backup_broken_projections); } if (hold_storage_and_part_ptrs) @@ -6249,13 +6258,13 @@ ReservationPtr MergeTreeData::tryReserveSpacePreferringTTLRules( log, "Would like to reserve space on volume '{}' by TTL rule of table '{}' but volume was not found", move_ttl_entry->destination_name, - *std::atomic_load(&log_name)); + log.loadName()); else if (move_ttl_entry->destination_type == DataDestinationType::DISK && !move_ttl_entry->if_exists) LOG_WARNING( log, "Would like to reserve space on disk '{}' by TTL rule of table '{}' but disk was not found", move_ttl_entry->destination_name, - *std::atomic_load(&log_name)); + log.loadName()); } else if (is_insert && !perform_ttl_move_on_insert) { @@ -6264,7 +6273,7 @@ ReservationPtr MergeTreeData::tryReserveSpacePreferringTTLRules( "TTL move on insert to {} {} for table {} is disabled", (move_ttl_entry->destination_type == DataDestinationType::VOLUME ? "volume" : "disk"), move_ttl_entry->destination_name, - *std::atomic_load(&log_name)); + log.loadName()); } else { @@ -6280,13 +6289,13 @@ ReservationPtr MergeTreeData::tryReserveSpacePreferringTTLRules( log, "Would like to reserve space on volume '{}' by TTL rule of table '{}' but there is not enough space", move_ttl_entry->destination_name, - *std::atomic_load(&log_name)); + log.loadName()); else if (move_ttl_entry->destination_type == DataDestinationType::DISK) LOG_WARNING( log, "Would like to reserve space on disk '{}' by TTL rule of table '{}' but there is not enough space", move_ttl_entry->destination_name, - *std::atomic_load(&log_name)); + log.loadName()); } } } @@ -6815,7 +6824,7 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( { for (const auto & part : real_parts) { - const auto & primary_key_column = *part->index[0]; + const auto & primary_key_column = *part->getIndex()[0]; auto & min_column = assert_cast(*partition_minmax_count_columns[pos]); insert(min_column, primary_key_column[0]); } @@ -6826,7 +6835,7 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( { for (const auto & part : real_parts) { - const auto & primary_key_column = *part->index[0]; + const auto & primary_key_column = *part->getIndex()[0]; auto & max_column = assert_cast(*partition_minmax_count_columns[pos]); insert(max_column, primary_key_column[primary_key_column.size() - 1]); } @@ -7071,6 +7080,8 @@ std::pair MergeTreeData::cloneAn const ReadSettings & read_settings, const WriteSettings & write_settings) { + chassert(!isStaticStorage()); + /// Check that the storage policy contains the disk where the src_part is located. bool does_storage_policy_allow_same_disk = false; for (const DiskPtr & disk : getStoragePolicy()->getDisks()) @@ -7803,21 +7814,39 @@ MovePartsOutcome MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & bool MergeTreeData::partsContainSameProjections(const DataPartPtr & left, const DataPartPtr & right, String & out_reason) { - if (left->getProjectionParts().size() != right->getProjectionParts().size()) + auto remove_broken_parts_from_consideration = [](auto & parts) + { + std::set broken_projection_parts; + for (const auto & [name, part] : parts) + { + if (part->is_broken) + broken_projection_parts.emplace(name); + } + for (const auto & name : broken_projection_parts) + parts.erase(name); + }; + + auto left_projection_parts = left->getProjectionParts(); + auto right_projection_parts = right->getProjectionParts(); + + remove_broken_parts_from_consideration(left_projection_parts); + remove_broken_parts_from_consideration(right_projection_parts); + + if (left_projection_parts.size() != right_projection_parts.size()) { out_reason = fmt::format( "Parts have different number of projections: {} in part '{}' and {} in part '{}'", - left->getProjectionParts().size(), + left_projection_parts.size(), left->name, - right->getProjectionParts().size(), + right_projection_parts.size(), right->name ); return false; } - for (const auto & [name, _] : left->getProjectionParts()) + for (const auto & [name, _] : left_projection_parts) { - if (!right->hasProjection(name)) + if (!right_projection_parts.contains(name)) { out_reason = fmt::format( "The part '{}' doesn't have projection '{}' while part '{}' does", right->name, name, left->name @@ -7989,7 +8018,7 @@ bool MergeTreeData::insertQueryIdOrThrowNoLock(const String & query_id, size_t m throw Exception( ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, "Too many simultaneous queries for table {}. Maximum is: {}", - *std::atomic_load(&log_name), + log.loadName(), max_queries); query_id_set.insert(query_id); return true; @@ -8181,7 +8210,7 @@ ReservationPtr MergeTreeData::balancedReservation( } // Record submerging big parts in the tagger to clean them up. - tagger_ptr->emplace(*this, part_name, std::move(covered_parts), log); + tagger_ptr->emplace(*this, part_name, std::move(covered_parts), log.load()); } } } diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index caef247500a..1de79ed17ca 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include @@ -462,14 +462,19 @@ public: /// Load the set of data parts from disk. Call once - immediately after the object is created. void loadDataParts(bool skip_sanity_checks, std::optional> expected_parts); - String getLogName() const { return *std::atomic_load(&log_name); } + String getLogName() const { return log.loadName(); } Int64 getMaxBlockNumber() const; struct ProjectionPartsVector { - DataPartsVector projection_parts; DataPartsVector data_parts; + + DataPartsVector projection_parts; + DataPartStateVector projection_parts_states; + + DataPartsVector broken_projection_parts; + DataPartStateVector broken_projection_parts_states; }; /// Returns a copy of the list so that the caller shouldn't worry about locks. @@ -484,7 +489,7 @@ public: const DataPartStates & affordable_states, DataPartStateVector * out_states = nullptr) const; /// Same as above but only returns projection parts ProjectionPartsVector getProjectionPartsVectorForInternalUsage( - const DataPartStates & affordable_states, DataPartStateVector * out_states = nullptr) const; + const DataPartStates & affordable_states, MergeTreeData::DataPartStateVector * out_states) const; /// Returns absolutely all parts (and snapshot of their states) @@ -1115,10 +1120,7 @@ protected: /// Engine-specific methods BrokenPartCallback broken_part_callback; - /// log_name will change during table RENAME. Use atomic_shared_ptr to allow concurrent RW. - /// NOTE clang-14 doesn't have atomic_shared_ptr yet. Use std::atomic* operations for now. - std::shared_ptr log_name; - LoggerPtr log; + AtomicLogger log; /// Storage settings. /// Use get and set to receive readonly versions. diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index 58fddde7b54..1bf1d4a3c29 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -85,7 +85,7 @@ UInt64 MergeTreeDataMergerMutator::getMaxSourcePartsSizeForMerge(size_t max_coun if (scheduled_tasks_count > max_count) { throw Exception(ErrorCodes::LOGICAL_ERROR, - "Logical error: invalid argument passed to getMaxSourcePartsSize: scheduled_tasks_count = {} > max_count = {}", + "Invalid argument passed to getMaxSourcePartsSize: scheduled_tasks_count = {} > max_count = {}", scheduled_tasks_count, max_count); } @@ -511,7 +511,7 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectPartsToMergeFromRanges( /// Do not allow to "merge" part with itself for regular merges, unless it is a TTL-merge where it is ok to remove some values with expired ttl if (parts_to_merge.size() == 1) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: merge selector returned only one part to merge"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Merge selector returned only one part to merge"); if (parts_to_merge.empty()) { diff --git a/src/Storages/MergeTree/MergeTreeDataPartChecksum.h b/src/Storages/MergeTree/MergeTreeDataPartChecksum.h index 837b940e354..d4980a67a43 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartChecksum.h +++ b/src/Storages/MergeTree/MergeTreeDataPartChecksum.h @@ -54,6 +54,8 @@ struct MergeTreeDataPartChecksums bool has(const String & file_name) const { return files.find(file_name) != files.end(); } + bool remove(const String & file_name) { return files.erase(file_name); } + bool empty() const { return files.empty(); } /// Checks that the set of columns and their checksums are the same. If not, throws an exception. diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp index 6e544b4a35a..fd83d2ebfe9 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp @@ -1,6 +1,14 @@ #include #include +#include #include +#include + +namespace ProfileEvents +{ +extern const Event MergeTreeDataWriterSkipIndicesCalculationMicroseconds; +extern const Event MergeTreeDataWriterStatisticsCalculationMicroseconds; +} namespace DB { @@ -148,6 +156,8 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( , default_codec(default_codec_) , compute_granularity(index_granularity.empty()) , compress_primary_key(settings.compress_primary_key) + , execution_stats(skip_indices.size(), stats.size()) + , log(getLogger(storage.getLogName() + " (DataPartWriter)")) { if (settings.blocks_are_granules_size && !index_granularity.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, @@ -329,9 +339,12 @@ void MergeTreeDataPartWriterOnDisk::calculateAndSerializePrimaryIndex(const Bloc void MergeTreeDataPartWriterOnDisk::calculateAndSerializeStatistics(const Block & block) { - for (const auto & stat_ptr : stats) + for (size_t i = 0; i < stats.size(); ++i) { + const auto & stat_ptr = stats[i]; + ProfileEventTimeIncrement watch(ProfileEvents::MergeTreeDataWriterStatisticsCalculationMicroseconds); stat_ptr->update(block.getByName(stat_ptr->columnName()).column); + execution_stats.statistics_build_us[i] += watch.elapsed(); } } @@ -378,10 +391,14 @@ void MergeTreeDataPartWriterOnDisk::calculateAndSerializeSkipIndices(const Block writeBinaryLittleEndian(1UL, marks_out); } + ProfileEventTimeIncrement watch(ProfileEvents::MergeTreeDataWriterSkipIndicesCalculationMicroseconds); + size_t pos = granule.start_row; skip_indices_aggregators[i]->update(skip_indexes_block, &pos, granule.rows_to_write); if (granule.is_complete) ++skip_index_accumulated_marks[i]; + + execution_stats.skip_indices_build_us[i] += watch.elapsed(); } } } @@ -481,6 +498,9 @@ void MergeTreeDataPartWriterOnDisk::finishStatisticsSerialization(bool sync) if (sync) stream->sync(); } + + for (size_t i = 0; i < stats.size(); ++i) + LOG_DEBUG(log, "Spent {} ms calculating statistics {} for the part {}", execution_stats.statistics_build_us[i] / 1000, stats[i]->columnName(), data_part->name); } void MergeTreeDataPartWriterOnDisk::fillStatisticsChecksums(MergeTreeData::DataPart::Checksums & checksums) @@ -504,6 +524,10 @@ void MergeTreeDataPartWriterOnDisk::finishSkipIndicesSerialization(bool sync) } for (auto & store: gin_index_stores) store.second->finalize(); + + for (size_t i = 0; i < skip_indices.size(); ++i) + LOG_DEBUG(log, "Spent {} ms calculating index {} for the part {}", execution_stats.skip_indices_build_us[i] / 1000, skip_indices[i]->index.name, data_part->name); + gin_index_stores.clear(); skip_indices_streams.clear(); skip_indices_aggregators.clear(); diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h index 4d081778e68..9f2cc3970fa 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h @@ -190,6 +190,20 @@ private: void initStatistics(); virtual void fillIndexGranularity(size_t index_granularity_for_block, size_t rows_in_block) = 0; + + struct ExecutionStatistics + { + ExecutionStatistics(size_t skip_indices_cnt, size_t stats_cnt) + : skip_indices_build_us(skip_indices_cnt, 0), statistics_build_us(stats_cnt, 0) + { + } + + std::vector skip_indices_build_us; // [i] corresponds to the i-th index + std::vector statistics_build_us; // [i] corresponds to the i-th stat + }; + ExecutionStatistics execution_stats; + + LoggerPtr log; }; } diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index a76d370d057..1ba28713680 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1087,7 +1087,7 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange( MarkRanges res; size_t marks_count = part->index_granularity.getMarksCount(); - const auto & index = part->index; + const auto & index = part->getIndex(); if (marks_count == 0) return res; diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index ce3015c5dcb..ebf887f5e9e 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -1,21 +1,22 @@ -#include -#include -#include #include -#include -#include -#include +#include +#include +#include #include +#include +#include #include #include #include -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include #include @@ -35,11 +36,16 @@ namespace ProfileEvents extern const Event MergeTreeDataWriterRows; extern const Event MergeTreeDataWriterUncompressedBytes; extern const Event MergeTreeDataWriterCompressedBytes; + extern const Event MergeTreeDataWriterSortingBlocksMicroseconds; + extern const Event MergeTreeDataWriterMergingBlocksMicroseconds; + extern const Event MergeTreeDataWriterProjectionsCalculationMicroseconds; extern const Event MergeTreeDataProjectionWriterBlocks; extern const Event MergeTreeDataProjectionWriterBlocksAlreadySorted; extern const Event MergeTreeDataProjectionWriterRows; extern const Event MergeTreeDataProjectionWriterUncompressedBytes; extern const Event MergeTreeDataProjectionWriterCompressedBytes; + extern const Event MergeTreeDataProjectionWriterSortingBlocksMicroseconds; + extern const Event MergeTreeDataProjectionWriterMergingBlocksMicroseconds; extern const Event RejectedInserts; } @@ -126,13 +132,18 @@ void buildScatterSelector( /// Computes ttls and updates ttl infos void updateTTL( + const ContextPtr context, const TTLDescription & ttl_entry, IMergeTreeDataPart::TTLInfos & ttl_infos, DB::MergeTreeDataPartTTLInfo & ttl_info, const Block & block, bool update_part_min_max_ttls) { - auto ttl_column = ITTLAlgorithm::executeExpressionAndGetColumn(ttl_entry.expression, block, ttl_entry.result_column); + auto expr_and_set = ttl_entry.buildExpression(context); + for (auto & subquery : expr_and_set.sets->getSubqueries()) + subquery->buildSetInplace(context); + + auto ttl_column = ITTLAlgorithm::executeExpressionAndGetColumn(expr_and_set.expression, block, ttl_entry.result_column); if (const ColumnUInt16 * column_date = typeid_cast(ttl_column.get())) { @@ -373,13 +384,13 @@ Block MergeTreeDataWriter::mergeBlock( /// Check that after first merge merging_algorithm is waiting for data from input 0. if (status.required_source != 0) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: required source after the first merge is not 0. Chunk rows: {}, is_finished: {}, required_source: {}, algorithm: {}", status.chunk.getNumRows(), status.is_finished, status.required_source, merging_algorithm->getName()); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Required source after the first merge is not 0. Chunk rows: {}, is_finished: {}, required_source: {}, algorithm: {}", status.chunk.getNumRows(), status.is_finished, status.required_source, merging_algorithm->getName()); status = merging_algorithm->merge(); /// Check that merge is finished. if (!status.is_finished) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: merge is not finished after the second merge."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Merge is not finished after the second merge."); /// Merged Block is sorted and we don't need to use permutation anymore permutation = nullptr; @@ -428,7 +439,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( auto max_month = date_lut.toNumYYYYMM(max_date); if (min_month != max_month) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: part spans more than one month."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Part spans more than one month."); part_name = new_part_info.getPartNameV0(min_date, max_date); } @@ -472,6 +483,8 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( IColumn::Permutation perm; if (!sort_description.empty()) { + ProfileEventTimeIncrement watch(ProfileEvents::MergeTreeDataWriterSortingBlocksMicroseconds); + if (!isAlreadySorted(block, sort_description)) { stableGetPermutation(block, sort_description, perm); @@ -483,7 +496,10 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( Names partition_key_columns = metadata_snapshot->getPartitionKey().column_names; if (context->getSettingsRef().optimize_on_insert) + { + ProfileEventTimeIncrement watch(ProfileEvents::MergeTreeDataWriterMergingBlocksMicroseconds); block = mergeBlock(block, sort_description, partition_key_columns, perm_ptr, data.merging_params); + } /// Size of part would not be greater than block.bytes() + epsilon size_t expected_size = block.bytes(); @@ -496,7 +512,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( DB::IMergeTreeDataPart::TTLInfos move_ttl_infos; const auto & move_ttl_entries = metadata_snapshot->getMoveTTLs(); for (const auto & ttl_entry : move_ttl_entries) - updateTTL(ttl_entry, move_ttl_infos, move_ttl_infos.moves_ttl[ttl_entry.result_column], block, false); + updateTTL(context, ttl_entry, move_ttl_infos, move_ttl_infos.moves_ttl[ttl_entry.result_column], block, false); ReservationPtr reservation = data.reserveSpacePreferringTTLRules(metadata_snapshot, expected_size, move_ttl_infos, time(nullptr), 0, true); VolumePtr volume = data.getStoragePolicy()->getVolume(0); @@ -551,20 +567,20 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( } if (metadata_snapshot->hasRowsTTL()) - updateTTL(metadata_snapshot->getRowsTTL(), new_data_part->ttl_infos, new_data_part->ttl_infos.table_ttl, block, true); + updateTTL(context, metadata_snapshot->getRowsTTL(), new_data_part->ttl_infos, new_data_part->ttl_infos.table_ttl, block, true); for (const auto & ttl_entry : metadata_snapshot->getGroupByTTLs()) - updateTTL(ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.group_by_ttl[ttl_entry.result_column], block, true); + updateTTL(context, ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.group_by_ttl[ttl_entry.result_column], block, true); for (const auto & ttl_entry : metadata_snapshot->getRowsWhereTTLs()) - updateTTL(ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.rows_where_ttl[ttl_entry.result_column], block, true); + updateTTL(context, ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.rows_where_ttl[ttl_entry.result_column], block, true); for (const auto & [name, ttl_entry] : metadata_snapshot->getColumnTTLs()) - updateTTL(ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.columns_ttl[name], block, true); + updateTTL(context, ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.columns_ttl[name], block, true); const auto & recompression_ttl_entries = metadata_snapshot->getRecompressionTTLs(); for (const auto & ttl_entry : recompression_ttl_entries) - updateTTL(ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.recompression_ttl[ttl_entry.result_column], block, false); + updateTTL(context, ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.recompression_ttl[ttl_entry.result_column], block, false); new_data_part->ttl_infos.update(move_ttl_infos); @@ -588,7 +604,13 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( for (const auto & projection : metadata_snapshot->getProjections()) { - auto projection_block = projection.calculate(block, context); + Block projection_block; + { + ProfileEventTimeIncrement watch(ProfileEvents::MergeTreeDataWriterProjectionsCalculationMicroseconds); + projection_block = projection.calculate(block, context); + LOG_DEBUG(log, "Spent {} ms calculating projection {} for the part {}", watch.elapsed() / 1000, projection.name, new_data_part->name); + } + if (projection_block.rows()) { auto proj_temp_part = writeProjectionPart(data, log, projection_block, projection, new_data_part.get()); @@ -685,6 +707,8 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( IColumn::Permutation perm; if (!sort_description.empty()) { + ProfileEventTimeIncrement watch(ProfileEvents::MergeTreeDataProjectionWriterSortingBlocksMicroseconds); + if (!isAlreadySorted(block, sort_description)) { stableGetPermutation(block, sort_description, perm); @@ -696,6 +720,8 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( if (projection.type == ProjectionDescription::Type::Aggregate) { + ProfileEventTimeIncrement watch(ProfileEvents::MergeTreeDataProjectionWriterMergingBlocksMicroseconds); + MergeTreeData::MergingParams projection_merging_params; projection_merging_params.mode = MergeTreeData::MergingParams::Aggregating; block = mergeBlock(block, sort_description, {}, perm_ptr, projection_merging_params); diff --git a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp index da49814b83a..f506230b5ea 100644 --- a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp @@ -59,7 +59,7 @@ bool maybeTrueOnBloomFilter(const IColumn * hash_column, const BloomFilterPtr & const auto * non_const_column = typeid_cast(hash_column); if (!const_column && !non_const_column) - throw Exception(ErrorCodes::LOGICAL_ERROR, "LOGICAL ERROR: hash column must be Const Column or UInt64 Column."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Hash column must be Const or UInt64."); if (const_column) { diff --git a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h index db85c804d8d..8029d6d405b 100644 --- a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h +++ b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h @@ -53,7 +53,7 @@ public: if (const auto & bf_granule = typeid_cast(granule.get())) return mayBeTrueOnGranule(bf_granule); - throw Exception(ErrorCodes::LOGICAL_ERROR, "LOGICAL ERROR: require bloom filter index granule."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Requires bloom filter index granule."); } private: diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp index 4e339964de3..da89d52a9ff 100644 --- a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp @@ -54,9 +54,9 @@ MarkType::MarkType(bool adaptive_, bool compressed_, MergeTreeDataPartType::Valu : adaptive(adaptive_), compressed(compressed_), part_type(part_type_) { if (!adaptive && part_type != MergeTreeDataPartType::Wide) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: non-Wide data part type with non-adaptive granularity"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Non-Wide data part type with non-adaptive granularity"); if (part_type == MergeTreeDataPartType::Unknown) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: unknown data part type"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown data part type"); } bool MarkType::isMarkFileExtension(std::string_view extension) @@ -71,7 +71,7 @@ std::string MarkType::getFileExtension() const if (!adaptive) { if (part_type != MergeTreeDataPartType::Wide) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: non-Wide data part type with non-adaptive granularity"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Non-Wide data part type with non-adaptive granularity"); return res; } @@ -84,7 +84,7 @@ std::string MarkType::getFileExtension() const case MergeTreeDataPartType::InMemory: return ""; case MergeTreeDataPartType::Unknown: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: unknown data part type"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown data part type"); } } diff --git a/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp b/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp index b1f8e09be9f..20dfed8cf8f 100644 --- a/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp @@ -15,6 +15,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; } @@ -217,7 +218,20 @@ MergeTreeIndexPtr minmaxIndexCreator( return std::make_shared(index); } -void minmaxIndexValidator(const IndexDescription & /* index */, bool /* attach */) +void minmaxIndexValidator(const IndexDescription & index, bool attach) { + if (attach) + return; + + for (const auto & column : index.sample_block) + { + if (!column.type->isComparable()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Data type of argument for minmax index must be comparable, got {} type for column {} instead", + column.type->getName(), column.name); + } + } } + } diff --git a/src/Storages/MergeTree/MergeTreeMarksLoader.cpp b/src/Storages/MergeTree/MergeTreeMarksLoader.cpp index 7531c03a011..8250050412f 100644 --- a/src/Storages/MergeTree/MergeTreeMarksLoader.cpp +++ b/src/Storages/MergeTree/MergeTreeMarksLoader.cpp @@ -1,12 +1,11 @@ #include #include -#include +#include #include #include #include #include #include -#include #include #include diff --git a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp index 47c2fe07bb4..8d8b0f1cc79 100644 --- a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include diff --git a/src/Storages/MergeTree/MergeTreeReadTask.cpp b/src/Storages/MergeTree/MergeTreeReadTask.cpp index dcfed700fac..41c7531b6a6 100644 --- a/src/Storages/MergeTree/MergeTreeReadTask.cpp +++ b/src/Storages/MergeTree/MergeTreeReadTask.cpp @@ -184,7 +184,11 @@ MergeTreeReadTask::BlockAndProgress MergeTreeReadTask::read(const BlockSizeParam Block block; if (read_result.num_rows != 0) + { + for (const auto & column : read_result.columns) + column->assumeMutableRef().shrinkToFit(); block = sample_block.cloneWithColumns(read_result.columns); + } BlockAndProgress res = { .block = std::move(block), diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index aeff438f509..4e93bd267ec 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -449,8 +449,8 @@ Block MergeTreeSelectProcessor::applyPrewhereActions(Block block, const Prewhere Block MergeTreeSelectProcessor::transformHeader( Block block, const PrewhereInfoPtr & prewhere_info, const DataTypePtr & partition_value_type, const Names & virtual_columns) { + injectVirtualColumns(block, 0, nullptr, partition_value_type, virtual_columns); auto transformed = applyPrewhereActions(std::move(block), prewhere_info); - injectVirtualColumns(transformed, 0, nullptr, partition_value_type, virtual_columns); return transformed; } diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp index 29af7fb4820..d0fbc316024 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp @@ -140,6 +140,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( if (storage.supportsSubcolumns()) options.withSubcolumns(); + columns_for_reader = storage_snapshot->getColumnsByNames(options, columns_to_read); } else @@ -156,6 +157,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( read_settings.local_fs_method = LocalFSReadMethod::pread; if (read_with_direct_io) read_settings.direct_io_threshold = 1; + /// Configure throttling switch (type) { @@ -224,7 +226,10 @@ try for (size_t i = 0; i < num_columns; ++i) { if (header.has(it->name)) + { + columns[i]->assumeMutableRef().shrinkToFit(); res_columns.emplace_back(std::move(columns[i])); + } ++it; } diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 96cab9c0293..b64632b6139 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -201,6 +201,7 @@ struct Settings; M(String, primary_key_compression_codec, "ZSTD(3)", "Compression encoding used by primary, primary key is small enough and cached, so the default compression is ZSTD(3).", 0) \ M(UInt64, marks_compress_block_size, 65536, "Mark compress block size, the actual size of the block to compress.", 0) \ M(UInt64, primary_key_compress_block_size, 65536, "Primary compress block size, the actual size of the block to compress.", 0) \ + M(Bool, primary_key_lazy_load, true, "Load primary key in memory on first use instead of on table initialization. This can save memory in the presence of a large number of tables.", 0) \ \ /** Projection settings. */ \ M(UInt64, max_projections, 25, "The maximum number of merge tree projections.", 0) \ diff --git a/src/Storages/MergeTree/MergeTreeSource.cpp b/src/Storages/MergeTree/MergeTreeSource.cpp index a450505f7a8..e1d1d0951e4 100644 --- a/src/Storages/MergeTree/MergeTreeSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSource.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index 4aecf85ac2a..d9a89b9d4ef 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -112,7 +112,7 @@ void MergeTreeWhereOptimizer::optimize(SelectQueryInfo & select_query_info, cons LOG_DEBUG(log, "MergeTreeWhereOptimizer: condition \"{}\" moved to PREWHERE", select.prewhere()->formatForLogging(log_queries_cut_to_length)); } -std::optional MergeTreeWhereOptimizer::optimize(const ActionsDAGPtr & filter_dag, +MergeTreeWhereOptimizer::FilterActionsOptimizeResult MergeTreeWhereOptimizer::optimize(const ActionsDAGPtr & filter_dag, const std::string & filter_column_name, const ContextPtr & context, bool is_final) @@ -132,11 +132,11 @@ std::optional MergeTreeWhe if (!optimize_result) return {}; - auto filter_actions = reconstructDAG(optimize_result->where_conditions); - auto prewhere_filter_actions = reconstructDAG(optimize_result->prewhere_conditions); + std::unordered_set prewhere_conditions; + for (const auto & condition : optimize_result->prewhere_conditions) + prewhere_conditions.insert(condition.node.getDAGNode()); - FilterActionsOptimizeResult result = { std::move(filter_actions), std::move(prewhere_filter_actions) }; - return result; + return {.prewhere_nodes = std::move(prewhere_conditions), .fully_moved_to_prewhere = optimize_result->where_conditions.empty()}; } static void collectColumns(const RPNBuilderTreeNode & node, const NameSet & columns_names, NameSet & result_set, bool & has_invalid_column) @@ -343,20 +343,6 @@ ASTPtr MergeTreeWhereOptimizer::reconstructAST(const Conditions & conditions) return function; } -ActionsDAGPtr MergeTreeWhereOptimizer::reconstructDAG(const Conditions & conditions) -{ - if (conditions.empty()) - return {}; - - ActionsDAG::NodeRawConstPtrs filter_nodes; - filter_nodes.reserve(conditions.size()); - - for (const auto & condition : conditions) - filter_nodes.push_back(condition.node.getDAGNode()); - - return ActionsDAG::buildFilterActionsDAG(filter_nodes); -} - std::optional MergeTreeWhereOptimizer::optimizeImpl(const RPNBuilderTreeNode & node, const WhereOptimizerContext & where_optimizer_context) const { diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index b56219e3c59..84afa4cda17 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -47,11 +47,11 @@ public: struct FilterActionsOptimizeResult { - ActionsDAGPtr filter_actions; - ActionsDAGPtr prewhere_filter_actions; + std::unordered_set prewhere_nodes; + bool fully_moved_to_prewhere = false; }; - std::optional optimize(const ActionsDAGPtr & filter_dag, + FilterActionsOptimizeResult optimize(const ActionsDAGPtr & filter_dag, const std::string & filter_column_name, const ContextPtr & context, bool is_final); @@ -122,9 +122,6 @@ private: /// Reconstruct AST from conditions static ASTPtr reconstructAST(const Conditions & conditions); - /// Reconstruct DAG from conditions - static ActionsDAGPtr reconstructDAG(const Conditions & conditions); - void optimizeArbitrary(ASTSelectQuery & select) const; UInt64 getColumnsSize(const NameSet & columns) const; diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 1d10a1433ef..f2fe2e0f255 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -181,7 +181,7 @@ MergedBlockOutputStream::Finalizer MergedBlockOutputStream::finalizePartAsync( new_part->rows_count = rows_count; new_part->modification_time = time(nullptr); - new_part->index = writer->releaseIndexColumns(); + new_part->setIndex(writer->releaseIndexColumns()); new_part->checksums = checksums; new_part->setBytesOnDisk(checksums.getTotalSizeOnDisk()); new_part->setBytesUncompressedOnDisk(checksums.getTotalSizeUncompressedOnDisk()); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 6bcdfe34296..6bacce9e2c5 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -16,16 +17,24 @@ #include #include #include +#include #include #include #include #include #include +#include #include +#include #include #include +namespace ProfileEvents +{ +extern const Event MutateTaskProjectionsCalculationMicroseconds; +} + namespace CurrentMetrics { extern const Metric PartMutation; @@ -64,6 +73,7 @@ static void splitAndModifyMutationCommands( LoggerPtr log) { auto part_columns = part->getColumnsDescription(); + const auto & table_columns = metadata_snapshot->getColumns(); if (!isWidePart(part) || !isFullPartStorage(part->getDataPartStorage())) { @@ -72,9 +82,19 @@ static void splitAndModifyMutationCommands( for (const auto & command : commands) { + if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) + { + /// For ordinary column with default or materialized expression, MATERIALIZE COLUMN should not override past values + /// So we only mutate column if `command.column_name` is a default/materialized column or if the part does not have physical column file + auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); + if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) + { + for_interpreter.push_back(command); + mutated_columns.emplace(command.column_name); + } + } if (command.type == MutationCommand::Type::MATERIALIZE_INDEX || command.type == MutationCommand::Type::MATERIALIZE_STATISTIC - || command.type == MutationCommand::Type::MATERIALIZE_COLUMN || command.type == MutationCommand::Type::MATERIALIZE_PROJECTION || command.type == MutationCommand::Type::MATERIALIZE_TTL || command.type == MutationCommand::Type::DELETE @@ -84,9 +104,6 @@ static void splitAndModifyMutationCommands( for_interpreter.push_back(command); for (const auto & [column_name, expr] : command.column_to_update_expression) mutated_columns.emplace(column_name); - - if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) - mutated_columns.emplace(command.column_name); } else if (command.type == MutationCommand::Type::DROP_INDEX || command.type == MutationCommand::Type::DROP_PROJECTION @@ -196,8 +213,15 @@ static void splitAndModifyMutationCommands( { for (const auto & command : commands) { - if (command.type == MutationCommand::Type::MATERIALIZE_INDEX - || command.type == MutationCommand::Type::MATERIALIZE_COLUMN + if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) + { + /// For ordinary column with default or materialized expression, MATERIALIZE COLUMN should not override past values + /// So we only mutate column if `command.column_name` is a default/materialized column or if the part does not have physical column file + auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); + if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) + for_interpreter.push_back(command); + } + else if (command.type == MutationCommand::Type::MATERIALIZE_INDEX || command.type == MutationCommand::Type::MATERIALIZE_STATISTIC || command.type == MutationCommand::Type::MATERIALIZE_PROJECTION || command.type == MutationCommand::Type::MATERIALIZE_TTL @@ -308,6 +332,15 @@ getColumnsForNewDataPart( } } + if (!storage_columns_set.contains(BlockNumberColumn::name)) + { + if (source_part->tryGetSerialization(BlockNumberColumn::name) != nullptr) + { + storage_columns.push_back({BlockNumberColumn::name, BlockNumberColumn::type}); + storage_columns_set.insert(BlockNumberColumn::name); + } + } + SerializationInfoByName new_serialization_infos; for (const auto & [name, old_info] : serialization_infos) { @@ -540,7 +573,9 @@ static std::set getProjectionsToRecalculate( { bool need_recalculate = materialized_projections.contains(projection.name) - || (!is_full_part_storage && source_part->hasProjection(projection.name)); + || (!is_full_part_storage + && source_part->hasProjection(projection.name) + && !source_part->hasBrokenProjection(projection.name)); if (need_recalculate) projections_to_recalc.insert(&projection); @@ -674,15 +709,25 @@ static NameToNameVector collectFilesForRenames( { if (command.type == MutationCommand::Type::DROP_INDEX) { - if (source_part->checksums.has(INDEX_FILE_PREFIX + command.column_name + ".idx2")) + static const std::array suffixes = {".idx2", ".idx"}; + static const std::array gin_suffixes = {".gin_dict", ".gin_post", ".gin_seg", ".gin_sid"}; /// .gin_* is inverted index + + for (const auto & suffix : suffixes) { - add_rename(INDEX_FILE_PREFIX + command.column_name + ".idx2", ""); - add_rename(INDEX_FILE_PREFIX + command.column_name + mrk_extension, ""); + const String filename = INDEX_FILE_PREFIX + command.column_name + suffix; + const String filename_mrk = INDEX_FILE_PREFIX + command.column_name + mrk_extension; + + if (source_part->checksums.has(filename)) + { + add_rename(filename, ""); + add_rename(filename_mrk, ""); + } } - else if (source_part->checksums.has(INDEX_FILE_PREFIX + command.column_name + ".idx")) + for (const auto & gin_suffix : gin_suffixes) { - add_rename(INDEX_FILE_PREFIX + command.column_name + ".idx", ""); - add_rename(INDEX_FILE_PREFIX + command.column_name + mrk_extension, ""); + const String filename = INDEX_FILE_PREFIX + command.column_name + gin_suffix; + if (source_part->checksums.has(filename)) + add_rename(filename, ""); } } else if (command.type == MutationCommand::Type::DROP_PROJECTION) @@ -869,12 +914,13 @@ void finalizeMutatedPart( new_data_part->rows_count = source_part->rows_count; new_data_part->index_granularity = source_part->index_granularity; - new_data_part->index = source_part->index; + new_data_part->setIndex(source_part->getIndex()); new_data_part->minmax_idx = source_part->minmax_idx; new_data_part->modification_time = time(nullptr); /// Load rest projections which are hardlinked - new_data_part->loadProjections(false, false, true /* if_not_loaded */); + bool noop; + new_data_part->loadProjections(false, false, noop, true /* if_not_loaded */); /// All information about sizes is stored in checksums. /// It doesn't make sense to touch filesystem for sizes. @@ -1218,7 +1264,13 @@ bool PartMergerWriter::mutateOriginalPartAndPrepareProjections() for (size_t i = 0, size = ctx->projections_to_build.size(); i < size; ++i) { const auto & projection = *ctx->projections_to_build[i]; - auto projection_block = projection_squashes[i].add(projection.calculate(cur_block, ctx->context)); + + Block projection_block; + { + ProfileEventTimeIncrement watch(ProfileEvents::MutateTaskProjectionsCalculationMicroseconds); + projection_block = projection_squashes[i].add(projection.calculate(cur_block, ctx->context)); + } + if (projection_block) { auto tmp_part = MergeTreeDataWriter::writeTempProjectionPart( @@ -1451,7 +1503,9 @@ private: bool need_recalculate = ctx->materialized_projections.contains(projection.name) - || (!is_full_part_storage && ctx->source_part->hasProjection(projection.name)); + || (!is_full_part_storage + && ctx->source_part->hasProjection(projection.name) + && !ctx->source_part->hasBrokenProjection(projection.name)); if (need_recalculate) { @@ -1515,21 +1569,34 @@ private: if (!ctx->mutating_pipeline_builder.initialized()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot mutate part columns with uninitialized mutations stream. It's a bug"); - QueryPipelineBuilder builder(std::move(ctx->mutating_pipeline_builder)); + auto builder = std::make_unique(std::move(ctx->mutating_pipeline_builder)); if (ctx->metadata_snapshot->hasPrimaryKey() || ctx->metadata_snapshot->hasSecondaryIndices()) { - builder.addTransform(std::make_shared( - builder.getHeader(), ctx->data->getPrimaryKeyAndSkipIndicesExpression(ctx->metadata_snapshot, skip_indices))); + builder->addTransform(std::make_shared( + builder->getHeader(), ctx->data->getPrimaryKeyAndSkipIndicesExpression(ctx->metadata_snapshot, skip_indices))); - builder.addTransform(std::make_shared(builder.getHeader())); + builder->addTransform(std::make_shared(builder->getHeader())); } + PreparedSets::Subqueries subqueries; + if (ctx->execute_ttl_type == ExecuteTTLType::NORMAL) - builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); + { + auto transform = std::make_shared(ctx->context, builder->getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } if (ctx->execute_ttl_type == ExecuteTTLType::RECALCULATE) - builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); + { + auto transform = std::make_shared(ctx->context, builder->getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } + + if (!subqueries.empty()) + builder = addCreatingSetsTransform(std::move(builder), std::move(subqueries), ctx->context); ctx->minmax_idx = std::make_shared(); @@ -1563,7 +1630,7 @@ private: ctx->context->getWriteSettings(), computed_granularity); - ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(builder)); + ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder)); ctx->mutating_pipeline.setProgressCallback(ctx->progress_callback); /// Is calculated inside MergeProgressCallback. ctx->mutating_pipeline.disableProfileEventUpdate(); @@ -1575,8 +1642,9 @@ private: void finalize() { + bool noop; ctx->new_data_part->minmax_idx = std::move(ctx->minmax_idx); - ctx->new_data_part->loadProjections(false, false, true /* if_not_loaded */); + ctx->new_data_part->loadProjections(false, false, noop, true /* if_not_loaded */); ctx->mutating_executor.reset(); ctx->mutating_pipeline.reset(); @@ -1758,13 +1826,25 @@ private: if (ctx->mutating_pipeline_builder.initialized()) { - QueryPipelineBuilder builder(std::move(ctx->mutating_pipeline_builder)); + auto builder = std::make_unique(std::move(ctx->mutating_pipeline_builder)); + PreparedSets::Subqueries subqueries; if (ctx->execute_ttl_type == ExecuteTTLType::NORMAL) - builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); + { + auto transform = std::make_shared(ctx->context, builder->getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } if (ctx->execute_ttl_type == ExecuteTTLType::RECALCULATE) - builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); + { + auto transform = std::make_shared(ctx->context, builder->getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } + + if (!subqueries.empty()) + builder = addCreatingSetsTransform(std::move(builder), std::move(subqueries), ctx->context); ctx->out = std::make_shared( ctx->new_data_part, @@ -1778,7 +1858,7 @@ private: &ctx->source_part->index_granularity_info ); - ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(builder)); + ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder)); ctx->mutating_pipeline.setProgressCallback(ctx->progress_callback); /// Is calculated inside MergeProgressCallback. ctx->mutating_pipeline.disableProfileEventUpdate(); @@ -1921,7 +2001,7 @@ static bool canSkipConversionToNullable(const MergeTreeDataPartPtr & part, const if (!part_column) return false; - /// For ALTER MODIFY COLUMN from 'Type' to 'Nullable(Type)' we can skip mutatation and + /// For ALTER MODIFY COLUMN from 'Type' to 'Nullable(Type)' we can skip mutation and /// apply only metadata conversion. But it doesn't work for custom serialization. const auto * to_nullable = typeid_cast(command.data_type.get()); if (!to_nullable) @@ -1937,6 +2017,20 @@ static bool canSkipConversionToNullable(const MergeTreeDataPartPtr & part, const return true; } +static bool canSkipConversionToVariant(const MergeTreeDataPartPtr & part, const MutationCommand & command) +{ + if (command.type != MutationCommand::READ_COLUMN) + return false; + + auto part_column = part->tryGetColumn(command.column_name); + if (!part_column) + return false; + + /// For ALTER MODIFY COLUMN with Variant extension (like 'Variant(T1, T2)' to 'Variant(T1, T2, T3, ...)') + /// we can skip mutation and apply only metadata conversion. + return isVariantExtension(part_column->type, command.data_type); +} + static bool canSkipMutationCommandForPart(const MergeTreeDataPartPtr & part, const MutationCommand & command, const ContextPtr & context) { if (command.partition) @@ -1952,6 +2046,9 @@ static bool canSkipMutationCommandForPart(const MergeTreeDataPartPtr & part, con if (canSkipConversionToNullable(part, command)) return true; + if (canSkipConversionToVariant(part, command)) + return true; + return false; } diff --git a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp index abc51bde3fb..2fe237efdc7 100644 --- a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp +++ b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp @@ -97,11 +97,9 @@ extern const Event ParallelReplicasCollectingOwnedSegmentsMicroseconds; extern const Event ParallelReplicasReadAssignedMarks; extern const Event ParallelReplicasReadUnassignedMarks; extern const Event ParallelReplicasReadAssignedForStealingMarks; -} -namespace ProfileEvents -{ - extern const Event ParallelReplicasUsedCount; +extern const Event ParallelReplicasUsedCount; +extern const Event ParallelReplicasUnavailableCount; } namespace DB @@ -1025,6 +1023,8 @@ ParallelReadResponse ParallelReplicasReadingCoordinator::handleRequest(ParallelR void ParallelReplicasReadingCoordinator::markReplicaAsUnavailable(size_t replica_number) { + ProfileEvents::increment(ProfileEvents::ParallelReplicasUnavailableCount); + std::lock_guard lock(mutex); if (!pimpl) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp index 156c41563ec..bc0b4f73a31 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp @@ -63,7 +63,7 @@ void ReplicatedMergeTreePartCheckThread::enqueuePart(const String & name, time_t if (parts_set.contains(name)) return; - LOG_TRACE(log, "Enqueueing {} for check after after {}s", name, delay_to_check_seconds); + LOG_TRACE(log, "Enqueueing {} for check after {}s", name, delay_to_check_seconds); parts_queue.emplace_back(name, std::chrono::steady_clock::now() + std::chrono::seconds(delay_to_check_seconds)); parts_set.insert(name); task->schedule(); @@ -274,7 +274,7 @@ std::pair ReplicatedMergeTreePartCheckThread::findLo return std::make_pair(exists_in_zookeeper, part); } -ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const String & part_name) +ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const String & part_name, bool throw_on_broken_projection) { ReplicatedCheckResult result; auto [exists_in_zookeeper, part] = findLocalPart(part_name); @@ -341,6 +341,7 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St /// before the ReplicatedMergeTreePartHeader was introduced. String part_path = storage.replica_path + "/parts/" + part_name; String part_znode = zookeeper->get(part_path); + bool is_broken_projection = false; try { @@ -362,8 +363,10 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St checkDataPart( part, - true, - [this] { return need_stop.load(); }); + /* require_checksums */true, + is_broken_projection, + [this] { return need_stop.load(); }, + throw_on_broken_projection); if (need_stop) { @@ -382,14 +385,27 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St if (isRetryableException(std::current_exception())) throw; - tryLogCurrentException(log, __PRETTY_FUNCTION__); + PreformattedMessage message; + if (is_broken_projection) + { + WriteBufferFromOwnString wb; + message = PreformattedMessage::create( + "Part {} has a broken projections. It will be ignored. Broken projections info: {}", + part_name, getCurrentExceptionMessage(false)); + LOG_DEBUG(log, message); + result.action = ReplicatedCheckResult::DoNothing; + } + else + { + tryLogCurrentException(log, __PRETTY_FUNCTION__); - auto message = PreformattedMessage::create("Part {} looks broken. Removing it and will try to fetch.", part_name); - LOG_ERROR(log, message); + message = PreformattedMessage::create("Part {} looks broken. Removing it and will try to fetch.", part_name); + LOG_ERROR(log, message); + result.action = ReplicatedCheckResult::TryFetchMissing; + } /// Part is broken, let's try to find it and fetch. result.status = {part_name, false, message}; - result.action = ReplicatedCheckResult::TryFetchMissing; return result; } @@ -419,12 +435,12 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St } -CheckResult ReplicatedMergeTreePartCheckThread::checkPartAndFix(const String & part_name, std::optional * recheck_after) +CheckResult ReplicatedMergeTreePartCheckThread::checkPartAndFix(const String & part_name, std::optional * recheck_after, bool throw_on_broken_projection) { LOG_INFO(log, "Checking part {}", part_name); ProfileEvents::increment(ProfileEvents::ReplicatedPartChecks); - ReplicatedCheckResult result = checkPartImpl(part_name); + ReplicatedCheckResult result = checkPartImpl(part_name, throw_on_broken_projection); switch (result.action) { case ReplicatedCheckResult::None: UNREACHABLE(); @@ -577,7 +593,7 @@ void ReplicatedMergeTreePartCheckThread::run() } std::optional recheck_after; - checkPartAndFix(selected->name, &recheck_after); + checkPartAndFix(selected->name, &recheck_after, /* throw_on_broken_projection */false); if (need_stop) return; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h index f2e26b3d324..9091f698546 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h @@ -65,9 +65,9 @@ public: size_t size() const; /// Check part by name - CheckResult checkPartAndFix(const String & part_name, std::optional * recheck_after = nullptr); + CheckResult checkPartAndFix(const String & part_name, std::optional * recheck_after = nullptr, bool throw_on_broken_projection = true); - ReplicatedCheckResult checkPartImpl(const String & part_name); + ReplicatedCheckResult checkPartImpl(const String & part_name, bool throw_on_broken_projection); std::unique_lock pausePartsCheck(); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 8d921bdcb1c..e26a36202dd 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -860,6 +860,9 @@ ActiveDataPartSet getPartNamesToMutate( int32_t ReplicatedMergeTreeQueue::updateMutations(zkutil::ZooKeeperPtr zookeeper, Coordination::WatchCallbackPtr watch_callback) { + if (pull_log_blocker.isCancelled()) + throw Exception(ErrorCodes::ABORTED, "Log pulling is cancelled"); + std::lock_guard lock(update_mutations_mutex); Coordination::Stat mutations_stat; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h index bc23204e7d3..29f3183be64 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index 8ae9b54b6e9..0b545beb116 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -43,6 +43,7 @@ namespace ErrorCodes extern const int NO_FILE_IN_DATA_PART; extern const int NETWORK_ERROR; extern const int SOCKET_TIMEOUT; + extern const int BROKEN_PROJECTION; } @@ -117,7 +118,9 @@ static IMergeTreeDataPart::Checksums checkDataPart( const NameSet & files_without_checksums, const ReadSettings & read_settings, bool require_checksums, - std::function is_cancelled) + std::function is_cancelled, + bool & is_broken_projection, + bool throw_on_broken_projection) { /** Responsibility: * - read list of columns from columns.txt; @@ -126,6 +129,7 @@ static IMergeTreeDataPart::Checksums checkDataPart( */ CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedChecks}; + Poco::Logger * log = &Poco::Logger::get("checkDataPart"); NamesAndTypesList columns_txt; @@ -275,17 +279,55 @@ static IMergeTreeDataPart::Checksums checkDataPart( } } + std::string broken_projections_message; for (const auto & [name, projection] : data_part->getProjectionParts()) { if (is_cancelled()) return {}; auto projection_file = name + ".proj"; - auto projection_checksums = checkDataPart( - projection, *data_part_storage.getProjection(projection_file), - projection->getColumns(), projection->getType(), - projection->getFileNamesWithoutChecksums(), - read_settings, require_checksums, is_cancelled); + if (!throw_on_broken_projection && projection->is_broken) + { + projections_on_disk.erase(projection_file); + checksums_txt.remove(projection_file); + } + + IMergeTreeDataPart::Checksums projection_checksums; + try + { + bool noop; + projection_checksums = checkDataPart( + projection, *data_part_storage.getProjection(projection_file), + projection->getColumns(), projection->getType(), + projection->getFileNamesWithoutChecksums(), + read_settings, require_checksums, is_cancelled, noop, /* throw_on_broken_projection */false); + } + catch (...) + { + if (isRetryableException(std::current_exception())) + throw; + + if (!projection->is_broken) + { + LOG_TEST(log, "Marking projection {} as broken ({})", name, projection_file); + projection->setBrokenReason(getCurrentExceptionMessage(false), getCurrentExceptionCode()); + } + + is_broken_projection = true; + if (throw_on_broken_projection) + { + if (!broken_projections_message.empty()) + broken_projections_message += "\n"; + + broken_projections_message += fmt::format( + "Part {} has a broken projection {} (error: {})", + data_part->name, name, getCurrentExceptionMessage(false)); + continue; + } + + projections_on_disk.erase(projection_file); + checksums_txt.remove(projection_file); + } checksums_data.files[projection_file] = IMergeTreeDataPart::Checksums::Checksum( projection_checksums.getTotalSizeOnDisk(), @@ -294,6 +336,11 @@ static IMergeTreeDataPart::Checksums checkDataPart( projections_on_disk.erase(projection_file); } + if (throw_on_broken_projection && !broken_projections_message.empty()) + { + throw Exception(ErrorCodes::BROKEN_PROJECTION, "{}", broken_projections_message); + } + if (require_checksums && !projections_on_disk.empty()) { throw Exception(ErrorCodes::UNEXPECTED_FILE_IN_DATA_PART, @@ -321,7 +368,9 @@ IMergeTreeDataPart::Checksums checkDataPartInMemory(const DataPartInMemoryPtr & IMergeTreeDataPart::Checksums checkDataPart( MergeTreeData::DataPartPtr data_part, bool require_checksums, - std::function is_cancelled) + bool & is_broken_projection, + std::function is_cancelled, + bool throw_on_broken_projection) { if (auto part_in_memory = asInMemoryPart(data_part)) return checkDataPartInMemory(part_in_memory); @@ -363,7 +412,9 @@ IMergeTreeDataPart::Checksums checkDataPart( data_part->getFileNamesWithoutChecksums(), read_settings, require_checksums, - is_cancelled); + is_cancelled, + is_broken_projection, + throw_on_broken_projection); }; try @@ -377,7 +428,9 @@ IMergeTreeDataPart::Checksums checkDataPart( data_part->getFileNamesWithoutChecksums(), read_settings, require_checksums, - is_cancelled); + is_cancelled, + is_broken_projection, + throw_on_broken_projection); } catch (...) { diff --git a/src/Storages/MergeTree/checkDataPart.h b/src/Storages/MergeTree/checkDataPart.h index d0e48b6f80a..a01978f4efe 100644 --- a/src/Storages/MergeTree/checkDataPart.h +++ b/src/Storages/MergeTree/checkDataPart.h @@ -10,7 +10,9 @@ namespace DB IMergeTreeDataPart::Checksums checkDataPart( MergeTreeData::DataPartPtr data_part, bool require_checksums, - std::function is_cancelled = []{ return false; }); + bool & is_broken_projection, + std::function is_cancelled = []{ return false; }, + bool throw_on_broken_projection = false); bool isNotEnoughMemoryErrorCode(int code); bool isRetryableException(const std::exception_ptr exception_ptr); diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 8e646e48f16..e2f89067b34 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -194,7 +194,7 @@ static StoragePtr create(const StorageFactory::Arguments & args) auto add_optional_param = [&](const char * desc) { ++max_num_params; - needed_params += needed_params.empty() ? "\n" : ",\n["; + needed_params += needed_params.empty() ? "\n[" : ",\n["; needed_params += desc; needed_params += "]"; }; @@ -315,7 +315,8 @@ static StoragePtr create(const StorageFactory::Arguments & args) DatabaseCatalog::instance().getDatabase(args.table_id.database_name)->getEngineName() == "Replicated"; /// Allow implicit {uuid} macros only for zookeeper_path in ON CLUSTER queries - bool allow_uuid_macro = is_on_cluster || is_replicated_database || args.query.attach; + /// and if UUID was explicitly passed in CREATE TABLE (like for ATTACH) + bool allow_uuid_macro = is_on_cluster || is_replicated_database || args.query.attach || args.query.has_uuid; auto expand_macro = [&] (ASTLiteral * ast_zk_path, ASTLiteral * ast_replica_name) { @@ -404,10 +405,10 @@ static StoragePtr create(const StorageFactory::Arguments & args) { /// Try use default values if arguments are not specified. /// Note: {uuid} macro works for ON CLUSTER queries when database engine is Atomic. - const auto & config = args.getContext()->getConfigRef(); - zookeeper_path = StorageReplicatedMergeTree::getDefaultZooKeeperPath(config); + const auto & server_settings = args.getContext()->getServerSettings(); + zookeeper_path = server_settings.default_replica_path; /// TODO maybe use hostname if {replica} is not defined? - replica_name = StorageReplicatedMergeTree::getDefaultReplicaName(config); + replica_name = server_settings.default_replica_name; /// Modify query, so default values will be written to metadata assert(arg_num == 0); diff --git a/src/Storages/NATS/NATSConnection.cpp b/src/Storages/NATS/NATSConnection.cpp index d7ad0cf8219..4d30d6b2360 100644 --- a/src/Storages/NATS/NATSConnection.cpp +++ b/src/Storages/NATS/NATSConnection.cpp @@ -91,6 +91,8 @@ void NATSConnectionManager::connectImpl() natsOptions_SetUserInfo(options, configuration.username.c_str(), configuration.password.c_str()); if (!configuration.token.empty()) natsOptions_SetToken(options, configuration.token.c_str()); + if (!configuration.credential_file.empty()) + natsOptions_SetUserCredentialsFromFiles(options, configuration.credential_file.c_str(), nullptr); if (configuration.secure) { diff --git a/src/Storages/NATS/NATSConnection.h b/src/Storages/NATS/NATSConnection.h index c350f395a92..859fcb72022 100644 --- a/src/Storages/NATS/NATSConnection.h +++ b/src/Storages/NATS/NATSConnection.h @@ -14,6 +14,7 @@ struct NATSConfiguration String username; String password; String token; + String credential_file; int max_reconnect; int reconnect_wait; diff --git a/src/Storages/NATS/NATSSettings.h b/src/Storages/NATS/NATSSettings.h index 3e3ed739d82..3273a5ff065 100644 --- a/src/Storages/NATS/NATSSettings.h +++ b/src/Storages/NATS/NATSSettings.h @@ -25,6 +25,7 @@ class ASTStorage; M(String, nats_username, "", "NATS username", 0) \ M(String, nats_password, "", "NATS password", 0) \ M(String, nats_token, "", "NATS token", 0) \ + M(String, nats_credential_file, "", "Path to a NATS credentials file", 0) \ M(UInt64, nats_startup_connect_tries, 5, "Number of connect tries at startup", 0) \ M(UInt64, nats_max_rows_per_message, 1, "The maximum number of rows produced in one message for row-based formats.", 0) \ M(StreamingHandleErrorMode, nats_handle_error_mode, StreamingHandleErrorMode::DEFAULT, "How to handle errors for NATS engine. Possible values: default (throw an exception after nats_skip_broken_messages broken messages), stream (save broken messages and errors in virtual columns _raw_message, _error).", 0) \ diff --git a/src/Storages/NATS/StorageNATS.cpp b/src/Storages/NATS/StorageNATS.cpp index 2af9a9f974f..4b6ff1d8f2a 100644 --- a/src/Storages/NATS/StorageNATS.cpp +++ b/src/Storages/NATS/StorageNATS.cpp @@ -67,6 +67,7 @@ StorageNATS::StorageNATS( auto nats_username = getContext()->getMacros()->expand(nats_settings->nats_username); auto nats_password = getContext()->getMacros()->expand(nats_settings->nats_password); auto nats_token = getContext()->getMacros()->expand(nats_settings->nats_token); + auto nats_credential_file = getContext()->getMacros()->expand(nats_settings->nats_credential_file); configuration = { @@ -75,6 +76,7 @@ StorageNATS::StorageNATS( .username = nats_username.empty() ? getContext()->getConfigRef().getString("nats.user", "") : nats_username, .password = nats_password.empty() ? getContext()->getConfigRef().getString("nats.password", "") : nats_password, .token = nats_token.empty() ? getContext()->getConfigRef().getString("nats.token", "") : nats_token, + .credential_file = nats_credential_file.empty() ? getContext()->getConfigRef().getString("nats.credential_file", "") : nats_credential_file, .max_reconnect = static_cast(nats_settings->nats_max_reconnect.value), .reconnect_wait = static_cast(nats_settings->nats_reconnect_wait.value), .secure = nats_settings->nats_secure.value diff --git a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp index f99ebf51792..ba3cc6f58d0 100644 --- a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp +++ b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp @@ -313,7 +313,7 @@ void MaterializedPostgreSQLConsumer::readTupleData( Int32 col_len = readInt32(message, pos, size); String value; for (Int32 i = 0; i < col_len; ++i) - value += readInt8(message, pos, size); + value += static_cast(readInt8(message, pos, size)); insertValue(storage_data, value, column_idx); break; diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp index 025f421db59..868f48d0b7d 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp @@ -69,7 +69,7 @@ StorageRabbitMQ::StorageRabbitMQ( ContextPtr context_, const ColumnsDescription & columns_, std::unique_ptr rabbitmq_settings_, - bool is_attach_) + bool is_attach) : IStorage(table_id_) , WithContext(context_->getGlobalContext()) , rabbitmq_settings(std::move(rabbitmq_settings_)) @@ -91,7 +91,6 @@ StorageRabbitMQ::StorageRabbitMQ( , unique_strbase(getRandomName()) , queue_size(std::max(QUEUE_SIZE, static_cast(getMaxBlockSize()))) , milliseconds_to_wait(rabbitmq_settings->rabbitmq_empty_queue_backoff_start_ms) - , is_attach(is_attach_) { const auto & config = getContext()->getConfigRef(); @@ -318,10 +317,11 @@ void StorageRabbitMQ::connectionFunc() try { if (connection->reconnect()) + { initRabbitMQ(); - - streaming_task->scheduleAfter(RESCHEDULE_MS); - return; + streaming_task->scheduleAfter(RESCHEDULE_MS); + return; + } } catch (...) { @@ -373,57 +373,37 @@ void StorageRabbitMQ::initRabbitMQ() } else { - try + auto rabbit_channel = connection->createChannel(); + + /// Main exchange -> Bridge exchange -> ( Sharding exchange ) -> Queues -> Consumers + + initExchange(*rabbit_channel); + bindExchange(*rabbit_channel); + + for (const auto i : collections::range(0, num_queues)) + bindQueue(i + 1, *rabbit_channel); + + if (queues.size() != num_queues) { - auto rabbit_channel = connection->createChannel(); - - /// Main exchange -> Bridge exchange -> ( Sharding exchange ) -> Queues -> Consumers - - initExchange(*rabbit_channel); - bindExchange(*rabbit_channel); - - for (const auto i : collections::range(0, num_queues)) - bindQueue(i + 1, *rabbit_channel); - - if (queues.size() != num_queues) - { - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "Expected all queues to be initialized (but having {}/{})", - queues.size(), num_queues); - } - - LOG_TRACE(log, "RabbitMQ setup completed"); - rabbit_channel->close(); - } - catch (...) - { - tryLogCurrentException(log); - if (is_attach) - return; /// A user will have to reattach the table. - throw; + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Expected all queues to be initialized (but having {}/{})", + queues.size(), num_queues); } + + LOG_TRACE(log, "RabbitMQ setup completed"); + rabbit_channel->close(); } LOG_TRACE(log, "Registering {} conumers", num_consumers); for (size_t i = 0; i < num_consumers; ++i) { - try - { - auto consumer = createConsumer(); - consumer->updateChannel(*connection); - consumers_ref.push_back(consumer); - pushConsumer(consumer); - ++num_created_consumers; - } - catch (...) - { - if (!is_attach) - throw; - - tryLogCurrentException(log); - } + auto consumer = createConsumer(); + consumer->updateChannel(*connection); + consumers_ref.push_back(consumer); + pushConsumer(consumer); + ++num_created_consumers; } LOG_TRACE(log, "Registered {}/{} conumers", num_created_consumers, num_consumers); diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.h b/src/Storages/RabbitMQ/StorageRabbitMQ.h index be46caf9798..696734617be 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.h +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.h @@ -27,7 +27,7 @@ public: ContextPtr context_, const ColumnsDescription & columns_, std::unique_ptr rabbitmq_settings_, - bool is_attach_); + bool is_attach); std::string getName() const override { return "RabbitMQ"; } @@ -158,10 +158,9 @@ private: size_t read_attempts = 0; mutable bool drop_table = false; - bool is_attach; RabbitMQConsumerPtr createConsumer(); - bool initialized = false; + std::atomic initialized = false; /// Functions working in the background void streamingToViewsFunc(); diff --git a/src/Storages/S3Queue/S3QueueFilesMetadata.cpp b/src/Storages/S3Queue/S3QueueFilesMetadata.cpp index a2b41eb4685..8583de27e00 100644 --- a/src/Storages/S3Queue/S3QueueFilesMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueFilesMetadata.cpp @@ -129,9 +129,12 @@ S3QueueFilesMetadata::S3QueueFilesMetadata(const fs::path & zookeeper_path_, con , max_loading_retries(settings_.s3queue_loading_retries.value) , min_cleanup_interval_ms(settings_.s3queue_cleanup_interval_min_ms.value) , max_cleanup_interval_ms(settings_.s3queue_cleanup_interval_max_ms.value) + , shards_num(settings_.s3queue_total_shards_num) + , threads_per_shard(settings_.s3queue_processing_threads_num) , zookeeper_processing_path(zookeeper_path_ / "processing") , zookeeper_processed_path(zookeeper_path_ / "processed") , zookeeper_failed_path(zookeeper_path_ / "failed") + , zookeeper_shards_path(zookeeper_path_ / "shards") , zookeeper_cleanup_lock_path(zookeeper_path_ / "cleanup_lock") , log(getLogger("S3QueueFilesMetadata")) { @@ -197,6 +200,126 @@ S3QueueFilesMetadata::NodeMetadata S3QueueFilesMetadata::createNodeMetadata( return metadata; } +bool S3QueueFilesMetadata::isShardedProcessing() const +{ + return getProcessingIdsNum() > 1 && mode == S3QueueMode::ORDERED; +} + +size_t S3QueueFilesMetadata::registerNewShard() +{ + if (!isShardedProcessing()) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Cannot register a new shard, because processing is not sharded"); + } + + const auto zk_client = getZooKeeper(); + zk_client->createIfNotExists(zookeeper_shards_path, ""); + + std::string shard_node_path; + size_t shard_id = 0; + for (size_t i = 0; i < shards_num; ++i) + { + const auto node_path = getZooKeeperPathForShard(i); + auto err = zk_client->tryCreate(node_path, "", zkutil::CreateMode::Persistent); + if (err == Coordination::Error::ZOK) + { + shard_node_path = node_path; + shard_id = i; + break; + } + else if (err == Coordination::Error::ZNODEEXISTS) + continue; + else + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Unexpected error: {}", magic_enum::enum_name(err)); + } + + if (shard_node_path.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to register a new shard"); + + LOG_TRACE(log, "Using shard {} (zk node: {})", shard_id, shard_node_path); + return shard_id; +} + +std::string S3QueueFilesMetadata::getZooKeeperPathForShard(size_t shard_id) const +{ + return zookeeper_shards_path / ("shard" + toString(shard_id)); +} + +void S3QueueFilesMetadata::registerNewShard(size_t shard_id) +{ + if (!isShardedProcessing()) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Cannot register a new shard, because processing is not sharded"); + } + + const auto zk_client = getZooKeeper(); + const auto node_path = getZooKeeperPathForShard(shard_id); + zk_client->createAncestors(node_path); + + auto err = zk_client->tryCreate(node_path, "", zkutil::CreateMode::Persistent); + if (err != Coordination::Error::ZOK) + { + if (err == Coordination::Error::ZNODEEXISTS) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot register shard {}: already exists", shard_id); + else + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Unexpected error: {}", magic_enum::enum_name(err)); + } +} + +bool S3QueueFilesMetadata::isShardRegistered(size_t shard_id) +{ + const auto zk_client = getZooKeeper(); + const auto node_path = getZooKeeperPathForShard(shard_id); + return zk_client->exists(node_path); +} + +void S3QueueFilesMetadata::unregisterShard(size_t shard_id) +{ + if (!isShardedProcessing()) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Cannot unregister a shard, because processing is not sharded"); + } + + const auto zk_client = getZooKeeper(); + const auto node_path = getZooKeeperPathForShard(shard_id); + auto error_code = zk_client->tryRemove(node_path); + if (error_code != Coordination::Error::ZOK + && error_code != Coordination::Error::ZNONODE) + throw zkutil::KeeperException::fromPath(error_code, node_path); +} + +size_t S3QueueFilesMetadata::getProcessingIdsNum() const +{ + return shards_num * threads_per_shard; +} + +std::vector S3QueueFilesMetadata::getProcessingIdsForShard(size_t shard_id) const +{ + std::vector res(threads_per_shard); + std::iota(res.begin(), res.end(), shard_id * threads_per_shard); + return res; +} + +bool S3QueueFilesMetadata::isProcessingIdBelongsToShard(size_t id, size_t shard_id) const +{ + return shard_id * threads_per_shard <= id && id < (shard_id + 1) * threads_per_shard; +} + +size_t S3QueueFilesMetadata::getIdForProcessingThread(size_t thread_id, size_t shard_id) const +{ + return shard_id * threads_per_shard + thread_id; +} + +size_t S3QueueFilesMetadata::getProcessingIdForPath(const std::string & path) const +{ + return sipHash64(path) % getProcessingIdsNum(); +} + S3QueueFilesMetadata::ProcessingNodeHolderPtr S3QueueFilesMetadata::trySetFileAsProcessing(const std::string & path) { auto timer = DB::CurrentThread::getProfileEvents().timer(ProfileEvents::S3QueueSetFileProcessingMicroseconds); @@ -212,16 +335,24 @@ S3QueueFilesMetadata::ProcessingNodeHolderPtr S3QueueFilesMetadata::trySetFileAs std::lock_guard lock(file_status->metadata_lock); switch (file_status->state) { - case FileStatus::State::Processing: [[fallthrough]]; + case FileStatus::State::Processing: + { + LOG_TEST(log, "File {} is already processing", path); + return {}; + } case FileStatus::State::Processed: { + LOG_TEST(log, "File {} is already processed", path); return {}; } case FileStatus::State::Failed: { /// If max_loading_retries == 0, file is not retriable. if (max_loading_retries == 0) + { + LOG_TEST(log, "File {} is failed and processing retries are disabled", path); return {}; + } /// Otherwise file_status->retries is also cached. /// In case file_status->retries >= max_loading_retries we can fully rely that it is true @@ -230,7 +361,10 @@ S3QueueFilesMetadata::ProcessingNodeHolderPtr S3QueueFilesMetadata::trySetFileAs /// (another server could have done a try after we cached retries value), /// so check with zookeeper here. if (file_status->retries >= max_loading_retries) + { + LOG_TEST(log, "File {} is failed and processing retries are exceeeded", path); return {}; + } break; } @@ -284,35 +418,31 @@ S3QueueFilesMetadata::ProcessingNodeHolderPtr S3QueueFilesMetadata::trySetFileAs if (!file_status->processing_start_time) file_status->processing_start_time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); - break; + return processing_node_holder; } case SetFileProcessingResult::AlreadyProcessed: { std::lock_guard lock(file_status->metadata_lock); file_status->state = FileStatus::State::Processed; - break; + return {}; } case SetFileProcessingResult::AlreadyFailed: { std::lock_guard lock(file_status->metadata_lock); file_status->state = FileStatus::State::Failed; - break; + return {}; } case SetFileProcessingResult::ProcessingByOtherNode: { /// We cannot save any local state here, see comment above. - break; + return {}; } } - - if (result == SetFileProcessingResult::Success) - return processing_node_holder; - - return {}; } std::pair S3QueueFilesMetadata::trySetFileAsProcessingForUnorderedMode(const std::string & path, const FileStatusPtr & file_status) + S3QueueFilesMetadata::ProcessingNodeHolderPtr> +S3QueueFilesMetadata::trySetFileAsProcessingForUnorderedMode(const std::string & path, const FileStatusPtr & file_status) { /// In one zookeeper transaction do the following: /// 1. check that corresponding persistent nodes do not exist in processed/ and failed/; @@ -339,7 +469,8 @@ std::pair(node_metadata.processing_id, path, zookeeper_processing_path / node_name, file_status, zk_client); + auto holder = std::make_unique( + node_metadata.processing_id, path, zookeeper_processing_path / node_name, file_status, zk_client); return std::pair{SetFileProcessingResult::Success, std::move(holder)}; } @@ -362,7 +493,8 @@ std::pair S3QueueFilesMetadata::trySetFileAsProcessingForOrderedMode(const std::string & path, const FileStatusPtr & file_status) + S3QueueFilesMetadata::ProcessingNodeHolderPtr> +S3QueueFilesMetadata::trySetFileAsProcessingForOrderedMode(const std::string & path, const FileStatusPtr & file_status) { /// Same as for Unordered mode. /// The only difference is the check if the file is already processed. @@ -385,10 +517,15 @@ std::pairget(zookeeper_processed_path, &processed_node_stat); + auto processed_node = isShardedProcessing() + ? zookeeper_processed_path / toString(getProcessingIdForPath(path)) + : zookeeper_processed_path; + NodeMetadata processed_node_metadata; - if (!data.empty()) + Coordination::Stat processed_node_stat; + std::string data; + auto processed_node_exists = zk_client->tryGet(processed_node, data, &processed_node_stat); + if (processed_node_exists && !data.empty()) processed_node_metadata = NodeMetadata::fromString(data); auto max_processed_file_path = processed_node_metadata.file_path; @@ -403,13 +540,25 @@ std::pairtryMulti(requests, responses); if (code == Coordination::Error::ZOK) { - auto holder = std::make_unique(node_metadata.processing_id, path, zookeeper_processing_path / node_name, file_status, zk_client); + auto holder = std::make_unique( + node_metadata.processing_id, path, zookeeper_processing_path / node_name, file_status, zk_client); + + LOG_TEST(log, "File {} is ready to be processed", path); return std::pair{SetFileProcessingResult::Success, std::move(holder)}; } @@ -491,20 +640,31 @@ void S3QueueFilesMetadata::setFileProcessedForUnorderedMode(ProcessingNodeHolder "this could be a result of expired zookeeper session", path); } + void S3QueueFilesMetadata::setFileProcessedForOrderedMode(ProcessingNodeHolderPtr holder) +{ + auto processed_node_path = isShardedProcessing() + ? zookeeper_processed_path / toString(getProcessingIdForPath(holder->path)) + : zookeeper_processed_path; + + return setFileProcessedForOrderedModeImpl(holder->path, holder, processed_node_path); +} + +void S3QueueFilesMetadata::setFileProcessedForOrderedModeImpl( + const std::string & path, ProcessingNodeHolderPtr holder, const std::string & processed_node_path) { /// Update a persistent node in /processed and remove ephemeral node from /processing. - const auto & path = holder->path; const auto node_name = getNodeName(path); const auto node_metadata = createNodeMetadata(path).toString(); const auto zk_client = getZooKeeper(); + LOG_TEST(log, "Setting file `{}` as processed (at {})", path, processed_node_path); while (true) { std::string res; Coordination::Stat stat; - bool exists = zk_client->tryGet(zookeeper_processed_path, res, &stat); + bool exists = zk_client->tryGet(processed_node_path, res, &stat); Coordination::Requests requests; if (exists) { @@ -513,39 +673,41 @@ void S3QueueFilesMetadata::setFileProcessedForOrderedMode(ProcessingNodeHolderPt auto metadata = NodeMetadata::fromString(res); if (metadata.file_path >= path) { - /// Here we get in the case that maximum processed file is bigger than ours. - /// This is possible to achieve in case of parallel processing - /// but for local processing we explicitly disable parallel mode and do everything in a single thread - /// (see constructor of StorageS3Queue where s3queue_processing_threads_num is explicitly set to 1 in case of Ordered mode). - /// Nevertheless, in case of distributed processing we cannot do anything with parallelism. - /// What this means? - /// It means that in scenario "distributed processing + Ordered mode" - /// a setting s3queue_loading_retries will not work. It is possible to fix, it is in TODO. - - /// Return because there is nothing to change, - /// the max processed file is already bigger than ours. + LOG_TRACE(log, "File {} is already processed, current max processed file: {}", path, metadata.file_path); return; } } - requests.push_back(zkutil::makeSetRequest(zookeeper_processed_path, node_metadata, stat.version)); + requests.push_back(zkutil::makeSetRequest(processed_node_path, node_metadata, stat.version)); } else { - requests.push_back(zkutil::makeCreateRequest(zookeeper_processed_path, node_metadata, zkutil::CreateMode::Persistent)); + requests.push_back(zkutil::makeCreateRequest(processed_node_path, node_metadata, zkutil::CreateMode::Persistent)); } Coordination::Responses responses; - if (holder->remove(&requests, &responses)) + if (holder) { - LOG_TEST(log, "Moved file `{}` to processed", path); - if (max_loading_retries) - zk_client->tryRemove(zookeeper_failed_path / (node_name + ".retriable"), -1); - return; + if (holder->remove(&requests, &responses)) + { + LOG_TEST(log, "Moved file `{}` to processed", path); + if (max_loading_retries) + zk_client->tryRemove(zookeeper_failed_path / (node_name + ".retriable"), -1); + return; + } + } + else + { + auto code = zk_client->tryMulti(requests, responses); + if (code == Coordination::Error::ZOK) + return; } /// Failed to update max processed node, retry. if (!responses.empty() && responses[0]->error != Coordination::Error::ZOK) + { + LOG_TRACE(log, "Failed to update processed node ({}). Will retry.", magic_enum::enum_name(responses[0]->error)); continue; + } LOG_WARNING(log, "Cannot set file ({}) as processed since processing node " "does not exist with expected processing id does not exist, " @@ -554,6 +716,22 @@ void S3QueueFilesMetadata::setFileProcessedForOrderedMode(ProcessingNodeHolderPt } } +void S3QueueFilesMetadata::setFileProcessed(const std::string & path, size_t shard_id) +{ + if (mode != S3QueueMode::ORDERED) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Can set file as preprocessed only for Ordered mode"); + + if (isShardedProcessing()) + { + for (const auto & processor : getProcessingIdsForShard(shard_id)) + setFileProcessedForOrderedModeImpl(path, nullptr, zookeeper_processed_path / toString(processor)); + } + else + { + setFileProcessedForOrderedModeImpl(path, nullptr, zookeeper_processed_path); + } +} + void S3QueueFilesMetadata::setFileFailed(ProcessingNodeHolderPtr holder, const String & exception_message) { auto timer = DB::CurrentThread::getProfileEvents().timer(ProfileEvents::S3QueueSetFileFailedMicroseconds); diff --git a/src/Storages/S3Queue/S3QueueFilesMetadata.h b/src/Storages/S3Queue/S3QueueFilesMetadata.h index 390cb5a64ab..9301ea7ceb8 100644 --- a/src/Storages/S3Queue/S3QueueFilesMetadata.h +++ b/src/Storages/S3Queue/S3QueueFilesMetadata.h @@ -42,6 +42,7 @@ public: ~S3QueueFilesMetadata(); void setFileProcessed(ProcessingNodeHolderPtr holder); + void setFileProcessed(const std::string & path, size_t shard_id); void setFileFailed(ProcessingNodeHolderPtr holder, const std::string & exception_message); @@ -80,6 +81,38 @@ public: void deactivateCleanupTask(); + /// Should the table use sharded processing? + /// We use sharded processing for Ordered mode of S3Queue table. + /// It allows to parallelize processing within a single server + /// and to allow distributed processing. + bool isShardedProcessing() const; + + /// Register a new shard for processing. + /// Return a shard id of registered shard. + size_t registerNewShard(); + /// Register a new shard for processing by given id. + /// Throws exception if shard by this id is already registered. + void registerNewShard(size_t shard_id); + /// Unregister shard from keeper. + void unregisterShard(size_t shard_id); + bool isShardRegistered(size_t shard_id); + + /// Total number of processing ids. + /// A processing id identifies a single processing thread. + /// There might be several processing ids per shard. + size_t getProcessingIdsNum() const; + /// Get processing ids identified with requested shard. + std::vector getProcessingIdsForShard(size_t shard_id) const; + /// Check if given processing id belongs to a given shard. + bool isProcessingIdBelongsToShard(size_t id, size_t shard_id) const; + /// Get a processing id for processing thread by given thread id. + /// thread id is a value in range [0, threads_per_shard]. + size_t getIdForProcessingThread(size_t thread_id, size_t shard_id) const; + + /// Calculate which processing id corresponds to a given file path. + /// The file will be processed by a thread related to this processing id. + size_t getProcessingIdForPath(const std::string & path) const; + private: const S3QueueMode mode; const UInt64 max_set_size; @@ -87,10 +120,13 @@ private: const UInt64 max_loading_retries; const size_t min_cleanup_interval_ms; const size_t max_cleanup_interval_ms; + const size_t shards_num; + const size_t threads_per_shard; const fs::path zookeeper_processing_path; const fs::path zookeeper_processed_path; const fs::path zookeeper_failed_path; + const fs::path zookeeper_shards_path; const fs::path zookeeper_cleanup_lock_path; LoggerPtr log; @@ -104,6 +140,10 @@ private: void setFileProcessedForOrderedMode(ProcessingNodeHolderPtr holder); void setFileProcessedForUnorderedMode(ProcessingNodeHolderPtr holder); + std::string getZooKeeperPathForShard(size_t shard_id) const; + + void setFileProcessedForOrderedModeImpl( + const std::string & path, ProcessingNodeHolderPtr holder, const std::string & processed_node_path); enum class SetFileProcessingResult { @@ -117,8 +157,7 @@ private: struct NodeMetadata { - std::string file_path; - UInt64 last_processed_timestamp = 0; + std::string file_path; UInt64 last_processed_timestamp = 0; std::string last_exception; UInt64 retries = 0; std::string processing_id; /// For ephemeral processing node. diff --git a/src/Storages/S3Queue/S3QueueSettings.h b/src/Storages/S3Queue/S3QueueSettings.h index 66fe9b4ce31..c26e973a1c0 100644 --- a/src/Storages/S3Queue/S3QueueSettings.h +++ b/src/Storages/S3Queue/S3QueueSettings.h @@ -22,6 +22,7 @@ class ASTStorage; M(UInt32, s3queue_loading_retries, 0, "Retry loading up to specified number of times", 0) \ M(UInt32, s3queue_processing_threads_num, 1, "Number of processing threads", 0) \ M(UInt32, s3queue_enable_logging_to_s3queue_log, 1, "Enable logging to system table system.s3queue_log", 0) \ + M(String, s3queue_last_processed_path, "", "For Ordered mode. Files that have lexicographically smaller file name are considered already processed", 0) \ M(UInt32, s3queue_tracked_file_ttl_sec, 0, "Maximum number of seconds to store processed files in ZooKeeper node (store forever by default)", 0) \ M(UInt32, s3queue_polling_min_timeout_ms, 1000, "Minimal timeout before next polling", 0) \ M(UInt32, s3queue_polling_max_timeout_ms, 10000, "Maximum timeout before next polling", 0) \ @@ -29,6 +30,8 @@ class ASTStorage; M(UInt32, s3queue_tracked_files_limit, 1000, "For unordered mode. Max set size for tracking processed files in ZooKeeper", 0) \ M(UInt32, s3queue_cleanup_interval_min_ms, 60000, "For unordered mode. Polling backoff min for cleanup", 0) \ M(UInt32, s3queue_cleanup_interval_max_ms, 60000, "For unordered mode. Polling backoff max for cleanup", 0) \ + M(UInt32, s3queue_total_shards_num, 1, "Value 0 means disabled", 0) \ + M(UInt32, s3queue_current_shard_num, 0, "", 0) \ #define LIST_OF_S3QUEUE_SETTINGS(M, ALIAS) \ S3QUEUE_RELATED_SETTINGS(M, ALIAS) \ diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 54155ad3ea7..b4f5f957f76 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -28,6 +28,7 @@ namespace ErrorCodes { extern const int S3_ERROR; extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; } StorageS3QueueSource::S3QueueKeyWithInfo::S3QueueKeyWithInfo( @@ -42,33 +43,112 @@ StorageS3QueueSource::S3QueueKeyWithInfo::S3QueueKeyWithInfo( StorageS3QueueSource::FileIterator::FileIterator( std::shared_ptr metadata_, std::unique_ptr glob_iterator_, + size_t current_shard_, std::atomic & shutdown_called_) : metadata(metadata_) , glob_iterator(std::move(glob_iterator_)) , shutdown_called(shutdown_called_) + , log(&Poco::Logger::get("StorageS3QueueSource")) + , sharded_processing(metadata->isShardedProcessing()) + , current_shard(current_shard_) { + if (sharded_processing) + { + for (const auto & id : metadata->getProcessingIdsForShard(current_shard)) + sharded_keys.emplace(id, std::deque{}); + } } -StorageS3QueueSource::KeyWithInfoPtr StorageS3QueueSource::FileIterator::next() +StorageS3QueueSource::KeyWithInfoPtr StorageS3QueueSource::FileIterator::next(size_t idx) { while (!shutdown_called) { - KeyWithInfoPtr val = glob_iterator->next(); + KeyWithInfoPtr val{nullptr}; + + { + std::unique_lock lk(sharded_keys_mutex, std::defer_lock); + if (sharded_processing) + { + /// To make sure order on keys in each shard in sharded_keys + /// we need to check sharded_keys and to next() under lock. + lk.lock(); + + if (auto it = sharded_keys.find(idx); it != sharded_keys.end()) + { + auto & keys = it->second; + if (!keys.empty()) + { + val = keys.front(); + keys.pop_front(); + } + } + else + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Processing id {} does not exist (Expected ids: {})", + idx, fmt::join(metadata->getProcessingIdsForShard(current_shard), ", ")); + } + } + + if (!val) + { + val = glob_iterator->next(); + if (val && sharded_processing) + { + const auto processing_id_for_key = metadata->getProcessingIdForPath(val->key); + if (idx != processing_id_for_key) + { + if (metadata->isProcessingIdBelongsToShard(processing_id_for_key, current_shard)) + { + LOG_TEST(log, "Putting key {} into queue of processor {} (total: {})", + val->key, processing_id_for_key, sharded_keys.size()); + + if (auto it = sharded_keys.find(idx); it != sharded_keys.end()) + { + it->second.push_back(val); + } + else + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Processing id {} does not exist (Expected ids: {})", + idx, fmt::join(metadata->getProcessingIdsForShard(current_shard), ", ")); + } + } + continue; + } + } + } + } if (!val) return {}; if (shutdown_called) { - LOG_TEST(getLogger("StorageS3QueueSource"), "Shutdown was called, stopping file iterator"); + LOG_TEST(log, "Shutdown was called, stopping file iterator"); return {}; } - if (auto processing_holder = metadata->trySetFileAsProcessing(val->key); - processing_holder && !shutdown_called) + auto processing_holder = metadata->trySetFileAsProcessing(val->key); + if (shutdown_called) + { + LOG_TEST(log, "Shutdown was called, stopping file iterator"); + return {}; + } + + LOG_TEST(log, "Checking if can process key {} for processing_id {}", val->key, idx); + + if (processing_holder) { return std::make_shared(val->key, val->info, processing_holder); } + else if (sharded_processing + && metadata->getFileStatus(val->key)->state == S3QueueFilesMetadata::FileStatus::State::Processing) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "File {} is processing by someone else in sharded processing. " + "It is a bug", val->key); + } } return {}; } @@ -83,6 +163,7 @@ StorageS3QueueSource::StorageS3QueueSource( const Block & header_, std::unique_ptr internal_source_, std::shared_ptr files_metadata_, + size_t processing_id_, const S3QueueAction & action_, RemoveFileFunc remove_file_func_, const NamesAndTypesList & requested_virtual_columns_, @@ -96,6 +177,7 @@ StorageS3QueueSource::StorageS3QueueSource( , WithContext(context_) , name(std::move(name_)) , action(action_) + , processing_id(processing_id_) , files_metadata(files_metadata_) , internal_source(std::move(internal_source_)) , requested_virtual_columns(requested_virtual_columns_) @@ -123,7 +205,7 @@ void StorageS3QueueSource::lazyInitialize() if (initialized) return; - internal_source->lazyInitialize(); + internal_source->lazyInitialize(processing_id); reader = std::move(internal_source->reader); if (reader) reader_future = std::move(internal_source->reader_future); @@ -249,7 +331,7 @@ Chunk StorageS3QueueSource::generate() /// Even if task is finished the thread may be not freed in pool. /// So wait until it will be freed before scheduling a new task. internal_source->create_reader_pool.wait(); - reader_future = internal_source->createReaderAsync(); + reader_future = internal_source->createReaderAsync(processing_id); } return {}; diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index 82e75020efb..8fc7305ea08 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -38,12 +38,16 @@ public: class FileIterator : public IIterator { public: - FileIterator(std::shared_ptr metadata_, std::unique_ptr glob_iterator_, std::atomic & shutdown_called_); + FileIterator( + std::shared_ptr metadata_, + std::unique_ptr glob_iterator_, + size_t current_shard_, + std::atomic & shutdown_called_); /// Note: /// List results in s3 are always returned in UTF-8 binary order. /// (https://docs.aws.amazon.com/AmazonS3/latest/userguide/ListingKeysUsingAPIs.html) - KeyWithInfoPtr next() override; + KeyWithInfoPtr next(size_t idx) override; size_t estimatedKeysCount() override; @@ -52,6 +56,12 @@ public: const std::unique_ptr glob_iterator; std::atomic & shutdown_called; std::mutex mutex; + Poco::Logger * log; + + const bool sharded_processing; + const size_t current_shard; + std::unordered_map> sharded_keys; + std::mutex sharded_keys_mutex; }; StorageS3QueueSource( @@ -59,6 +69,7 @@ public: const Block & header_, std::unique_ptr internal_source_, std::shared_ptr files_metadata_, + size_t processing_id_, const S3QueueAction & action_, RemoveFileFunc remove_file_func_, const NamesAndTypesList & requested_virtual_columns_, @@ -80,6 +91,7 @@ public: private: const String name; const S3QueueAction action; + const size_t processing_id; const std::shared_ptr files_metadata; const std::shared_ptr internal_source; const NamesAndTypesList requested_virtual_columns; diff --git a/src/Storages/S3Queue/S3QueueTableMetadata.cpp b/src/Storages/S3Queue/S3QueueTableMetadata.cpp index 104f70224b6..1830bac4743 100644 --- a/src/Storages/S3Queue/S3QueueTableMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueTableMetadata.cpp @@ -16,8 +16,22 @@ namespace DB namespace ErrorCodes { extern const int METADATA_MISMATCH; + extern const int BAD_ARGUMENTS; } +namespace +{ + S3QueueMode modeFromString(const std::string & mode) + { + if (mode == "ordered") + return S3QueueMode::ORDERED; + if (mode == "unordered") + return S3QueueMode::UNORDERED; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected S3Queue mode: {}", mode); + } +} + + S3QueueTableMetadata::S3QueueTableMetadata( const StorageS3::Configuration & configuration, const S3QueueSettings & engine_settings, @@ -28,10 +42,11 @@ S3QueueTableMetadata::S3QueueTableMetadata( mode = engine_settings.mode.toString(); s3queue_tracked_files_limit = engine_settings.s3queue_tracked_files_limit; s3queue_tracked_file_ttl_sec = engine_settings.s3queue_tracked_file_ttl_sec; + s3queue_total_shards_num = engine_settings.s3queue_total_shards_num; + s3queue_processing_threads_num = engine_settings.s3queue_processing_threads_num; columns = storage_metadata.getColumns().toString(); } - String S3QueueTableMetadata::toString() const { Poco::JSON::Object json; @@ -39,6 +54,8 @@ String S3QueueTableMetadata::toString() const json.set("mode", mode); json.set("s3queue_tracked_files_limit", s3queue_tracked_files_limit); json.set("s3queue_tracked_file_ttl_sec", s3queue_tracked_file_ttl_sec); + json.set("s3queue_total_shards_num", s3queue_total_shards_num); + json.set("s3queue_processing_threads_num", s3queue_processing_threads_num); json.set("format_name", format_name); json.set("columns", columns); @@ -52,12 +69,23 @@ void S3QueueTableMetadata::read(const String & metadata_str) { Poco::JSON::Parser parser; auto json = parser.parse(metadata_str).extract(); + after_processing = json->getValue("after_processing"); mode = json->getValue("mode"); s3queue_tracked_files_limit = json->getValue("s3queue_tracked_files_limit"); s3queue_tracked_file_ttl_sec = json->getValue("s3queue_tracked_file_ttl_sec"); format_name = json->getValue("format_name"); columns = json->getValue("columns"); + + if (json->has("s3queue_total_shards_num")) + s3queue_total_shards_num = json->getValue("s3queue_total_shards_num"); + else + s3queue_total_shards_num = 1; + + if (json->has("s3queue_processing_threads_num")) + s3queue_processing_threads_num = json->getValue("s3queue_processing_threads_num"); + else + s3queue_processing_threads_num = 1; } S3QueueTableMetadata S3QueueTableMetadata::parse(const String & metadata_str) @@ -67,7 +95,6 @@ S3QueueTableMetadata S3QueueTableMetadata::parse(const String & metadata_str) return metadata; } - void S3QueueTableMetadata::checkImmutableFieldsEquals(const S3QueueTableMetadata & from_zk) const { if (after_processing != from_zk.after_processing) @@ -83,8 +110,8 @@ void S3QueueTableMetadata::checkImmutableFieldsEquals(const S3QueueTableMetadata ErrorCodes::METADATA_MISMATCH, "Existing table metadata in ZooKeeper differs in engine mode. " "Stored in ZooKeeper: {}, local: {}", - DB::toString(from_zk.mode), - DB::toString(mode)); + from_zk.mode, + mode); if (s3queue_tracked_files_limit != from_zk.s3queue_tracked_files_limit) throw Exception( @@ -109,6 +136,28 @@ void S3QueueTableMetadata::checkImmutableFieldsEquals(const S3QueueTableMetadata "Stored in ZooKeeper: {}, local: {}", from_zk.format_name, format_name); + + if (modeFromString(mode) == S3QueueMode::ORDERED) + { + if (s3queue_processing_threads_num != from_zk.s3queue_processing_threads_num) + { + throw Exception( + ErrorCodes::METADATA_MISMATCH, + "Existing table metadata in ZooKeeper differs in s3queue_processing_threads_num setting. " + "Stored in ZooKeeper: {}, local: {}", + from_zk.s3queue_processing_threads_num, + s3queue_processing_threads_num); + } + if (s3queue_total_shards_num != from_zk.s3queue_total_shards_num) + { + throw Exception( + ErrorCodes::METADATA_MISMATCH, + "Existing table metadata in ZooKeeper differs in s3queue_total_shards_num setting. " + "Stored in ZooKeeper: {}, local: {}", + from_zk.s3queue_total_shards_num, + s3queue_total_shards_num); + } + } } void S3QueueTableMetadata::checkEquals(const S3QueueTableMetadata & from_zk) const diff --git a/src/Storages/S3Queue/S3QueueTableMetadata.h b/src/Storages/S3Queue/S3QueueTableMetadata.h index f15665692c4..84087f72a6a 100644 --- a/src/Storages/S3Queue/S3QueueTableMetadata.h +++ b/src/Storages/S3Queue/S3QueueTableMetadata.h @@ -21,8 +21,10 @@ struct S3QueueTableMetadata String columns; String after_processing; String mode; - UInt64 s3queue_tracked_files_limit; - UInt64 s3queue_tracked_file_ttl_sec; + UInt64 s3queue_tracked_files_limit = 0; + UInt64 s3queue_tracked_file_ttl_sec = 0; + UInt64 s3queue_total_shards_num = 1; + UInt64 s3queue_processing_threads_num = 1; S3QueueTableMetadata() = default; S3QueueTableMetadata(const StorageS3::Configuration & configuration, const S3QueueSettings & engine_settings, const StorageInMemoryMetadata & storage_metadata); diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index a8741aed3c5..4f73f4ee205 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -75,14 +75,8 @@ namespace return zkutil::extractZooKeeperPath(result_zk_path, true); } - void checkAndAdjustSettings(S3QueueSettings & s3queue_settings, const Settings & settings, LoggerPtr log) + void checkAndAdjustSettings(S3QueueSettings & s3queue_settings, const Settings & settings) { - if (s3queue_settings.mode == S3QueueMode::ORDERED && s3queue_settings.s3queue_processing_threads_num > 1) - { - LOG_WARNING(log, "Parallel processing is not yet supported for Ordered mode"); - s3queue_settings.s3queue_processing_threads_num = 1; - } - if (!s3queue_settings.s3queue_processing_threads_num) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Setting `s3queue_processing_threads_num` cannot be set to zero"); @@ -110,7 +104,8 @@ StorageS3Queue::StorageS3Queue( const ConstraintsDescription & constraints_, const String & comment, ContextPtr context_, - std::optional format_settings_) + std::optional format_settings_, + ASTStorage * engine_args) : IStorage(table_id_) , WithContext(context_) , s3queue_settings(std::move(s3queue_settings_)) @@ -134,7 +129,7 @@ StorageS3Queue::StorageS3Queue( throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "S3Queue url must either end with '/' or contain globs"); } - checkAndAdjustSettings(*s3queue_settings, context_->getSettingsRef(), log); + checkAndAdjustSettings(*s3queue_settings, context_->getSettingsRef()); configuration.update(context_); FormatFactory::instance().checkFormatName(configuration.format); @@ -143,11 +138,17 @@ StorageS3Queue::StorageS3Queue( StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = StorageS3::getTableStructureFromDataImpl(configuration, format_settings, context_); + ColumnsDescription columns; + if (configuration.format == "auto") + std::tie(columns, configuration.format) = StorageS3::getTableStructureAndFormatFromData(configuration, format_settings, context_); + else + columns = StorageS3::getTableStructureFromData(configuration, format_settings, context_); storage_metadata.setColumns(columns); } else { + if (configuration.format == "auto") + configuration.format = StorageS3::getTableStructureAndFormatFromData(configuration, format_settings, context_).second; storage_metadata.setColumns(columns_); } @@ -160,19 +161,36 @@ StorageS3Queue::StorageS3Queue( LOG_INFO(log, "Using zookeeper path: {}", zk_path.string()); task = getContext()->getSchedulePool().createTask("S3QueueStreamingTask", [this] { threadFunc(); }); - /// Get metadata manager from S3QueueMetadataFactory, - /// it will increase the ref count for the metadata object. - /// The ref count is decreased when StorageS3Queue::drop() method is called. - files_metadata = S3QueueMetadataFactory::instance().getOrCreate(zk_path, *s3queue_settings); try { createOrCheckMetadata(storage_metadata); } catch (...) { - S3QueueMetadataFactory::instance().remove(zk_path); throw; } + + /// Get metadata manager from S3QueueMetadataFactory, + /// it will increase the ref count for the metadata object. + /// The ref count is decreased when StorageS3Queue::drop() method is called. + files_metadata = S3QueueMetadataFactory::instance().getOrCreate(zk_path, *s3queue_settings); + + if (files_metadata->isShardedProcessing()) + { + if (!s3queue_settings->s3queue_current_shard_num.changed) + { + s3queue_settings->s3queue_current_shard_num = static_cast(files_metadata->registerNewShard()); + engine_args->settings->changes.setSetting("s3queue_current_shard_num", s3queue_settings->s3queue_current_shard_num.value); + } + else if (!files_metadata->isShardRegistered(s3queue_settings->s3queue_current_shard_num)) + { + files_metadata->registerNewShard(s3queue_settings->s3queue_current_shard_num); + } + } + if (s3queue_settings->mode == S3QueueMode::ORDERED && !s3queue_settings->s3queue_last_processed_path.value.empty()) + { + files_metadata->setFileProcessed(s3queue_settings->s3queue_last_processed_path.value, s3queue_settings->s3queue_current_shard_num); + } } void StorageS3Queue::startup() @@ -186,6 +204,7 @@ void StorageS3Queue::shutdown(bool is_drop) table_is_being_dropped = is_drop; shutdown_called = true; + LOG_TRACE(log, "Shutting down storage..."); if (task) { task->deactivate(); @@ -194,8 +213,16 @@ void StorageS3Queue::shutdown(bool is_drop) if (files_metadata) { files_metadata->deactivateCleanupTask(); + + if (is_drop && files_metadata->isShardedProcessing()) + { + files_metadata->unregisterShard(s3queue_settings->s3queue_current_shard_num); + LOG_TRACE(log, "Unregistered shard {} from zookeeper", s3queue_settings->s3queue_current_shard_num); + } + files_metadata.reset(); } + LOG_TRACE(log, "Shut down storage"); } void StorageS3Queue::drop() @@ -220,14 +247,12 @@ public: ReadFromFormatInfo info_, std::shared_ptr storage_, ContextPtr context_, - size_t max_block_size_, - size_t num_streams_) + size_t max_block_size_) : SourceStepWithFilter(DataStream{.header = std::move(sample_block)}) , info(std::move(info_)) , storage(std::move(storage_)) , context(std::move(context_)) , max_block_size(max_block_size_) - , num_streams(num_streams_) { } @@ -236,7 +261,6 @@ private: std::shared_ptr storage; ContextPtr context; size_t max_block_size; - size_t num_streams; std::shared_ptr iterator; @@ -270,7 +294,7 @@ void StorageS3Queue::read( ContextPtr local_context, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, - size_t num_streams) + size_t) { if (!local_context->getSettingsRef().stream_like_engine_allow_direct_select) { @@ -292,8 +316,7 @@ void StorageS3Queue::read( read_from_format_info, std::move(this_ptr), local_context, - max_block_size, - num_streams); + max_block_size); query_plan.addStep(std::move(reading)); } @@ -301,11 +324,15 @@ void StorageS3Queue::read( void ReadFromS3Queue::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) { Pipes pipes; - const size_t adjusted_num_streams = std::min(num_streams, storage->s3queue_settings->s3queue_processing_threads_num); + const size_t adjusted_num_streams = storage->s3queue_settings->s3queue_processing_threads_num; createIterator(nullptr); for (size_t i = 0; i < adjusted_num_streams; ++i) - pipes.emplace_back(storage->createSource(info, iterator, max_block_size, context)); + pipes.emplace_back(storage->createSource( + info, + iterator, + storage->files_metadata->getIdForProcessingThread(i, storage->s3queue_settings->s3queue_current_shard_num), + max_block_size, context)); auto pipe = Pipe::unitePipes(std::move(pipes)); if (pipe.empty()) @@ -320,6 +347,7 @@ void ReadFromS3Queue::initializePipeline(QueryPipelineBuilder & pipeline, const std::shared_ptr StorageS3Queue::createSource( const ReadFromFormatInfo & info, std::shared_ptr file_iterator, + size_t processing_id, size_t max_block_size, ContextPtr local_context) { @@ -359,7 +387,7 @@ std::shared_ptr StorageS3Queue::createSource( auto s3_queue_log = s3queue_settings->s3queue_enable_logging_to_s3queue_log ? local_context->getS3QueueLog() : nullptr; return std::make_shared( getName(), info.source_header, std::move(internal_source), - files_metadata, after_processing, file_deleter, info.requested_virtual_columns, + files_metadata, processing_id, after_processing, file_deleter, info.requested_virtual_columns, local_context, shutdown_called, table_is_being_dropped, s3_queue_log, getStorageID(), log); } @@ -463,7 +491,8 @@ bool StorageS3Queue::streamToViews() for (size_t i = 0; i < s3queue_settings->s3queue_processing_threads_num; ++i) { auto source = createSource( - read_from_format_info, file_iterator, DBMS_DEFAULT_BUFFER_SIZE, s3queue_context); + read_from_format_info, file_iterator, files_metadata->getIdForProcessingThread(i, s3queue_settings->s3queue_current_shard_num), + DBMS_DEFAULT_BUFFER_SIZE, s3queue_context); pipes.emplace_back(std::move(source)); } @@ -566,7 +595,7 @@ std::shared_ptr StorageS3Queue::createFileIterator auto glob_iterator = std::make_unique( *configuration.client, configuration.url, predicate, virtual_columns, local_context, /* read_keys */nullptr, configuration.request_settings); - return std::make_shared(files_metadata, std::move(glob_iterator), shutdown_called); + return std::make_shared(files_metadata, std::move(glob_iterator), s3queue_settings->s3queue_current_shard_num, shutdown_called); } void registerStorageS3QueueImpl(const String & name, StorageFactory & factory) @@ -624,7 +653,8 @@ void registerStorageS3QueueImpl(const String & name, StorageFactory & factory) args.constraints, args.comment, args.getContext(), - format_settings); + format_settings, + args.storage_def); }, { .supports_settings = true, diff --git a/src/Storages/S3Queue/StorageS3Queue.h b/src/Storages/S3Queue/StorageS3Queue.h index 5d2be610d58..fd3b4bb4914 100644 --- a/src/Storages/S3Queue/StorageS3Queue.h +++ b/src/Storages/S3Queue/StorageS3Queue.h @@ -11,6 +11,7 @@ #include #include #include +#include namespace Aws::S3 @@ -35,7 +36,8 @@ public: const ConstraintsDescription & constraints_, const String & comment, ContextPtr context_, - std::optional format_settings_); + std::optional format_settings_, + ASTStorage * engine_args); String getName() const override { return "S3Queue"; } @@ -91,6 +93,7 @@ private: std::shared_ptr createSource( const ReadFromFormatInfo & info, std::shared_ptr file_iterator, + size_t processing_id, size_t max_block_size, ContextPtr local_context); diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 662a5c0ef5a..4cb88a6d3fc 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -161,6 +161,8 @@ struct SelectQueryInfo /// It's guaranteed to be present in JOIN TREE of `query_tree` QueryTreeNodePtr table_expression; + bool analyzer_can_use_parallel_replicas_on_follower = false; + /// Table expression modifiers for storage std::optional table_expression_modifiers; diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 01c31eab2b1..f5fcf01c59e 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -65,6 +65,7 @@ namespace ErrorCodes extern const int DATABASE_ACCESS_DENIED; extern const int CANNOT_COMPILE_REGEXP; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int CANNOT_DETECT_FORMAT; extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; @@ -127,7 +128,7 @@ void StorageAzureBlob::processNamedCollectionResult(StorageAzureBlob::Configurat } -StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine_args, ContextPtr local_context) +StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine_args, const ContextPtr & local_context) { StorageAzureBlob::Configuration configuration; @@ -143,7 +144,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); return configuration; } @@ -236,13 +237,13 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); return configuration; } -AzureObjectStorage::SettingsPtr StorageAzureBlob::createSettings(ContextPtr local_context) +AzureObjectStorage::SettingsPtr StorageAzureBlob::createSettings(const ContextPtr & local_context) { const auto & context_settings = local_context->getSettingsRef(); auto settings_ptr = std::make_unique(); @@ -297,7 +298,7 @@ void registerStorageAzureBlob(StorageFactory & factory) return std::make_shared( std::move(configuration), - std::make_unique("AzureBlobStorage", std::move(client), std::move(settings)), + std::make_unique("AzureBlobStorage", std::move(client), std::move(settings),configuration.container), args.getContext(), args.table_id, args.columns, @@ -447,7 +448,7 @@ Poco::URI StorageAzureBlob::Configuration::getConnectionURL() const StorageAzureBlob::StorageAzureBlob( const Configuration & configuration_, std::unique_ptr && object_storage_, - ContextPtr context, + const ContextPtr & context, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -463,17 +464,25 @@ StorageAzureBlob::StorageAzureBlob( , format_settings(format_settings_) , partition_by(partition_by_) { - FormatFactory::instance().checkFormatName(configuration.format); + if (configuration.format != "auto") + FormatFactory::instance().checkFormatName(configuration.format); context->getGlobalContext()->getRemoteHostFilter().checkURL(configuration.getConnectionURL()); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = getTableStructureFromData(object_storage.get(), configuration, format_settings, context, distributed_processing); + ColumnsDescription columns; + if (configuration.format == "auto") + std::tie(columns, configuration.format) = getTableStructureAndFormatFromData(object_storage.get(), configuration, format_settings, context); + else + columns = getTableStructureFromData(object_storage.get(), configuration, format_settings, context); storage_metadata.setColumns(columns); } else { + if (configuration.format == "auto") + configuration.format = getTableStructureAndFormatFromData(object_storage.get(), configuration, format_settings, context).second; + /// We don't allow special columns in File storage. if (!columns_.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine AzureBlobStorage doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -517,7 +526,7 @@ public: StorageAzureBlobSink( const String & format, const Block & sample_block_, - ContextPtr context, + const ContextPtr & context, std::optional format_settings_, const CompressionMethod compression_method, AzureObjectStorage * object_storage, @@ -607,22 +616,21 @@ private: std::mutex cancel_mutex; }; -class PartitionedStorageAzureBlobSink : public PartitionedSink +class PartitionedStorageAzureBlobSink : public PartitionedSink, WithContext { public: PartitionedStorageAzureBlobSink( const ASTPtr & partition_by, const String & format_, const Block & sample_block_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, const CompressionMethod compression_method_, AzureObjectStorage * object_storage_, const String & blob_) - : PartitionedSink(partition_by, context_, sample_block_) + : PartitionedSink(partition_by, context_, sample_block_), WithContext(context_) , format(format_) , sample_block(sample_block_) - , context(context_) , compression_method(compression_method_) , object_storage(object_storage_) , blob(blob_) @@ -638,7 +646,7 @@ public: return std::make_shared( format, sample_block, - context, + getContext(), format_settings, compression_method, object_storage, @@ -649,7 +657,6 @@ public: private: const String format; const Block sample_block; - const ContextPtr context; const CompressionMethod compression_method; AzureObjectStorage * object_storage; const String blob; @@ -913,7 +920,7 @@ StorageAzureBlobSource::GlobIterator::GlobIterator( String blob_path_with_globs_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context_, + const ContextPtr & context_, RelativePathsWithMetadata * outer_blobs_, std::function file_progress_callback_) : IIterator(context_) @@ -1028,7 +1035,7 @@ StorageAzureBlobSource::KeysIterator::KeysIterator( const Strings & keys_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context_, + const ContextPtr & context_, RelativePathsWithMetadata * outer_blobs, std::function file_progress_callback) : IIterator(context_) @@ -1147,7 +1154,7 @@ StorageAzureBlobSource::StorageAzureBlobSource( const ReadFromFormatInfo & info, const String & format_, String name_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, UInt64 max_block_size_, String compression_hint_, @@ -1290,6 +1297,7 @@ namespace ReadBufferIterator( const std::shared_ptr & file_iterator_, AzureObjectStorage * object_storage_, + std::optional format_, const StorageAzureBlob::Configuration & configuration_, const std::optional & format_settings_, const RelativePathsWithMetadata & read_keys_, @@ -1298,19 +1306,38 @@ namespace , file_iterator(file_iterator_) , object_storage(object_storage_) , configuration(configuration_) + , format(std::move(format_)) , format_settings(format_settings_) , read_keys(read_keys_) , prev_read_keys_size(read_keys_.size()) { } - std::pair, std::optional> next() override + Data next() override { /// For default mode check cached columns for currently read keys on first iteration. - if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + if (first) { - if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) - return {nullptr, cached_columns}; + /// If format is unknown we iterate through all currently read keys on first iteration and + /// try to determine format by file name. + if (!format) + { + for (const auto & key : read_keys) + { + if (auto format_from_path = FormatFactory::instance().tryGetFormatFromFileName(key.relative_path)) + { + format = format_from_path; + break; + } + } + } + + /// For default mode check cached columns for currently read keys on first iteration. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) + return {nullptr, cached_columns, format}; + } } current_path_with_metadata = file_iterator->next(); @@ -1318,29 +1345,55 @@ namespace if (current_path_with_metadata.relative_path.empty()) { if (first) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files with provided path " - "in AzureBlobStorage. You must specify table structure manually", configuration.format); + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because there are no files with provided path " + "in AzureBlobStorage. You can specify table structure manually", *format); - return {nullptr, std::nullopt}; + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because there are no files with provided path " + "in AzureBlobStorage. You can specify table structure manually"); + } + + return {nullptr, std::nullopt, format}; } first = false; - /// AzureBlobStorage file iterator could get new keys after new iteration, check them in schema cache if schema inference mode is default. - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT && read_keys.size() > prev_read_keys_size) + /// AzureBlobStorage file iterator could get new keys after new iteration. + if (read_keys.size() > prev_read_keys_size) { - auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + /// If format is unknown we can try to determine it by new file names. + if (!format) + { + for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it).relative_path)) + { + format = format_from_file_name; + break; + } + } + } + /// Check new files in schema cache if schema inference mode is default. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + if (columns_from_cache) + return {nullptr, columns_from_cache, format}; + } + prev_read_keys_size = read_keys.size(); - if (columns_from_cache) - return {nullptr, columns_from_cache}; } - else if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) + + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) { RelativePathsWithMetadata paths = {current_path_with_metadata}; if (auto columns_from_cache = tryGetColumnsFromCache(paths.begin(), paths.end())) - return {nullptr, columns_from_cache}; + return {nullptr, columns_from_cache, format}; } first = false; @@ -1348,7 +1401,7 @@ namespace return {wrapReadBufferWithCompressionMethod( object_storage->readObject(StoredObject(current_path_with_metadata.relative_path), getContext()->getReadSettings(), {}, current_path_with_metadata.metadata.size_bytes), chooseCompressionMethod(current_path_with_metadata.relative_path, configuration.compression_method), - zstd_window_log_max), std::nullopt}; + zstd_window_log_max), std::nullopt, format}; } void setNumRowsToLastFile(size_t num_rows) override @@ -1357,7 +1410,7 @@ namespace return; String source = fs::path(configuration.connection_url) / configuration.container / current_path_with_metadata.relative_path; - auto key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); + auto key = getKeyForSchemaCache(source, *format, format_settings, getContext()); StorageAzureBlob::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -1368,7 +1421,7 @@ namespace return; String source = fs::path(configuration.connection_url) / configuration.container / current_path_with_metadata.relative_path; - auto key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); + auto key = getKeyForSchemaCache(source, *format, format_settings, getContext()); StorageAzureBlob::getSchemaCache(getContext()).addColumns(key, columns); } @@ -1382,16 +1435,36 @@ namespace Strings sources; sources.reserve(read_keys.size()); std::transform(read_keys.begin(), read_keys.end(), std::back_inserter(sources), [&](const auto & elem){ return host_and_bucket + '/' + elem.relative_path; }); - auto cache_keys = getKeysForSchemaCache(sources, configuration.format, format_settings, getContext()); + auto cache_keys = getKeysForSchemaCache(sources, *format, format_settings, getContext()); StorageAzureBlob::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { return current_path_with_metadata.relative_path; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + int zstd_window_log_max = static_cast(getContext()->getSettingsRef().zstd_window_log_max); + return wrapReadBufferWithCompressionMethod( + object_storage->readObject(StoredObject(current_path_with_metadata.relative_path), getContext()->getReadSettings(), {}, current_path_with_metadata.metadata.size_bytes), + chooseCompressionMethod(current_path_with_metadata.relative_path, configuration.compression_method), + zstd_window_log_max); + } + private: std::optional tryGetColumnsFromCache(const RelativePathsWithMetadata::const_iterator & begin, const RelativePathsWithMetadata::const_iterator & end) { - auto & schema_cache = StorageAzureBlob::getSchemaCache(getContext()); + auto context = getContext(); + if (!context->getSettingsRef().schema_inference_use_cache_for_azure) + return std::nullopt; + + auto & schema_cache = StorageAzureBlob::getSchemaCache(context); for (auto it = begin; it < end; ++it) { auto get_last_mod_time = [&] -> std::optional @@ -1403,10 +1476,28 @@ namespace auto host_and_bucket = configuration.connection_url + '/' + configuration.container; String source = host_and_bucket + '/' + it->relative_path; - auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(source, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(source, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -1415,6 +1506,7 @@ namespace std::shared_ptr file_iterator; AzureObjectStorage * object_storage; const StorageAzureBlob::Configuration & configuration; + std::optional format; const std::optional & format_settings; const RelativePathsWithMetadata & read_keys; size_t prev_read_keys_size; @@ -1423,21 +1515,16 @@ namespace }; } -ColumnsDescription StorageAzureBlob::getTableStructureFromData( +std::pair StorageAzureBlob::getTableStructureAndFormatFromDataImpl( + std::optional format, AzureObjectStorage * object_storage, const Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx, - bool distributed_processing) + const ContextPtr & ctx) { RelativePathsWithMetadata read_keys; std::shared_ptr file_iterator; - if (distributed_processing) - { - file_iterator = std::make_shared(ctx, - ctx->getReadTaskCallback()); - } - else if (configuration.withGlobs()) + if (configuration.withGlobs()) { file_iterator = std::make_shared( object_storage, configuration.container, configuration.blob_path, nullptr, NamesAndTypesList{}, ctx, &read_keys); @@ -1448,8 +1535,28 @@ ColumnsDescription StorageAzureBlob::getTableStructureFromData( object_storage, configuration.container, configuration.blobs_paths, nullptr, NamesAndTypesList{}, ctx, &read_keys); } - ReadBufferIterator read_buffer_iterator(file_iterator, object_storage, configuration, format_settings, read_keys, ctx); - return readSchemaFromFormat(configuration.format, format_settings, read_buffer_iterator, configuration.withGlobs(), ctx); + ReadBufferIterator read_buffer_iterator(file_iterator, object_storage, format, configuration, format_settings, read_keys, ctx); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, ctx), *format}; + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, ctx); +} + +std::pair StorageAzureBlob::getTableStructureAndFormatFromData( + DB::AzureObjectStorage * object_storage, + const DB::StorageAzureBlob::Configuration & configuration, + const std::optional & format_settings, + const DB::ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(std::nullopt, object_storage, configuration, format_settings, ctx); +} + +ColumnsDescription StorageAzureBlob::getTableStructureFromData( + DB::AzureObjectStorage * object_storage, + const DB::StorageAzureBlob::Configuration & configuration, + const std::optional & format_settings, + const DB::ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(configuration.format, object_storage, configuration, format_settings, ctx).first; } SchemaCache & StorageAzureBlob::getSchemaCache(const ContextPtr & ctx) diff --git a/src/Storages/StorageAzureBlob.h b/src/Storages/StorageAzureBlob.h index 6fc3c5ce592..2ab96c84e49 100644 --- a/src/Storages/StorageAzureBlob.h +++ b/src/Storages/StorageAzureBlob.h @@ -31,9 +31,9 @@ public: String getPath() const { return blob_path; } - bool update(ContextPtr context); + bool update(const ContextPtr & context); - void connect(ContextPtr context); + void connect(const ContextPtr & context); bool withGlobs() const { return blob_path.find_first_of("*?{") != std::string::npos; } @@ -59,7 +59,7 @@ public: StorageAzureBlob( const Configuration & configuration_, std::unique_ptr && object_storage_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -68,10 +68,10 @@ public: bool distributed_processing_, ASTPtr partition_by_); - static StorageAzureBlob::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context); + static StorageAzureBlob::Configuration getConfiguration(ASTs & engine_args, const ContextPtr & local_context); static AzureClientPtr createClient(StorageAzureBlob::Configuration configuration, bool is_read_only); - static AzureObjectStorage::SettingsPtr createSettings(ContextPtr local_context); + static AzureObjectStorage::SettingsPtr createSettings(const ContextPtr & local_context); static void processNamedCollectionResult(StorageAzureBlob::Configuration & configuration, const NamedCollection & collection); @@ -115,10 +115,22 @@ public: AzureObjectStorage * object_storage, const Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx, - bool distributed_processing = false); + const ContextPtr & ctx); + + static std::pair getTableStructureAndFormatFromData( + AzureObjectStorage * object_storage, + const Configuration & configuration, + const std::optional & format_settings, + const ContextPtr & ctx); private: + static std::pair getTableStructureAndFormatFromDataImpl( + std::optional format, + AzureObjectStorage * object_storage, + const Configuration & configuration, + const std::optional & format_settings, + const ContextPtr & ctx); + friend class ReadFromAzureBlob; std::string name; @@ -137,7 +149,7 @@ public: class IIterator : public WithContext { public: - IIterator(ContextPtr context_):WithContext(context_) {} + IIterator(const ContextPtr & context_):WithContext(context_) {} virtual ~IIterator() = default; virtual RelativePathWithMetadata next() = 0; @@ -153,7 +165,7 @@ public: String blob_path_with_globs_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context_, + const ContextPtr & context_, RelativePathsWithMetadata * outer_blobs_, std::function file_progress_callback_ = {}); @@ -186,7 +198,7 @@ public: class ReadIterator : public IIterator { public: - explicit ReadIterator(ContextPtr context_, + explicit ReadIterator(const ContextPtr & context_, const ReadTaskCallback & callback_) : IIterator(context_), callback(callback_) { } RelativePathWithMetadata next() override @@ -207,7 +219,7 @@ public: const Strings & keys_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context_, + const ContextPtr & context_, RelativePathsWithMetadata * outer_blobs, std::function file_progress_callback = {}); @@ -229,7 +241,7 @@ public: const ReadFromFormatInfo & info, const String & format_, String name_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, UInt64 max_block_size_, String compression_hint_, diff --git a/src/Storages/StorageAzureBlobCluster.cpp b/src/Storages/StorageAzureBlobCluster.cpp index 1d587512f38..32445556611 100644 --- a/src/Storages/StorageAzureBlobCluster.cpp +++ b/src/Storages/StorageAzureBlobCluster.cpp @@ -36,23 +36,30 @@ StorageAzureBlobCluster::StorageAzureBlobCluster( const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, getLogger("StorageAzureBlobCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) + const ContextPtr & context) + : IStorageCluster(cluster_name_, table_id_, getLogger("StorageAzureBlobCluster (" + table_id_.table_name + ")")) , configuration{configuration_} , object_storage(std::move(object_storage_)) { - context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.getConnectionURL()); + context->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.getConnectionURL()); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { + ColumnsDescription columns; /// `format_settings` is set to std::nullopt, because StorageAzureBlobCluster is used only as table function - auto columns = StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, /*format_settings=*/std::nullopt, context_, false); + if (configuration.format == "auto") + std::tie(columns, configuration.format) = StorageAzureBlob::getTableStructureAndFormatFromData(object_storage.get(), configuration, /*format_settings=*/std::nullopt, context); + else + columns = StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, /*format_settings=*/std::nullopt, context); storage_metadata.setColumns(columns); } else + { + if (configuration.format == "auto") + configuration.format = StorageAzureBlob::getTableStructureAndFormatFromData(object_storage.get(), configuration, /*format_settings=*/std::nullopt, context).second; storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -60,13 +67,14 @@ StorageAzureBlobCluster::StorageAzureBlobCluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageAzureBlobCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageAzureBlobCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const DB::StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function s3Cluster, got '{}'", queryToString(query)); - TableFunctionAzureBlobStorageCluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionAzureBlobStorageCluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), configuration.format, context); } RemoteQueryExecutor::Extension StorageAzureBlobCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const diff --git a/src/Storages/StorageAzureBlobCluster.h b/src/Storages/StorageAzureBlobCluster.h index 2831b94f825..476f21c6742 100644 --- a/src/Storages/StorageAzureBlobCluster.h +++ b/src/Storages/StorageAzureBlobCluster.h @@ -27,8 +27,7 @@ public: const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_); + const ContextPtr & context); std::string getName() const override { return "AzureBlobStorageCluster"; } @@ -43,7 +42,7 @@ public: private: void updateBeforeRead(const ContextPtr & /*context*/) override {} - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; StorageAzureBlob::Configuration configuration; NamesAndTypesList virtual_columns; diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index d5c135bb81d..2925038ec8e 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -1,40 +1,41 @@ -#include -#include #include +#include +#include #include #include -#include #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include #include +#include +#include +#include #include #include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include namespace ProfileEvents @@ -56,6 +57,9 @@ namespace CurrentMetrics { extern const Metric StorageBufferRows; extern const Metric StorageBufferBytes; + extern const Metric StorageBufferFlushThreads; + extern const Metric StorageBufferFlushThreadsActive; + extern const Metric StorageBufferFlushThreadsScheduled; } @@ -153,6 +157,12 @@ StorageBuffer::StorageBuffer( storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); + if (num_shards > 1) + { + flush_pool = std::make_unique( + CurrentMetrics::StorageBufferFlushThreads, CurrentMetrics::StorageBufferFlushThreadsActive, CurrentMetrics::StorageBufferFlushThreadsScheduled, + num_shards, 0, num_shards); + } flush_handle = bg_pool.createTask(log->name() + "/Bg", [this]{ backgroundFlush(); }); } @@ -802,7 +812,22 @@ bool StorageBuffer::checkThresholdsImpl(bool direct, size_t rows, size_t bytes, void StorageBuffer::flushAllBuffers(bool check_thresholds) { for (auto & buf : buffers) - flushBuffer(buf, check_thresholds, false); + { + if (flush_pool) + { + scheduleFromThreadPool([&] () + { + flushBuffer(buf, check_thresholds, false); + }, *flush_pool, "BufferFlush"); + } + else + { + flushBuffer(buf, check_thresholds, false); + } + } + + if (flush_pool) + flush_pool->wait(); } diff --git a/src/Storages/StorageBuffer.h b/src/Storages/StorageBuffer.h index 47f6239b173..6c15c7e0238 100644 --- a/src/Storages/StorageBuffer.h +++ b/src/Storages/StorageBuffer.h @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -149,6 +150,7 @@ private: /// There are `num_shards` of independent buffers. const size_t num_shards; + std::unique_ptr flush_pool; std::vector buffers; const Thresholds min_thresholds; diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index afd9e4aad76..86ed1d03b94 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -773,24 +773,18 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, table_function_node->setTableExpressionModifiers(*table_expression_modifiers); QueryAnalysisPass query_analysis_pass; - query_analysis_pass.run(table_function_node, query_context); + QueryTreeNodePtr node = table_function_node; + query_analysis_pass.run(node, query_context); replacement_table_expression = std::move(table_function_node); } else { - auto resolved_remote_storage_id = remote_storage_id; - // In case of cross-replication we don't know what database is used for the table. - // `storage_id.hasDatabase()` can return false only on the initiator node. - // Each shard will use the default database (in the case of cross-replication shards may have different defaults). - if (remote_storage_id.hasDatabase()) - resolved_remote_storage_id = query_context->resolveStorageID(remote_storage_id); - auto get_column_options = GetColumnsOptions(GetColumnsOptions::All).withExtendedObjects().withVirtuals(); auto column_names_and_types = distributed_storage_snapshot->getColumns(get_column_options); - auto storage = std::make_shared(resolved_remote_storage_id, ColumnsDescription{column_names_and_types}); + auto storage = std::make_shared(remote_storage_id, ColumnsDescription{column_names_and_types}); auto table_node = std::make_shared(std::move(storage), query_context); if (table_expression_modifiers) @@ -803,7 +797,7 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, auto query_tree_to_modify = query_info.query_tree->cloneAndReplace(query_info.table_expression, std::move(replacement_table_expression)); - return buildQueryTreeForShard(query_info, query_tree_to_modify); + return buildQueryTreeForShard(query_info.planner_context, query_tree_to_modify); } } @@ -837,7 +831,7 @@ void StorageDistributed::read( */ for (auto & column : header) column.column = column.column->convertToFullColumnIfConst(); - query_ast = queryNodeToSelectQuery(query_tree_distributed); + query_ast = queryNodeToDistributedSelectQuery(query_tree_distributed); } else { @@ -982,8 +976,10 @@ std::optional StorageDistributed::distributedWriteBetweenDistribu new_query->select = select_with_union_query; } - const Cluster::AddressesWithFailover & src_addresses = src_distributed.getCluster()->getShardsAddresses(); - const Cluster::AddressesWithFailover & dst_addresses = getCluster()->getShardsAddresses(); + const auto src_cluster = src_distributed.getCluster(); + const auto dst_cluster = getCluster(); + const Cluster::AddressesWithFailover & src_addresses = src_cluster->getShardsAddresses(); + const Cluster::AddressesWithFailover & dst_addresses = dst_cluster->getShardsAddresses(); /// Compare addresses instead of cluster name, to handle remote()/cluster(). /// (since for remote()/cluster() the getClusterName() is empty string) if (src_addresses != dst_addresses) @@ -1012,8 +1008,7 @@ std::optional StorageDistributed::distributedWriteBetweenDistribu new_query->table_function.reset(); } - const auto & cluster = getCluster(); - const auto & shards_info = cluster->getShardsInfo(); + const auto & shards_info = dst_cluster->getShardsInfo(); String new_query_str; { @@ -1144,7 +1139,8 @@ std::optional StorageDistributed::distributedWriteFromClusterStor auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings); /// Here we take addresses from destination cluster and assume source table exists on these nodes - for (const auto & replicas : getCluster()->getShardsInfo()) + const auto cluster = getCluster(); + for (const auto & replicas : cluster->getShardsInfo()) { /// Skip unavailable hosts if necessary auto try_results = replicas.pool->getMany(timeouts, current_settings, PoolMode::GET_MANY, /*async_callback*/ {}, /*skip_unavailable_endpoints*/ true); @@ -1540,10 +1536,7 @@ ClusterPtr StorageDistributed::getOptimizedCluster( IColumn::Selector StorageDistributed::createSelector(const ClusterPtr cluster, const ColumnWithTypeAndName & result) { const auto & slot_to_shard = cluster->getSlotToShard(); - const IColumn * column = result.column.get(); - if (const auto * col_const = typeid_cast(column)) - column = &col_const->getDataColumn(); // If result.type is DataTypeLowCardinality, do shard according to its dictionaryType #define CREATE_FOR_TYPE(TYPE) \ diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 8b8a151fb1d..595573b566d 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -89,6 +90,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int CANNOT_APPEND_TO_FILE; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int CANNOT_DETECT_FORMAT; extern const int CANNOT_COMPILE_REGEXP; } @@ -327,7 +329,7 @@ std::unique_ptr createReadBuffer( } -Strings StorageFile::getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read) +Strings StorageFile::getPathsList(const String & table_path, const String & user_files_path, const ContextPtr & context, size_t & total_bytes_to_read) { fs::path user_files_absolute_path = fs::weakly_canonical(user_files_path); fs::path fs_table_path(table_path); @@ -374,27 +376,44 @@ namespace public: ReadBufferFromFileIterator( const std::vector & paths_, - const String & format_, + std::optional format_, const String & compression_method_, const std::optional & format_settings_, - ContextPtr context_) + const ContextPtr & context_) : WithContext(context_) , paths(paths_) - , format(format_) + , format(std::move(format_)) , compression_method(compression_method_) , format_settings(format_settings_) { } - std::pair, std::optional> next() override + Data next() override { bool is_first = current_index == 0; - /// For default mode check cached columns for all paths on first iteration. - /// If we have cached columns, next() won't be called again. - if (is_first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + if (is_first) { - if (auto cached_columns = tryGetColumnsFromCache(paths)) - return {nullptr, cached_columns}; + /// If format is unknown we iterate through all paths on first iteration and + /// try to determine format by file name. + if (!format) + { + for (const auto & path : paths) + { + if (auto format_from_path = FormatFactory::instance().tryGetFormatFromFileName(path)) + { + format = format_from_path; + break; + } + } + } + + /// For default mode check cached columns for all paths on first iteration. + /// If we have cached columns, next() won't be called again. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + if (auto cached_columns = tryGetColumnsFromCache(paths)) + return {nullptr, cached_columns, format}; + } } String path; @@ -405,11 +424,18 @@ namespace if (current_index == paths.size()) { if (is_first) + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because all files are empty. You can specify the format manually", + *format); + throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. You must specify table structure manually", - format); - return {nullptr, std::nullopt}; + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because all files are empty. You can specify table structure manually"); + } + return {nullptr, std::nullopt, std::nullopt}; } path = paths[current_index++]; @@ -420,10 +446,10 @@ namespace if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) { if (auto cached_columns = tryGetColumnsFromCache({path})) - return {nullptr, cached_columns}; + return {nullptr, cached_columns, format}; } - return {createReadBuffer(path, file_stat, false, -1, compression_method, getContext()), std::nullopt}; + return {createReadBuffer(path, file_stat, false, -1, compression_method, getContext()), std::nullopt, format}; } void setNumRowsToLastFile(size_t num_rows) override @@ -431,7 +457,7 @@ namespace if (!getContext()->getSettingsRef().use_cache_for_count_from_files) return; - auto key = getKeyForSchemaCache(paths[current_index - 1], format, format_settings, getContext()); + auto key = getKeyForSchemaCache(paths[current_index - 1], *format, format_settings, getContext()); StorageFile::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -443,7 +469,7 @@ namespace /// For union mode, schema can be different for different files, so we need to /// cache last inferred schema only for last processed file. - auto cache_key = getKeyForSchemaCache(paths[current_index - 1], format, format_settings, getContext()); + auto cache_key = getKeyForSchemaCache(paths[current_index - 1], *format, format_settings, getContext()); StorageFile::getSchemaCache(getContext()).addColumns(cache_key, columns); } @@ -454,7 +480,7 @@ namespace return; /// For default mode we cache resulting schema for all paths. - auto cache_keys = getKeysForSchemaCache(paths, format, format_settings, getContext()); + auto cache_keys = getKeysForSchemaCache(paths, *format, format_settings, getContext()); StorageFile::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); } @@ -465,14 +491,30 @@ namespace return ""; } + void setFormatName(const String & format_name) override + { + format = format_name; + } + + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + chassert(current_index > 0 && current_index <= paths.size()); + auto path = paths[current_index - 1]; + auto file_stat = getFileStat(path, false, -1, "File"); + return createReadBuffer(path, file_stat, false, -1, compression_method, getContext()); + } + private: std::optional tryGetColumnsFromCache(const Strings & paths_) { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_file) + auto context = getContext(); + if (!context->getSettingsRef().schema_inference_use_cache_for_file) return std::nullopt; /// Check if the cache contains one of the paths. - auto & schema_cache = StorageFile::getSchemaCache(getContext()); + auto & schema_cache = StorageFile::getSchemaCache(context); struct stat file_stat{}; for (const auto & path : paths_) { @@ -484,10 +526,28 @@ namespace return file_stat.st_mtime; }; - auto cache_key = getKeyForSchemaCache(path, format, format_settings, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(path, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(path, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -496,7 +556,7 @@ namespace const std::vector & paths; size_t current_index = 0; - String format; + std::optional format; String compression_method; const std::optional & format_settings; }; @@ -506,17 +566,17 @@ namespace public: ReadBufferFromArchiveIterator( const StorageFile::ArchiveInfo & archive_info_, - const String & format_, + std::optional format_, const std::optional & format_settings_, - ContextPtr context_) + const ContextPtr & context_) : WithContext(context_) , archive_info(archive_info_) - , format(format_) + , format(std::move(format_)) , format_settings(format_settings_) { } - std::pair, std::optional> next() override + Data next() override { /// For default mode check cached columns for all initial archive paths (maybe with globs) on first iteration. /// If we have cached columns, next() won't be called again. @@ -524,8 +584,8 @@ namespace { for (const auto & archive : archive_info.paths_to_archives) { - if (auto cached_columns = tryGetColumnsFromSchemaCache(archive, archive_info.path_in_archive)) - return {nullptr, cached_columns}; + if (auto cached_schema = tryGetSchemaFromCache(archive, fmt::format("{}::{}", archive, archive_info.path_in_archive))) + return {nullptr, cached_schema, format}; } } @@ -535,12 +595,19 @@ namespace if (current_archive_index == archive_info.paths_to_archives.size()) { if (is_first) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. You must specify table structure manually", - format); + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because all files are empty. You can specify table structure manually", + *format); - return {nullptr, std::nullopt}; + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because all files are empty. You can specify the format manually"); + } + + return {nullptr, std::nullopt, format}; } const auto & archive = archive_info.paths_to_archives[current_archive_index]; @@ -554,11 +621,18 @@ namespace continue; } + if (format) + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The table structure cannot be extracted from a {} format file, because the archive {} is empty. " + "You can specify table structure manually", + *format, + archive); + throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because the archive {} is empty. " - "You must specify table structure manually", - format, + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because the archive {} is empty. " + "You can specify the format manually", archive); } @@ -574,8 +648,8 @@ namespace last_read_file_path = paths_for_schema_cache.emplace_back(fmt::format("{}::{}", archive_reader->getPath(), archive_info.path_in_archive)); is_first = false; - if (auto cached_columns = tryGetColumnsFromSchemaCache(archive, last_read_file_path)) - return {nullptr, cached_columns}; + if (auto cached_schema = tryGetSchemaFromCache(archive, last_read_file_path)) + return {nullptr, cached_schema, format}; } else { @@ -611,13 +685,20 @@ namespace last_read_file_path = paths_for_schema_cache.emplace_back(fmt::format("{}::{}", archive_reader->getPath(), *filename)); is_first = false; - if (auto cached_columns = tryGetColumnsFromSchemaCache(archive, last_read_file_path)) + /// If format is unknown we can try to determine it by the file name. + if (!format) + { + if (auto format_from_file = FormatFactory::instance().tryGetFormatFromFileName(*filename)) + format = format_from_file; + } + + if (auto cached_schema = tryGetSchemaFromCache(archive, last_read_file_path)) { /// For union mode next() will be called again even if we found cached columns, /// so we need to remember last_read_buffer to continue iterating through files in archive. if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) last_read_buffer = archive_reader->readFile(std::move(file_enumerator)); - return {nullptr, cached_columns}; + return {nullptr, cached_schema, format}; } read_buf = archive_reader->readFile(std::move(file_enumerator)); @@ -626,7 +707,7 @@ namespace break; } - return {std::move(read_buf), std::nullopt}; + return {std::move(read_buf), std::nullopt, format}; } void setPreviousReadBuffer(std::unique_ptr buffer) override @@ -640,7 +721,7 @@ namespace if (!getContext()->getSettingsRef().use_cache_for_count_from_files) return; - auto key = getKeyForSchemaCache(last_read_file_path, format, format_settings, getContext()); + auto key = getKeyForSchemaCache(last_read_file_path, *format, format_settings, getContext()); StorageFile::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -653,7 +734,7 @@ namespace /// For union mode, schema can be different for different files in archive, so we need to /// cache last inferred schema only for last processed file. auto & schema_cache = StorageFile::getSchemaCache(getContext()); - auto cache_key = getKeyForSchemaCache(last_read_file_path, format, format_settings, getContext()); + auto cache_key = getKeyForSchemaCache(last_read_file_path, *format, format_settings, getContext()); schema_cache.addColumns(cache_key, columns); } @@ -669,17 +750,42 @@ namespace for (const auto & archive : archive_info.paths_to_archives) paths_for_schema_cache.emplace_back(fmt::format("{}::{}", archive, archive_info.path_in_archive)); auto & schema_cache = StorageFile::getSchemaCache(getContext()); - auto cache_keys = getKeysForSchemaCache(paths_for_schema_cache, format, format_settings, getContext()); + auto cache_keys = getKeysForSchemaCache(paths_for_schema_cache, *format, format_settings, getContext()); schema_cache.addManyColumns(cache_keys, columns); } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { return last_read_file_path; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + if (archive_info.isSingleFileRead()) + { + chassert(current_archive_index > 0 && current_archive_index <= archive_info.paths_to_archives.size()); + const auto & archive = archive_info.paths_to_archives[current_archive_index - 1]; + auto archive_reader = createArchiveReader(archive); + return archive_reader->readFile(archive_info.path_in_archive, false); + } + + chassert(current_archive_index >= 0 && current_archive_index < archive_info.paths_to_archives.size()); + const auto & archive = archive_info.paths_to_archives[current_archive_index]; + auto archive_reader = createArchiveReader(archive); + chassert(last_read_buffer); + file_enumerator = archive_reader->currentFile(std::move(last_read_buffer)); + return archive_reader->readFile(std::move(file_enumerator)); + } + private: - std::optional tryGetColumnsFromSchemaCache(const std::string & archive_path, const std::string & full_path) + std::optional tryGetSchemaFromCache(const std::string & archive_path, const std::string & full_path) { auto context = getContext(); if (!context->getSettingsRef().schema_inference_use_cache_for_file) @@ -695,11 +801,28 @@ namespace return file_stat.st_mtime; }; - auto cache_key = getKeyForSchemaCache(full_path, format, format_settings, context); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(full_path, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(full_path, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } return std::nullopt; } @@ -715,13 +838,13 @@ namespace std::unique_ptr file_enumerator; std::unique_ptr last_read_buffer; - String format; + std::optional format; const std::optional & format_settings; std::vector paths_for_schema_cache; }; } -ColumnsDescription StorageFile::getTableStructureFromFileDescriptor(ContextPtr context) +std::pair StorageFile::getTableStructureAndFormatFromFileDescriptor(std::optional format, const ContextPtr & context) { /// If we want to read schema from file descriptor we should create /// a read buffer from fd, create a checkpoint, read some data required @@ -738,22 +861,29 @@ ColumnsDescription StorageFile::getTableStructureFromFileDescriptor(ContextPtr c read_buf->setCheckpoint(); auto read_buffer_iterator = SingleReadBufferIterator(std::move(read_buf)); - auto columns = readSchemaFromFormat(format_name, format_settings, read_buffer_iterator, false, context, peekable_read_buffer_from_fd); + ColumnsDescription columns; + if (format) + columns = readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context); + else + std::tie(columns, format) = detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); + + peekable_read_buffer_from_fd = read_buffer_iterator.releaseBuffer(); if (peekable_read_buffer_from_fd) { /// If we have created read buffer in readSchemaFromFormat we should rollback to checkpoint. assert_cast(peekable_read_buffer_from_fd.get())->rollbackToCheckpoint(); has_peekable_read_buffer_from_fd = true; } - return columns; + + return {columns, *format}; } -ColumnsDescription StorageFile::getTableStructureFromFile( - const String & format, +std::pair StorageFile::getTableStructureAndFormatFromFileImpl( + std::optional format, const std::vector & paths, const String & compression_method, const std::optional & format_settings, - ContextPtr context, + const ContextPtr & context, const std::optional & archive_info) { if (format == "Distributed") @@ -761,29 +891,60 @@ ColumnsDescription StorageFile::getTableStructureFromFile( if (paths.empty()) throw Exception(ErrorCodes::INCORRECT_FILE_NAME, "Cannot get table structure from file, because no files match specified name"); - return ColumnsDescription(DistributedAsyncInsertSource(paths[0]).getOutputs().front().getHeader().getNamesAndTypesList()); + return {ColumnsDescription(DistributedAsyncInsertSource(paths[0]).getOutputs().front().getHeader().getNamesAndTypesList()), *format}; } if (((archive_info && archive_info->paths_to_archives.empty()) || (!archive_info && paths.empty())) - && !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format)) + && (!format || !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(*format))) + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because there are no files with provided path. " + "You can specify table structure manually", *format); + throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files with provided path. " - "You must specify table structure manually", format); + "The data format cannot be detected by the contents of the files, because there are no files with provided path. " + "You can specify the format manually"); + + } if (archive_info) { ReadBufferFromArchiveIterator read_buffer_iterator(*archive_info, format, format_settings, context); - return readSchemaFromFormat( - format, - format_settings, - read_buffer_iterator, - /*retry=*/archive_info->paths_to_archives.size() > 1 || !archive_info->isSingleFileRead(), - context); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; + + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); } ReadBufferFromFileIterator read_buffer_iterator(paths, format, compression_method, format_settings, context); - return readSchemaFromFormat(format, format_settings, read_buffer_iterator, paths.size() > 1, context); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; + + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); +} + +ColumnsDescription StorageFile::getTableStructureFromFile( + const DB::String & format, + const std::vector & paths, + const DB::String & compression_method, + const std::optional & format_settings, + const ContextPtr & context, + const std::optional & archive_info) +{ + return getTableStructureAndFormatFromFileImpl(format, paths, compression_method, format_settings, context, archive_info).first; +} + +std::pair StorageFile::getTableStructureAndFormatFromFile( + const std::vector & paths, + const DB::String & compression_method, + const std::optional & format_settings, + const ContextPtr & context, + const std::optional & archive_info) +{ + return getTableStructureAndFormatFromFileImpl(std::nullopt, paths, compression_method, format_settings, context, archive_info); } bool StorageFile::supportsSubsetOfColumns(const ContextPtr & context) const @@ -874,7 +1035,7 @@ StorageFile::StorageFile(CommonArguments args) , compression_method(args.compression_method) , base_path(args.getContext()->getPath()) { - if (format_name != "Distributed") + if (format_name != "Distributed" && format_name != "auto") FormatFactory::instance().checkFormatName(format_name); } @@ -886,16 +1047,19 @@ void StorageFile::setStorageMetadata(CommonArguments args) { ColumnsDescription columns; if (use_table_fd) - columns = getTableStructureFromFileDescriptor(args.getContext()); + { + if (format_name == "auto") + std::tie(columns, format_name) = getTableStructureAndFormatFromFileDescriptor(std::nullopt, args.getContext()); + else + columns = getTableStructureAndFormatFromFileDescriptor(format_name, args.getContext()).first; + } else { - columns = getTableStructureFromFile( - format_name, - paths, - compression_method, - format_settings, - args.getContext(), - archive_info); + if (format_name == "auto") + std::tie(columns, format_name) = getTableStructureAndFormatFromFile(paths, compression_method, format_settings, args.getContext(), archive_info); + else + columns = getTableStructureFromFile(format_name, paths, compression_method, format_settings, args.getContext(), archive_info); + if (!args.columns.empty() && args.columns != columns) throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Table structure and file structure are different"); } @@ -903,6 +1067,8 @@ void StorageFile::setStorageMetadata(CommonArguments args) } else { + if (format_name == "auto") + format_name = getTableStructureAndFormatFromFile(paths, compression_method, format_settings, args.getContext(), archive_info).second; /// We don't allow special columns in File storage. if (!args.columns.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine File doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -917,7 +1083,7 @@ void StorageFile::setStorageMetadata(CommonArguments args) } -static std::chrono::seconds getLockTimeout(ContextPtr context) +static std::chrono::seconds getLockTimeout(const ContextPtr & context) { const Settings & settings = context->getSettingsRef(); Int64 lock_timeout = settings.lock_acquire_timeout.totalSeconds(); @@ -933,9 +1099,9 @@ StorageFileSource::FilesIterator::FilesIterator( std::optional archive_info_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, - ContextPtr context_, + const ContextPtr & context_, bool distributed_processing_) - : files(files_), archive_info(std::move(archive_info_)), distributed_processing(distributed_processing_), context(context_) + : WithContext(context_), files(files_), archive_info(std::move(archive_info_)), distributed_processing(distributed_processing_) { ActionsDAGPtr filter_dag; if (!distributed_processing && !archive_info && !files.empty()) @@ -948,7 +1114,7 @@ StorageFileSource::FilesIterator::FilesIterator( String StorageFileSource::FilesIterator::next() { if (distributed_processing) - return context->getReadTaskCallback()(); + return getContext()->getReadTaskCallback()(); else { const auto & fs = isReadFromArchive() ? archive_info->paths_to_archives : files; @@ -972,12 +1138,12 @@ const String & StorageFileSource::FilesIterator::getFileNameInArchive() StorageFileSource::StorageFileSource( const ReadFromFormatInfo & info, std::shared_ptr storage_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size_, FilesIteratorPtr files_iterator_, std::unique_ptr read_buf_, bool need_only_count_) - : SourceWithKeyCondition(info.source_header, false) + : SourceWithKeyCondition(info.source_header, false), WithContext(context_) , storage(std::move(storage_)) , files_iterator(std::move(files_iterator_)) , read_buf(std::move(read_buf_)) @@ -985,13 +1151,12 @@ StorageFileSource::StorageFileSource( , requested_columns(info.requested_columns) , requested_virtual_columns(info.requested_virtual_columns) , block_for_format(info.format_header) - , context(context_) , max_block_size(max_block_size_) , need_only_count(need_only_count_) { if (!storage->use_table_fd) { - shared_lock = std::shared_lock(storage->rwlock, getLockTimeout(context)); + shared_lock = std::shared_lock(storage->rwlock, getLockTimeout(getContext())); if (!shared_lock) throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Lock timeout exceeded"); storage->readers_counter.fetch_add(1, std::memory_order_release); @@ -1008,7 +1173,7 @@ void StorageFileSource::beforeDestroy() if (std::uncaught_exceptions() == 0 && cnt == 1 && !storage->was_renamed) { shared_lock.unlock(); - auto exclusive_lock = std::unique_lock{storage->rwlock, getLockTimeout(context)}; + auto exclusive_lock = std::unique_lock{storage->rwlock, getLockTimeout(getContext())}; if (!exclusive_lock) return; @@ -1027,7 +1192,7 @@ void StorageFileSource::beforeDestroy() file_path = file_path.lexically_normal(); // Checking access rights - checkCreationIsAllowed(context, context->getUserFilesPath(), file_path, true); + checkCreationIsAllowed(getContext(), getContext()->getUserFilesPath(), file_path, true); // Checking an existing of new file if (fs::exists(file_path)) @@ -1060,7 +1225,7 @@ void StorageFileSource::setKeyCondition(const ActionsDAG::NodeRawConstPtrs & nod bool StorageFileSource::tryGetCountFromCache(const struct stat & file_stat) { - if (!context->getSettingsRef().use_cache_for_count_from_files) + if (!getContext()->getSettingsRef().use_cache_for_count_from_files) return false; auto num_rows_from_cache = tryGetNumRowsFromCache(current_path, file_stat.st_mtime); @@ -1102,7 +1267,7 @@ Chunk StorageFileSource::generate() return {}; auto file_stat = getFileStat(archive, storage->use_table_fd, storage->table_fd, storage->getName()); - if (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) + if (getContext()->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) continue; archive_reader = createArchiveReader(archive); @@ -1116,7 +1281,7 @@ Chunk StorageFileSource::generate() if (!read_buf) continue; - if (auto progress_callback = context->getFileProgressCallback()) + if (auto progress_callback = getContext()->getFileProgressCallback()) progress_callback(FileProgress(0, tryGetFileSizeFromReadBuffer(*read_buf).value_or(0))); } else @@ -1130,7 +1295,7 @@ Chunk StorageFileSource::generate() return {}; current_archive_stat = getFileStat(archive, storage->use_table_fd, storage->table_fd, storage->getName()); - if (context->getSettingsRef().engine_file_skip_empty_files && current_archive_stat.st_size == 0) + if (getContext()->getSettingsRef().engine_file_skip_empty_files && current_archive_stat.st_size == 0) continue; archive_reader = createArchiveReader(archive); @@ -1164,7 +1329,7 @@ Chunk StorageFileSource::generate() continue; read_buf = archive_reader->readFile(std::move(file_enumerator)); - if (auto progress_callback = context->getFileProgressCallback()) + if (auto progress_callback = getContext()->getFileProgressCallback()) progress_callback(FileProgress(0, tryGetFileSizeFromReadBuffer(*read_buf).value_or(0))); } } @@ -1190,16 +1355,16 @@ Chunk StorageFileSource::generate() file_stat = getFileStat(current_path, storage->use_table_fd, storage->table_fd, storage->getName()); current_file_size = file_stat.st_size; - if (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) + if (getContext()->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) continue; if (need_only_count && tryGetCountFromCache(file_stat)) continue; - read_buf = createReadBuffer(current_path, file_stat, storage->use_table_fd, storage->table_fd, storage->compression_method, context); + read_buf = createReadBuffer(current_path, file_stat, storage->use_table_fd, storage->table_fd, storage->compression_method, getContext()); } - const Settings & settings = context->getSettingsRef(); + const Settings & settings = getContext()->getSettingsRef(); size_t file_num = 0; if (storage->archive_info) @@ -1211,7 +1376,7 @@ Chunk StorageFileSource::generate() const auto max_parsing_threads = std::max(settings.max_threads / file_num, 1UL); input_format = FormatFactory::instance().getInput( - storage->format_name, *read_buf, block_for_format, context, max_block_size, storage->format_settings, + storage->format_name, *read_buf, block_for_format, getContext(), max_block_size, storage->format_settings, max_parsing_threads, std::nullopt, /*is_remote_fs*/ false, CompressionMethod::None, need_only_count); if (key_condition) @@ -1227,7 +1392,7 @@ Chunk StorageFileSource::generate() { builder.addSimpleTransform([&](const Block & header) { - return std::make_shared(header, columns_description, *input_format, context); + return std::make_shared(header, columns_description, *input_format, getContext()); }); } @@ -1264,7 +1429,7 @@ Chunk StorageFileSource::generate() if (storage->use_table_fd) finished_generate = true; - if (input_format && storage->format_name != "Distributed" && context->getSettingsRef().use_cache_for_count_from_files) + if (input_format && storage->format_name != "Distributed" && getContext()->getSettingsRef().use_cache_for_count_from_files) addNumRowsToCache(current_path, total_rows_in_file); total_rows_in_file = 0; @@ -1295,14 +1460,14 @@ Chunk StorageFileSource::generate() void StorageFileSource::addNumRowsToCache(const String & path, size_t num_rows) const { - auto key = getKeyForSchemaCache(path, storage->format_name, storage->format_settings, context); - StorageFile::getSchemaCache(context).addNumRows(key, num_rows); + auto key = getKeyForSchemaCache(path, storage->format_name, storage->format_settings, getContext()); + StorageFile::getSchemaCache(getContext()).addNumRows(key, num_rows); } std::optional StorageFileSource::tryGetNumRowsFromCache(const String & path, time_t last_mod_time) const { - auto & schema_cache = StorageFile::getSchemaCache(context); - auto key = getKeyForSchemaCache(path, storage->format_name, storage->format_settings, context); + auto & schema_cache = StorageFile::getSchemaCache(getContext()); + auto key = getKeyForSchemaCache(path, storage->format_name, storage->format_settings, getContext()); auto get_last_mod_time = [&]() -> std::optional { return last_mod_time; @@ -1311,7 +1476,7 @@ std::optional StorageFileSource::tryGetNumRowsFromCache(const String & p return schema_cache.tryGetNumRows(key, get_last_mod_time); } -class ReadFromFile : public SourceStepWithFilter +class ReadFromFile : public SourceStepWithFilter, WithContext { public: std::string getName() const override { return "ReadFromFile"; } @@ -1323,14 +1488,13 @@ public: std::shared_ptr storage_, ReadFromFormatInfo info_, const bool need_only_count_, - ContextPtr context_, + const ContextPtr & context_, size_t max_block_size_, size_t num_streams_) - : SourceStepWithFilter(DataStream{.header = std::move(sample_block)}) + : SourceStepWithFilter(DataStream{.header = std::move(sample_block)}), WithContext(context_) , storage(std::move(storage_)) , info(std::move(info_)) , need_only_count(need_only_count_) - , context(std::move(context_)) , max_block_size(max_block_size_) , max_num_streams(num_streams_) { @@ -1341,7 +1505,6 @@ private: ReadFromFormatInfo info; const bool need_only_count; - ContextPtr context; size_t max_block_size; const size_t max_num_streams; @@ -1422,7 +1585,7 @@ void ReadFromFile::createIterator(const ActionsDAG::Node * predicate) storage->archive_info, predicate, storage->virtual_columns, - context, + getContext(), storage->distributed_processing); } @@ -1444,8 +1607,10 @@ void ReadFromFile::initializePipeline(QueryPipelineBuilder & pipeline, const Bui Pipes pipes; pipes.reserve(num_streams); + auto ctx = getContext(); + /// Set total number of bytes to process. For progress bar. - auto progress_callback = context->getFileProgressCallback(); + auto progress_callback = ctx->getFileProgressCallback(); if (progress_callback && !storage->archive_info) progress_callback(FileProgress(0, storage->total_bytes_to_read)); @@ -1463,20 +1628,20 @@ void ReadFromFile::initializePipeline(QueryPipelineBuilder & pipeline, const Bui auto source = std::make_shared( info, storage, - context, + ctx, max_block_size, files_iterator, std::move(read_buffer), need_only_count); - source->setKeyCondition(filter_nodes.nodes, context); + source->setKeyCondition(filter_nodes.nodes, ctx); pipes.emplace_back(std::move(source)); } auto pipe = Pipe::unitePipes(std::move(pipes)); size_t output_ports = pipe.numOutputPorts(); - const bool parallelize_output = context->getSettingsRef().parallelize_output_from_storages; - if (parallelize_output && storage->parallelizeOutputAfterReading(context) && output_ports > 0 && output_ports < max_num_streams) + const bool parallelize_output = ctx->getSettingsRef().parallelize_output_from_storages; + if (parallelize_output && storage->parallelizeOutputAfterReading(ctx) && output_ports > 0 && output_ports < max_num_streams) pipe.resize(max_num_streams); if (pipe.empty()) @@ -1489,7 +1654,7 @@ void ReadFromFile::initializePipeline(QueryPipelineBuilder & pipeline, const Bui } -class StorageFileSink final : public SinkToStorage +class StorageFileSink final : public SinkToStorage, WithContext { public: StorageFileSink( @@ -1502,9 +1667,9 @@ public: const CompressionMethod compression_method_, const std::optional & format_settings_, const String format_name_, - ContextPtr context_, + const ContextPtr & context_, int flags_) - : SinkToStorage(metadata_snapshot_->getSampleBlock()) + : SinkToStorage(metadata_snapshot_->getSampleBlock()), WithContext(context_) , metadata_snapshot(metadata_snapshot_) , table_name_for_log(table_name_for_log_) , table_fd(table_fd_) @@ -1514,7 +1679,6 @@ public: , compression_method(compression_method_) , format_name(format_name_) , format_settings(format_settings_) - , context(context_) , flags(flags_) { initialize(); @@ -1531,9 +1695,9 @@ public: const CompressionMethod compression_method_, const std::optional & format_settings_, const String format_name_, - ContextPtr context_, + const ContextPtr & context_, int flags_) - : SinkToStorage(metadata_snapshot_->getSampleBlock()) + : SinkToStorage(metadata_snapshot_->getSampleBlock()), WithContext(context_) , metadata_snapshot(metadata_snapshot_) , table_name_for_log(table_name_for_log_) , table_fd(table_fd_) @@ -1543,7 +1707,6 @@ public: , compression_method(compression_method_) , format_name(format_name_) , format_settings(format_settings_) - , context(context_) , flags(flags_) , lock(std::move(lock_)) { @@ -1567,7 +1730,7 @@ public: /// In case of formats with prefixes if file is not empty we have already written prefix. bool do_not_write_prefix = naked_buffer->size(); - const auto & settings = context->getSettingsRef(); + const auto & settings = getContext()->getSettingsRef(); write_buf = wrapWriteBufferWithCompressionMethod( std::move(naked_buffer), compression_method, @@ -1575,7 +1738,7 @@ public: static_cast(settings.output_format_compression_zstd_window_log)); writer = FormatFactory::instance().getOutputFormatParallelIfPossible(format_name, - *write_buf, metadata_snapshot->getSampleBlock(), context, format_settings); + *write_buf, metadata_snapshot->getSampleBlock(), getContext(), format_settings); if (do_not_write_prefix) writer->doNotWritePrefix(); @@ -1658,7 +1821,6 @@ private: std::string format_name; std::optional format_settings; - ContextPtr context; int flags; std::unique_lock lock; @@ -2043,7 +2205,7 @@ StorageFile::ArchiveInfo StorageFile::getArchiveInfo( const std::string & path_to_archive, const std::string & file_in_archive, const std::string & user_files_path, - ContextPtr context, + const ContextPtr & context, size_t & total_bytes_to_read ) { diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index 2955eb0f1aa..5c7a8da9f53 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -84,7 +84,7 @@ public: static Names getVirtualColumnNames(); - static Strings getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read); + static Strings getPathsList(const String & table_path, const String & user_files_path, const ContextPtr & context, size_t & total_bytes_to_read); /// Check if the format supports reading only some subset of columns. /// Is is useful because such formats could effectively skip unknown columns @@ -112,14 +112,19 @@ public: } }; - ColumnsDescription getTableStructureFromFileDescriptor(ContextPtr context); - static ColumnsDescription getTableStructureFromFile( const String & format, const std::vector & paths, const String & compression_method, const std::optional & format_settings, - ContextPtr context, + const ContextPtr & context, + const std::optional & archive_info = std::nullopt); + + static std::pair getTableStructureAndFormatFromFile( + const std::vector & paths, + const String & compression_method, + const std::optional & format_settings, + const ContextPtr & context, const std::optional & archive_info = std::nullopt); static SchemaCache & getSchemaCache(const ContextPtr & context); @@ -130,7 +135,7 @@ public: const std::string & path_to_archive, const std::string & file_in_archive, const std::string & user_files_path, - ContextPtr context, + const ContextPtr & context, size_t & total_bytes_to_read); bool supportsTrivialCountOptimization() const override { return true; } @@ -141,6 +146,16 @@ protected: friend class ReadFromFile; private: + std::pair getTableStructureAndFormatFromFileDescriptor(std::optional format, const ContextPtr & context); + + static std::pair getTableStructureAndFormatFromFileImpl( + std::optional format, + const std::vector & paths, + const String & compression_method, + const std::optional & format_settings, + const ContextPtr & context, + const std::optional & archive_info = std::nullopt); + void setStorageMetadata(CommonArguments args); std::string format_name; @@ -187,10 +202,10 @@ private: bool distributed_processing = false; }; -class StorageFileSource : public SourceWithKeyCondition +class StorageFileSource : public SourceWithKeyCondition, WithContext { public: - class FilesIterator + class FilesIterator : WithContext { public: explicit FilesIterator( @@ -198,7 +213,7 @@ public: std::optional archive_info_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, - ContextPtr context_, + const ContextPtr & context_, bool distributed_processing_ = false); String next(); @@ -227,8 +242,6 @@ private: std::atomic index = 0; bool distributed_processing; - - ContextPtr context; }; using FilesIteratorPtr = std::shared_ptr; @@ -236,7 +249,7 @@ private: StorageFileSource( const ReadFromFormatInfo & info, std::shared_ptr storage_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size_, FilesIteratorPtr files_iterator_, std::unique_ptr read_buf_, @@ -286,7 +299,6 @@ private: NamesAndTypesList requested_virtual_columns; Block block_for_format; - ContextPtr context; /// TODO Untangle potential issues with context lifetime. UInt64 max_block_size; bool finished_generate = false; diff --git a/src/Storages/StorageFileCluster.cpp b/src/Storages/StorageFileCluster.cpp index 0cc961bb464..0cc18abef5f 100644 --- a/src/Storages/StorageFileCluster.cpp +++ b/src/Storages/StorageFileCluster.cpp @@ -25,36 +25,39 @@ extern const int LOGICAL_ERROR; } StorageFileCluster::StorageFileCluster( - ContextPtr context_, + const ContextPtr & context, const String & cluster_name_, const String & filename_, const String & format_name_, - const String & compression_method_, + const String & compression_method, const StorageID & table_id_, const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, getLogger("StorageFileCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) + const ConstraintsDescription & constraints_) + : IStorageCluster(cluster_name_, table_id_, getLogger("StorageFileCluster (" + table_id_.table_name + ")")) , filename(filename_) , format_name(format_name_) - , compression_method(compression_method_) { StorageInMemoryMetadata storage_metadata; size_t total_bytes_to_read; // its value isn't used as we are not reading files (just listing them). But it is required by getPathsList - paths = StorageFile::getPathsList(filename_, context_->getUserFilesPath(), context_, total_bytes_to_read); + paths = StorageFile::getPathsList(filename_, context->getUserFilesPath(), context, total_bytes_to_read); if (columns_.empty()) { - auto columns = StorageFile::getTableStructureFromFile(format_name, - paths, - compression_method, - std::nullopt, - context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = StorageFile::getTableStructureAndFormatFromFile(paths, compression_method, std::nullopt, context); + else + columns = StorageFile::getTableStructureFromFile(format_name, paths, compression_method, std::nullopt, context); + storage_metadata.setColumns(columns); } else + { + if (format_name == "auto") + format_name = StorageFile::getTableStructureAndFormatFromFile(paths, compression_method, std::nullopt, context).second; storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -62,13 +65,14 @@ StorageFileCluster::StorageFileCluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageFileCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageFileCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function fileCluster, got '{}'", queryToString(query)); - TableFunctionFileCluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionFileCluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), format_name, context); } RemoteQueryExecutor::Extension StorageFileCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const diff --git a/src/Storages/StorageFileCluster.h b/src/Storages/StorageFileCluster.h index a6e57c3bb4f..2803c8b6e5b 100644 --- a/src/Storages/StorageFileCluster.h +++ b/src/Storages/StorageFileCluster.h @@ -17,15 +17,14 @@ class StorageFileCluster : public IStorageCluster { public: StorageFileCluster( - ContextPtr context_, + const ContextPtr & context_, const String & cluster_name_, const String & filename_, const String & format_name_, const String & compression_method_, const StorageID & table_id_, const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - bool structure_argument_was_provided_); + const ConstraintsDescription & constraints_); std::string getName() const override { return "FileCluster"; } @@ -38,12 +37,11 @@ public: bool supportsTrivialCountOptimization() const override { return true; } private: - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; Strings paths; String filename; String format_name; - String compression_method; NamesAndTypesList virtual_columns; }; diff --git a/src/Storages/StorageInMemoryMetadata.cpp b/src/Storages/StorageInMemoryMetadata.cpp index cece70eddfa..64ff224fc10 100644 --- a/src/Storages/StorageInMemoryMetadata.cpp +++ b/src/Storages/StorageInMemoryMetadata.cpp @@ -200,7 +200,7 @@ TTLDescription StorageInMemoryMetadata::getRowsTTL() const bool StorageInMemoryMetadata::hasRowsTTL() const { - return table_ttl.rows_ttl.expression != nullptr; + return table_ttl.rows_ttl.expression_ast != nullptr; } TTLDescriptions StorageInMemoryMetadata::getRowsWhereTTLs() const @@ -258,9 +258,8 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies( NameSet required_ttl_columns; NameSet updated_ttl_columns; - auto add_dependent_columns = [&updated_columns](const auto & expression, auto & to_set) + auto add_dependent_columns = [&updated_columns](const Names & required_columns, auto & to_set) { - auto required_columns = expression->getRequiredColumns(); for (const auto & dependency : required_columns) { if (updated_columns.contains(dependency)) @@ -276,18 +275,18 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies( for (const auto & index : getSecondaryIndices()) { if (has_dependency(index.name, ColumnDependency::SKIP_INDEX)) - add_dependent_columns(index.expression, indices_columns); + add_dependent_columns(index.expression->getRequiredColumns(), indices_columns); } for (const auto & projection : getProjections()) { if (has_dependency(projection.name, ColumnDependency::PROJECTION)) - add_dependent_columns(&projection, projections_columns); + add_dependent_columns(projection.getRequiredColumns(), projections_columns); } auto add_for_rows_ttl = [&](const auto & expression, auto & to_set) { - if (add_dependent_columns(expression, to_set) && include_ttl_target) + if (add_dependent_columns(expression.getNames(), to_set) && include_ttl_target) { /// Filter all columns, if rows TTL expression have to be recalculated. for (const auto & column : getColumns().getAllPhysical()) @@ -296,25 +295,25 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies( }; if (hasRowsTTL()) - add_for_rows_ttl(getRowsTTL().expression, required_ttl_columns); + add_for_rows_ttl(getRowsTTL().expression_columns, required_ttl_columns); for (const auto & entry : getRowsWhereTTLs()) - add_for_rows_ttl(entry.expression, required_ttl_columns); + add_for_rows_ttl(entry.expression_columns, required_ttl_columns); for (const auto & entry : getGroupByTTLs()) - add_for_rows_ttl(entry.expression, required_ttl_columns); + add_for_rows_ttl(entry.expression_columns, required_ttl_columns); for (const auto & entry : getRecompressionTTLs()) - add_dependent_columns(entry.expression, required_ttl_columns); + add_dependent_columns(entry.expression_columns.getNames(), required_ttl_columns); for (const auto & [name, entry] : getColumnTTLs()) { - if (add_dependent_columns(entry.expression, required_ttl_columns) && include_ttl_target) + if (add_dependent_columns(entry.expression_columns.getNames(), required_ttl_columns) && include_ttl_target) updated_ttl_columns.insert(name); } for (const auto & entry : getMoveTTLs()) - add_dependent_columns(entry.expression, required_ttl_columns); + add_dependent_columns(entry.expression_columns.getNames(), required_ttl_columns); //TODO what about rows_where_ttl and group_by_ttl ?? diff --git a/src/Storages/StorageJoin.cpp b/src/Storages/StorageJoin.cpp index b9e082c0b22..b122674466f 100644 --- a/src/Storages/StorageJoin.cpp +++ b/src/Storages/StorageJoin.cpp @@ -500,7 +500,7 @@ protected: Chunk chunk; if (!joinDispatch(join->kind, join->strictness, join->data->maps.front(), [&](auto kind, auto strictness, auto & map) { chunk = createChunk(map); })) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: unknown JOIN strictness"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown JOIN strictness"); return chunk; } diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index c7b0a9d0644..99192fe1e50 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -241,7 +241,7 @@ void LogSource::readData(const NameAndTypePair & name_and_type, ColumnPtr & colu const auto & data_file_it = storage.data_files_by_names.find(data_file_name); if (data_file_it == storage.data_files_by_names.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: no information about file {} in StorageLog", data_file_name); + throw Exception(ErrorCodes::LOGICAL_ERROR, "No information about file {} in StorageLog", data_file_name); const auto & data_file = *data_file_it->second; size_t offset = stream_for_prefix ? 0 : offsets[data_file.index]; @@ -448,7 +448,7 @@ ISerialization::OutputStreamGetter LogSink::createStreamGetter(const NameAndType String data_file_name = ISerialization::getFileNameForStream(name_and_type, path); auto it = streams.find(data_file_name); if (it == streams.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: stream was not created when writing data in LogSink"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Stream was not created when writing data in LogSink"); Stream & stream = it->second; if (stream.written) @@ -473,7 +473,7 @@ void LogSink::writeData(const NameAndTypePair & name_and_type, const IColumn & c { const auto & data_file_it = storage.data_files_by_names.find(data_file_name); if (data_file_it == storage.data_files_by_names.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: no information about file {} in StorageLog", data_file_name); + throw Exception(ErrorCodes::LOGICAL_ERROR, "No information about file {} in StorageLog", data_file_name); const auto & data_file = *data_file_it->second; const auto & columns = metadata_snapshot->getColumns(); diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 0011e3c57a2..02d81eda59a 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -79,11 +79,11 @@ public: for (const auto & elem : block) compressed_block.insert({ elem.column->compress(), elem.type, elem.name }); - new_blocks.emplace_back(compressed_block); + new_blocks.push_back(std::move(compressed_block)); } else { - new_blocks.emplace_back(block); + new_blocks.push_back(std::move(block)); } } @@ -472,9 +472,21 @@ void StorageMemory::restoreDataImpl(const BackupPtr & backup, const String & dat while (auto block = block_in.read()) { - new_bytes += block.bytes(); - new_rows += block.rows(); - new_blocks.push_back(std::move(block)); + if (compress) + { + Block compressed_block; + for (const auto & elem : block) + compressed_block.insert({ elem.column->compress(), elem.type, elem.name }); + + new_blocks.push_back(std::move(compressed_block)); + } + else + { + new_blocks.push_back(std::move(block)); + } + + new_bytes += new_blocks.back().bytes(); + new_rows += new_blocks.back().rows(); } } diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 8d75382c91c..79d7b83cada 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -1,6 +1,13 @@ #include #include +#include #include +#include +#include +#include +#include +#include +#include #include #include #include @@ -25,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +60,8 @@ #include #include #include +#include +#include namespace { @@ -78,13 +89,13 @@ namespace DB namespace ErrorCodes { + extern const int LOGICAL_ERROR; extern const int BAD_ARGUMENTS; extern const int NOT_IMPLEMENTED; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int SAMPLING_NOT_SUPPORTED; extern const int ALTER_OF_COLUMN_IS_FORBIDDEN; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; - extern const int LOGICAL_ERROR; } StorageMerge::DatabaseNameOrRegexp::DatabaseNameOrRegexp( @@ -379,7 +390,14 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu const auto storage_metadata_snapshot = storage->getInMemoryMetadataPtr(); const auto nested_storage_snaphsot = storage->getStorageSnapshot(storage_metadata_snapshot, context); - auto modified_query_info = getModifiedQueryInfo(query_info, context, table, nested_storage_snaphsot); + Names column_names_as_aliases; + Aliases aliases; + + Names real_column_names = column_names; + if (child_plan.row_policy_data_opt) + child_plan.row_policy_data_opt->extendNames(real_column_names); + + auto modified_query_info = getModifiedQueryInfo(context, table, nested_storage_snaphsot, real_column_names, column_names_as_aliases, aliases); auto source_pipeline = createSources( child_plan.plan, @@ -512,7 +530,6 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ auto storage_metadata_snapshot = storage->getInMemoryMetadataPtr(); auto nested_storage_snaphsot = storage->getStorageSnapshot(storage_metadata_snapshot, context); - auto modified_query_info = getModifiedQueryInfo(query_info, context, table, nested_storage_snaphsot); Names column_names_as_aliases; Names real_column_names = column_names; @@ -528,6 +545,8 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ row_policy_data_opt->extendNames(real_column_names); } + auto modified_query_info = getModifiedQueryInfo(context, table, nested_storage_snaphsot, real_column_names, column_names_as_aliases, aliases); + if (!context->getSettingsRef().allow_experimental_analyzer) { auto storage_columns = storage_metadata_snapshot->getColumns(); @@ -580,6 +599,10 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ column_names_as_aliases.push_back(ExpressionActions::getSmallestColumn(storage_metadata_snapshot->getColumns().getAllPhysical()).name); } } + else + { + + } res.back().plan = createPlanForTable( nested_storage_snaphsot, @@ -596,10 +619,210 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ return res; } -SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const SelectQueryInfo & query_info, - const ContextPtr & modified_context, +namespace +{ + +class ApplyAliasColumnExpressionsVisitor : public InDepthQueryTreeVisitor +{ +public: + explicit ApplyAliasColumnExpressionsVisitor(QueryTreeNodePtr replacement_table_expression_) + : replacement_table_expression(replacement_table_expression_) + {} + + void visitImpl(QueryTreeNodePtr & node) + { + if (auto * column = node->as(); column != nullptr) + { + if (column->hasExpression()) + { + node = column->getExpressionOrThrow(); + node->setAlias(column->getColumnName()); + } + else + column->setColumnSource(replacement_table_expression); + } + } +private: + QueryTreeNodePtr replacement_table_expression; +}; + +bool hasUnknownColumn(const QueryTreeNodePtr & node, QueryTreeNodePtr replacement_table_expression) +{ + QueryTreeNodes stack = { node }; + while (!stack.empty()) + { + auto current = stack.back(); + stack.pop_back(); + + switch (current->getNodeType()) + { + case QueryTreeNodeType::CONSTANT: + break; + case QueryTreeNodeType::COLUMN: + { + auto * column_node = current->as(); + auto source = column_node->getColumnSourceOrNull(); + if (source != replacement_table_expression) + return true; + break; + } + default: + { + for (const auto & child : current->getChildren()) + { + if (child) + stack.push_back(child); + } + } + } + } + return false; +} + +void replaceFilterExpression( + QueryTreeNodePtr & expression, + const QueryTreeNodePtr & replacement_table_expression, + const ContextPtr & context) +{ + auto * function = expression->as(); + if (!function) + return; + + if (function->getFunctionName() != "and") + { + if (hasUnknownColumn(expression, replacement_table_expression)) + expression = nullptr; + return; + } + + QueryTreeNodes conjunctions; + QueryTreeNodes processing{ expression }; + + while (!processing.empty()) + { + auto node = std::move(processing.back()); + processing.pop_back(); + + if (auto * function_node = node->as()) + { + if (function_node->getFunctionName() == "and") + std::copy( + function_node->getArguments().begin(), + function_node->getArguments().end(), + std::back_inserter(processing) + ); + else + conjunctions.push_back(node); + } + else + { + conjunctions.push_back(node); + } + } + + std::swap(processing, conjunctions); + + for (const auto & node : processing) + { + if (!hasUnknownColumn(node, replacement_table_expression)) + conjunctions.push_back(node); + } + + if (conjunctions.empty()) + { + expression = {}; + return; + } + if (conjunctions.size() == 1) + { + expression = conjunctions[0]; + return; + } + + function->getArguments().getNodes() = std::move(conjunctions); + + const auto function_impl = FunctionFactory::instance().get("and", context); + function->resolveAsFunction(function_impl->build(function->getArgumentColumns())); +} + +QueryTreeNodePtr replaceTableExpressionAndRemoveJoin( + QueryTreeNodePtr query, + QueryTreeNodePtr original_table_expression, + QueryTreeNodePtr replacement_table_expression, + const ContextPtr & context, + const Names & required_column_names) +{ + auto * query_node = query->as(); + auto join_tree_type = query_node->getJoinTree()->getNodeType(); + auto modified_query = query_node->cloneAndReplace(original_table_expression, replacement_table_expression); + + // For the case when join tree is just a table or a table function we don't need to do anything more. + if (join_tree_type == QueryTreeNodeType::TABLE || join_tree_type == QueryTreeNodeType::TABLE_FUNCTION) + return modified_query; + + // JOIN needs to be removed because StorageMerge should produce not joined data. + // GROUP BY should be removed as well. + + auto * modified_query_node = modified_query->as(); + + // Remove the JOIN statement. As a result query will have a form like: SELECT * FROM
... + modified_query = modified_query->cloneAndReplace(modified_query_node->getJoinTree(), replacement_table_expression); + modified_query_node = modified_query->as(); + + query_node = modified_query->as(); + + // For backward compatibility we need to leave all filters related to this table. + // It may lead to some incorrect result. + if (query_node->hasPrewhere()) + replaceFilterExpression(query_node->getPrewhere(), replacement_table_expression, context); + if (query_node->hasWhere()) + replaceFilterExpression(query_node->getWhere(), replacement_table_expression, context); + + query_node->getGroupBy().getNodes().clear(); + query_node->getHaving() = {}; + query_node->getOrderBy().getNodes().clear(); + + auto & projection = modified_query_node->getProjection().getNodes(); + projection.clear(); + NamesAndTypes projection_columns; + + // Select only required columns from the table, because projection list may contain: + // 1. aggregate functions + // 2. expressions referencing other tables of JOIN + for (auto const & column_name : required_column_names) + { + QueryTreeNodePtr fake_node = std::make_shared(Identifier{column_name}); + + QueryAnalysisPass query_analysis_pass(original_table_expression); + query_analysis_pass.run(fake_node, context); + + auto * resolved_column = fake_node->as(); + if (!resolved_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Required column '{}' is not resolved", column_name); + auto fake_column = resolved_column->getColumn(); + + // Identifier is resolved to ColumnNode, but we need to get rid of ALIAS columns + // and also fix references to source expression (now column is referencing original table expression). + ApplyAliasColumnExpressionsVisitor visitor(replacement_table_expression); + visitor.visit(fake_node); + + projection.push_back(fake_node); + projection_columns.push_back(fake_column); + } + + query_node->resolveProjectionColumns(std::move(projection_columns)); + + return modified_query; +} + +} + +SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_context, const StorageWithLockAndName & storage_with_lock_and_name, - const StorageSnapshotPtr & storage_snapshot) + const StorageSnapshotPtr & storage_snapshot, + Names required_column_names, + Names & column_names_as_aliases, + Aliases & aliases) const { const auto & [database_name, storage, storage_lock, table_name] = storage_with_lock_and_name; const StorageID current_storage_id = storage->getStorageID(); @@ -612,8 +835,7 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const SelectQueryInfo & quer if (query_info.table_expression_modifiers) replacement_table_expression->setTableExpressionModifiers(*query_info.table_expression_modifiers); - modified_query_info.query_tree = modified_query_info.query_tree->cloneAndReplace(modified_query_info.table_expression, - replacement_table_expression); + modified_query_info.query_tree = replaceTableExpressionAndRemoveJoin(modified_query_info.query_tree, modified_query_info.table_expression, replacement_table_expression, modified_context, required_column_names); modified_query_info.table_expression = replacement_table_expression; modified_query_info.planner_context->getOrCreateTableExpressionData(replacement_table_expression); @@ -624,10 +846,65 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const SelectQueryInfo & quer std::unordered_map column_name_to_node; if (!storage_snapshot->tryGetColumn(get_column_options, "_table")) - column_name_to_node.emplace("_table", std::make_shared(current_storage_id.table_name)); + { + auto table_name_node = std::make_shared(current_storage_id.table_name); + table_name_node->setAlias("_table"); + column_name_to_node.emplace("_table", table_name_node); + } if (!storage_snapshot->tryGetColumn(get_column_options, "_database")) - column_name_to_node.emplace("_database", std::make_shared(current_storage_id.database_name)); + { + auto database_name_node = std::make_shared(current_storage_id.database_name); + database_name_node->setAlias("_database"); + column_name_to_node.emplace("_database", database_name_node); + } + + auto storage_columns = storage_snapshot->metadata->getColumns(); + + bool with_aliases = /* common_processed_stage == QueryProcessingStage::FetchColumns && */ !storage_columns.getAliases().empty(); + if (with_aliases) + { + auto filter_actions_dag = std::make_shared(); + for (const auto & column : required_column_names) + { + const auto column_default = storage_columns.getDefault(column); + bool is_alias = column_default && column_default->kind == ColumnDefaultKind::Alias; + + QueryTreeNodePtr column_node; + + // Replace all references to ALIAS columns in the query by expressions. + if (is_alias) + { + QueryTreeNodePtr fake_node = std::make_shared(Identifier{column}); + + QueryAnalysisPass query_analysis_pass(modified_query_info.table_expression); + query_analysis_pass.run(fake_node, modified_context); + + auto * resolved_column = fake_node->as(); + + column_node = fake_node; + ApplyAliasColumnExpressionsVisitor visitor(replacement_table_expression); + visitor.visit(column_node); + + if (!resolved_column || !resolved_column->getExpression()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Alias column is not resolved"); + + column_name_to_node.emplace(column, column_node); + aliases.push_back({ .name = column, .type = resolved_column->getResultType(), .expression = column_node->toAST() }); + } + else + { + column_node = std::make_shared(NameAndTypePair{column, storage_columns.getColumn(get_column_options, column).type }, modified_query_info.table_expression); + } + + + PlannerActionsVisitor actions_visitor(modified_query_info.planner_context, false /*use_column_identifier_as_action_node_name*/); + actions_visitor.visit(filter_actions_dag, column_node); + } + column_names_as_aliases = filter_actions_dag->getRequiredColumnsNames(); + if (column_names_as_aliases.empty()) + column_names_as_aliases.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()).name); + } if (!column_name_to_node.empty()) { @@ -756,7 +1033,7 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( /// Subordinary tables could have different but convertible types, like numeric types of different width. /// We must return streams with structure equals to structure of Merge table. - convertAndFilterSourceStream(header, storage_snapshot->metadata, aliases, row_policy_data_opt, modified_context, *builder, processed_stage); + convertAndFilterSourceStream(header, modified_query_info, storage_snapshot, aliases, row_policy_data_opt, modified_context, *builder, processed_stage); } return builder; @@ -1107,38 +1384,73 @@ void StorageMerge::alter( void ReadFromMerge::convertAndFilterSourceStream( const Block & header, - const StorageMetadataPtr & metadata_snapshot, + SelectQueryInfo & modified_query_info, + const StorageSnapshotPtr & snapshot, const Aliases & aliases, const RowPolicyDataOpt & row_policy_data_opt, - ContextPtr local_context, + ContextMutablePtr local_context, QueryPipelineBuilder & builder, QueryProcessingStage::Enum processed_stage) { Block before_block_header = builder.getHeader(); - auto storage_sample_block = metadata_snapshot->getSampleBlock(); + auto storage_sample_block = snapshot->metadata->getSampleBlock(); auto pipe_columns = builder.getHeader().getNamesAndTypesList(); - for (const auto & alias : aliases) + if (local_context->getSettingsRef().allow_experimental_analyzer) { - pipe_columns.emplace_back(NameAndTypePair(alias.name, alias.type)); - ASTPtr expr = alias.expression; - auto syntax_result = TreeRewriter(local_context).analyze(expr, pipe_columns); - auto expression_analyzer = ExpressionAnalyzer{alias.expression, syntax_result, local_context}; - - auto dag = std::make_shared(pipe_columns); - auto actions_dag = expression_analyzer.getActionsDAG(true, false); - auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); - - builder.addSimpleTransform([&](const Block & stream_header) + for (const auto & alias : aliases) { - return std::make_shared(stream_header, actions); - }); + pipe_columns.emplace_back(NameAndTypePair(alias.name, alias.type)); + + auto actions_dag = std::make_shared(pipe_columns); + + QueryTreeNodePtr query_tree = buildQueryTree(alias.expression, local_context); + query_tree->setAlias(alias.name); + + QueryAnalysisPass query_analysis_pass(modified_query_info.table_expression); + query_analysis_pass.run(query_tree, local_context); + + PlannerActionsVisitor actions_visitor(modified_query_info.planner_context, false /*use_column_identifier_as_action_node_name*/); + const auto & nodes = actions_visitor.visit(actions_dag, query_tree); + + if (nodes.size() != 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected to have 1 output but got {}", nodes.size()); + + actions_dag->addOrReplaceInOutputs(actions_dag->addAlias(*nodes.front(), alias.name)); + + auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); + + builder.addSimpleTransform([&](const Block & stream_header) + { + return std::make_shared(stream_header, actions); + }); + } + } + else + { + for (const auto & alias : aliases) + { + pipe_columns.emplace_back(NameAndTypePair(alias.name, alias.type)); + ASTPtr expr = alias.expression; + auto syntax_result = TreeRewriter(local_context).analyze(expr, pipe_columns); + auto expression_analyzer = ExpressionAnalyzer{alias.expression, syntax_result, local_context}; + + auto dag = std::make_shared(pipe_columns); + auto actions_dag = expression_analyzer.getActionsDAG(true, false); + auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); + + builder.addSimpleTransform([&](const Block & stream_header) + { + return std::make_shared(stream_header, actions); + }); + } } ActionsDAG::MatchColumnsMode convert_actions_match_columns_mode = ActionsDAG::MatchColumnsMode::Name; - if (local_context->getSettingsRef().allow_experimental_analyzer && processed_stage != QueryProcessingStage::FetchColumns) + if (local_context->getSettingsRef().allow_experimental_analyzer + && (processed_stage != QueryProcessingStage::FetchColumns || dynamic_cast(&snapshot->storage) != nullptr)) convert_actions_match_columns_mode = ActionsDAG::MatchColumnsMode::Position; if (row_policy_data_opt) diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index 703e5db9c50..f5b6c3a7ca9 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -189,6 +189,13 @@ private: using Aliases = std::vector; + SelectQueryInfo getModifiedQueryInfo(const ContextPtr & modified_context, + const StorageWithLockAndName & storage_with_lock_and_name, + const StorageSnapshotPtr & storage_snapshot, + Names required_column_names, + Names & column_names_as_aliases, + Aliases & aliases) const; + /// An object of this helper class is created /// when processing a Merge table data source (subordinary table) /// that has row policies @@ -261,17 +268,13 @@ private: ContextMutablePtr modified_context, bool concat_streams = false) const; - static SelectQueryInfo getModifiedQueryInfo(const SelectQueryInfo & query_info, - const ContextPtr & modified_context, - const StorageWithLockAndName & storage_with_lock_and_name, - const StorageSnapshotPtr & storage_snapshot); - static void convertAndFilterSourceStream( const Block & header, - const StorageMetadataPtr & metadata_snapshot, + SelectQueryInfo & modified_query_info, + const StorageSnapshotPtr & snapshot, const Aliases & aliases, const RowPolicyDataOpt & row_policy_data_opt, - ContextPtr context, + ContextMutablePtr context, QueryPipelineBuilder & builder, QueryProcessingStage::Enum processed_stage); diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 7e6c5ca3924..678535da732 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -65,6 +65,7 @@ namespace ErrorCodes extern const int NO_SUCH_DATA_PART; extern const int ABORTED; extern const int SUPPORT_IS_DISABLED; + extern const int TABLE_IS_READ_ONLY; } namespace ActionLocks @@ -220,11 +221,11 @@ void StorageMergeTree::read( if (local_context->getSettingsRef().allow_experimental_analyzer) { QueryTreeNodePtr modified_query_tree = query_info.query_tree->clone(); - rewriteJoinToGlobalJoin(modified_query_tree); - modified_query_tree = buildQueryTreeForShard(query_info, modified_query_tree); + rewriteJoinToGlobalJoin(modified_query_tree, local_context); + modified_query_tree = buildQueryTreeForShard(query_info.planner_context, modified_query_tree); header = InterpreterSelectQueryAnalyzer::getSampleBlock( modified_query_tree, local_context, SelectQueryOptions(processed_stage).analyze()); - modified_query_ast = queryNodeToSelectQuery(modified_query_tree); + modified_query_ast = queryNodeToDistributedSelectQuery(modified_query_tree); } else { @@ -251,7 +252,9 @@ void StorageMergeTree::read( } else { - const bool enable_parallel_reading = local_context->canUseParallelReplicasOnFollower() && local_context->getSettingsRef().parallel_replicas_for_non_replicated_merge_tree; + const bool enable_parallel_reading = local_context->canUseParallelReplicasOnFollower() + && local_context->getSettingsRef().parallel_replicas_for_non_replicated_merge_tree + && (!local_context->getSettingsRef().allow_experimental_analyzer || query_info.analyzer_can_use_parallel_replicas_on_follower); if (auto plan = reader.read( column_names, @@ -294,6 +297,8 @@ std::optional StorageMergeTree::totalBytesUncompressed(const Settings &) SinkToStoragePtr StorageMergeTree::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) { + assertNotReadonly(); + const auto & settings = local_context->getSettingsRef(); return std::make_shared( *this, metadata_snapshot, settings.max_partitions_per_insert_block, local_context); @@ -319,9 +324,6 @@ void StorageMergeTree::checkTableCanBeDropped(ContextPtr query_context) const void StorageMergeTree::drop() { shutdown(true); - /// In case there is read-only disk we cannot allow to call dropAllData(), but dropping tables is allowed. - if (isStaticStorage()) - return; dropAllData(); } @@ -330,6 +332,8 @@ void StorageMergeTree::alter( ContextPtr local_context, AlterLockHolder & table_lock_holder) { + assertNotReadonly(); + if (local_context->getCurrentTransaction() && local_context->getSettingsRef().throw_on_unsupported_query_inside_transaction) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ALTER METADATA is not supported inside transactions"); @@ -620,6 +624,8 @@ void StorageMergeTree::setMutationCSN(const String & mutation_id, CSN csn) void StorageMergeTree::mutate(const MutationCommands & commands, ContextPtr query_context) { + assertNotReadonly(); + delayMutationOrThrowIfNeeded(nullptr, query_context); /// Validate partition IDs (if any) before starting mutation @@ -683,7 +689,7 @@ std::optional StorageMergeTree::getIncompleteMutationsS const auto & mutation_entry = current_mutation_it->second; - auto txn = tryGetTransactionForMutation(mutation_entry, log); + auto txn = tryGetTransactionForMutation(mutation_entry, log.load()); /// There's no way a transaction may finish before a mutation that was started by the transaction. /// But sometimes we need to check status of an unrelated mutation, in this case we don't care about transactions. assert(txn || mutation_entry.tid.isPrehistoric() || from_another_mutation); @@ -810,6 +816,8 @@ std::vector StorageMergeTree::getMutationsStatus() cons CancellationCode StorageMergeTree::killMutation(const String & mutation_id) { + assertNotReadonly(); + LOG_TRACE(log, "Killing mutation {}", mutation_id); UInt64 mutation_version = MergeTreeMutationEntry::tryParseFileName(mutation_id); if (!mutation_version) @@ -829,7 +837,7 @@ CancellationCode StorageMergeTree::killMutation(const String & mutation_id) if (!to_kill) return CancellationCode::NotFound; - if (auto txn = tryGetTransactionForMutation(*to_kill, log)) + if (auto txn = tryGetTransactionForMutation(*to_kill, log.load())) { LOG_TRACE(log, "Cancelling transaction {} which had started mutation {}", to_kill->tid, mutation_id); TransactionLog::instance().rollbackTransaction(txn); @@ -1222,7 +1230,7 @@ MergeMutateSelectedEntryPtr StorageMergeTree::selectPartsToMutate( if (!part->version.isVisible(first_mutation_tid.start_csn, first_mutation_tid)) continue; - txn = tryGetTransactionForMutation(mutations_begin_it->second, log); + txn = tryGetTransactionForMutation(mutations_begin_it->second, log.load()); if (!txn) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find transaction {} that has started mutation {} " "that is going to be applied to part {}", @@ -1520,6 +1528,8 @@ bool StorageMergeTree::optimize( bool cleanup, ContextPtr local_context) { + assertNotReadonly(); + if (deduplicate) { if (deduplicate_by_columns.empty()) @@ -1765,6 +1775,8 @@ void StorageMergeTree::renameAndCommitEmptyParts(MutableDataPartsVector & new_pa void StorageMergeTree::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr query_context, TableExclusiveLockHolder &) { + assertNotReadonly(); + { /// Asks to complete merges and does not allow them to start. /// This protects against "revival" of data for a removed partition after completion of merge. @@ -2039,6 +2051,8 @@ PartitionCommandsResultInfo StorageMergeTree::attachPartition( void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, const ASTPtr & partition, bool replace, ContextPtr local_context) { + assertNotReadonly(); + auto lock1 = lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); auto lock2 = source_table->lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); auto merges_blocker = stopMergesAndWait(); @@ -2284,11 +2298,12 @@ std::optional StorageMergeTree::checkDataNext(DataValidationTasksPt { /// If the checksums file is not present, calculate the checksums and write them to disk. static constexpr auto checksums_path = "checksums.txt"; + bool noop; if (part->isStoredOnDisk() && !part->getDataPartStorage().exists(checksums_path)) { try { - auto calculated_checksums = checkDataPart(part, false); + auto calculated_checksums = checkDataPart(part, false, noop, /* is_cancelled */[]{ return false; }, /* throw_on_broken_projection */true); calculated_checksums.checkEqual(part->checksums, true); auto & part_mutable = const_cast(*part); @@ -2309,7 +2324,7 @@ std::optional StorageMergeTree::checkDataNext(DataValidationTasksPt { try { - checkDataPart(part, true); + checkDataPart(part, true, noop, /* is_cancelled */[]{ return false; }, /* throw_on_broken_projection */true); return CheckResult(part->name, true, ""); } catch (...) @@ -2437,6 +2452,12 @@ PreparedSetsCachePtr StorageMergeTree::getPreparedSetsCache(Int64 mutation_id) return cache; } +void StorageMergeTree::assertNotReadonly() const +{ + if (isStaticStorage()) + throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode due to static storage"); +} + void StorageMergeTree::fillNewPartName(MutableDataPartPtr & part, DataPartsLock &) { part->info.min_block = part->info.max_block = increment.get(); diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index 51bf6aa42e7..359fa1d262d 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -273,6 +273,8 @@ private: PreparedSetsCachePtr getPreparedSetsCache(Int64 mutation_id); + void assertNotReadonly() const; + friend class MergeTreeSink; friend class MergeTreeData; friend class MergePlainMergeTreeTask; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 695b78a10db..fb4e9b4ad87 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1,6 +1,5 @@ #include -#include #include #include @@ -30,14 +29,12 @@ #include #include #include -#include #include #include #include #include #include #include -#include #include #include #include @@ -63,21 +60,16 @@ #include #include #include -#include #include -#include #include #include #include #include -#include -#include #include #include #include #include -#include #include #include @@ -105,9 +97,6 @@ #include #include -#include - -#include #include #include @@ -320,7 +309,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( attach, [this] (const std::string & name) { enqueuePartForCheck(name); }) , zookeeper_name(zkutil::extractZooKeeperName(zookeeper_path_)) - , zookeeper_path(zkutil::extractZooKeeperPath(zookeeper_path_, /* check_starts_with_slash */ !attach, log)) + , zookeeper_path(zkutil::extractZooKeeperPath(zookeeper_path_, /* check_starts_with_slash */ !attach, log.load())) , replica_name(replica_name_) , replica_path(fs::path(zookeeper_path) / "replicas" / replica_name_) , reader(*this) @@ -556,18 +545,6 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( } -String StorageReplicatedMergeTree::getDefaultZooKeeperPath(const Poco::Util::AbstractConfiguration & config) -{ - return config.getString("default_replica_path", "/clickhouse/tables/{uuid}/{shard}"); -} - - -String StorageReplicatedMergeTree::getDefaultReplicaName(const Poco::Util::AbstractConfiguration & config) -{ - return config.getString("default_replica_name", "{replica}"); -} - - bool StorageReplicatedMergeTree::checkFixedGranularityInZookeeper() { auto zookeeper = getZooKeeper(); @@ -812,7 +789,7 @@ bool StorageReplicatedMergeTree::createTableIfNotExists(const StorageMetadataPtr else { auto metadata_drop_lock = zkutil::EphemeralNodeHolder::existing(drop_lock_path, *zookeeper); - if (!removeTableNodesFromZooKeeper(zookeeper, zookeeper_path, metadata_drop_lock, log)) + if (!removeTableNodesFromZooKeeper(zookeeper, zookeeper_path, metadata_drop_lock, log.load())) { /// Someone is recursively removing table right now, we cannot create new table until old one is removed continue; @@ -1130,7 +1107,7 @@ void StorageReplicatedMergeTree::drop() if (lost_part_count > 0) LOG_INFO(log, "Dropping table with non-zero lost_part_count equal to {}", lost_part_count); } - dropReplica(zookeeper, zookeeper_path, replica_name, log, getSettings(), &has_metadata_in_zookeeper); + dropReplica(zookeeper, zookeeper_path, replica_name, log.load(), getSettings(), &has_metadata_in_zookeeper); } } @@ -2066,7 +2043,7 @@ bool StorageReplicatedMergeTree::executeFetch(LogEntry & entry, bool need_to_che if (entry.quorum) { if (entry.type != LogEntry::GET_PART) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: log entry with quorum but type is not GET_PART"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Log entry with quorum but type is not GET_PART"); LOG_DEBUG(log, "No active replica has part {} which needs to be written with quorum. Will try to mark that quorum as failed.", entry.new_part_name); @@ -2129,7 +2106,7 @@ bool StorageReplicatedMergeTree::executeFetch(LogEntry & entry, bool need_to_che auto part_info = MergeTreePartInfo::fromPartName(entry.new_part_name, format_version); if (part_info.min_block != part_info.max_block) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: log entry with quorum for part covering more than one block number"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Log entry with quorum for part covering more than one block number"); ops.emplace_back(zkutil::makeCreateRequest( fs::path(zookeeper_path) / "quorum" / "failed_parts" / entry.new_part_name, @@ -4183,7 +4160,7 @@ void StorageReplicatedMergeTree::startBeingLeader() return; } - zkutil::checkNoOldLeaders(log, *zookeeper, fs::path(zookeeper_path) / "leader_election"); + zkutil::checkNoOldLeaders(log.load(), *zookeeper, fs::path(zookeeper_path) / "leader_election"); LOG_INFO(log, "Became leader"); is_leader = true; @@ -4277,7 +4254,7 @@ void StorageReplicatedMergeTree::waitForUniquePartsToBeFetchedByOtherReplicas(St auto zookeeper = getZooKeeperIfTableShutDown(); - auto unique_parts_set = findReplicaUniqueParts(replica_name, zookeeper_path, format_version, zookeeper, log); + auto unique_parts_set = findReplicaUniqueParts(replica_name, zookeeper_path, format_version, zookeeper, log.load()); if (unique_parts_set.empty()) { LOG_INFO(log, "Will not wait for unique parts to be fetched because we don't have any unique parts"); @@ -4661,6 +4638,9 @@ bool StorageReplicatedMergeTree::fetchPart( zkutil::ZooKeeper::Ptr zookeeper_, bool try_fetch_shared) { + if (isStaticStorage()) + throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode due to static storage"); + auto zookeeper = zookeeper_ ? zookeeper_ : getZooKeeper(); const auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version); @@ -5388,12 +5368,12 @@ void StorageReplicatedMergeTree::readParallelReplicasImpl( if (local_context->getSettingsRef().allow_experimental_analyzer) { QueryTreeNodePtr modified_query_tree = query_info.query_tree->clone(); - rewriteJoinToGlobalJoin(modified_query_tree); - modified_query_tree = buildQueryTreeForShard(query_info, modified_query_tree); + rewriteJoinToGlobalJoin(modified_query_tree, local_context); + modified_query_tree = buildQueryTreeForShard(query_info.planner_context, modified_query_tree); header = InterpreterSelectQueryAnalyzer::getSampleBlock( modified_query_tree, local_context, SelectQueryOptions(processed_stage).analyze()); - modified_query_ast = queryNodeToSelectQuery(modified_query_tree); + modified_query_ast = queryNodeToDistributedSelectQuery(modified_query_tree); } else { @@ -5427,11 +5407,14 @@ void StorageReplicatedMergeTree::readLocalImpl( const size_t max_block_size, const size_t num_streams) { + const bool enable_parallel_reading = local_context->canUseParallelReplicasOnFollower() + && (!local_context->getSettingsRef().allow_experimental_analyzer || query_info.analyzer_can_use_parallel_replicas_on_follower); + auto plan = reader.read( column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams, /* max_block_numbers_to_read= */ nullptr, - /* enable_parallel_reading= */ local_context->canUseParallelReplicasOnFollower()); + enable_parallel_reading); if (plan) query_plan = std::move(*plan); @@ -5498,6 +5481,8 @@ void StorageReplicatedMergeTree::assertNotReadonly() const { if (is_readonly) throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode (replica path: {})", replica_path); + if (isStaticStorage()) + throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode due to static storage"); } @@ -5506,6 +5491,8 @@ SinkToStoragePtr StorageReplicatedMergeTree::write(const ASTPtr & /*query*/, con if (!initialization_done) throw Exception(ErrorCodes::NOT_INITIALIZED, "Table is not initialized yet"); + if (isStaticStorage()) + throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode due to static storage"); /// If table is read-only because it doesn't have metadata in zk yet, then it's not possible to insert into it /// Without this check, we'll write data parts on disk, and afterwards will remove them since we'll fail to commit them into zk /// In case of remote storage like s3, it'll generate unnecessary PUT requests @@ -6809,7 +6796,7 @@ bool StorageReplicatedMergeTree::tryWaitForReplicaToProcessLogEntry( } else { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: unexpected name of log node: {}", entry.znode_name); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected name of log node: {}", entry.znode_name); } /** Second - find the corresponding entry in the queue of the specified replica. @@ -7185,7 +7172,7 @@ void StorageReplicatedMergeTree::fetchPartition( } if (best_replica.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: cannot choose best replica."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot choose best replica."); LOG_INFO(log, "Found {} replicas, {} of them are active. Selected {} to fetch from.", replicas.size(), active_replicas.size(), best_replica); @@ -8881,12 +8868,11 @@ IStorage::DataValidationTasksPtr StorageReplicatedMergeTree::getCheckTaskList( std::optional StorageReplicatedMergeTree::checkDataNext(DataValidationTasksPtr & check_task_list) { - if (auto part = assert_cast(check_task_list.get())->next()) { try { - return CheckResult(part_check_thread.checkPartAndFix(part->name)); + return part_check_thread.checkPartAndFix(part->name, /* recheck_after */nullptr, /* throw_on_broken_projection */true); } catch (const Exception & ex) { @@ -9350,7 +9336,7 @@ StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part, co return unlockSharedDataByID( part.getUniqueId(), shared_id, part.info, replica_name, - part.getDataPartStorage().getDiskType(), zookeeper, *getSettings(), log, zookeeper_path, format_version); + part.getDataPartStorage().getDiskType(), zookeeper, *getSettings(), log.load(), zookeeper_path, format_version); } namespace @@ -10303,7 +10289,7 @@ void StorageReplicatedMergeTree::backupData( bool exists = false; Strings mutation_ids; { - ZooKeeperRetriesControl retries_ctl("getMutations", log, zookeeper_retries_info, nullptr); + ZooKeeperRetriesControl retries_ctl("getMutations", log.load(), zookeeper_retries_info, nullptr); retries_ctl.retryLoop([&]() { if (!zookeeper || zookeeper->expired()) @@ -10322,7 +10308,7 @@ void StorageReplicatedMergeTree::backupData( bool mutation_id_exists = false; String mutation; - ZooKeeperRetriesControl retries_ctl("getMutation", log, zookeeper_retries_info, nullptr); + ZooKeeperRetriesControl retries_ctl("getMutation", log.load(), zookeeper_retries_info, nullptr); retries_ctl.retryLoop([&]() { if (!zookeeper || zookeeper->expired()) diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index c682b1ec88d..79d6d1dce3d 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -143,9 +143,6 @@ public: ~StorageReplicatedMergeTree() override; - static String getDefaultZooKeeperPath(const Poco::Util::AbstractConfiguration & config); - static String getDefaultReplicaName(const Poco::Util::AbstractConfiguration & config); - std::string getName() const override { return "Replicated" + merging_params.getModeName() + "MergeTree"; } bool supportsParallelInsert() const override { return true; } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index aec967cc95c..2d8ef3df1c8 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -129,6 +129,7 @@ namespace ErrorCodes extern const int UNEXPECTED_EXPRESSION; extern const int DATABASE_ACCESS_DENIED; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int CANNOT_DETECT_FORMAT; extern const int NOT_IMPLEMENTED; extern const int CANNOT_COMPILE_REGEXP; extern const int FILE_DOESNT_EXIST; @@ -244,7 +245,7 @@ public: fillInternalBufferAssumeLocked(); } - KeyWithInfoPtr next() + KeyWithInfoPtr next(size_t) { std::lock_guard lock(mutex); return nextAssumeLocked(); @@ -428,7 +429,7 @@ StorageS3Source::DisclosedGlobIterator::DisclosedGlobIterator( const S3::URI & globbed_uri_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context, + const ContextPtr & context, KeysWithInfo * read_keys_, const S3Settings::RequestSettings & request_settings_, std::function file_progress_callback_) @@ -436,9 +437,9 @@ StorageS3Source::DisclosedGlobIterator::DisclosedGlobIterator( { } -StorageS3Source::KeyWithInfoPtr StorageS3Source::DisclosedGlobIterator::next() +StorageS3Source::KeyWithInfoPtr StorageS3Source::DisclosedGlobIterator::next(size_t idx) /// NOLINT { - return pimpl->next(); + return pimpl->next(idx); } size_t StorageS3Source::DisclosedGlobIterator::estimatedKeysCount() @@ -471,7 +472,7 @@ public: } } - KeyWithInfoPtr next() + KeyWithInfoPtr next(size_t) { size_t current_index = index.fetch_add(1, std::memory_order_relaxed); if (current_index >= keys.size()) @@ -516,9 +517,9 @@ StorageS3Source::KeysIterator::KeysIterator( { } -StorageS3Source::KeyWithInfoPtr StorageS3Source::KeysIterator::next() +StorageS3Source::KeyWithInfoPtr StorageS3Source::KeysIterator::next(size_t idx) /// NOLINT { - return pimpl->next(); + return pimpl->next(idx); } size_t StorageS3Source::KeysIterator::estimatedKeysCount() @@ -545,7 +546,7 @@ StorageS3Source::ReadTaskIterator::ReadTaskIterator( buffer.emplace_back(std::make_shared(key_future.get(), std::nullopt)); } -StorageS3Source::KeyWithInfoPtr StorageS3Source::ReadTaskIterator::next() +StorageS3Source::KeyWithInfoPtr StorageS3Source::ReadTaskIterator::next(size_t) /// NOLINT { size_t current_index = index.fetch_add(1, std::memory_order_relaxed); if (current_index >= buffer.size()) @@ -563,7 +564,7 @@ StorageS3Source::StorageS3Source( const ReadFromFormatInfo & info, const String & format_, String name_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, UInt64 max_block_size_, const S3Settings::RequestSettings & request_settings_, @@ -599,23 +600,23 @@ StorageS3Source::StorageS3Source( { } -void StorageS3Source::lazyInitialize() +void StorageS3Source::lazyInitialize(size_t idx) { if (initialized) return; - reader = createReader(); + reader = createReader(idx); if (reader) - reader_future = createReaderAsync(); + reader_future = createReaderAsync(idx); initialized = true; } -StorageS3Source::ReaderHolder StorageS3Source::createReader() +StorageS3Source::ReaderHolder StorageS3Source::createReader(size_t idx) { KeyWithInfoPtr key_with_info; do { - key_with_info = (*file_iterator)(); + key_with_info = file_iterator->next(idx); if (!key_with_info || key_with_info->key.empty()) return {}; @@ -689,9 +690,9 @@ StorageS3Source::ReaderHolder StorageS3Source::createReader() return ReaderHolder{key_with_info, bucket, std::move(read_buf), std::move(source), std::move(pipeline), std::move(current_reader)}; } -std::future StorageS3Source::createReaderAsync() +std::future StorageS3Source::createReaderAsync(size_t idx) { - return create_reader_scheduler([this] { return createReader(); }, Priority{}); + return create_reader_scheduler([=, this] { return createReader(idx); }, Priority{}); } std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & key, size_t object_size) @@ -841,7 +842,7 @@ public: StorageS3Sink( const String & format, const Block & sample_block_, - ContextPtr context, + const ContextPtr & context, std::optional format_settings_, const CompressionMethod compression_method, const StorageS3::Configuration & configuration_, @@ -949,23 +950,22 @@ private: }; -class PartitionedStorageS3Sink : public PartitionedSink +class PartitionedStorageS3Sink : public PartitionedSink, WithContext { public: PartitionedStorageS3Sink( const ASTPtr & partition_by, const String & format_, const Block & sample_block_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, const CompressionMethod compression_method_, const StorageS3::Configuration & configuration_, const String & bucket_, const String & key_) - : PartitionedSink(partition_by, context_, sample_block_) + : PartitionedSink(partition_by, context_, sample_block_), WithContext(context_) , format(format_) , sample_block(sample_block_) - , context(context_) , compression_method(compression_method_) , configuration(configuration_) , bucket(bucket_) @@ -985,7 +985,7 @@ public: return std::make_shared( format, sample_block, - context, + getContext(), format_settings, compression_method, configuration, @@ -997,7 +997,6 @@ public: private: const String format; const Block sample_block; - const ContextPtr context; const CompressionMethod compression_method; const StorageS3::Configuration configuration; const String bucket; @@ -1033,7 +1032,7 @@ private: StorageS3::StorageS3( const Configuration & configuration_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -1050,18 +1049,27 @@ StorageS3::StorageS3( { updateConfiguration(context_); // NOLINT(clang-analyzer-optin.cplusplus.VirtualCall) - FormatFactory::instance().checkFormatName(configuration.format); + if (configuration.format != "auto") + FormatFactory::instance().checkFormatName(configuration.format); context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration.url.uri); context_->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(configuration.headers_from_ast); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = getTableStructureFromDataImpl(configuration, format_settings, context_); + ColumnsDescription columns; + if (configuration.format == "auto") + std::tie(columns, configuration.format) = getTableStructureAndFormatFromData(configuration, format_settings, context_); + else + columns = getTableStructureFromData(configuration, format_settings, context_); + storage_metadata.setColumns(columns); } else { + if (configuration.format == "auto") + configuration.format = getTableStructureAndFormatFromData(configuration, format_settings, context_).second; + /// We don't allow special columns in S3 storage. if (!columns_.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine S3 doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -1350,14 +1358,14 @@ void StorageS3::truncate(const ASTPtr & /* query */, const StorageMetadataPtr &, LOG_WARNING(getLogger("StorageS3"), "Failed to delete {}, error: {}", error.GetKey(), error.GetMessage()); } -StorageS3::Configuration StorageS3::updateConfigurationAndGetCopy(ContextPtr local_context) +StorageS3::Configuration StorageS3::updateConfigurationAndGetCopy(const ContextPtr & local_context) { std::lock_guard lock(configuration_update_mutex); configuration.update(local_context); return configuration; } -void StorageS3::updateConfiguration(ContextPtr local_context) +void StorageS3::updateConfiguration(const ContextPtr & local_context) { std::lock_guard lock(configuration_update_mutex); configuration.update(local_context); @@ -1375,9 +1383,9 @@ const StorageS3::Configuration & StorageS3::getConfiguration() return configuration; } -bool StorageS3::Configuration::update(ContextPtr context) +bool StorageS3::Configuration::update(const ContextPtr & context) { - auto s3_settings = context->getStorageS3Settings().getSettings(url.uri.toString()); + auto s3_settings = context->getStorageS3Settings().getSettings(url.uri.toString(), context->getUserName()); request_settings = s3_settings.request_settings; request_settings.updateFromSettings(context->getSettings()); @@ -1390,7 +1398,7 @@ bool StorageS3::Configuration::update(ContextPtr context) return true; } -void StorageS3::Configuration::connect(ContextPtr context) +void StorageS3::Configuration::connect(const ContextPtr & context) { const Settings & global_settings = context->getGlobalContext()->getSettingsRef(); const Settings & local_settings = context->getSettingsRef(); @@ -1462,7 +1470,7 @@ void StorageS3::processNamedCollectionResult(StorageS3::Configuration & configur configuration.request_settings = S3Settings::RequestSettings(collection); } -StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file) +StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, const ContextPtr & local_context, bool get_format_from_file) { StorageS3::Configuration configuration; @@ -1601,7 +1609,7 @@ StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, Context configuration.keys = {configuration.url.key}; if (configuration.format == "auto" && get_format_from_file) - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.url.key, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.url.key).value_or("auto"); return configuration; } @@ -1609,9 +1617,17 @@ StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, Context ColumnsDescription StorageS3::getTableStructureFromData( const StorageS3::Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx) + const ContextPtr & ctx) { - return getTableStructureFromDataImpl(configuration, format_settings, ctx); + return getTableStructureAndFormatFromDataImpl(configuration.format, configuration, format_settings, ctx).first; +} + +std::pair StorageS3::getTableStructureAndFormatFromData( + const StorageS3::Configuration & configuration, + const std::optional & format_settings, + const ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(std::nullopt, configuration, format_settings, ctx); } namespace @@ -1623,24 +1639,43 @@ namespace std::shared_ptr file_iterator_, const StorageS3Source::KeysWithInfo & read_keys_, const StorageS3::Configuration & configuration_, + std::optional format_, const std::optional & format_settings_, const ContextPtr & context_) : WithContext(context_) , file_iterator(file_iterator_) , read_keys(read_keys_) , configuration(configuration_) + , format(std::move(format_)) , format_settings(format_settings_) , prev_read_keys_size(read_keys_.size()) { } - std::pair, std::optional> next() override + Data next() override { - /// For default mode check cached columns for currently read keys on first iteration. - if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + if (first) { - if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) - return {nullptr, cached_columns}; + /// If format is unknown we iterate through all currently read keys on first iteration and + /// try to determine format by file name. + if (!format) + { + for (const auto & key_with_info : read_keys) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(key_with_info->key)) + { + format = format_from_file_name; + break; + } + } + } + + /// For default mode check cached columns for currently read keys on first iteration. + if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) + return {nullptr, cached_columns, format}; + } } while (true) @@ -1650,22 +1685,48 @@ namespace if (!current_key_with_info || current_key_with_info->key.empty()) { if (first) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files with provided path " - "in S3 or all files are empty. You must specify table structure manually", - configuration.format); + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because there are no files with provided path " + "in S3 or all files are empty. You can specify table structure manually", + *format); - return {nullptr, std::nullopt}; + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because there are no files with provided path " + "in S3 or all files are empty. You can specify the format manually"); + } + + return {nullptr, std::nullopt, format}; } - /// S3 file iterator could get new keys after new iteration, check them in schema cache if schema inference mode is default. - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT && read_keys.size() > prev_read_keys_size) + /// S3 file iterator could get new keys after new iteration + if (read_keys.size() > prev_read_keys_size) { - auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + /// If format is unknown we can try to determine it by new file names. + if (!format) + { + for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it)->key)) + { + format = format_from_file_name; + break; + } + } + } + + /// Check new files in schema cache if schema inference mode is default. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + if (columns_from_cache) + return {nullptr, columns_from_cache, format}; + } + prev_read_keys_size = read_keys.size(); - if (columns_from_cache) - return {nullptr, columns_from_cache}; } if (getContext()->getSettingsRef().s3_skip_empty_files && current_key_with_info->info && current_key_with_info->info->size == 0) @@ -1678,7 +1739,7 @@ namespace if (auto columns_from_cache = tryGetColumnsFromCache(keys.begin(), keys.end())) { first = false; - return {nullptr, columns_from_cache}; + return {nullptr, columns_from_cache, format}; } } @@ -1687,7 +1748,7 @@ namespace if (!getContext()->getSettingsRef().s3_skip_empty_files || !impl->eof()) { first = false; - return {wrapReadBufferWithCompressionMethod(std::move(impl), chooseCompressionMethod(current_key_with_info->key, configuration.compression_method), zstd_window_log_max), std::nullopt}; + return {wrapReadBufferWithCompressionMethod(std::move(impl), chooseCompressionMethod(current_key_with_info->key, configuration.compression_method), zstd_window_log_max), std::nullopt, format}; } } } @@ -1698,7 +1759,7 @@ namespace return; String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / configuration.url.bucket / current_key_with_info->key; - auto key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); + auto key = getKeyForSchemaCache(source, *format, format_settings, getContext()); StorageS3::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -1709,7 +1770,7 @@ namespace return; String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / configuration.url.bucket / current_key_with_info->key; - auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); + auto cache_key = getKeyForSchemaCache(source, *format, format_settings, getContext()); StorageS3::getSchemaCache(getContext()).addColumns(cache_key, columns); } @@ -1723,10 +1784,15 @@ namespace Strings sources; sources.reserve(read_keys.size()); std::transform(read_keys.begin(), read_keys.end(), std::back_inserter(sources), [&](const auto & elem){ return host_and_bucket / elem->key; }); - auto cache_keys = getKeysForSchemaCache(sources, configuration.format, format_settings, getContext()); + auto cache_keys = getKeysForSchemaCache(sources, *format, format_settings, getContext()); StorageS3::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { if (current_key_with_info) @@ -1734,15 +1800,26 @@ namespace return ""; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + chassert(current_key_with_info); + int zstd_window_log_max = static_cast(getContext()->getSettingsRef().zstd_window_log_max); + auto impl = std::make_unique(configuration.client, configuration.url.bucket, current_key_with_info->key, configuration.url.version_id, configuration.request_settings, getContext()->getReadSettings()); + return wrapReadBufferWithCompressionMethod(std::move(impl), chooseCompressionMethod(current_key_with_info->key, configuration.compression_method), zstd_window_log_max); + } + private: std::optional tryGetColumnsFromCache( const StorageS3::KeysWithInfo::const_iterator & begin, const StorageS3::KeysWithInfo::const_iterator & end) { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_s3) + auto context = getContext(); + if (!context->getSettingsRef().schema_inference_use_cache_for_s3) return std::nullopt; - auto & schema_cache = StorageS3::getSchemaCache(getContext()); + auto & schema_cache = StorageS3::getSchemaCache(context); for (auto it = begin; it < end; ++it) { auto get_last_mod_time = [&] @@ -1773,10 +1850,29 @@ namespace String path = fs::path(configuration.url.bucket) / (*it)->key; String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / path; - auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + + if (format) + { + auto cache_key = getKeyForSchemaCache(source, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(source, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -1785,6 +1881,7 @@ namespace std::shared_ptr file_iterator; const StorageS3Source::KeysWithInfo & read_keys; const StorageS3::Configuration & configuration; + std::optional format; const std::optional & format_settings; StorageS3Source::KeyWithInfoPtr current_key_with_info; size_t prev_read_keys_size; @@ -1793,17 +1890,20 @@ namespace } -ColumnsDescription StorageS3::getTableStructureFromDataImpl( +std::pair StorageS3::getTableStructureAndFormatFromDataImpl( + std::optional format, const Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx) + const ContextPtr & ctx) { KeysWithInfo read_keys; auto file_iterator = createFileIterator(configuration, false, ctx, {}, {}, &read_keys); - ReadBufferIterator read_buffer_iterator(file_iterator, read_keys, configuration, format_settings, ctx); - return readSchemaFromFormat(configuration.format, format_settings, read_buffer_iterator, configuration.withGlobs(), ctx); + ReadBufferIterator read_buffer_iterator(file_iterator, read_keys, configuration, format, format_settings, ctx); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, ctx), *format}; + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, ctx); } void registerStorageS3Impl(const String & name, StorageFactory & factory) diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 8d020c5e9a2..587145cd1a7 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -61,7 +61,7 @@ public: { public: virtual ~IIterator() = default; - virtual KeyWithInfoPtr next() = 0; + virtual KeyWithInfoPtr next(size_t idx = 0) = 0; /// NOLINT /// Estimates how many streams we need to process all files. /// If keys count >= max_threads_count, the returned number may not represent the actual number of the keys. @@ -80,12 +80,12 @@ public: const S3::URI & globbed_uri_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, - ContextPtr context, + const ContextPtr & context, KeysWithInfo * read_keys_ = nullptr, const S3Settings::RequestSettings & request_settings_ = {}, std::function progress_callback_ = {}); - KeyWithInfoPtr next() override; + KeyWithInfoPtr next(size_t idx = 0) override; /// NOLINT size_t estimatedKeysCount() override; private: @@ -106,7 +106,7 @@ public: KeysWithInfo * read_keys = nullptr, std::function progress_callback_ = {}); - KeyWithInfoPtr next() override; + KeyWithInfoPtr next(size_t idx = 0) override; /// NOLINT size_t estimatedKeysCount() override; private: @@ -120,7 +120,7 @@ public: public: explicit ReadTaskIterator(const ReadTaskCallback & callback_, size_t max_threads_count); - KeyWithInfoPtr next() override; + KeyWithInfoPtr next(size_t idx = 0) override; /// NOLINT size_t estimatedKeysCount() override; private: @@ -134,7 +134,7 @@ public: const ReadFromFormatInfo & info, const String & format, String name_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, UInt64 max_block_size_, const S3Settings::RequestSettings & request_settings_, @@ -253,11 +253,11 @@ private: /// Notice: we should initialize reader and future_reader lazily in generate to make sure key_condition /// is set before createReader is invoked for key_condition is read in createReader. - void lazyInitialize(); + void lazyInitialize(size_t idx = 0); /// Recreate ReadBuffer and Pipeline for each file. - ReaderHolder createReader(); - std::future createReaderAsync(); + ReaderHolder createReader(size_t idx = 0); + std::future createReaderAsync(size_t idx = 0); std::unique_ptr createS3ReadBuffer(const String & key, size_t object_size); std::unique_ptr createAsyncS3ReadBuffer(const String & key, const ReadSettings & read_settings, size_t object_size); @@ -280,9 +280,9 @@ public: String getPath() const { return url.key; } - bool update(ContextPtr context); + bool update(const ContextPtr & context); - void connect(ContextPtr context); + void connect(const ContextPtr & context); bool withGlobs() const { return url.key.find_first_of("*?{") != std::string::npos; } @@ -308,7 +308,7 @@ public: StorageS3( const Configuration & configuration_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -345,21 +345,26 @@ public: static SchemaCache & getSchemaCache(const ContextPtr & ctx); - static StorageS3::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file = true); + static StorageS3::Configuration getConfiguration(ASTs & engine_args, const ContextPtr & local_context, bool get_format_from_file = true); static ColumnsDescription getTableStructureFromData( const StorageS3::Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx); + const ContextPtr & ctx); + + static std::pair getTableStructureAndFormatFromData( + const StorageS3::Configuration & configuration, + const std::optional & format_settings, + const ContextPtr & ctx); using KeysWithInfo = StorageS3Source::KeysWithInfo; bool supportsTrivialCountOptimization() const override { return true; } protected: - virtual Configuration updateConfigurationAndGetCopy(ContextPtr local_context); + virtual Configuration updateConfigurationAndGetCopy(const ContextPtr & local_context); - virtual void updateConfiguration(ContextPtr local_context); + virtual void updateConfiguration(const ContextPtr & local_context); void useConfiguration(const Configuration & new_configuration); @@ -380,10 +385,11 @@ private: std::optional format_settings; ASTPtr partition_by; - static ColumnsDescription getTableStructureFromDataImpl( + static std::pair getTableStructureAndFormatFromDataImpl( + std::optional format, const Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx); + const ContextPtr & ctx); bool supportsSubcolumns() const override { return true; } diff --git a/src/Storages/StorageS3Cluster.cpp b/src/Storages/StorageS3Cluster.cpp index 25c2b42b766..0ea224f6ee9 100644 --- a/src/Storages/StorageS3Cluster.cpp +++ b/src/Storages/StorageS3Cluster.cpp @@ -38,25 +38,34 @@ StorageS3Cluster::StorageS3Cluster( const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, getLogger("StorageS3Cluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) + const ContextPtr & context) + : IStorageCluster(cluster_name_, table_id_, getLogger("StorageS3Cluster (" + table_id_.table_name + ")")) , s3_configuration{configuration_} { - context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.url.uri); - context_->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(configuration_.headers_from_ast); + context->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.url.uri); + context->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(configuration_.headers_from_ast); StorageInMemoryMetadata storage_metadata; - updateConfigurationIfChanged(context_); + updateConfigurationIfChanged(context); if (columns_.empty()) { + ColumnsDescription columns; /// `format_settings` is set to std::nullopt, because StorageS3Cluster is used only as table function - auto columns = StorageS3::getTableStructureFromDataImpl(s3_configuration, /*format_settings=*/std::nullopt, context_); + if (s3_configuration.format == "auto") + std::tie(columns, s3_configuration.format) = StorageS3::getTableStructureAndFormatFromData(s3_configuration, /*format_settings=*/std::nullopt, context); + else + columns = StorageS3::getTableStructureFromData(s3_configuration, /*format_settings=*/std::nullopt, context); + storage_metadata.setColumns(columns); } else + { + if (s3_configuration.format == "auto") + s3_configuration.format = StorageS3::getTableStructureAndFormatFromData(s3_configuration, /*format_settings=*/std::nullopt, context).second; + storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -64,13 +73,17 @@ StorageS3Cluster::StorageS3Cluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageS3Cluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageS3Cluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const DB::StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function s3Cluster, got '{}'", queryToString(query)); - TableFunctionS3Cluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionS3Cluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, + storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), + s3_configuration.format, + context); } void StorageS3Cluster::updateConfigurationIfChanged(ContextPtr local_context) diff --git a/src/Storages/StorageS3Cluster.h b/src/Storages/StorageS3Cluster.h index c526f14834a..ac25c506337 100644 --- a/src/Storages/StorageS3Cluster.h +++ b/src/Storages/StorageS3Cluster.h @@ -27,8 +27,7 @@ public: const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_); + const ContextPtr & context_); std::string getName() const override { return "S3Cluster"; } @@ -46,7 +45,7 @@ protected: private: void updateBeforeRead(const ContextPtr & context) override { updateConfigurationIfChanged(context); } - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; StorageS3::Configuration s3_configuration; NamesAndTypesList virtual_columns; diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp index b0c1160429a..2a0d15a2bab 100644 --- a/src/Storages/StorageS3Settings.cpp +++ b/src/Storages/StorageS3Settings.cpp @@ -293,7 +293,7 @@ void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::U } } -S3Settings StorageS3Settings::getSettings(const String & endpoint) const +S3Settings StorageS3Settings::getSettings(const String & endpoint, const String & user) const { std::lock_guard lock(mutex); auto next_prefix_setting = s3_settings.upper_bound(endpoint); @@ -302,7 +302,8 @@ S3Settings StorageS3Settings::getSettings(const String & endpoint) const for (auto possible_prefix_setting = next_prefix_setting; possible_prefix_setting != s3_settings.begin();) { std::advance(possible_prefix_setting, -1); - if (boost::algorithm::starts_with(endpoint, possible_prefix_setting->first)) + const auto & [endpoint_prefix, settings] = *possible_prefix_setting; + if (boost::algorithm::starts_with(endpoint, endpoint_prefix) && settings.auth_settings.canBeUsedByUser(user)) return possible_prefix_setting->second; } diff --git a/src/Storages/StorageS3Settings.h b/src/Storages/StorageS3Settings.h index 0e152bb2d31..21b6264717e 100644 --- a/src/Storages/StorageS3Settings.h +++ b/src/Storages/StorageS3Settings.h @@ -112,7 +112,7 @@ class StorageS3Settings public: void loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config, const Settings & settings); - S3Settings getSettings(const String & endpoint) const; + S3Settings getSettings(const String & endpoint, const String & user) const; private: mutable std::mutex mutex; diff --git a/src/Storages/StorageSQLite.cpp b/src/Storages/StorageSQLite.cpp index 85c5e16a1bf..30cca409dc8 100644 --- a/src/Storages/StorageSQLite.cpp +++ b/src/Storages/StorageSQLite.cpp @@ -19,6 +19,20 @@ #include #include +namespace +{ + +using namespace DB; + +ContextPtr makeSQLiteWriteContext(ContextPtr context) +{ + auto write_context = Context::createCopy(context); + write_context->setSetting("output_format_values_escape_quote_with_quote", Field(true)); + return write_context; +} + +} + namespace DB { @@ -43,6 +57,7 @@ StorageSQLite::StorageSQLite( , database_path(database_path_) , sqlite_db(sqlite_db_) , log(getLogger("StorageSQLite (" + table_id_.table_name + ")")) + , write_context(makeSQLiteWriteContext(getContext())) { StorageInMemoryMetadata storage_metadata; @@ -144,7 +159,7 @@ public: sqlbuf << ") VALUES "; - auto writer = FormatFactory::instance().getOutputFormat("Values", sqlbuf, metadata_snapshot->getSampleBlock(), storage.getContext()); + auto writer = FormatFactory::instance().getOutputFormat("Values", sqlbuf, metadata_snapshot->getSampleBlock(), storage.write_context); writer->write(block); sqlbuf << ";"; diff --git a/src/Storages/StorageSQLite.h b/src/Storages/StorageSQLite.h index baacdfb4899..ed673123fe0 100644 --- a/src/Storages/StorageSQLite.h +++ b/src/Storages/StorageSQLite.h @@ -47,10 +47,13 @@ public: const String & table); private: + friend class SQLiteSink; /// for write_context + String remote_table_name; String database_path; SQLitePtr sqlite_db; LoggerPtr log; + ContextPtr write_context; }; } diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 433f4ed7700..608e44c3cd0 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include #include @@ -101,7 +101,7 @@ static ConnectionTimeouts getHTTPTimeouts(ContextPtr context) IStorageURLBase::IStorageURLBase( const String & uri_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & table_id_, const String & format_name_, const std::optional & format_settings_, @@ -123,16 +123,26 @@ IStorageURLBase::IStorageURLBase( , partition_by(partition_by_) , distributed_processing(distributed_processing_) { - FormatFactory::instance().checkFormatName(format_name); + if (format_name != "auto") + FormatFactory::instance().checkFormatName(format_name); + StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = getTableStructureFromData(format_name, uri, compression_method, headers, format_settings, context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = getTableStructureAndFormatFromData(uri, compression_method, headers, format_settings, context_); + else + columns = getTableStructureFromData(format_name, uri, compression_method, headers, format_settings, context_); + storage_metadata.setColumns(columns); } else { + if (format_name == "auto") + format_name = getTableStructureAndFormatFromData(uri, compression_method, headers, format_settings, context_).second; + /// We don't allow special columns in URL storage. if (!columns_.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine URL doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -257,7 +267,7 @@ StorageURLSource::StorageURLSource( const String & format_, const std::optional & format_settings_, String name_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size, const ConnectionTimeouts & timeouts, CompressionMethod compression_method, @@ -525,7 +535,7 @@ StorageURLSink::StorageURLSink( const String & format, const std::optional & format_settings, const Block & sample_block, - ContextPtr context, + const ContextPtr & context, const ConnectionTimeouts & timeouts, const CompressionMethod compression_method, const HTTPHeaderEntries & headers, @@ -668,7 +678,7 @@ std::vector> IStorageURLBase::getReadURIPara const Names & /*column_names*/, const StorageSnapshotPtr & /*storage_snapshot*/, const SelectQueryInfo & /*query_info*/, - ContextPtr /*context*/, + const ContextPtr & /*context*/, QueryProcessingStage::Enum & /*processed_stage*/, size_t /*max_block_size*/) const { @@ -679,7 +689,7 @@ std::function IStorageURLBase::getReadPOSTDataCallback( const Names & /*column_names*/, const ColumnsDescription & /* columns_description */, const SelectQueryInfo & /*query_info*/, - ContextPtr /*context*/, + const ContextPtr & /*context*/, QueryProcessingStage::Enum & /*processed_stage*/, size_t /*max_block_size*/) const { @@ -693,28 +703,48 @@ namespace public: ReadBufferIterator( const std::vector & urls_to_check_, - const String & format_, + std::optional format_, const CompressionMethod & compression_method_, const HTTPHeaderEntries & headers_, const std::optional & format_settings_, const ContextPtr & context_) - : WithContext(context_), format(format_), compression_method(compression_method_), headers(headers_), format_settings(format_settings_) + : WithContext(context_), format(std::move(format_)), compression_method(compression_method_), headers(headers_), format_settings(format_settings_) { url_options_to_check.reserve(urls_to_check_.size()); for (const auto & url : urls_to_check_) url_options_to_check.push_back(getFailoverOptions(url, getContext()->getSettingsRef().glob_expansion_max_elements)); } - std::pair, std::optional> next() override + Data next() override { bool is_first = (current_index == 0); - /// For default mode check cached columns for all urls on first iteration. - if (is_first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + if (is_first) { - for (const auto & options : url_options_to_check) + /// If format is unknown we iterate through all url options on first iteration and + /// try to determine format by file name. + if (!format) { - if (auto cached_columns = tryGetColumnsFromCache(options)) - return {nullptr, cached_columns}; + for (const auto & options : url_options_to_check) + { + for (const auto & url : options) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(url)) + { + format = format_from_file_name; + break; + } + } + } + } + + /// For default mode check cached columns for all urls on first iteration. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + for (const auto & options : url_options_to_check) + { + if (auto cached_columns = tryGetColumnsFromCache(options)) + return {nullptr, cached_columns, format}; + } } } @@ -724,20 +754,30 @@ namespace if (current_index == url_options_to_check.size()) { if (is_first) + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because all files are empty. " + "You can specify table structure manually", + *format); + throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. " - "You must specify table structure manually", - format); - return {nullptr, std::nullopt}; + "The data format cannot be detected by the contents of the files, because there are no files with provided path " + "You can specify the format manually"); + + } + + return {nullptr, std::nullopt, format}; } if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) { - if (auto cached_columns = tryGetColumnsFromCache(url_options_to_check[current_index])) + if (auto cached_schema = tryGetColumnsFromCache(url_options_to_check[current_index])) { ++current_index; - return {nullptr, cached_columns}; + return {nullptr, cached_schema, format}; } } @@ -762,7 +802,7 @@ namespace return {wrapReadBufferWithCompressionMethod( std::move(uri_and_buf.second), compression_method, - static_cast(getContext()->getSettingsRef().zstd_window_log_max)), std::nullopt}; + static_cast(getContext()->getSettingsRef().zstd_window_log_max)), std::nullopt, format}; } void setNumRowsToLastFile(size_t num_rows) override @@ -770,7 +810,7 @@ namespace if (!getContext()->getSettingsRef().schema_inference_use_cache_for_url) return; - auto key = getKeyForSchemaCache(current_url_option, format, format_settings, getContext()); + auto key = getKeyForSchemaCache(current_url_option, *format, format_settings, getContext()); StorageURL::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -780,7 +820,7 @@ namespace || getContext()->getSettingsRef().schema_inference_mode != SchemaInferenceMode::UNION) return; - auto key = getKeyForSchemaCache(current_url_option, format, format_settings, getContext()); + auto key = getKeyForSchemaCache(current_url_option, *format, format_settings, getContext()); StorageURL::getSchemaCache(getContext()).addColumns(key, columns); } @@ -792,17 +832,45 @@ namespace for (const auto & options : url_options_to_check) { - auto keys = getKeysForSchemaCache(options, format, format_settings, getContext()); + auto keys = getKeysForSchemaCache(options, *format, format_settings, getContext()); StorageURL::getSchemaCache(getContext()).addManyColumns(keys, columns); } } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { return current_url_option; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + chassert(current_index > 0 && current_index <= url_options_to_check.size()); + auto first_option = url_options_to_check[current_index - 1].cbegin(); + auto uri_and_buf = StorageURLSource::getFirstAvailableURIAndReadBuffer( + first_option, + url_options_to_check[current_index - 1].cend(), + getContext(), + {}, + Poco::Net::HTTPRequest::HTTP_GET, + {}, + getHTTPTimeouts(getContext()), + credentials, + headers, + false, + false); + + return wrapReadBufferWithCompressionMethod(std::move(uri_and_buf.second), compression_method, static_cast(getContext()->getSettingsRef().zstd_window_log_max)); + } + private: std::optional tryGetColumnsFromCache(const Strings & urls) { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_url) + auto context = getContext(); + if (!context->getSettingsRef().schema_inference_use_cache_for_url) return std::nullopt; auto & schema_cache = StorageURL::getSchemaCache(getContext()); @@ -810,7 +878,7 @@ namespace { auto get_last_mod_time = [&]() -> std::optional { - auto last_mod_time = StorageURL::tryGetLastModificationTime(url, headers, credentials, getContext()); + auto last_mod_time = StorageURL::tryGetLastModificationTime(url, headers, credentials, context); /// Some URLs could not have Last-Modified header, in this case we cannot be sure that /// data wasn't changed after adding it's schema to cache. Use schema from cache only if /// special setting for this case is enabled. @@ -819,10 +887,27 @@ namespace return last_mod_time; }; - auto cache_key = getKeyForSchemaCache(url, format, format_settings, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(url, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(url, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -831,7 +916,7 @@ namespace std::vector> url_options_to_check; size_t current_index = 0; String current_url_option; - const String & format; + std::optional format; const CompressionMethod & compression_method; const HTTPHeaderEntries & headers; Poco::Net::HTTPBasicCredentials credentials; @@ -839,13 +924,13 @@ namespace }; } -ColumnsDescription IStorageURLBase::getTableStructureFromData( - const String & format, +std::pair IStorageURLBase::getTableStructureAndFormatFromDataImpl( + std::optional format, const String & uri, CompressionMethod compression_method, const HTTPHeaderEntries & headers, const std::optional & format_settings, - ContextPtr context) + const ContextPtr & context) { context->getRemoteHostFilter().checkURL(Poco::URI(uri)); @@ -858,7 +943,30 @@ ColumnsDescription IStorageURLBase::getTableStructureFromData( urls_to_check = {uri}; ReadBufferIterator read_buffer_iterator(urls_to_check, format, compression_method, headers, format_settings, context); - return readSchemaFromFormat(format, format_settings, read_buffer_iterator, urls_to_check.size() > 1, context); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); +} + +ColumnsDescription IStorageURLBase::getTableStructureFromData( + const String & format, + const String & uri, + CompressionMethod compression_method, + const HTTPHeaderEntries & headers, + const std::optional & format_settings, + const ContextPtr & context) +{ + return getTableStructureAndFormatFromDataImpl(format, uri, compression_method, headers, format_settings, context).first; +} + +std::pair IStorageURLBase::getTableStructureAndFormatFromData( + const String & uri, + CompressionMethod compression_method, + const HTTPHeaderEntries & headers, + const std::optional & format_settings, + const ContextPtr & context) +{ + return getTableStructureAndFormatFromDataImpl(std::nullopt, uri, compression_method, headers, format_settings, context); } bool IStorageURLBase::supportsSubsetOfColumns(const ContextPtr & context) const @@ -904,6 +1012,7 @@ public: , context(std::move(context_)) , max_block_size(max_block_size_) , num_streams(num_streams_) + , max_num_streams(num_streams_) { } @@ -920,6 +1029,7 @@ private: size_t max_block_size; size_t num_streams; + const size_t max_num_streams; std::shared_ptr iterator_wrapper; bool is_url_with_globs = false; @@ -1093,8 +1203,8 @@ void ReadFromURL::initializePipeline(QueryPipelineBuilder & pipeline, const Buil auto pipe = Pipe::unitePipes(std::move(pipes)); size_t output_ports = pipe.numOutputPorts(); const bool parallelize_output = context->getSettingsRef().parallelize_output_from_storages; - if (parallelize_output && storage->parallelizeOutputAfterReading(context) && output_ports > 0 && output_ports < num_streams) - pipe.resize(num_streams); + if (parallelize_output && storage->parallelizeOutputAfterReading(context) && output_ports > 0 && output_ports < max_num_streams) + pipe.resize(max_num_streams); if (pipe.empty()) pipe = Pipe(std::make_shared(info.source_header)); @@ -1243,7 +1353,7 @@ StorageURL::StorageURL( const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_, const HTTPHeaderEntries & headers_, const String & http_method_, @@ -1276,7 +1386,7 @@ StorageURLWithFailover::StorageURLWithFailover( const std::optional & format_settings_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_) : StorageURL("", table_id_, format_name_, format_settings_, columns_, constraints_, String{}, context_, compression_method_) { @@ -1325,7 +1435,7 @@ FormatSettings StorageURL::getFormatSettingsFromArgs(const StorageFactory::Argum } size_t StorageURL::evalArgsAndCollectHeaders( - ASTs & url_function_args, HTTPHeaderEntries & header_entries, ContextPtr context) + ASTs & url_function_args, HTTPHeaderEntries & header_entries, const ContextPtr & context) { ASTs::iterator headers_it = url_function_args.end(); @@ -1401,7 +1511,7 @@ void StorageURL::processNamedCollectionResult(Configuration & configuration, con && configuration.http_method != Poco::Net::HTTPRequest::HTTP_PUT) throw Exception( ErrorCodes::BAD_ARGUMENTS, - "Http method can be POST or PUT (current: {}). For insert default is POST, for select GET", + "HTTP method can be POST or PUT (current: {}). For insert default is POST, for select GET", configuration.http_method); configuration.format = collection.getOrDefault("format", "auto"); @@ -1409,7 +1519,7 @@ void StorageURL::processNamedCollectionResult(Configuration & configuration, con configuration.structure = collection.getOrDefault("structure", "auto"); } -StorageURL::Configuration StorageURL::getConfiguration(ASTs & args, ContextPtr local_context) +StorageURL::Configuration StorageURL::getConfiguration(ASTs & args, const ContextPtr & local_context) { StorageURL::Configuration configuration; @@ -1433,7 +1543,7 @@ StorageURL::Configuration StorageURL::getConfiguration(ASTs & args, ContextPtr l } if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(Poco::URI(configuration.url).getPath(), true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(Poco::URI(configuration.url).getPath()).value_or("auto"); for (const auto & [header, value] : configuration.headers) { diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index c8b8d0942f4..18a90c7bb82 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -57,7 +57,15 @@ public: CompressionMethod compression_method, const HTTPHeaderEntries & headers, const std::optional & format_settings, - ContextPtr context); + const ContextPtr & context); + + static std::pair getTableStructureAndFormatFromData( + const String & uri, + CompressionMethod compression_method, + const HTTPHeaderEntries & headers, + const std::optional & format_settings, + const ContextPtr & context); + static SchemaCache & getSchemaCache(const ContextPtr & context); @@ -72,7 +80,7 @@ protected: IStorageURLBase( const String & uri_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & id_, const String & format_name_, const std::optional & format_settings_, @@ -106,7 +114,7 @@ protected: const Names & column_names, const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, - ContextPtr context, + const ContextPtr & context, QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const; @@ -114,7 +122,7 @@ protected: const Names & column_names, const ColumnsDescription & columns_description, const SelectQueryInfo & query_info, - ContextPtr context, + const ContextPtr & context, QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const; @@ -127,6 +135,14 @@ protected: bool supportsTrivialCountOptimization() const override { return true; } private: + static std::pair getTableStructureAndFormatFromDataImpl( + std::optional format, + const String & uri, + CompressionMethod compression_method, + const HTTPHeaderEntries & headers, + const std::optional & format_settings, + const ContextPtr & context); + virtual Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const = 0; }; @@ -160,7 +176,7 @@ public: const String & format, const std::optional & format_settings, String name_, - ContextPtr context, + const ContextPtr & context, UInt64 max_block_size, const ConnectionTimeouts & timeouts, CompressionMethod compression_method, @@ -231,7 +247,7 @@ public: const String & format, const std::optional & format_settings, const Block & sample_block, - ContextPtr context, + const ContextPtr & context, const ConnectionTimeouts & timeouts, CompressionMethod compression_method, const HTTPHeaderEntries & headers = {}, @@ -263,7 +279,7 @@ public: const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_, const HTTPHeaderEntries & headers_ = {}, const String & method_ = "", @@ -292,12 +308,12 @@ public: std::string addresses_expr; }; - static Configuration getConfiguration(ASTs & args, ContextPtr context); + static Configuration getConfiguration(ASTs & args, const ContextPtr & context); /// Does evaluateConstantExpressionOrIdentifierAsLiteral() on all arguments. /// If `headers(...)` argument is present, parses it and moves it to the end of the array. /// Returns number of arguments excluding `headers(...)`. - static size_t evalArgsAndCollectHeaders(ASTs & url_function_args, HTTPHeaderEntries & header_entries, ContextPtr context); + static size_t evalArgsAndCollectHeaders(ASTs & url_function_args, HTTPHeaderEntries & header_entries, const ContextPtr & context); static void processNamedCollectionResult(Configuration & configuration, const NamedCollection & collection); }; @@ -314,7 +330,7 @@ public: const std::optional & format_settings_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_); void read( diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp index 2365887983d..d0df74d7521 100644 --- a/src/Storages/StorageURLCluster.cpp +++ b/src/Storages/StorageURLCluster.cpp @@ -35,36 +35,43 @@ namespace ErrorCodes } StorageURLCluster::StorageURLCluster( - ContextPtr context_, + const ContextPtr & context, const String & cluster_name_, const String & uri_, const String & format_, - const String & compression_method_, + const String & compression_method, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const StorageURL::Configuration & configuration_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, getLogger("StorageURLCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) - , uri(uri_) + const StorageURL::Configuration & configuration_) + : IStorageCluster(cluster_name_, table_id_, getLogger("StorageURLCluster (" + table_id_.table_name + ")")) + , uri(uri_), format_name(format_) { - context_->getRemoteHostFilter().checkURL(Poco::URI(uri)); - context_->getHTTPHeaderFilter().checkHeaders(configuration_.headers); + context->getRemoteHostFilter().checkURL(Poco::URI(uri)); + context->getHTTPHeaderFilter().checkHeaders(configuration_.headers); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = StorageURL::getTableStructureFromData(format_, - uri, - chooseCompressionMethod(Poco::URI(uri).getPath(), compression_method_), - configuration_.headers, - std::nullopt, - context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = StorageURL::getTableStructureAndFormatFromData( + uri, chooseCompressionMethod(Poco::URI(uri).getPath(), compression_method), configuration_.headers, std::nullopt, context); + else + columns = StorageURL::getTableStructureFromData( + format_, uri, chooseCompressionMethod(Poco::URI(uri).getPath(), compression_method), configuration_.headers, std::nullopt, context); + storage_metadata.setColumns(columns); } else + { + if (format_name == "auto") + format_name = StorageURL::getTableStructureAndFormatFromData( + uri, chooseCompressionMethod(Poco::URI(uri).getPath(), compression_method), configuration_.headers, std::nullopt, context).second; + storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -72,13 +79,14 @@ StorageURLCluster::StorageURLCluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageURLCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageURLCluster::updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function urlCluster, got '{}'", queryToString(query)); - TableFunctionURLCluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionURLCluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), format_name, context); } RemoteQueryExecutor::Extension StorageURLCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const diff --git a/src/Storages/StorageURLCluster.h b/src/Storages/StorageURLCluster.h index 07978040029..f57d262f434 100644 --- a/src/Storages/StorageURLCluster.h +++ b/src/Storages/StorageURLCluster.h @@ -19,16 +19,15 @@ class StorageURLCluster : public IStorageCluster { public: StorageURLCluster( - ContextPtr context_, + const ContextPtr & context, const String & cluster_name_, const String & uri_, const String & format_, - const String & compression_method_, + const String & compression_method, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const StorageURL::Configuration & configuration_, - bool structure_argument_was_provided_); + const StorageURL::Configuration & configuration_); std::string getName() const override { return "URLCluster"; } @@ -41,11 +40,10 @@ public: bool supportsTrivialCountOptimization() const override { return true; } private: - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; String uri; String format_name; - String compression_method; NamesAndTypesList virtual_columns; }; diff --git a/src/Storages/StorageView.cpp b/src/Storages/StorageView.cpp index f0f9b9540de..5679effbcb2 100644 --- a/src/Storages/StorageView.cpp +++ b/src/Storages/StorageView.cpp @@ -112,7 +112,15 @@ StorageView::StorageView( : IStorage(table_id_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (!is_parameterized_view_) + { + /// If CREATE query is to create parameterized view, then we dont want to set columns + if (!query.isParameterizedView()) + storage_metadata.setColumns(columns_); + } + else + storage_metadata.setColumns(columns_); + storage_metadata.setComment(comment); if (!query.select) @@ -199,12 +207,12 @@ void StorageView::read( static ASTTableExpression * getFirstTableExpression(ASTSelectQuery & select_query) { if (!select_query.tables() || select_query.tables()->children.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: no table expression in view select AST"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "No table expression in view select AST"); auto * select_element = select_query.tables()->children[0]->as(); if (!select_element->table_expression) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: incorrect table expression"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Incorrect table expression"); return select_element->table_expression->as(); } @@ -235,7 +243,7 @@ void StorageView::replaceWithSubquery(ASTSelectQuery & outer_query, ASTPtr view_ } if (!table_expression->database_and_table_name) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: incorrect table expression"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Incorrect table expression"); } DatabaseAndTableWithAlias db_table(table_expression->database_and_table_name); @@ -243,8 +251,7 @@ void StorageView::replaceWithSubquery(ASTSelectQuery & outer_query, ASTPtr view_ view_name = table_expression->database_and_table_name; table_expression->database_and_table_name = {}; - table_expression->subquery = std::make_shared(); - table_expression->subquery->children.push_back(view_query); + table_expression->subquery = std::make_shared(view_query); table_expression->subquery->setAlias(alias); for (auto & child : table_expression->children) @@ -263,7 +270,7 @@ ASTPtr StorageView::restoreViewName(ASTSelectQuery & select_query, const ASTPtr ASTTableExpression * table_expression = getFirstTableExpression(select_query); if (!table_expression->subquery) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: incorrect table expression"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Incorrect table expression"); ASTPtr subquery = table_expression->subquery; table_expression->subquery = {}; diff --git a/src/Storages/StorageXDBC.cpp b/src/Storages/StorageXDBC.cpp index 259abefb00f..fb8fa2d6da4 100644 --- a/src/Storages/StorageXDBC.cpp +++ b/src/Storages/StorageXDBC.cpp @@ -59,7 +59,7 @@ std::vector> StorageXDBC::getReadURIParams( const Names & /* column_names */, const StorageSnapshotPtr & /*storage_snapshot*/, const SelectQueryInfo & /*query_info*/, - ContextPtr /*context*/, + const ContextPtr & /*context*/, QueryProcessingStage::Enum & /*processed_stage*/, size_t max_block_size) const { @@ -70,7 +70,7 @@ std::function StorageXDBC::getReadPOSTDataCallback( const Names & column_names, const ColumnsDescription & columns_description, const SelectQueryInfo & query_info, - ContextPtr local_context, + const ContextPtr & local_context, QueryProcessingStage::Enum & /*processed_stage*/, size_t /*max_block_size*/) const { diff --git a/src/Storages/StorageXDBC.h b/src/Storages/StorageXDBC.h index cba15a83226..7cec7266760 100644 --- a/src/Storages/StorageXDBC.h +++ b/src/Storages/StorageXDBC.h @@ -55,7 +55,7 @@ private: const Names & column_names, const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, - ContextPtr context, + const ContextPtr & context, QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const override; @@ -63,7 +63,7 @@ private: const Names & column_names, const ColumnsDescription & columns_description, const SelectQueryInfo & query_info, - ContextPtr context, + const ContextPtr & context, QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const override; diff --git a/src/Storages/System/StorageSystemContributors.generated.cpp b/src/Storages/System/StorageSystemContributors.generated.cpp index fbd5afd3274..b5a985fec9b 100644 --- a/src/Storages/System/StorageSystemContributors.generated.cpp +++ b/src/Storages/System/StorageSystemContributors.generated.cpp @@ -109,6 +109,7 @@ const char * auto_contributors[] { "Ali Demirci", "Aliaksandr Pliutau", "Aliaksandr Shylau", + "Aliaksei Khatskevich", "Alina Terekhova", "Amesaru", "Amila Welihinda", @@ -179,6 +180,7 @@ const char * auto_contributors[] { "Arsen Hakobyan", "Arslan G", "ArtCorp", + "Artem Alperin", "Artem Andreenko", "Artem Gavrilov", "Artem Hnilov", @@ -223,7 +225,9 @@ const char * auto_contributors[] { "Bill", "Bin Xie", "BiteTheDDDDt", + "Blacksmith", "BlahGeek", + "Blargian", "Bo Lu", "Bogdan", "Bogdan Voronin", @@ -373,6 +377,7 @@ const char * auto_contributors[] { "Evgeny Kruglov", "Evgeny Markov", "Ewout", + "Eyal Halpern Shalev", "FArthur-cmd", "FFFFFFFHHHHHHH", "FFish", @@ -513,6 +518,7 @@ const char * auto_contributors[] { "Javi santana bot", "JaySon", "JaySon-Huang", + "Jayme Bird", "Jean Baptiste Favre", "Jeffrey Dang", "Jens Hoevenaars", @@ -613,6 +619,7 @@ const char * auto_contributors[] { "Lewinma", "Li Shuai", "Li Yin", + "Lino Uruñuela", "Lirikl", "Liu Cong", "LiuCong", @@ -636,6 +643,7 @@ const char * auto_contributors[] { "MagiaGroz", "Maks Skorokhod", "Maksim", + "Maksim Alekseev", "Maksim Buren", "Maksim Fedotov", "Maksim Kita", @@ -653,6 +661,7 @@ const char * auto_contributors[] { "Mariano Benítez Mulet", "Mark Andreev", "Mark Frost", + "Mark Needham", "Mark Papadakis", "Mark Polokhov", "Maroun Maroun", @@ -662,6 +671,7 @@ const char * auto_contributors[] { "Martijn Bakker", "Marvin Taschenberger", "Masha", + "Mathieu Rey", "Matthew Peveler", "Matwey V. Kornilov", "Max", @@ -733,6 +743,7 @@ const char * auto_contributors[] { "Mingliang Pan", "Misko Lee", "Misz606", + "MochiXu", "Mohamad Fadhil", "Mohammad Arab Anvari", "Mohammad Hossein Sekhavat", @@ -780,6 +791,7 @@ const char * auto_contributors[] { "Nikolai Sorokin", "Nikolay", "Nikolay Degterinsky", + "Nikolay Edigaryev", "Nikolay Kirsh", "Nikolay Semyachkin", "Nikolay Shcheglov", @@ -876,6 +888,7 @@ const char * auto_contributors[] { "Roman Bug", "Roman Chyrva", "Roman G", + "Roman Glinskikh", "Roman Heinrich", "Roman Lipovsky", "Roman Nikolaev", @@ -948,6 +961,7 @@ const char * auto_contributors[] { "Seyed Mehrshad Hosseini", "Shane Andrade", "Shani Elharrar", + "Shaun Struwig", "Sherry Wang", "Shoh Jahon", "Shri Bodas", @@ -1015,6 +1029,7 @@ const char * auto_contributors[] { "Tian Xinhui", "Tiaonmmn", "Tigran Khudaverdyan", + "Tim Liou", "Tim Windelschmidt", "Timur Magomedov", "Timur Solodovnikov", @@ -1109,6 +1124,7 @@ const char * auto_contributors[] { "Wang Fenjin", "WangZengrui", "Wangyang Guo", + "Waterkin", "Weiqing Xu", "William Shallum", "Winter Zhang", @@ -1152,6 +1168,7 @@ const char * auto_contributors[] { "Yury Stankevich", "Yusuke Tanaka", "Zach Naimon", + "Zheng Miao", "ZhiYong Wang", "Zhichang Yu", "Zhichun Wu", @@ -1213,6 +1230,7 @@ const char * auto_contributors[] { "attack204", "auxten", "avasiliev", + "avinzhang", "avogar", "avoiderboi", "avsharapov", @@ -1253,6 +1271,7 @@ const char * auto_contributors[] { "chengy8934", "chenjian", "chenqi", + "chenwei", "chenxing-xc", "chenxing.xc", "chertus", @@ -1301,6 +1320,7 @@ const char * auto_contributors[] { "ducle.canh", "eaxdev", "edef", + "edpyt", "eejoin", "egatov", "ekrasikov", @@ -1540,6 +1560,7 @@ const char * auto_contributors[] { "mlkui", "mnkonkova", "mo-avatar", + "mochi", "monchickey", "morty", "moscas", @@ -1671,6 +1692,7 @@ const char * auto_contributors[] { "sundy-li", "sundyli", "sunlisheng", + "sunny19930321", "svladykin", "tai", "taichong", diff --git a/src/Storages/System/StorageSystemDashboards.cpp b/src/Storages/System/StorageSystemDashboards.cpp index 7e545757129..7c9e8b73519 100644 --- a/src/Storages/System/StorageSystemDashboards.cpp +++ b/src/Storages/System/StorageSystemDashboards.cpp @@ -26,192 +26,329 @@ void StorageSystemDashboards::fillData(MutableColumns & res_columns, ContextPtr, { static const std::vector> dashboards { + /// Default dashboard for self-managed ClickHouse { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Queries/second" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_Query) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "CPU Usage (cores)" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_OSCPUVirtualTimeMicroseconds) / 1000000 -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Queries Running" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(CurrentMetric_Query) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Merges Running" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(CurrentMetric_Merge) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Selected Bytes/second" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_SelectedBytes) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "IO Wait" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_OSIOWaitMicroseconds) / 1000000 -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "CPU Wait" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_OSCPUWaitMicroseconds) / 1000000 -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "OS CPU Usage (Userspace)" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value) -FROM system.asynchronous_metric_log +FROM merge('system', '^asynchronous_metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'OSUserTimeNormalized' GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "OS CPU Usage (Kernel)" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value) -FROM system.asynchronous_metric_log +FROM merge('system', '^asynchronous_metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'OSSystemTimeNormalized' GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Read From Disk" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_OSReadBytes) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Read From Filesystem" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_OSReadChars) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Memory (tracked)" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(CurrentMetric_MemoryTracking) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Load Average (15 minutes)" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value) -FROM system.asynchronous_metric_log +FROM merge('system', '^asynchronous_metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'LoadAverage15' GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Selected Rows/second" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_SelectedRows) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Inserted Rows/second" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_InsertedRows) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Total MergeTree Parts" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value) -FROM system.asynchronous_metric_log +FROM merge('system', '^asynchronous_metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'TotalPartsOfMergeTreeTables' GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Max Parts For Partition" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, max(value) -FROM system.asynchronous_metric_log +FROM merge('system', '^asynchronous_metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'MaxPartCountForPartition' GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } + }, + /// Default dashboard for ClickHouse Cloud + { + { "dashboard", "Cloud overview" }, + { "title", "Queries/second" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_Query) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "CPU Usage (cores)" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(metric) / 1000000\nFROM (\n SELECT event_time, sum(ProfileEvent_OSCPUVirtualTimeMicroseconds) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32} GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Queries Running" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(CurrentMetric_Query) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Merges Running" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(CurrentMetric_Merge) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Selected Bytes/second" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_SelectedBytes) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "IO Wait (local fs)" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_OSIOWaitMicroseconds) / 1000000 AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "S3 read wait" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_ReadBufferFromS3Microseconds) / 1000000 AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "S3 read errors/sec" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_ReadBufferFromS3RequestsErrors) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "CPU Wait" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_OSCPUWaitMicroseconds) / 1000000 AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "OS CPU Usage (Userspace, normalized)" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value)\nFROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\nWHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32}\nAND metric = 'OSUserTimeNormalized'\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "OS CPU Usage (Kernel, normalized)" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value)\nFROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\nWHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32}\nAND metric = 'OSSystemTimeNormalized'\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Read From Disk (bytes/sec)" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_OSReadBytes) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Read From Filesystem (bytes/sec)" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_OSReadChars) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Memory (tracked, bytes)" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(CurrentMetric_MemoryTracking) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Load Average (15 minutes)" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value)\nFROM (\n SELECT event_time, sum(value) AS value\n FROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n AND metric = 'LoadAverage15'\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Selected Rows/sec" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_SelectedRows) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Inserted Rows/sec" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_InsertedRows) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Total MergeTree Parts" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, max(value)\nFROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\nWHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32}\nAND metric = 'TotalPartsOfMergeTreeTables'\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Max Parts For Partition" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, max(value)\nFROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\nWHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32}\nAND metric = 'MaxPartCountForPartition'\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Read From S3 (bytes/sec)" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_ReadBufferFromS3Bytes) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Filesystem Cache Size" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(CurrentMetric_FilesystemCacheSize) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Disk S3 write req/sec" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_DiskS3PutObject + ProfileEvent_DiskS3UploadPart + ProfileEvent_DiskS3CreateMultipartUpload + ProfileEvent_DiskS3CompleteMultipartUpload) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Disk S3 read req/sec" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_DiskS3GetObject + ProfileEvent_DiskS3HeadObject + ProfileEvent_DiskS3ListObjects) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "FS cache hit rate" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_CachedReadBufferReadFromCacheBytes) / (sum(ProfileEvent_CachedReadBufferReadFromCacheBytes) + sum(ProfileEvent_CachedReadBufferReadFromSourceBytes)) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Page cache hit rate" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, greatest(0, (sum(ProfileEvent_OSReadChars) - sum(ProfileEvent_OSReadBytes)) / (sum(ProfileEvent_OSReadChars) + sum(ProfileEvent_ReadBufferFromS3Bytes))) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Network receive bytes/sec" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value)\nFROM (\n SELECT event_time, sum(value) AS value\n FROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n AND metric LIKE 'NetworkReceiveBytes%'\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Network send bytes/sec" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value)\nFROM (\n SELECT event_time, sum(value) AS value\n FROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n AND metric LIKE 'NetworkSendBytes%'\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } } }; diff --git a/src/Storages/System/StorageSystemDetachedParts.cpp b/src/Storages/System/StorageSystemDetachedParts.cpp index a9cd5f2610a..3dae43976f7 100644 --- a/src/Storages/System/StorageSystemDetachedParts.cpp +++ b/src/Storages/System/StorageSystemDetachedParts.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include diff --git a/src/Storages/System/StorageSystemProjectionParts.cpp b/src/Storages/System/StorageSystemProjectionParts.cpp index 016705f4e66..b1494f2ba98 100644 --- a/src/Storages/System/StorageSystemProjectionParts.cpp +++ b/src/Storages/System/StorageSystemProjectionParts.cpp @@ -83,7 +83,11 @@ StorageSystemProjectionParts::StorageSystemProjectionParts(const StorageID & tab {"rows_where_ttl_info.expression", std::make_shared(std::make_shared())}, {"rows_where_ttl_info.min", std::make_shared(std::make_shared())}, - {"rows_where_ttl_info.max", std::make_shared(std::make_shared())} + {"rows_where_ttl_info.max", std::make_shared(std::make_shared())}, + + {"is_broken", std::make_shared()}, + {"exception_code", std::make_shared()}, + {"exception", std::make_shared()}, } ) { @@ -272,12 +276,38 @@ void StorageSystemProjectionParts::processNextStorage( add_ttl_info_map(part->ttl_infos.moves_ttl); if (columns_mask[src_index++]) - columns[res_index++]->insert(queryToString(part->default_codec->getCodecDesc())); + { + if (part->default_codec) + columns[res_index++]->insert(queryToString(part->default_codec->getCodecDesc())); + else + columns[res_index++]->insertDefault(); + } add_ttl_info_map(part->ttl_infos.recompression_ttl); add_ttl_info_map(part->ttl_infos.group_by_ttl); add_ttl_info_map(part->ttl_infos.rows_where_ttl); + { + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->is_broken.load(std::memory_order_relaxed)); + + if (part->is_broken) + { + std::lock_guard lock(part->broken_reason_mutex); + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->exception_code); + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->exception); + } + else + { + if (columns_mask[src_index++]) + columns[res_index++]->insertDefault(); + if (columns_mask[src_index++]) + columns[res_index++]->insertDefault(); + } + } + /// _state column should be the latest. /// Do not use part->getState*, it can be changed from different thread if (has_state_column) diff --git a/src/Storages/System/StorageSystemStackTrace.cpp b/src/Storages/System/StorageSystemStackTrace.cpp index 82a5fd4e33f..90eb0ad89ec 100644 --- a/src/Storages/System/StorageSystemStackTrace.cpp +++ b/src/Storages/System/StorageSystemStackTrace.cpp @@ -168,7 +168,7 @@ bool wait(int timeout_ms) continue; /// Drain delayed notifications. } - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: read wrong number of bytes from pipe"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Read wrong number of bytes from pipe"); } } diff --git a/src/Storages/System/StorageSystemTables.cpp b/src/Storages/System/StorageSystemTables.cpp index a0f6b03cf89..47c4a03a595 100644 --- a/src/Storages/System/StorageSystemTables.cpp +++ b/src/Storages/System/StorageSystemTables.cpp @@ -43,6 +43,7 @@ StorageSystemTables::StorageSystemTables(const StorageID & table_id_) {"data_paths", std::make_shared(std::make_shared()), "Paths to the table data in the file systems."}, {"metadata_path", std::make_shared(), "Path to the table metadata in the file system."}, {"metadata_modification_time", std::make_shared(), "Time of latest modification of the table metadata."}, + {"metadata_version", std::make_shared(), "Metadata version for ReplicatedMergeTree table, 0 for non ReplicatedMergeTree table."}, {"dependencies_database", std::make_shared(std::make_shared()), "Database dependencies."}, {"dependencies_table", std::make_shared(std::make_shared()), "Table dependencies (materialized views the current table)."}, {"create_table_query", std::make_shared(), "The query that was used to create the table."}, @@ -287,6 +288,11 @@ protected: if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); + // metadata_version + // Temporary tables does not support replication + if (columns_mask[src_index++]) + res_columns[res_index++]->insertDefault(); + // dependencies_database if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); @@ -311,7 +317,7 @@ protected: while (src_index < columns_mask.size()) { // total_rows - if (src_index == 18 && columns_mask[src_index]) + if (src_index == 19 && columns_mask[src_index]) { if (auto total_rows = table.second->totalRows(settings)) res_columns[res_index++]->insert(*total_rows); @@ -319,7 +325,7 @@ protected: res_columns[res_index++]->insertDefault(); } // total_bytes - else if (src_index == 19 && columns_mask[src_index]) + else if (src_index == 20 && columns_mask[src_index]) { if (auto total_bytes = table.second->totalBytes(settings)) res_columns[res_index++]->insert(*total_bytes); @@ -418,6 +424,18 @@ protected: if (columns_mask[src_index++]) res_columns[res_index++]->insert(static_cast(database->getObjectMetadataModificationTime(table_name))); + StorageMetadataPtr metadata_snapshot; + if (table) + metadata_snapshot = table->getInMemoryMetadataPtr(); + + if (columns_mask[src_index++]) + { + if (metadata_snapshot && table->supportsReplication()) + res_columns[res_index++]->insert(metadata_snapshot->metadata_version); + else + res_columns[res_index++]->insertDefault(); + } + { Array views_table_name_array; Array views_database_name_array; @@ -482,10 +500,6 @@ protected: else src_index += 3; - StorageMetadataPtr metadata_snapshot; - if (table) - metadata_snapshot = table->getInMemoryMetadataPtr(); - ASTPtr expression_ptr; if (columns_mask[src_index++]) { @@ -693,10 +707,15 @@ public: { } + void applyFilters() override; + private: ContextPtr context; std::vector columns_mask; size_t max_block_size; + + ColumnPtr filtered_databases_column; + ColumnPtr filtered_tables_column; }; void StorageSystemTables::read( @@ -723,16 +742,19 @@ void StorageSystemTables::read( query_plan.addStep(std::move(reading)); } -void ReadFromSystemTables::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) +void ReadFromSystemTables::applyFilters() { auto filter_actions_dag = ActionsDAG::buildFilterActionsDAG(filter_nodes.nodes); const ActionsDAG::Node * predicate = nullptr; if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); - ColumnPtr filtered_databases_column = getFilteredDatabases(predicate, context); - ColumnPtr filtered_tables_column = getFilteredTables(predicate, filtered_databases_column, context); + filtered_databases_column = getFilteredDatabases(predicate, context); + filtered_tables_column = getFilteredTables(predicate, filtered_databases_column, context); +} +void ReadFromSystemTables::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) +{ Pipe pipe(std::make_shared( std::move(columns_mask), getOutputStream().header, max_block_size, std::move(filtered_databases_column), std::move(filtered_tables_column), context)); pipeline.init(std::move(pipe)); diff --git a/src/Storages/System/StorageSystemZooKeeper.cpp b/src/Storages/System/StorageSystemZooKeeper.cpp index 37fe9074950..abf93bf1ac0 100644 --- a/src/Storages/System/StorageSystemZooKeeper.cpp +++ b/src/Storages/System/StorageSystemZooKeeper.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -178,7 +180,7 @@ using Paths = std::deque>; class ReadFromSystemZooKeeper final : public SourceStepWithFilter { public: - ReadFromSystemZooKeeper(const Block & header, SelectQueryInfo & query_info_, ContextPtr context_); + ReadFromSystemZooKeeper(const Block & header, SelectQueryInfo & query_info_, UInt64 max_block_size_, ContextPtr context_); String getName() const override { return "ReadFromSystemZooKeeper"; } @@ -187,13 +189,42 @@ public: void applyFilters() override; private: - void fillData(MutableColumns & res_columns); - std::shared_ptr storage_limits; + const UInt64 max_block_size; ContextPtr context; Paths paths; }; + +class SystemZooKeeperSource : public ISource +{ +public: + SystemZooKeeperSource( + Paths && paths_, + Block header_, + UInt64 max_block_size_, + ContextPtr context_) + : ISource(header_) + , max_block_size(max_block_size_) + , paths(std::move(paths_)) + , context(std::move(context_)) + { + } + + String getName() const override { return "SystemZooKeeper"; } + +protected: + Chunk generate() override; + +private: + const UInt64 max_block_size; + Paths paths; + ContextPtr context; + ZooKeeperWithFaultInjection::Ptr zookeeper; + bool started = false; +}; + + StorageSystemZooKeeper::StorageSystemZooKeeper(const StorageID & table_id_) : IStorage(table_id_) { @@ -209,11 +240,11 @@ void StorageSystemZooKeeper::read( SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, - size_t /*max_block_size*/, + size_t max_block_size, size_t /*num_streams*/) { auto header = storage_snapshot->metadata->getSampleBlockWithVirtuals(getVirtuals()); - auto read_step = std::make_unique(header, query_info, context); + auto read_step = std::make_unique(header, query_info, max_block_size, context); query_plan.addStep(std::move(read_step)); } @@ -412,7 +443,7 @@ static Paths extractPath(const ActionsDAG::NodeRawConstPtrs & filter_nodes, Cont for (const auto * node : filter_nodes) extractPathImpl(*node, res, context, allow_unrestricted); - if (filter_nodes.empty() && allow_unrestricted) + if (res.empty() && allow_unrestricted) res.emplace_back("/", ZkPathType::Recurse); return res; @@ -424,16 +455,50 @@ void ReadFromSystemZooKeeper::applyFilters() paths = extractPath(getFilterNodes().nodes, context, context->getSettingsRef().allow_unrestricted_reads_from_keeper); } -void ReadFromSystemZooKeeper::fillData(MutableColumns & res_columns) -{ - zkutil::ZooKeeperPtr zookeeper = context->getZooKeeper(); +Chunk SystemZooKeeperSource::generate() +{ if (paths.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, + { + if (!started) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "SELECT from system.zookeeper table must contain condition like path = 'path' " "or path IN ('path1','path2'...) or path IN (subquery) " "in WHERE clause unless `set allow_unrestricted_reads_from_keeper = 'true'`."); + /// No more work + return {}; + } + + started = true; + + MutableColumns res_columns = getPort().getHeader().cloneEmptyColumns(); + size_t row_count = 0; + + QueryStatusPtr query_status = context->getProcessListElement(); + + const auto & settings = context->getSettingsRef(); + /// Use insert settings for now in order not to introduce new settings. + /// Hopefully insert settings will also be unified and replaced with some generic retry settings. + ZooKeeperRetriesInfo retries_seetings( + settings.insert_keeper_max_retries, + settings.insert_keeper_retry_initial_backoff_ms, + settings.insert_keeper_retry_max_backoff_ms); + + /// Handles reconnects when needed + auto get_zookeeper = [&] () + { + if (!zookeeper || zookeeper->expired()) + { + zookeeper = ZooKeeperWithFaultInjection::createInstance( + settings.insert_keeper_fault_injection_probability, + settings.insert_keeper_fault_injection_seed, + context->getZooKeeper(), + "", nullptr); + } + return zookeeper; + }; + const Int64 max_inflight_requests = std::max(1, context->getSettingsRef().max_download_threads.value); struct ListTask @@ -448,6 +513,19 @@ void ReadFromSystemZooKeeper::fillData(MutableColumns & res_columns) std::unordered_set added; while (!paths.empty()) { + if (query_status) + query_status->checkTimeLimit(); + + /// Check if the block is big enough already + if (max_block_size > 0 && row_count > 0) + { + size_t total_size = 0; + for (const auto & column : res_columns) + total_size += column->byteSize(); + if (total_size > max_block_size) + break; + } + list_tasks.clear(); std::vector paths_to_list; while (!paths.empty() && static_cast(list_tasks.size()) < max_inflight_requests) @@ -470,7 +548,10 @@ void ReadFromSystemZooKeeper::fillData(MutableColumns & res_columns) paths_to_list.emplace_back(task.path_corrected); list_tasks.emplace_back(std::move(task)); } - auto list_responses = zookeeper->tryGetChildren(paths_to_list); + + zkutil::ZooKeeper::MultiTryGetChildrenResponse list_responses; + ZooKeeperRetriesControl("", nullptr, retries_seetings, query_status).retryLoop( + [&]() { list_responses = get_zookeeper()->tryGetChildren(paths_to_list); }); struct GetTask { @@ -488,8 +569,8 @@ void ReadFromSystemZooKeeper::fillData(MutableColumns & res_columns) continue; auto & task = list_tasks[list_task_idx]; - if (auto elem = context->getProcessListElement()) - elem->checkTimeLimit(); + if (query_status) + query_status->checkTimeLimit(); Strings nodes = std::move(list_result.names); @@ -514,7 +595,9 @@ void ReadFromSystemZooKeeper::fillData(MutableColumns & res_columns) } } - auto get_responses = zookeeper->tryGet(paths_to_get); + zkutil::ZooKeeper::MultiTryGetResponse get_responses; + ZooKeeperRetriesControl("", nullptr, retries_seetings, query_status).retryLoop( + [&]() { get_responses = get_zookeeper()->tryGet(paths_to_get); }); for (size_t i = 0, size = get_tasks.size(); i < size; ++i) { @@ -524,8 +607,8 @@ void ReadFromSystemZooKeeper::fillData(MutableColumns & res_columns) auto & get_task = get_tasks[i]; auto & list_task = list_tasks[get_task.list_task_idx]; - if (auto elem = context->getProcessListElement()) - elem->checkTimeLimit(); + if (query_status) + query_status->checkTimeLimit(); // Deduplication String key = list_task.path_part + '/' + get_task.node; @@ -551,17 +634,22 @@ void ReadFromSystemZooKeeper::fillData(MutableColumns & res_columns) res_columns[col_num++]->insert( list_task.path); /// This is the original path. In order to process the request, condition in WHERE should be triggered. + ++row_count; + if (list_task.path_type != ZkPathType::Exact && res.stat.numChildren > 0) { paths.emplace_back(key, ZkPathType::Recurse); } } } + + return Chunk(std::move(res_columns), row_count); } -ReadFromSystemZooKeeper::ReadFromSystemZooKeeper(const Block & header, SelectQueryInfo & query_info, ContextPtr context_) +ReadFromSystemZooKeeper::ReadFromSystemZooKeeper(const Block & header, SelectQueryInfo & query_info, UInt64 max_block_size_, ContextPtr context_) : SourceStepWithFilter({.header = header}) , storage_limits(query_info.storage_limits) + , max_block_size(max_block_size_) , context(std::move(context_)) { } @@ -569,13 +657,7 @@ ReadFromSystemZooKeeper::ReadFromSystemZooKeeper(const Block & header, SelectQue void ReadFromSystemZooKeeper::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) { const auto & header = getOutputStream().header; - MutableColumns res_columns = header.cloneEmptyColumns(); - fillData(res_columns); - - UInt64 num_rows = res_columns.at(0)->size(); - Chunk chunk(std::move(res_columns), num_rows); - - auto source = std::make_shared(header, std::move(chunk)); + auto source = std::make_shared(std::move(paths), header, max_block_size, context); source->setStorageLimits(storage_limits); processors.emplace_back(source); pipeline.init(Pipe(std::move(source))); diff --git a/src/Storages/System/attachSystemTablesImpl.h b/src/Storages/System/attachSystemTablesImpl.h index 0b0a22baa13..9f2c4e8016d 100644 --- a/src/Storages/System/attachSystemTablesImpl.h +++ b/src/Storages/System/attachSystemTablesImpl.h @@ -7,14 +7,20 @@ namespace DB { -template -void attach(ContextPtr context, IDatabase & system_database, const String & table_name, const String & comment, StorageArgs && ... args) +template +using StringLiteral = const char(&)[Length]; + +template +void attach(ContextPtr context, IDatabase & system_database, const String & table_name, StringLiteral comment, StorageArgs && ... args) { + static_assert(CommentSize > 15, "The comment for a system table is too short or empty"); assert(system_database.getDatabaseName() == DatabaseCatalog::SYSTEM_DATABASE); + + auto table_id = StorageID::createEmpty(); if (system_database.getUUID() == UUIDHelpers::Nil) { /// Attach to Ordinary database. - auto table_id = StorageID(DatabaseCatalog::SYSTEM_DATABASE, table_name); + table_id = StorageID(DatabaseCatalog::SYSTEM_DATABASE, table_name); system_database.attachTable(context, table_name, std::make_shared(table_id, std::forward(args)...)); } else @@ -22,18 +28,18 @@ void attach(ContextPtr context, IDatabase & system_database, const String & tabl /// Attach to Atomic database. /// NOTE: UUIDs are not persistent, but it's ok since no data are stored on disk for these storages /// and path is actually not used - auto table_id = StorageID(DatabaseCatalog::SYSTEM_DATABASE, table_name, UUIDHelpers::generateV4()); + table_id = StorageID(DatabaseCatalog::SYSTEM_DATABASE, table_name, UUIDHelpers::generateV4()); DatabaseCatalog::instance().addUUIDMapping(table_id.uuid); String path = "store/" + DatabaseCatalog::getPathForUUID(table_id.uuid); system_database.attachTable(context, table_name, std::make_shared(table_id, std::forward(args)...), path); - - /// Set the comment - auto table = DatabaseCatalog::instance().getTable(table_id, context); - assert(table); - auto metadata = table->getInMemoryMetadata(); - metadata.comment = comment; - table->setInMemoryMetadata(metadata); } + + /// Set the comment + auto table = DatabaseCatalog::instance().getTable(table_id, context); + assert(table); + auto metadata = table->getInMemoryMetadata(); + metadata.comment = comment; + table->setInMemoryMetadata(metadata); } } diff --git a/src/Storages/System/getQueriedColumnsMaskAndHeader.cpp b/src/Storages/System/getQueriedColumnsMaskAndHeader.cpp index c29ccb590ed..b93fe7b8034 100644 --- a/src/Storages/System/getQueriedColumnsMaskAndHeader.cpp +++ b/src/Storages/System/getQueriedColumnsMaskAndHeader.cpp @@ -11,10 +11,11 @@ std::pair, Block> getQueriedColumnsMaskAndHeader(const Block NameSet names_set(column_names.begin(), column_names.end()); for (size_t i = 0; i < columns_mask.size(); ++i) { - if (names_set.contains(sample_block.getByPosition(i).name)) + const auto & column_with_type_and_name = sample_block.getByPosition(i); + if (names_set.contains(column_with_type_and_name.name)) { columns_mask[i] = 1; - header.insert(sample_block.getByPosition(i)); + header.insert(column_with_type_and_name); } } diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index b3f5d181d5d..a675afbdc26 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -18,6 +18,7 @@ #include #include #include +#include namespace DB @@ -110,7 +111,10 @@ using FindAggregateFunctionVisitor = InDepthNodeVisitorclone() : nullptr) + , expression_columns(other.expression_columns) , result_column(other.result_column) + , where_expression_ast(other.where_expression_ast ? other.where_expression_ast->clone() : nullptr) + , where_expression_columns(other.where_expression_columns) , where_result_column(other.where_result_column) , group_by_keys(other.group_by_keys) , set_parts(other.set_parts) @@ -120,11 +124,6 @@ TTLDescription::TTLDescription(const TTLDescription & other) , if_exists(other.if_exists) , recompression_codec(other.recompression_codec) { - if (other.expression) - expression = other.expression->clone(); - - if (other.where_expression) - where_expression = other.where_expression->clone(); } TTLDescription & TTLDescription::operator=(const TTLDescription & other) @@ -138,17 +137,15 @@ TTLDescription & TTLDescription::operator=(const TTLDescription & other) else expression_ast.reset(); - if (other.expression) - expression = other.expression->clone(); - else - expression.reset(); - + expression_columns = other.expression_columns; result_column = other.result_column; - if (other.where_expression) - where_expression = other.where_expression->clone(); - else - where_expression.reset(); + if (other.where_expression_ast) + where_expression_ast = other.where_expression_ast->clone(); + else + where_expression_ast.reset(); + + where_expression_columns = other.where_expression_columns; where_result_column = other.where_result_column; group_by_keys = other.group_by_keys; set_parts = other.set_parts; @@ -165,6 +162,44 @@ TTLDescription & TTLDescription::operator=(const TTLDescription & other) return * this; } +static ExpressionAndSets buildExpressionAndSets(ASTPtr & ast, const NamesAndTypesList & columns, const ContextPtr & context) +{ + ExpressionAndSets result; + auto ttl_string = queryToString(ast); + auto syntax_analyzer_result = TreeRewriter(context).analyze(ast, columns); + ExpressionAnalyzer analyzer(ast, syntax_analyzer_result, context); + auto dag = analyzer.getActionsDAG(false); + + const auto * col = &dag->findInOutputs(ast->getColumnName()); + if (col->result_name != ttl_string) + col = &dag->addAlias(*col, ttl_string); + + dag->getOutputs() = {col}; + dag->removeUnusedActions(); + + result.expression = std::make_shared(dag, ExpressionActionsSettings::fromContext(context)); + result.sets = analyzer.getPreparedSets(); + + return result; +} + +ExpressionAndSets TTLDescription::buildExpression(const ContextPtr & context) const +{ + auto ast = expression_ast->clone(); + return buildExpressionAndSets(ast, expression_columns, context); +} + +ExpressionAndSets TTLDescription::buildWhereExpression(const ContextPtr & context) const +{ + if (where_expression_ast) + { + auto ast = where_expression_ast->clone(); + return buildExpressionAndSets(ast, where_expression_columns, context); + } + + return {}; +} + TTLDescription TTLDescription::getTTLFromAST( const ASTPtr & definition_ast, const ColumnsDescription & columns, @@ -182,9 +217,12 @@ TTLDescription TTLDescription::getTTLFromAST( result.expression_ast = definition_ast->clone(); auto ttl_ast = result.expression_ast->clone(); - auto syntax_analyzer_result = TreeRewriter(context).analyze(ttl_ast, columns.getAllPhysical()); - result.expression = ExpressionAnalyzer(ttl_ast, syntax_analyzer_result, context).getActions(false); - result.result_column = ttl_ast->getColumnName(); + auto expression = buildExpressionAndSets(ttl_ast, columns.getAllPhysical(), context).expression; + result.expression_columns = expression->getRequiredColumnsWithTypes(); + + result.result_column = expression->getSampleBlock().safeGetByPosition(0).name; + + ExpressionActionsPtr where_expression; if (ttl_element == nullptr) /// columns TTL { @@ -202,9 +240,10 @@ TTLDescription TTLDescription::getTTLFromAST( { if (ASTPtr where_expr_ast = ttl_element->where()) { - auto where_syntax_result = TreeRewriter(context).analyze(where_expr_ast, columns.getAllPhysical()); - result.where_expression = ExpressionAnalyzer(where_expr_ast, where_syntax_result, context).getActions(false); - result.where_result_column = where_expr_ast->getColumnName(); + result.where_expression_ast = where_expr_ast->clone(); + where_expression = buildExpressionAndSets(where_expr_ast, columns.getAllPhysical(), context).expression; + result.where_expression_columns = where_expression->getRequiredColumnsWithTypes(); + result.where_result_column = where_expression->getSampleBlock().safeGetByPosition(0).name; } } else if (ttl_element->mode == TTLMode::GROUP_BY) @@ -229,17 +268,17 @@ TTLDescription TTLDescription::getTTLFromAST( for (const auto & ast : ttl_element->group_by_assignments) { const auto assignment = ast->as(); - auto expression = assignment.expression(); + auto ass_expression = assignment.expression(); FindAggregateFunctionVisitor::Data data{false}; - FindAggregateFunctionVisitor(data).visit(expression); + FindAggregateFunctionVisitor(data).visit(ass_expression); if (!data.has_aggregate_function) throw Exception(ErrorCodes::BAD_TTL_EXPRESSION, "Invalid expression for assignment of column {}. Should contain an aggregate function", assignment.column_name); - expression = addTypeConversionToAST(std::move(expression), columns.getPhysical(assignment.column_name).type->getName()); - aggregations.emplace_back(assignment.column_name, std::move(expression)); + ass_expression = addTypeConversionToAST(std::move(ass_expression), columns.getPhysical(assignment.column_name).type->getName()); + aggregations.emplace_back(assignment.column_name, std::move(ass_expression)); aggregation_columns_set.insert(assignment.column_name); } @@ -297,7 +336,7 @@ TTLDescription TTLDescription::getTTLFromAST( } } - checkTTLExpression(result.expression, result.result_column, is_attach || context->getSettingsRef().allow_suspicious_ttl_expressions); + checkTTLExpression(expression, result.result_column, is_attach || context->getSettingsRef().allow_suspicious_ttl_expressions); return result; } @@ -350,7 +389,7 @@ TTLTableDescription TTLTableDescription::getTTLForTableFromAST( auto ttl = TTLDescription::getTTLFromAST(ttl_element_ptr, columns, context, primary_key, is_attach); if (ttl.mode == TTLMode::DELETE) { - if (!ttl.where_expression) + if (!ttl.where_expression_ast) { if (have_unconditional_delete_ttl) throw Exception(ErrorCodes::BAD_TTL_EXPRESSION, "More than one DELETE TTL expression without WHERE expression is not allowed"); diff --git a/src/Storages/TTLDescription.h b/src/Storages/TTLDescription.h index aab5b43e53e..4ea73ac291f 100644 --- a/src/Storages/TTLDescription.h +++ b/src/Storages/TTLDescription.h @@ -35,6 +35,15 @@ struct TTLAggregateDescription using TTLAggregateDescriptions = std::vector; +class PreparedSets; +using PreparedSetsPtr = std::shared_ptr; + +struct ExpressionAndSets +{ + ExpressionActionsPtr expression; + PreparedSetsPtr sets; +}; + /// Common struct for TTL record in storage struct TTLDescription { @@ -44,9 +53,10 @@ struct TTLDescription /// TTL d + INTERVAL 1 DAY /// ^~~~~~~~~~~~~~~~~~~^ ASTPtr expression_ast; + NamesAndTypesList expression_columns; /// Expression actions evaluated from AST - ExpressionActionsPtr expression; + ExpressionAndSets buildExpression(const ContextPtr & context) const; /// Result column of this TTL expression String result_column; @@ -54,7 +64,9 @@ struct TTLDescription /// WHERE part in TTL expression /// TTL ... WHERE x % 10 == 0 and y > 5 /// ^~~~~~~~~~~~~~~~~~~~~~^ - ExpressionActionsPtr where_expression; + ASTPtr where_expression_ast; + NamesAndTypesList where_expression_columns; + ExpressionAndSets buildWhereExpression(const ContextPtr & context) const; /// Name of result column from WHERE expression String where_result_column; diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 430ed012fa8..33ff6e7104f 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include diff --git a/src/Storages/buildQueryTreeForShard.cpp b/src/Storages/buildQueryTreeForShard.cpp index 5ea28d9e09c..c87a1b216ca 100644 --- a/src/Storages/buildQueryTreeForShard.cpp +++ b/src/Storages/buildQueryTreeForShard.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -283,7 +284,16 @@ TableNodePtr executeSubqueryNode(const QueryTreeNodePtr & subquery_node, auto optimization_settings = QueryPlanOptimizationSettings::fromContext(mutable_context); auto build_pipeline_settings = BuildQueryPipelineSettings::fromContext(mutable_context); - auto pipeline = QueryPipelineBuilder::getPipeline(std::move(*query_plan.buildQueryPipeline(optimization_settings, build_pipeline_settings))); + auto builder = query_plan.buildQueryPipeline(optimization_settings, build_pipeline_settings); + + size_t min_block_size_rows = mutable_context->getSettingsRef().min_external_table_block_size_rows; + size_t min_block_size_bytes = mutable_context->getSettingsRef().min_external_table_block_size_bytes; + auto squashing = std::make_shared(builder->getHeader(), min_block_size_rows, min_block_size_bytes); + + builder->resize(1); + builder->addTransform(std::move(squashing)); + + auto pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder)); pipeline.complete(std::move(table_out)); CompletedPipelineExecutor executor(pipeline); @@ -295,10 +305,8 @@ TableNodePtr executeSubqueryNode(const QueryTreeNodePtr & subquery_node, } -QueryTreeNodePtr buildQueryTreeForShard(SelectQueryInfo & query_info, QueryTreeNodePtr query_tree_to_modify) +QueryTreeNodePtr buildQueryTreeForShard(const PlannerContextPtr & planner_context, QueryTreeNodePtr query_tree_to_modify) { - auto & planner_context = query_info.planner_context; - CollectColumnSourceToColumnsVisitor collect_column_source_to_columns_visitor; collect_column_source_to_columns_visitor.visit(query_tree_to_modify); @@ -378,16 +386,47 @@ QueryTreeNodePtr buildQueryTreeForShard(SelectQueryInfo & query_info, QueryTreeN return query_tree_to_modify; } -class RewriteJoinToGlobalJoinVisitor : public InDepthQueryTreeVisitor +class CollectStoragesVisitor : public InDepthQueryTreeVisitor { public: - using Base = InDepthQueryTreeVisitor; + using Base = InDepthQueryTreeVisitor; using Base::Base; void visitImpl(QueryTreeNodePtr & node) + { + if (auto * table_node = node->as()) + storages.push_back(table_node->getStorage()); + } + + std::vector storages; +}; + +class RewriteJoinToGlobalJoinVisitor : public InDepthQueryTreeVisitorWithContext +{ +public: + using Base = InDepthQueryTreeVisitorWithContext; + using Base::Base; + + static bool allStoragesAreMergeTree(QueryTreeNodePtr & node) + { + CollectStoragesVisitor collect_storages; + collect_storages.visit(node); + for (const auto & storage : collect_storages.storages) + if (!storage->isMergeTree()) + return false; + + return true; + } + + void enterImpl(QueryTreeNodePtr & node) { if (auto * join_node = node->as()) - join_node->setLocality(JoinLocality::Global); + { + bool prefer_local_join = getContext()->getSettingsRef().parallel_replicas_prefer_local_join; + bool should_use_global_join = !prefer_local_join || !allStoragesAreMergeTree(join_node->getRightTableExpression()); + if (should_use_global_join) + join_node->setLocality(JoinLocality::Global); + } } static bool needChildVisit(QueryTreeNodePtr & parent, QueryTreeNodePtr & child) @@ -400,9 +439,9 @@ public: } }; -void rewriteJoinToGlobalJoin(QueryTreeNodePtr query_tree_to_modify) +void rewriteJoinToGlobalJoin(QueryTreeNodePtr query_tree_to_modify, ContextPtr context) { - RewriteJoinToGlobalJoinVisitor visitor; + RewriteJoinToGlobalJoinVisitor visitor(context); visitor.visit(query_tree_to_modify); } diff --git a/src/Storages/buildQueryTreeForShard.h b/src/Storages/buildQueryTreeForShard.h index eec5a0dc38a..5b00b89c729 100644 --- a/src/Storages/buildQueryTreeForShard.h +++ b/src/Storages/buildQueryTreeForShard.h @@ -10,8 +10,14 @@ struct SelectQueryInfo; class IQueryTreeNode; using QueryTreeNodePtr = std::shared_ptr; -QueryTreeNodePtr buildQueryTreeForShard(SelectQueryInfo & query_info, QueryTreeNodePtr query_tree_to_modify); +class PlannerContext; +using PlannerContextPtr = std::shared_ptr; -void rewriteJoinToGlobalJoin(QueryTreeNodePtr query_tree_to_modify); +class Context; +using ContextPtr = std::shared_ptr; + +QueryTreeNodePtr buildQueryTreeForShard(const PlannerContextPtr & planner_context, QueryTreeNodePtr query_tree_to_modify); + +void rewriteJoinToGlobalJoin(QueryTreeNodePtr query_tree_to_modify, ContextPtr context); } diff --git a/src/Storages/transformQueryForExternalDatabase.cpp b/src/Storages/transformQueryForExternalDatabase.cpp index 4526a38a1c3..afc458ea612 100644 --- a/src/Storages/transformQueryForExternalDatabase.cpp +++ b/src/Storages/transformQueryForExternalDatabase.cpp @@ -145,7 +145,7 @@ bool isCompatible(ASTPtr & node) return false; if (!function->arguments) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: function->arguments is not set"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "function->arguments is not set"); String name = function->name; diff --git a/src/TableFunctions/ITableFunctionCluster.h b/src/TableFunctions/ITableFunctionCluster.h index 7e81d6d21b7..9f56d781bc9 100644 --- a/src/TableFunctions/ITableFunctionCluster.h +++ b/src/TableFunctions/ITableFunctionCluster.h @@ -4,7 +4,6 @@ #include #include -#include #include #include #include @@ -29,14 +28,14 @@ public: String getName() const override = 0; String getSignature() const override = 0; - static void addColumnsStructureToArguments(ASTs & args, const String & desired_structure, const ContextPtr & context) + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure_, const String & format_, const ContextPtr & context) { if (args.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected empty list of arguments for {}Cluster table function", Base::name); ASTPtr cluster_name_arg = args.front(); args.erase(args.begin()); - Base::addColumnsStructureToArguments(args, desired_structure, context); + Base::updateStructureAndFormatArgumentsIfNeeded(args, structure_, format_, context); args.insert(args.begin(), cluster_name_arg); } diff --git a/src/TableFunctions/ITableFunctionFileLike.cpp b/src/TableFunctions/ITableFunctionFileLike.cpp index b88af855309..b697f3df925 100644 --- a/src/TableFunctions/ITableFunctionFileLike.cpp +++ b/src/TableFunctions/ITableFunctionFileLike.cpp @@ -27,14 +27,14 @@ void ITableFunctionFileLike::parseFirstArguments(const ASTPtr & arg, const Conte filename = checkAndGetLiteralArgument(arg, "source"); } -String ITableFunctionFileLike::getFormatFromFirstArgument() +std::optional ITableFunctionFileLike::tryGetFormatFromFirstArgument() { - return FormatFactory::instance().getFormatFromFileName(filename, true); + return FormatFactory::instance().tryGetFormatFromFileName(filename); } bool ITableFunctionFileLike::supportsReadingSubsetOfColumns(const ContextPtr & context) { - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format, context); + return format != "auto" && FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format, context); } void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, ContextPtr context) @@ -63,7 +63,10 @@ void ITableFunctionFileLike::parseArgumentsImpl(ASTs & args, const ContextPtr & format = checkAndGetLiteralArgument(args[1], "format"); if (format == "auto") - format = getFormatFromFirstArgument(); + { + if (auto format_from_first_argument = tryGetFormatFromFirstArgument()) + format = *format_from_first_argument; + } if (args.size() > 2) { @@ -79,34 +82,37 @@ void ITableFunctionFileLike::parseArgumentsImpl(ASTs & args, const ContextPtr & compression_method = checkAndGetLiteralArgument(args[3], "compression_method"); } -void ITableFunctionFileLike::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr &) +void ITableFunctionFileLike::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context) { if (args.empty() || args.size() > getMaxNumberOfArguments()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected 1 to {} arguments in table function, got {}", getMaxNumberOfArguments(), args.size()); + auto format_literal = std::make_shared(format); auto structure_literal = std::make_shared(structure); + for (auto & arg : args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); + /// f(filename) if (args.size() == 1) { - /// Add format=auto before structure argument. - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); args.push_back(structure_literal); } /// f(filename, format) else if (args.size() == 2) { + if (checkAndGetLiteralArgument(args[1], "format") == "auto") + args.back() = format_literal; args.push_back(structure_literal); } - /// f(filename, format, 'auto') - else if (args.size() == 3) + /// f(filename, format, structure) or f(filename, format, structure, compression) + else if (args.size() >= 3) { - args.back() = structure_literal; - } - /// f(filename, format, 'auto', compression) - else if (args.size() == 4) - { - args[args.size() - 2] = structure_literal; + if (checkAndGetLiteralArgument(args[1], "format") == "auto") + args[1] = format_literal; + if (checkAndGetLiteralArgument(args[2], "structure") == "auto") + args[2] = structure_literal; } } diff --git a/src/TableFunctions/ITableFunctionFileLike.h b/src/TableFunctions/ITableFunctionFileLike.h index 5fe86587797..c8412905e44 100644 --- a/src/TableFunctions/ITableFunctionFileLike.h +++ b/src/TableFunctions/ITableFunctionFileLike.h @@ -31,7 +31,7 @@ public: static size_t getMaxNumberOfArguments() { return 4; } - static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr &); + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr &); protected: @@ -39,10 +39,9 @@ protected: virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); virtual void parseFirstArguments(const ASTPtr & arg, const ContextPtr & context); - virtual String getFormatFromFirstArgument(); + virtual std::optional tryGetFormatFromFirstArgument(); String filename; - String path_to_archive; String format = "auto"; String structure = "auto"; String compression_method = "auto"; diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp index d394c836369..066d6338b6a 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -58,7 +58,7 @@ void TableFunctionAzureBlobStorage::parseArgumentsImpl(ASTs & engine_args, const configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); } else { @@ -155,7 +155,7 @@ void TableFunctionAzureBlobStorage::parseArgumentsImpl(ASTs & engine_args, const configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); } } @@ -174,15 +174,24 @@ void TableFunctionAzureBlobStorage::parseArguments(const ASTPtr & ast_function, parseArgumentsImpl(args, context); } -void TableFunctionAzureBlobStorage::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context) +void TableFunctionAzureBlobStorage::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context) { - if (tryGetNamedCollectionWithOverrides(args, context)) + if (auto collection = tryGetNamedCollectionWithOverrides(args, context)) { - /// In case of named collection, just add key-value pair "structure='...'" - /// at the end of arguments to override existed structure. - ASTs equal_func_args = {std::make_shared("structure"), std::make_shared(structure)}; - auto equal_func = makeASTFunction("equals", std::move(equal_func_args)); - args.push_back(equal_func); + /// In case of named collection, just add key-value pairs "format='...', structure='...'" + /// at the end of arguments to override existed format and structure with "auto" values. + if (collection->getOrDefault("format", "auto") == "auto") + { + ASTs format_equal_func_args = {std::make_shared("format"), std::make_shared(format)}; + auto format_equal_func = makeASTFunction("equals", std::move(format_equal_func_args)); + args.push_back(format_equal_func); + } + if (collection->getOrDefault("structure", "auto") == "auto") + { + ASTs structure_equal_func_args = {std::make_shared("structure"), std::make_shared(structure)}; + auto structure_equal_func = makeASTFunction("equals", std::move(structure_equal_func_args)); + args.push_back(structure_equal_func); + } } else { @@ -191,65 +200,126 @@ void TableFunctionAzureBlobStorage::addColumnsStructureToArguments(ASTs & args, "Storage Azure requires 3 to 7 arguments: " "AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])"); + auto format_literal = std::make_shared(format); auto structure_literal = std::make_shared(structure); + for (auto & arg : args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); + auto is_format_arg = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); }; - + /// (connection_string, container_name, blobpath) if (args.size() == 3) { - /// Add format=auto & compression=auto before structure argument. - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); + /// Add compression = "auto" before structure argument. args.push_back(std::make_shared("auto")); args.push_back(structure_literal); } + /// (connection_string, container_name, blobpath, structure) or + /// (connection_string, container_name, blobpath, format) + /// We can distinguish them by looking at the 4-th argument: check if it's format name or not. else if (args.size() == 4) { auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name/structure"); + /// (..., format) -> (..., format, compression, structure) if (is_format_arg(fourth_arg)) { + if (fourth_arg == "auto") + args[3] = format_literal; /// Add compression=auto before structure argument. args.push_back(std::make_shared("auto")); args.push_back(structure_literal); } + /// (..., structure) -> (..., format, compression, structure) else { - args.back() = structure_literal; + auto structure_arg = args.back(); + args[3] = format_literal; + /// Add compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + if (fourth_arg == "auto") + args.push_back(structure_literal); + else + args.push_back(structure_arg); } } + /// (connection_string, container_name, blobpath, format, compression) or + /// (storage_account_url, container_name, blobpath, account_name, account_key) + /// We can distinguish them by looking at the 4-th argument: check if it's format name or not. else if (args.size() == 5) { auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); - if (!is_format_arg(fourth_arg)) + /// (..., format, compression) -> (..., format, compression, structure) + if (is_format_arg(fourth_arg)) { - /// Add format=auto & compression=auto before structure argument. - args.push_back(std::make_shared("auto")); - args.push_back(std::make_shared("auto")); + if (fourth_arg == "auto") + args[3] = format_literal; + args.push_back(structure_literal); } - args.push_back(structure_literal); - } - else if (args.size() == 6) - { - auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); - if (!is_format_arg(fourth_arg)) + /// (..., account_name, account_key) -> (..., account_name, account_key, format, compression, structure) + else { + args.push_back(format_literal); /// Add compression=auto before structure argument. args.push_back(std::make_shared("auto")); args.push_back(structure_literal); } + } + /// (connection_string, container_name, blobpath, format, compression, structure) or + /// (storage_account_url, container_name, blobpath, account_name, account_key, structure) or + /// (storage_account_url, container_name, blobpath, account_name, account_key, format) + else if (args.size() == 6) + { + auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); + auto sixth_arg = checkAndGetLiteralArgument(args[5], "format/structure"); + + /// (..., format, compression, structure) + if (is_format_arg(fourth_arg)) + { + if (fourth_arg == "auto") + args[3] = format_literal; + if (checkAndGetLiteralArgument(args[5], "structure") == "auto") + args[5] = structure_literal; + } + /// (..., account_name, account_key, format) -> (..., account_name, account_key, format, compression, structure) + else if (is_format_arg(sixth_arg)) + { + if (sixth_arg == "auto") + args[5] = format_literal; + /// Add compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + args.push_back(structure_literal); + } + /// (..., account_name, account_key, structure) -> (..., account_name, account_key, format, compression, structure) else { - args.back() = structure_literal; + auto structure_arg = args.back(); + args[5] = format_literal; + /// Add compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + if (sixth_arg == "auto") + args.push_back(structure_literal); + else + args.push_back(structure_arg); } } + /// (storage_account_url, container_name, blobpath, account_name, account_key, format, compression) else if (args.size() == 7) { + /// (..., format, compression) -> (..., format, compression, structure) + if (checkAndGetLiteralArgument(args[5], "format") == "auto") + args[5] = format_literal; args.push_back(structure_literal); } + /// (storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure) else if (args.size() == 8) { - args.back() = structure_literal; + if (checkAndGetLiteralArgument(args[5], "format") == "auto") + args[5] = format_literal; + if (checkAndGetLiteralArgument(args[7], "structure") == "auto") + args[7] = structure_literal; } } } @@ -262,8 +332,10 @@ ColumnsDescription TableFunctionAzureBlobStorage::getActualTableStructure(Contex auto client = StorageAzureBlob::createClient(configuration, !is_insert_query); auto settings = StorageAzureBlob::createSettings(context); - auto object_storage = std::make_unique("AzureBlobStorageTableFunction", std::move(client), std::move(settings)); - return StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, std::nullopt, context, false); + auto object_storage = std::make_unique("AzureBlobStorageTableFunction", std::move(client), std::move(settings), configuration.container); + if (configuration.format == "auto") + return StorageAzureBlob::getTableStructureAndFormatFromData(object_storage.get(), configuration, std::nullopt, context).first; + return StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, std::nullopt, context); } return parseColumnsListFromString(configuration.structure, context); @@ -293,7 +365,7 @@ StoragePtr TableFunctionAzureBlobStorage::executeImpl(const ASTPtr & /*ast_funct StoragePtr storage = std::make_shared( configuration, - std::make_unique(table_name, std::move(client), std::move(settings)), + std::make_unique(table_name, std::move(client), std::move(settings), configuration.container), context, StorageID(getDatabaseName(), table_name), columns, diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.h b/src/TableFunctions/TableFunctionAzureBlobStorage.h index 1a221f60c55..9622881b417 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.h +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.h @@ -55,7 +55,7 @@ public: virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); - static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context); + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context); protected: diff --git a/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp b/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp index eee585967c2..04dddca7672 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp @@ -21,9 +21,8 @@ StoragePtr TableFunctionAzureBlobStorageCluster::executeImpl( { StoragePtr storage; ColumnsDescription columns; - bool structure_argument_was_provided = configuration.structure != "auto"; - if (structure_argument_was_provided) + if (configuration.structure != "auto") { columns = parseColumnsListFromString(configuration.structure, context); } @@ -40,7 +39,7 @@ StoragePtr TableFunctionAzureBlobStorageCluster::executeImpl( /// On worker node this filename won't contains globs storage = std::make_shared( configuration, - std::make_unique(table_name, std::move(client), std::move(settings)), + std::make_unique(table_name, std::move(client), std::move(settings), configuration.container), context, StorageID(getDatabaseName(), table_name), columns, @@ -55,12 +54,11 @@ StoragePtr TableFunctionAzureBlobStorageCluster::executeImpl( storage = std::make_shared( cluster_name, configuration, - std::make_unique(table_name, std::move(client), std::move(settings)), + std::make_unique(table_name, std::move(client), std::move(settings), configuration.container), StorageID(getDatabaseName(), table_name), columns, ConstraintsDescription{}, - context, - structure_argument_was_provided); + context); } storage->startup(); diff --git a/src/TableFunctions/TableFunctionExplain.cpp b/src/TableFunctions/TableFunctionExplain.cpp index f993a9820cb..400fc81e6d4 100644 --- a/src/TableFunctions/TableFunctionExplain.cpp +++ b/src/TableFunctions/TableFunctionExplain.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -21,6 +22,7 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int BAD_ARGUMENTS; + extern const int UNEXPECTED_AST_STRUCTURE; } namespace @@ -103,11 +105,25 @@ void TableFunctionExplain::parseArguments(const ASTPtr & ast_function, ContextPt if (function->arguments->children.size() > 2) { - const auto & query_arg = function->arguments->children[2]; + const auto & subquery_arg = function->arguments->children[2]; + const auto * subquery = subquery_arg->as(); + + if (!subquery) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Table function '{}' requires a subquery argument, got '{}'", + getName(), queryToString(subquery_arg)); + + if (subquery->children.empty()) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, + "A subquery AST element must have a child"); + + const auto & query_arg = subquery->children[0]; + if (!query_arg->as()) throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Table function '{}' requires a EXPLAIN SELECT query argument, got EXPLAIN '{}'", + "Table function '{}' requires a EXPLAIN's SELECT query argument, got '{}'", getName(), queryToString(query_arg)); + explain_query->setExplainedQuery(query_arg); } else if (kind != ASTExplainQuery::ExplainKind::CurrentTransaction) diff --git a/src/TableFunctions/TableFunctionFile.cpp b/src/TableFunctions/TableFunctionFile.cpp index 8a9dde374ec..b481076e9b6 100644 --- a/src/TableFunctions/TableFunctionFile.cpp +++ b/src/TableFunctions/TableFunctionFile.cpp @@ -54,12 +54,12 @@ void TableFunctionFile::parseFirstArguments(const ASTPtr & arg, const ContextPtr throw Exception(ErrorCodes::BAD_ARGUMENTS, "The first argument of table function '{}' mush be path or file descriptor", getName()); } -String TableFunctionFile::getFormatFromFirstArgument() +std::optional TableFunctionFile::tryGetFormatFromFirstArgument() { if (fd >= 0) - return FormatFactory::instance().getFormatFromFileDescriptor(fd); + return FormatFactory::instance().tryGetFormatFromFileDescriptor(fd); else - return FormatFactory::instance().getFormatFromFileName(filename, true); + return FormatFactory::instance().tryGetFormatFromFileName(filename); } StoragePtr TableFunctionFile::getStorage(const String & source, @@ -104,10 +104,11 @@ ColumnsDescription TableFunctionFile::getActualTableStructure(ContextPtr context archive_info = StorageFile::getArchiveInfo(path_to_archive, filename, context->getUserFilesPath(), context, total_bytes_to_read); + if (format == "auto") + return StorageFile::getTableStructureAndFormatFromFile(paths, compression_method, std::nullopt, context, archive_info).first; return StorageFile::getTableStructureFromFile(format, paths, compression_method, std::nullopt, context, archive_info); } - return parseColumnsListFromString(structure, context); } diff --git a/src/TableFunctions/TableFunctionFile.h b/src/TableFunctions/TableFunctionFile.h index 6eaab29db8a..c1924028b49 100644 --- a/src/TableFunctions/TableFunctionFile.h +++ b/src/TableFunctions/TableFunctionFile.h @@ -26,8 +26,9 @@ public: protected: int fd = -1; + String path_to_archive; void parseFirstArguments(const ASTPtr & arg, const ContextPtr & context) override; - String getFormatFromFirstArgument() override; + std::optional tryGetFormatFromFirstArgument() override; private: StoragePtr getStorage( diff --git a/src/TableFunctions/TableFunctionFileCluster.cpp b/src/TableFunctions/TableFunctionFileCluster.cpp index 843909e2a58..3e53349b022 100644 --- a/src/TableFunctions/TableFunctionFileCluster.cpp +++ b/src/TableFunctions/TableFunctionFileCluster.cpp @@ -43,8 +43,7 @@ StoragePtr TableFunctionFileCluster::getStorage( compression_method, StorageID(getDatabaseName(), table_name), columns, - ConstraintsDescription{}, - structure != "auto"); + ConstraintsDescription{}); } return storage; diff --git a/src/TableFunctions/TableFunctionFormat.cpp b/src/TableFunctions/TableFunctionFormat.cpp index 4b6d0f70c0a..ad2a142a140 100644 --- a/src/TableFunctions/TableFunctionFormat.cpp +++ b/src/TableFunctions/TableFunctionFormat.cpp @@ -33,7 +33,9 @@ namespace ErrorCodes namespace { -/* format(format_name, data) - ... +/* format(format_name, structure, data) - parses data according to the specified format and structure. + * format(format_name, data) - infers the schema from the data and parses it according to the specified format. + * format(data) - detects the format, infers the schema and parses data according to inferred format and structure. */ class TableFunctionFormat : public ITableFunction { @@ -49,11 +51,11 @@ private: ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; - Block parseData(ColumnsDescription columns, ContextPtr context) const; + Block parseData(const ColumnsDescription & columns, const String & format_name, const ContextPtr & context) const; - String format; - String data; + String format = "auto"; String structure = "auto"; + String data; }; void TableFunctionFormat::parseArguments(const ASTPtr & ast_function, ContextPtr context) @@ -65,14 +67,15 @@ void TableFunctionFormat::parseArguments(const ASTPtr & ast_function, ContextPtr ASTs & args = args_func.at(0)->children; - if (args.size() != 2 && args.size() != 3) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' requires 2 or 3 arguments: format, [structure], data", getName()); + if (args.empty() || args.size() > 3) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' requires from 1 to 3 arguments: [format, [structure]], data", getName()); for (auto & arg : args) arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); - format = checkAndGetLiteralArgument(args[0], "format"); data = checkAndGetLiteralArgument(args.back(), "data"); + if (args.size() > 1) + format = checkAndGetLiteralArgument(args[0], "format"); if (args.size() == 3) structure = checkAndGetLiteralArgument(args[1], "structure"); } @@ -82,19 +85,21 @@ ColumnsDescription TableFunctionFormat::getActualTableStructure(ContextPtr conte if (structure == "auto") { SingleReadBufferIterator read_buffer_iterator(std::make_unique(data)); - return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, false, context); + if (format == "auto") + return detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, context).first; + return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, context); } return parseColumnsListFromString(structure, context); } -Block TableFunctionFormat::parseData(ColumnsDescription columns, ContextPtr context) const +Block TableFunctionFormat::parseData(const ColumnsDescription & columns, const String & format_name, const ContextPtr & context) const { Block block; for (const auto & name_and_type : columns.getAllPhysical()) block.insert({name_and_type.type->createColumn(), name_and_type.type, name_and_type.name}); auto read_buf = std::make_unique(data); - auto input_format = context->getInputFormat(format, *read_buf, block, context->getSettingsRef().max_block_size); + auto input_format = context->getInputFormat(format_name, *read_buf, block, context->getSettingsRef().max_block_size); QueryPipelineBuilder builder; builder.init(Pipe(input_format)); if (columns.hasDefaults()) @@ -120,10 +125,24 @@ Block TableFunctionFormat::parseData(ColumnsDescription columns, ContextPtr cont return concatenateBlocks(blocks); } -StoragePtr TableFunctionFormat::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const +StoragePtr TableFunctionFormat::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const { - auto columns = getActualTableStructure(context, is_insert_query); - Block res_block = parseData(columns, context); + ColumnsDescription columns; + String format_name = format; + if (structure == "auto") + { + SingleReadBufferIterator read_buffer_iterator(std::make_unique(data)); + if (format_name == "auto") + std::tie(columns, format_name) = detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, context); + else + columns = readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, context); + } + else + { + columns = parseColumnsListFromString(structure, context); + } + + Block res_block = parseData(columns, format_name, context); auto res = std::make_shared(StorageID(getDatabaseName(), table_name), columns, res_block); res->startup(); return res; diff --git a/src/TableFunctions/TableFunctionHDFS.cpp b/src/TableFunctions/TableFunctionHDFS.cpp index 8d48a7ba30e..2dac4398144 100644 --- a/src/TableFunctions/TableFunctionHDFS.cpp +++ b/src/TableFunctions/TableFunctionHDFS.cpp @@ -33,6 +33,8 @@ ColumnsDescription TableFunctionHDFS::getActualTableStructure(ContextPtr context if (structure == "auto") { context->checkAccess(getSourceAccessType()); + if (format == "auto") + return StorageHDFS::getTableStructureAndFormatFromData(filename, compression_method, context).first; return StorageHDFS::getTableStructureFromData(format, filename, compression_method, context); } diff --git a/src/TableFunctions/TableFunctionHDFSCluster.cpp b/src/TableFunctions/TableFunctionHDFSCluster.cpp index 6fb7ed0fce5..57ce6d2b9ff 100644 --- a/src/TableFunctions/TableFunctionHDFSCluster.cpp +++ b/src/TableFunctions/TableFunctionHDFSCluster.cpp @@ -45,8 +45,7 @@ StoragePtr TableFunctionHDFSCluster::getStorage( format, columns, ConstraintsDescription{}, - compression_method, - structure != "auto"); + compression_method); } return storage; } diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index a9c5a5c99f0..3fedd38277c 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -61,12 +61,11 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context if (configuration.format == "auto") { String file_path = named_collection->getOrDefault("filename", Poco::URI(named_collection->get("url")).getPath()); - configuration.format = FormatFactory::instance().getFormatFromFileName(file_path, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(file_path).value_or("auto"); } } else { - size_t count = StorageURL::evalArgsAndCollectHeaders(args, configuration.headers_from_ast, context); if (count == 0 || count > 7) @@ -216,7 +215,7 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context configuration.auth_settings.no_sign_request = no_sign_request; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(Poco::URI(url).getPath(), true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(Poco::URI(url).getPath()).value_or("auto"); } configuration.keys = {configuration.url.key}; @@ -238,15 +237,24 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con parseArgumentsImpl(args, context); } -void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context) +void TableFunctionS3::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context) { - if (tryGetNamedCollectionWithOverrides(args, context)) + if (auto collection = tryGetNamedCollectionWithOverrides(args, context)) { - /// In case of named collection, just add key-value pair "structure='...'" - /// at the end of arguments to override existed structure. - ASTs equal_func_args = {std::make_shared("structure"), std::make_shared(structure)}; - auto equal_func = makeASTFunction("equals", std::move(equal_func_args)); - args.push_back(equal_func); + /// In case of named collection, just add key-value pairs "format='...', structure='...'" + /// at the end of arguments to override existed format and structure with "auto" values. + if (collection->getOrDefault("format", "auto") == "auto") + { + ASTs format_equal_func_args = {std::make_shared("format"), std::make_shared(format)}; + auto format_equal_func = makeASTFunction("equals", std::move(format_equal_func_args)); + args.push_back(format_equal_func); + } + if (collection->getOrDefault("structure", "auto") == "auto") + { + ASTs structure_equal_func_args = {std::make_shared("structure"), std::make_shared(structure)}; + auto structure_equal_func = makeASTFunction("equals", std::move(structure_equal_func_args)); + args.push_back(structure_equal_func); + } } else { @@ -256,23 +264,25 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & if (count == 0 || count > getMaxNumberOfArguments()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected 1 to {} arguments in table function, got {}", getMaxNumberOfArguments(), count); + auto format_literal = std::make_shared(format); auto structure_literal = std::make_shared(structure); - /// s3(s3_url) + /// s3(s3_url) -> s3(s3_url, format, structure) if (count == 1) { - /// Add format=auto before structure argument. - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); args.push_back(structure_literal); } - /// s3(s3_url, format) or s3(s3_url, NOSIGN) + /// s3(s3_url, format) -> s3(s3_url, format, structure) or + /// s3(s3_url, NOSIGN) -> s3(s3_url, NOSIGN, format, structure) /// We can distinguish them by looking at the 2-nd argument: check if it's NOSIGN or not. else if (count == 2) { auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); - /// If there is NOSIGN, add format=auto before structure. if (boost::iequals(second_arg, "NOSIGN")) - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); + else if (second_arg == "auto") + args.back() = format_literal; args.push_back(structure_literal); } /// s3(source, format, structure) or @@ -282,18 +292,25 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & else if (count == 3) { auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); + /// s3(source, NOSIGN, format) -> s3(source, NOSIGN, format, structure) if (boost::iequals(second_arg, "NOSIGN")) { + if (checkAndGetLiteralArgument(args[2], "format") == "auto") + args.back() = format_literal; args.push_back(structure_literal); } + /// s3(source, format, structure) else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) { - args[count - 1] = structure_literal; + if (second_arg == "auto") + args[1] = format_literal; + if (checkAndGetLiteralArgument(args[2], "structure") == "auto") + args[2] = structure_literal; } + /// s3(source, access_key_id, access_key_id) -> s3(source, access_key_id, access_key_id, format, structure) else { - /// Add format=auto before structure argument. - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); args.push_back(structure_literal); } } @@ -304,16 +321,27 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & else if (count == 4) { auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); + /// s3(source, NOSIGN, format, structure) if (boost::iequals(second_arg, "NOSIGN")) { - args[count - 1] = structure_literal; + if (checkAndGetLiteralArgument(args[2], "format") == "auto") + args[2] = format_literal; + if (checkAndGetLiteralArgument(args[3], "structure") == "auto") + args[3] = structure_literal; } + /// s3(source, format, structure, compression_method) else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) { - args[count - 2] = structure_literal; + if (second_arg == "auto") + args[1] = format_literal; + if (checkAndGetLiteralArgument(args[2], "structure") == "auto") + args[2] = structure_literal; } + /// s3(source, access_key_id, access_key_id, format) -> s3(source, access_key_id, access_key_id, format, structure) else { + if (checkAndGetLiteralArgument(args[3], "format") == "auto") + args[3] = format_literal; args.push_back(structure_literal); } } @@ -323,19 +351,30 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & else if (count == 5) { auto sedond_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); + /// s3(source, NOSIGN, format, structure, compression_method) if (boost::iequals(sedond_arg, "NOSIGN")) { - args[count - 2] = structure_literal; + if (checkAndGetLiteralArgument(args[2], "format") == "auto") + args[2] = format_literal; + if (checkAndGetLiteralArgument(args[3], "structure") == "auto") + args[3] = structure_literal; } + /// s3(source, access_key_id, access_key_id, format, structure) else { - args[count - 1] = structure_literal; + if (checkAndGetLiteralArgument(args[3], "format") == "auto") + args[3] = format_literal; + if (checkAndGetLiteralArgument(args[4], "structure") == "auto") + args[4] = structure_literal; } } /// s3(source, access_key_id, secret_access_key, format, structure, compression) else if (count == 6) { - args[count - 2] = structure_literal; + if (checkAndGetLiteralArgument(args[3], "format") == "auto") + args[3] = format_literal; + if (checkAndGetLiteralArgument(args[4], "structure") == "auto") + args[4] = structure_literal; } } } @@ -346,6 +385,9 @@ ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context, { context->checkAccess(getSourceAccessType()); configuration.update(context); + if (configuration.format == "auto") + return StorageS3::getTableStructureAndFormatFromData(configuration, std::nullopt, context).first; + return StorageS3::getTableStructureFromData(configuration, std::nullopt, context); } diff --git a/src/TableFunctions/TableFunctionS3.h b/src/TableFunctions/TableFunctionS3.h index fa73c1d313e..00ca36c6653 100644 --- a/src/TableFunctions/TableFunctionS3.h +++ b/src/TableFunctions/TableFunctionS3.h @@ -57,7 +57,7 @@ public: virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); - static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context); + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context); protected: diff --git a/src/TableFunctions/TableFunctionS3Cluster.cpp b/src/TableFunctions/TableFunctionS3Cluster.cpp index ce96f7f580b..e727c4e4c89 100644 --- a/src/TableFunctions/TableFunctionS3Cluster.cpp +++ b/src/TableFunctions/TableFunctionS3Cluster.cpp @@ -21,9 +21,8 @@ StoragePtr TableFunctionS3Cluster::executeImpl( { StoragePtr storage; ColumnsDescription columns; - bool structure_argument_was_provided = configuration.structure != "auto"; - if (structure_argument_was_provided) + if (configuration.structure != "auto") { columns = parseColumnsListFromString(configuration.structure, context); } @@ -53,8 +52,7 @@ StoragePtr TableFunctionS3Cluster::executeImpl( StorageID(getDatabaseName(), table_name), columns, ConstraintsDescription{}, - context, - structure_argument_was_provided); + context); } storage->startup(); diff --git a/src/TableFunctions/TableFunctionURL.cpp b/src/TableFunctions/TableFunctionURL.cpp index aa535991d65..a78b2affa9a 100644 --- a/src/TableFunctions/TableFunctionURL.cpp +++ b/src/TableFunctions/TableFunctionURL.cpp @@ -55,7 +55,7 @@ void TableFunctionURL::parseArgumentsImpl(ASTs & args, const ContextPtr & contex format = configuration.format; if (format == "auto") - format = FormatFactory::instance().getFormatFromFileName(Poco::URI(filename).getPath(), true); + format = FormatFactory::instance().tryGetFormatFromFileName(Poco::URI(filename).getPath()).value_or("auto"); StorageURL::evalArgsAndCollectHeaders(args, configuration.headers, context); } @@ -78,15 +78,24 @@ void TableFunctionURL::parseArgumentsImpl(ASTs & args, const ContextPtr & contex } } -void TableFunctionURL::addColumnsStructureToArguments(ASTs & args, const String & desired_structure, const ContextPtr & context) +void TableFunctionURL::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure_, const String & format_, const ContextPtr & context) { - if (tryGetNamedCollectionWithOverrides(args, context)) + if (auto collection = tryGetNamedCollectionWithOverrides(args, context)) { - /// In case of named collection, just add key-value pair "structure='...'" - /// at the end of arguments to override existed structure. - ASTs equal_func_args = {std::make_shared("structure"), std::make_shared(desired_structure)}; - auto equal_func = makeASTFunction("equals", std::move(equal_func_args)); - args.push_back(equal_func); + /// In case of named collection, just add key-value pairs "format='...', structure='...'" + /// at the end of arguments to override existed format and structure with "auto" values. + if (collection->getOrDefault("format", "auto") == "auto") + { + ASTs format_equal_func_args = {std::make_shared("format"), std::make_shared(format_)}; + auto format_equal_func = makeASTFunction("equals", std::move(format_equal_func_args)); + args.push_back(format_equal_func); + } + if (collection->getOrDefault("structure", "auto") == "auto") + { + ASTs structure_equal_func_args = {std::make_shared("structure"), std::make_shared(structure_)}; + auto structure_equal_func = makeASTFunction("equals", std::move(structure_equal_func_args)); + args.push_back(structure_equal_func); + } } else { @@ -101,7 +110,7 @@ void TableFunctionURL::addColumnsStructureToArguments(ASTs & args, const String args.pop_back(); } - ITableFunctionFileLike::addColumnsStructureToArguments(args, desired_structure, context); + ITableFunctionFileLike::updateStructureAndFormatArgumentsIfNeeded(args, structure_, format_, context); if (headers_ast) args.push_back(headers_ast); @@ -131,6 +140,14 @@ ColumnsDescription TableFunctionURL::getActualTableStructure(ContextPtr context, if (structure == "auto") { context->checkAccess(getSourceAccessType()); + if (format == "auto") + return StorageURL::getTableStructureAndFormatFromData( + filename, + chooseCompressionMethod(Poco::URI(filename).getPath(), compression_method), + configuration.headers, + std::nullopt, + context).first; + return StorageURL::getTableStructureFromData(format, filename, chooseCompressionMethod(Poco::URI(filename).getPath(), compression_method), @@ -148,9 +165,9 @@ std::unordered_set TableFunctionURL::getVirtualsToCheckBeforeUsingStruct return {virtual_column_names.begin(), virtual_column_names.end()}; } -String TableFunctionURL::getFormatFromFirstArgument() +std::optional TableFunctionURL::tryGetFormatFromFirstArgument() { - return FormatFactory::instance().getFormatFromFileName(Poco::URI(filename).getPath(), true); + return FormatFactory::instance().tryGetFormatFromFileName(Poco::URI(filename).getPath()); } void registerTableFunctionURL(TableFunctionFactory & factory) diff --git a/src/TableFunctions/TableFunctionURL.h b/src/TableFunctions/TableFunctionURL.h index bf417f950c0..54e223283ba 100644 --- a/src/TableFunctions/TableFunctionURL.h +++ b/src/TableFunctions/TableFunctionURL.h @@ -34,7 +34,7 @@ public: ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; - static void addColumnsStructureToArguments(ASTs & args, const String & desired_structure, const ContextPtr & context); + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure_, const String & format_, const ContextPtr & context); std::unordered_set getVirtualsToCheckBeforeUsingStructureHint() const override; @@ -53,8 +53,7 @@ private: const char * getStorageTypeName() const override { return "URL"; } - String getFormatFromFirstArgument() override; - + std::optional tryGetFormatFromFirstArgument() override; }; } diff --git a/src/TableFunctions/TableFunctionURLCluster.cpp b/src/TableFunctions/TableFunctionURLCluster.cpp index a2949278155..5fd3c3342a5 100644 --- a/src/TableFunctions/TableFunctionURLCluster.cpp +++ b/src/TableFunctions/TableFunctionURLCluster.cpp @@ -40,8 +40,7 @@ StoragePtr TableFunctionURLCluster::getStorage( StorageID(getDatabaseName(), table_name), getActualTableStructure(context, /* is_insert_query */ true), ConstraintsDescription{}, - configuration, - structure != "auto"); + configuration); } return storage; } diff --git a/src/configure_config.cmake b/src/configure_config.cmake index 7de2d5a9fdd..141e51badbb 100644 --- a/src/configure_config.cmake +++ b/src/configure_config.cmake @@ -164,6 +164,9 @@ endif () if (ENABLE_OPENSSL) set(USE_OPENSSL_INTREE 1) endif () +if (TARGET ch_contrib::ssh) + set(USE_SSH 1) +endif() if (TARGET ch_contrib::fiu) set(FIU_ENABLE 1) endif() diff --git a/tests/analyzer_integration_broken_tests.txt b/tests/analyzer_integration_broken_tests.txt index 23f22209451..6cf5d3b6008 100644 --- a/tests/analyzer_integration_broken_tests.txt +++ b/tests/analyzer_integration_broken_tests.txt @@ -1,26 +1,9 @@ -test_access_for_functions/test.py::test_access_rights_for_function test_build_sets_from_multiple_threads/test.py::test_set test_concurrent_backups_s3/test.py::test_concurrent_backups -test_dictionaries_update_and_reload/test.py::test_reload_after_fail_in_cache_dictionary -test_distributed_backward_compatability/test.py::test_distributed_in_tuple test_distributed_type_object/test.py::test_distributed_type_object test_executable_table_function/test.py::test_executable_function_input_python test_mask_sensitive_info/test.py::test_encryption_functions test_merge_table_over_distributed/test.py::test_global_in test_merge_table_over_distributed/test.py::test_select_table_name_from_merge_over_distributed -test_mutations_with_merge_tree/test.py::test_mutations_with_merge_background_task -test_mysql_database_engine/test.py::test_mysql_ddl_for_mysql_database test_passing_max_partitions_to_read_remotely/test.py::test_default_database_on_cluster -test_profile_events_s3/test.py::test_profile_events -test_replicating_constants/test.py::test_different_versions test_select_access_rights/test_main.py::test_alias_columns -test_select_access_rights/test_main.py::test_select_count -test_select_access_rights/test_main.py::test_select_join -test_settings_profile/test.py::test_show_profiles -test_shard_level_const_function/test.py::test_remote -test_sql_user_defined_functions_on_cluster/test.py::test_sql_user_defined_functions_on_cluster -test_storage_rabbitmq/test.py::test_rabbitmq_materialized_view -test_user_defined_object_persistence/test.py::test_persistence -test_wrong_db_or_table_name/test.py::test_wrong_table_name -test_zookeeper_config/test.py::test_chroot_with_same_root -test_zookeeper_config/test.py::test_chroot_with_different_root diff --git a/tests/analyzer_tech_debt.txt b/tests/analyzer_tech_debt.txt index a84f912f371..0566dca8f5c 100644 --- a/tests/analyzer_tech_debt.txt +++ b/tests/analyzer_tech_debt.txt @@ -4,30 +4,19 @@ 01062_pm_all_join_with_block_continuation 01083_expressions_in_engine_arguments 01155_rename_move_materialized_view -01214_test_storage_merge_aliases_with_where 01244_optimize_distributed_group_by_sharding_key -01268_shard_avgweighted -01495_subqueries_in_with_statement -01560_merge_distributed_join 01584_distributed_buffer_cannot_find_column 01624_soft_constraints 01656_test_query_log_factories_info -01739_index_hint -02880_indexHint__partition_id 01747_join_view_filter_dictionary 01761_cast_to_enum_nullable 01925_join_materialized_columns -01925_test_storage_merge_aliases 01952_optimize_distributed_group_by_sharding_key 02174_cte_scalar_cache_mv 02354_annoy -02428_parameterized_view 02493_inconsistent_hex_and_binary_number -02575_merge_prewhere_different_default_kind -00917_multiple_joins_denny_crane 02725_agg_projection_resprect_PK 02763_row_policy_storage_merge_alias -02818_parameterized_view_with_cte_multiple_usage # Check after constants refactoring 02901_parallel_replicas_rollup # Flaky. Please don't delete them without fixing them: diff --git a/tests/ci/artifacts_helper.py b/tests/ci/artifacts_helper.py index a9f3385585b..5feca927a96 100644 --- a/tests/ci/artifacts_helper.py +++ b/tests/ci/artifacts_helper.py @@ -10,14 +10,17 @@ from pathlib import Path from shutil import copy2 from typing import List, Optional, Union +# isort: off from github.Commit import Commit +# isort: on + from build_download_helper import download_build_with_progress from commit_status_helper import post_commit_status from compress_files import SUFFIX, compress_fast, decompress_fast from env_helper import CI, RUNNER_TEMP, S3_BUILDS_BUCKET from git_helper import SHA_REGEXP -from report import HEAD_HTML_TEMPLATE, FOOTER_HTML_TEMPLATE +from report import FOOTER_HTML_TEMPLATE, HEAD_HTML_TEMPLATE, SUCCESS from s3_helper import S3Helper ARTIFACTS_PATH = Path(RUNNER_TEMP) / "artifacts" @@ -128,9 +131,7 @@ class ArtifactsHelper: @staticmethod def post_commit_status(commit: Commit, url: str) -> None: - post_commit_status( - commit, "success", url, "Artifacts for workflow", "Artifacts" - ) + post_commit_status(commit, SUCCESS, url, "Artifacts for workflow", "Artifacts") def _regenerate_index(self) -> None: if CI: diff --git a/tests/ci/ast_fuzzer_check.py b/tests/ci/ast_fuzzer_check.py index 41e4ef19361..0a69d8aab49 100644 --- a/tests/ci/ast_fuzzer_check.py +++ b/tests/ci/ast_fuzzer_check.py @@ -7,13 +7,11 @@ import sys from pathlib import Path from build_download_helper import get_build_name_for_check, read_build_urls -from clickhouse_helper import ( - CiLogsCredentials, -) +from clickhouse_helper import CiLogsCredentials from docker_images_helper import DockerImage, get_docker_image, pull_image from env_helper import REPORT_PATH, TEMP_PATH from pr_info import PRInfo -from report import JobReport +from report import FAIL, FAILURE, OK, SUCCESS, JobReport, TestResult from stopwatch import Stopwatch from tee_popen import TeePopen @@ -113,7 +111,6 @@ def main(): paths = { "run.log": run_log_path, "main.log": main_log_path, - "fuzzer.log": workspace_path / "fuzzer.log", "report.html": workspace_path / "report.html", "core.zst": workspace_path / "core.zst", "dmesg.log": workspace_path / "dmesg.log", @@ -122,12 +119,20 @@ def main(): compressed_server_log_path = workspace_path / "server.log.zst" if compressed_server_log_path.exists(): paths["server.log.zst"] = compressed_server_log_path + else: + # The script can fail before the invocation of `zstd`, but we are still interested in its log: + not_compressed_server_log_path = workspace_path / "server.log" + if not_compressed_server_log_path.exists(): + paths["server.log"] = not_compressed_server_log_path - # The script can fail before the invocation of `zstd`, but we are still interested in its log: - - not_compressed_server_log_path = workspace_path / "server.log" - if not_compressed_server_log_path.exists(): - paths["server.log"] = not_compressed_server_log_path + # Same idea but with the fuzzer log + compressed_fuzzer_log_path = workspace_path / "fuzzer.log.zst" + if compressed_fuzzer_log_path.exists(): + paths["fuzzer.log.zst"] = compressed_fuzzer_log_path + else: + not_compressed_fuzzer_log_path = workspace_path / "fuzzer.log" + if not_compressed_fuzzer_log_path.exists(): + paths["fuzzer.log"] = not_compressed_fuzzer_log_path # Try to get status message saved by the fuzzer try: @@ -137,12 +142,16 @@ def main(): with open(workspace_path / "description.txt", "r", encoding="utf-8") as desc_f: description = desc_f.readline().rstrip("\n") except: - status = "failure" + status = FAILURE description = "Task failed: $?=" + str(retcode) + test_result = TestResult(description, OK) + if "fail" in status: + test_result.status = FAIL + JobReport( description=description, - test_results=[], + test_results=[test_result], status=status, start_time=stopwatch.start_time_str, duration=stopwatch.duration_seconds, @@ -151,7 +160,7 @@ def main(): ).dump() logging.info("Result: '%s', '%s'", status, description) - if status == "failure": + if status != SUCCESS: sys.exit(1) diff --git a/tests/ci/autoscale_runners_lambda/app.py b/tests/ci/autoscale_runners_lambda/app.py index 120126b404a..26a05ab0af4 100644 --- a/tests/ci/autoscale_runners_lambda/app.py +++ b/tests/ci/autoscale_runners_lambda/app.py @@ -8,11 +8,10 @@ from pprint import pformat from typing import Any, List, Literal, Optional, Tuple import boto3 # type: ignore - from lambda_shared import ( + RUNNER_TYPE_LABELS, CHException, ClickHouseHelper, - RUNNER_TYPE_LABELS, get_parameter_from_ssm, ) @@ -115,6 +114,8 @@ def set_capacity( # Are we already at the capacity limits stop = stop or asg["MaxSize"] <= asg["DesiredCapacity"] # Let's calculate a new desired capacity + # (capacity_deficit + scale_up - 1) // scale_up : will increase min by 1 + # if there is any capacity_deficit desired_capacity = ( asg["DesiredCapacity"] + (capacity_deficit + scale_up - 1) // scale_up ) diff --git a/tests/ci/autoscale_runners_lambda/test_autoscale.py b/tests/ci/autoscale_runners_lambda/test_autoscale.py index 464e5695556..21a407276f9 100644 --- a/tests/ci/autoscale_runners_lambda/test_autoscale.py +++ b/tests/ci/autoscale_runners_lambda/test_autoscale.py @@ -4,7 +4,7 @@ import unittest from dataclasses import dataclass from typing import Any, List -from app import set_capacity, Queue +from app import Queue, set_capacity @dataclass @@ -68,10 +68,16 @@ class TestSetCapacity(unittest.TestCase): test_cases = ( # Do not change capacity TestCase("noqueue", 1, 13, 20, [Queue("in_progress", 155, "noqueue")], -1), - TestCase( - "w/reserve-1", 1, 13, 20, [Queue("queued", 15, "w/reserve-1")], 14 - ), + TestCase("reserve", 1, 13, 20, [Queue("queued", 13, "reserve")], -1), # Increase capacity + TestCase( + "increase-always", + 1, + 13, + 20, + [Queue("queued", 14, "increase-always")], + 14, + ), TestCase("increase-1", 1, 13, 20, [Queue("queued", 23, "increase-1")], 17), TestCase( "style-checker", 1, 13, 20, [Queue("queued", 33, "style-checker")], 20 diff --git a/tests/ci/bugfix_validate_check.py b/tests/ci/bugfix_validate_check.py index 107c02a0f56..ae7fce1f102 100644 --- a/tests/ci/bugfix_validate_check.py +++ b/tests/ci/bugfix_validate_check.py @@ -1,25 +1,28 @@ #!/usr/bin/env python3 from pathlib import Path -from typing import List, Tuple, Optional -import argparse +import subprocess +import sys +from typing import List, Sequence, Tuple import csv import logging -from github import Github - -from commit_status_helper import get_commit, post_commit_status -from get_robot_token import get_best_robot_token -from pr_info import PRInfo -from report import TestResults, TestResult -from s3_helper import S3Helper -from upload_result_helper import upload_results - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser() - parser.add_argument("files", nargs="+", type=Path, help="Path to status files") - return parser.parse_args() +from report import ( + ERROR, + FAILURE, + SKIPPED, + SUCCESS, + FAIL, + OK, + TestResult, + TestResults, + JobReport, +) +from env_helper import TEMP_PATH +from stopwatch import Stopwatch +from ci_config import JobNames +from ci_utils import normalize_string +from functional_test_check import NO_CHANGES_MSG def post_commit_status_from_file(file_path: Path) -> List[str]: @@ -32,93 +35,123 @@ def post_commit_status_from_file(file_path: Path) -> List[str]: return res[0] -# Returns (is_ok, test_results, error_message) -def process_result(file_path: Path) -> Tuple[bool, TestResults, Optional[str]]: - test_results = [] # type: TestResults - state, report_url, description = post_commit_status_from_file(file_path) - prefix = file_path.parent.name - if description.strip() in [ - "Invalid check_status.tsv", - "Not found test_results.tsv", - "Empty test_results.tsv", - ]: - status = ( - f'Check failed (Report)' - if report_url != "null" - else "Check failed" - ) - return False, [TestResult(f"{prefix}: {description}", status)], "Check failed" - - is_ok = state == "success" - if is_ok and report_url == "null": - return is_ok, test_results, None - - status = ( - f'OK: Bug reproduced (Report)' - if is_ok - else f'Bug is not reproduced (Report)' - ) - test_results.append(TestResult(f"{prefix}: {description}", status)) - return is_ok, test_results, None +def get_failed_test_cases(file_path: Path) -> List[TestResult]: + job_report = JobReport.load(from_file=file_path) + test_results = [] # type: List[TestResult] + for tr in job_report.test_results: + if tr.status == FAIL: + if tr.name == NO_CHANGES_MSG: + tr.status = SKIPPED + else: + tr.name = "[with NOT_OK] " + tr.name + tr.status = OK + elif tr.status == OK: + tr.name = "[with NOT_OK] " + tr.name + tr.status = FAIL + else: + # do not invert error status + pass + test_results.append(tr) + return test_results def process_all_results( - file_paths: List[Path], -) -> Tuple[bool, TestResults, Optional[str]]: - any_ok = False - all_results = [] - error = None - for status_path in file_paths: - is_ok, test_results, error = process_result(status_path) - any_ok = any_ok or is_ok - if test_results is not None: - all_results.extend(test_results) + file_paths: Sequence[Path], +) -> Tuple[str, str, TestResults]: + all_results = [] # type: TestResults + has_fail = False + has_error = False + has_ok = False + for job_report_path in file_paths: + test_results = get_failed_test_cases(job_report_path) + for tr in test_results: + if tr.status == FAIL: + has_fail = True + elif tr.status == ERROR: + has_error = True + elif tr.status == OK: + has_ok = True + all_results.extend(test_results) + if has_error: + status = ERROR + description = "Some error(s) occured in tests" + elif has_ok: + status = SUCCESS + description = "New test(s) reproduced a bug" + elif has_fail: + status = FAILURE + description = "New test(s) failed to reproduce a bug" + else: + status = ERROR + description = "Invalid job results" - return any_ok and error is None, all_results, error + return status, description, all_results def main(): logging.basicConfig(level=logging.INFO) - args = parse_args() - status_files = args.files # type: List[Path] + # args = parse_args() + stopwatch = Stopwatch() + jobs_to_validate = [JobNames.STATELESS_TEST_RELEASE, JobNames.INTEGRATION_TEST] + functional_job_report_file = Path(TEMP_PATH) / "functional_test_job_report.json" + integration_job_report_file = Path(TEMP_PATH) / "integration_test_job_report.json" + jobs_report_files = { + JobNames.STATELESS_TEST_RELEASE: functional_job_report_file, + JobNames.INTEGRATION_TEST: integration_job_report_file, + } + jobs_scripts = { + JobNames.STATELESS_TEST_RELEASE: "functional_test_check.py", + JobNames.INTEGRATION_TEST: "integration_test_check.py", + } - check_name_with_group = "Bugfix validate check" - - is_ok, test_results, error = process_all_results(status_files) - - description = "" - if error: - description = error - elif not is_ok: - description = "Changed tests don't reproduce the bug" - - pr_info = PRInfo() - if not test_results: - description = "No results to upload" - report_url = "" - logging.info("No results to upload") - else: - report_url = upload_results( - S3Helper(), - pr_info.number, - pr_info.sha, - test_results, - status_files, - check_name_with_group, + for test_job in jobs_to_validate: + report_file = jobs_report_files[test_job] + test_script = jobs_scripts[test_job] + if report_file.exists(): + report_file.unlink() + extra_timeout_option = "" + if test_job == JobNames.STATELESS_TEST_RELEASE: + extra_timeout_option = str(3600) + # "bugfix" must be present in checkname, as integration test runner checks this + check_name = f"Validate bugfix: {test_job}" + command = f"python3 {test_script} '{check_name}' {extra_timeout_option} --validate-bugfix --report-to-file {report_file}" + print(f"Going to validate job [{test_job}], command [{command}]") + _ = subprocess.run( + command, + stdout=sys.stdout, + stderr=sys.stderr, + text=True, + check=False, + shell=True, ) + assert ( + report_file.is_file() + ), f"No job report [{report_file}] found after job execution" - gh = Github(get_best_robot_token(), per_page=100) - commit = get_commit(gh, pr_info.sha) - post_commit_status( - commit, - "success" if is_ok else "error", - report_url, - description, - check_name_with_group, - pr_info, - dump_to_file=True, + status, description, test_results = process_all_results( + list(jobs_report_files.values()) ) + additional_files = [] + for job_id, report_file in jobs_report_files.items(): + jr = JobReport.load(from_file=report_file) + additional_files.append(report_file) + for file in set(jr.additional_files): + file_ = Path(file) + file_name = file_.name + file_name = file_name.replace(".", "__" + normalize_string(job_id) + ".", 1) + file_ = file_.rename(file_.parent / file_name) + additional_files.append(file_) + + JobReport( + description=description, + test_results=test_results, + status=status, + start_time=stopwatch.start_time_str, + duration=stopwatch.duration_seconds, + additional_files=additional_files, + ).dump() + if __name__ == "__main__": main() diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index cec8c4c7b65..f2a2ffc667b 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -64,6 +64,8 @@ def get_packager_cmd( cmd += " --debug-build" if build_config.sanitizer: cmd += f" --sanitizer={build_config.sanitizer}" + if build_config.coverage: + cmd += " --coverage" if build_config.tidy: cmd += " --clang-tidy" diff --git a/tests/ci/build_download_helper.py b/tests/ci/build_download_helper.py index 21012f6337d..0d24cb80021 100644 --- a/tests/ci/build_download_helper.py +++ b/tests/ci/build_download_helper.py @@ -112,10 +112,12 @@ def get_build_name_for_check(check_name: str) -> str: def read_build_urls(build_name: str, reports_path: Union[Path, str]) -> List[str]: for root, _, files in os.walk(reports_path): - for f in files: - if build_name in f: - logging.info("Found build report json %s", f) - with open(os.path.join(root, f), "r", encoding="utf-8") as file_handler: + for file in files: + if file.endswith(f"_{build_name}.json"): + logging.info("Found build report json %s", file) + with open( + os.path.join(root, file), "r", encoding="utf-8" + ) as file_handler: build_report = json.load(file_handler) return build_report["build_urls"] # type: ignore return [] diff --git a/tests/ci/ci.py b/tests/ci/ci.py index f52f28c3a16..819152fadc3 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -1,16 +1,30 @@ import argparse import concurrent.futures +from copy import deepcopy +from dataclasses import asdict, dataclass +from enum import Enum import json import logging import os import re import subprocess import sys +import time from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Dict, List, Optional, Sequence, Union import docker_images_helper -from ci_config import CI_CONFIG, Labels +import upload_result_helper +from build_check import get_release_or_pr +from ci_config import CI_CONFIG, Build, Labels, JobNames +from ci_utils import GHActions, is_hex, normalize_string +from clickhouse_helper import ( + CiLogsCredentials, + ClickHouseHelper, + get_instance_id, + get_instance_type, + prepare_tests_results_for_clickhouse, +) from commit_status_helper import ( CommitStatusData, RerunHelper, @@ -24,6 +38,7 @@ from digest_helper import DockerDigester, JobDigester from env_helper import ( CI, GITHUB_JOB_API_URL, + GITHUB_RUN_URL, REPO_COPY, REPORT_PATH, S3_BUILDS_BUCKET, @@ -34,20 +49,670 @@ from git_helper import GIT_PREFIX, Git from git_helper import Runner as GitRunner from github import Github from pr_info import PRInfo -from report import SUCCESS, BuildResult, JobReport +from report import ERROR, SUCCESS, BuildResult, JobReport from s3_helper import S3Helper -from clickhouse_helper import ( - CiLogsCredentials, - ClickHouseHelper, - get_instance_id, - get_instance_type, - prepare_tests_results_for_clickhouse, -) -from build_check import get_release_or_pr -import upload_result_helper from version_helper import get_version_from_repo +@dataclass +class PendingState: + updated_at: float + run_url: str + + +class CiCache: + """ + CI cache is a bunch of records. Record is a file stored under special location on s3. + The file name has following format + + _[]--___.ci + + RECORD_TYPE: + SUCCESSFUL - for successfuly finished jobs + PENDING - for pending jobs + + ATTRIBUTES: + release - for jobs being executed on the release branch including master branch (not a PR branch) + """ + + _S3_CACHE_PREFIX = "CI_cache_v1" + _CACHE_BUILD_REPORT_PREFIX = "build_report" + _RECORD_FILE_EXTENSION = ".ci" + _LOCAL_CACHE_PATH = Path(TEMP_PATH) / "ci_cache" + _ATTRIBUTE_RELEASE = "release" + # divider symbol 1 + _DIV1 = "--" + # divider symbol 2 + _DIV2 = "_" + assert _DIV1 != _DIV2 + + class RecordType(Enum): + SUCCESSFUL = "successful" + PENDING = "pending" + FAILED = "failed" + + @dataclass + class Record: + record_type: "CiCache.RecordType" + job_name: str + job_digest: str + batch: int + num_batches: int + release_branch: bool + file: str = "" + + def to_str_key(self): + """other fields must not be included in the hash str""" + return "_".join( + [self.job_name, self.job_digest, str(self.batch), str(self.num_batches)] + ) + + class JobType(Enum): + DOCS = "DOCS" + SRCS = "SRCS" + + @classmethod + def is_docs_job(cls, job_name: str) -> bool: + return job_name == JobNames.DOCS_CHECK + + @classmethod + def is_srcs_job(cls, job_name: str) -> bool: + return not cls.is_docs_job(job_name) + + @classmethod + def get_type_by_name(cls, job_name: str) -> "CiCache.JobType": + res = cls.SRCS + if cls.is_docs_job(job_name): + res = cls.DOCS + elif cls.is_srcs_job(job_name): + res = cls.SRCS + else: + assert False + return res + + def __init__( + self, + s3: S3Helper, + job_digests: Dict[str, str], + ): + self.s3 = s3 + self.job_digests = job_digests + self.cache_s3_paths = { + job_type: f"{self._S3_CACHE_PREFIX}/{job_type.value}-{self.job_digests[self._get_reference_job_name(job_type)]}/" + for job_type in self.JobType + } + self.s3_record_prefixes = { + record_type: record_type.value for record_type in self.RecordType + } + self.records: Dict["CiCache.RecordType", Dict[str, "CiCache.Record"]] = { + record_type: {} for record_type in self.RecordType + } + + self.cache_updated = False + self.cache_data_fetched = True + if not self._LOCAL_CACHE_PATH.exists(): + self._LOCAL_CACHE_PATH.mkdir(parents=True, exist_ok=True) + + def _get_reference_job_name(self, job_type: JobType) -> str: + res = Build.PACKAGE_RELEASE + if job_type == self.JobType.DOCS: + res = JobNames.DOCS_CHECK + elif job_type == self.JobType.SRCS: + res = Build.PACKAGE_RELEASE + else: + assert False + return res + + def _get_record_file_name( + self, + record_type: RecordType, + job_name: str, + batch: int, + num_batches: int, + release_branch: bool, + ) -> str: + prefix = self.s3_record_prefixes[record_type] + prefix_extended = ( + self._DIV2.join([prefix, self._ATTRIBUTE_RELEASE]) + if release_branch + else prefix + ) + assert self._DIV1 not in job_name, f"Invalid job name {job_name}" + job_name = self._DIV2.join( + [job_name, self.job_digests[job_name], str(batch), str(num_batches)] + ) + file_name = self._DIV1.join([prefix_extended, job_name]) + file_name += self._RECORD_FILE_EXTENSION + return file_name + + def _get_record_s3_path(self, job_name: str) -> str: + return self.cache_s3_paths[self.JobType.get_type_by_name(job_name)] + + def _parse_record_file_name( + self, record_type: RecordType, file_name: str + ) -> Optional["CiCache.Record"]: + # validate filename + if ( + not file_name.endswith(self._RECORD_FILE_EXTENSION) + or not len(file_name.split(self._DIV1)) == 2 + ): + print("ERROR: wrong file name format") + return None + + file_name = file_name.removesuffix(self._RECORD_FILE_EXTENSION) + release_branch = False + + prefix_extended, job_suffix = file_name.split(self._DIV1) + record_type_and_attribute = prefix_extended.split(self._DIV2) + + # validate filename prefix + failure = False + if not 0 < len(record_type_and_attribute) <= 2: + print("ERROR: wrong file name prefix") + failure = True + if ( + len(record_type_and_attribute) > 1 + and record_type_and_attribute[1] != self._ATTRIBUTE_RELEASE + ): + print("ERROR: wrong record attribute") + failure = True + if record_type_and_attribute[0] != self.s3_record_prefixes[record_type]: + print("ERROR: wrong record type") + failure = True + if failure: + return None + + if ( + len(record_type_and_attribute) > 1 + and record_type_and_attribute[1] == self._ATTRIBUTE_RELEASE + ): + release_branch = True + + job_properties = job_suffix.split(self._DIV2) + job_name, job_digest, batch, num_batches = ( + self._DIV2.join(job_properties[:-3]), + job_properties[-3], + int(job_properties[-2]), + int(job_properties[-1]), + ) + + if not is_hex(job_digest): + print("ERROR: wrong record job digest") + return None + + record = self.Record( + record_type, + job_name, + job_digest, + batch, + num_batches, + release_branch, + file="", + ) + return record + + def print_status(self): + for record_type in self.RecordType: + GHActions.print_in_group( + f"Cache records: [{record_type}]", list(self.records[record_type]) + ) + return self + + def update(self): + """ + Pulls cache records from s3. Only records name w/o content. + """ + for record_type in self.RecordType: + prefix = self.s3_record_prefixes[record_type] + cache_list = self.records[record_type] + for job_type in self.JobType: + path = self.cache_s3_paths[job_type] + records = self.s3.list_prefix(f"{path}{prefix}", S3_BUILDS_BUCKET) + records = [record.split("/")[-1] for record in records] + for file in records: + record = self._parse_record_file_name( + record_type=record_type, file_name=file + ) + if not record: + print(f"ERROR: failed to parse cache record [{file}]") + continue + if ( + record.job_name not in self.job_digests + or self.job_digests[record.job_name] != record.job_digest + ): + # skip records we are not interested in + continue + + if record.to_str_key() not in cache_list: + cache_list[record.to_str_key()] = record + self.cache_data_fetched = False + elif ( + not cache_list[record.to_str_key()].release_branch + and record.release_branch + ): + # replace a non-release record with a release one + cache_list[record.to_str_key()] = record + self.cache_data_fetched = False + + self.cache_updated = True + return self + + def fetch_records_data(self): + """ + Pulls CommitStatusData for all cached jobs from s3 + """ + if not self.cache_updated: + self.update() + + if self.cache_data_fetched: + # there are no record w/o underling data - no need to fetch + return self + + # clean up + for file in self._LOCAL_CACHE_PATH.glob("*.ci"): + file.unlink() + + # download all record files + for job_type in self.JobType: + path = self.cache_s3_paths[job_type] + for record_type in self.RecordType: + prefix = self.s3_record_prefixes[record_type] + _ = self.s3.download_files( + bucket=S3_BUILDS_BUCKET, + s3_path=f"{path}{prefix}", + file_suffix=self._RECORD_FILE_EXTENSION, + local_directory=self._LOCAL_CACHE_PATH, + ) + + # validate we have files for all records and save file names meanwhile + for record_type in self.RecordType: + record_list = self.records[record_type] + for _, record in record_list.items(): + record_file_name = self._get_record_file_name( + record_type, + record.job_name, + record.batch, + record.num_batches, + record.release_branch, + ) + assert ( + self._LOCAL_CACHE_PATH / record_file_name + ).is_file(), f"BUG. Record file must be present: {self._LOCAL_CACHE_PATH / record_file_name}" + record.file = record_file_name + + self.cache_data_fetched = True + return self + + def exist( + self, + record_type: "CiCache.RecordType", + job: str, + batch: int, + num_batches: int, + release_branch: bool, + ) -> bool: + if not self.cache_updated: + self.update() + record_key = self.Record( + record_type, + job, + self.job_digests[job], + batch, + num_batches, + release_branch, + ).to_str_key() + res = record_key in self.records[record_type] + if release_branch: + return res and self.records[record_type][record_key].release_branch + else: + return res + + def push( + self, + record_type: "CiCache.RecordType", + job: str, + batches: Union[int, Sequence[int]], + num_batches: int, + status: Union[CommitStatusData, PendingState], + release_branch: bool = False, + ) -> None: + """ + Pushes a cache record (CommitStatusData) + @release_branch adds "release" attribute to a record + """ + if isinstance(batches, int): + batches = [batches] + for batch in batches: + record_file = self._LOCAL_CACHE_PATH / self._get_record_file_name( + record_type, job, batch, num_batches, release_branch + ) + record_s3_path = self._get_record_s3_path(job) + if record_type == self.RecordType.SUCCESSFUL: + assert isinstance(status, CommitStatusData) + status.dump_to_file(record_file) + elif record_type == self.RecordType.FAILED: + assert isinstance(status, CommitStatusData) + status.dump_to_file(record_file) + elif record_type == self.RecordType.PENDING: + assert isinstance(status, PendingState) + with open(record_file, "w") as json_file: + json.dump(asdict(status), json_file) + else: + assert False + + _ = self.s3.upload_file( + bucket=S3_BUILDS_BUCKET, + file_path=record_file, + s3_path=record_s3_path + record_file.name, + ) + record = self.Record( + record_type, + job, + self.job_digests[job], + batch, + num_batches, + release_branch, + file=record_file.name, + ) + if ( + record.release_branch + or record.to_str_key() not in self.records[record_type] + ): + self.records[record_type][record.to_str_key()] = record + + def get( + self, record_type: "CiCache.RecordType", job: str, batch: int, num_batches: int + ) -> Optional[Union[CommitStatusData, PendingState]]: + """ + Gets a cache record data for a job, or None if a cache miss + """ + + if not self.cache_data_fetched: + self.fetch_records_data() + + record_key = self.Record( + record_type, + job, + self.job_digests[job], + batch, + num_batches, + release_branch=False, + ).to_str_key() + + if record_key not in self.records[record_type]: + return None + + record_file_name = self.records[record_type][record_key].file + + res = CommitStatusData.load_from_file( + self._LOCAL_CACHE_PATH / record_file_name + ) # type: CommitStatusData + + return res + + def delete( + self, + record_type: "CiCache.RecordType", + job: str, + batch: int, + num_batches: int, + release_branch: bool, + ) -> None: + """ + deletes record from the cache + """ + raise NotImplementedError("Let's try make cache push-and-read-only") + # assert ( + # record_type == self.RecordType.PENDING + # ), "FIXME: delete is supported for pending records only" + # record_file_name = self._get_record_file_name( + # self.RecordType.PENDING, + # job, + # batch, + # num_batches, + # release_branch=release_branch, + # ) + # record_s3_path = self._get_record_s3_path(job) + # self.s3.delete_file_from_s3(S3_BUILDS_BUCKET, record_s3_path + record_file_name) + + # record_key = self.Record( + # record_type, + # job, + # self.job_digests[job], + # batch, + # num_batches, + # release_branch=False, + # ).to_str_key() + + # if record_key in self.records[record_type]: + # del self.records[record_type][record_key] + + def is_successful( + self, job: str, batch: int, num_batches: int, release_branch: bool + ) -> bool: + """ + checks if a given job have already been done successfuly + """ + return self.exist( + self.RecordType.SUCCESSFUL, job, batch, num_batches, release_branch + ) + + def is_failed( + self, job: str, batch: int, num_batches: int, release_branch: bool + ) -> bool: + """ + checks if a given job have already been done with failure + """ + return self.exist( + self.RecordType.FAILED, job, batch, num_batches, release_branch + ) + + def is_pending( + self, job: str, batch: int, num_batches: int, release_branch: bool + ) -> bool: + """ + check pending record in the cache for a given job + @release_branch - checks that "release" attribute is set for a record + """ + if self.is_successful( + job, batch, num_batches, release_branch + ) or self.is_failed(job, batch, num_batches, release_branch): + return False + + return self.exist( + self.RecordType.PENDING, job, batch, num_batches, release_branch + ) + + def push_successful( + self, + job: str, + batch: int, + num_batches: int, + job_status: CommitStatusData, + release_branch: bool = False, + ) -> None: + """ + Pushes a cache record (CommitStatusData) + @release_branch adds "release" attribute to a record + """ + self.push( + self.RecordType.SUCCESSFUL, + job, + [batch], + num_batches, + job_status, + release_branch, + ) + + def push_failed( + self, + job: str, + batch: int, + num_batches: int, + job_status: CommitStatusData, + release_branch: bool = False, + ) -> None: + """ + Pushes a cache record of type Failed (CommitStatusData) + @release_branch adds "release" attribute to a record + """ + self.push( + self.RecordType.FAILED, + job, + [batch], + num_batches, + job_status, + release_branch, + ) + + def push_pending( + self, job: str, batches: List[int], num_batches: int, release_branch: bool + ) -> None: + """ + pushes pending record for a job to the cache + """ + pending_state = PendingState(time.time(), run_url=GITHUB_RUN_URL) + self.push( + self.RecordType.PENDING, + job, + batches, + num_batches, + pending_state, + release_branch, + ) + + def get_successful( + self, job: str, batch: int, num_batches: int + ) -> Optional[CommitStatusData]: + """ + Gets a cache record (CommitStatusData) for a job, or None if a cache miss + """ + res = self.get(self.RecordType.SUCCESSFUL, job, batch, num_batches) + assert res is None or isinstance(res, CommitStatusData) + return res + + def delete_pending( + self, job: str, batch: int, num_batches: int, release_branch: bool + ) -> None: + """ + deletes pending record from the cache + """ + self.delete(self.RecordType.PENDING, job, batch, num_batches, release_branch) + + def download_build_reports(self, file_prefix: str = "") -> List[str]: + """ + not ideal class for this method, + but let it be as we store build reports in CI cache directory on s3 + and CiCache knows where exactly + + @file_prefix allows to filter out reports by git head_ref + """ + report_path = Path(REPORT_PATH) + report_path.mkdir(exist_ok=True, parents=True) + path = ( + self._get_record_s3_path(Build.PACKAGE_RELEASE) + + self._CACHE_BUILD_REPORT_PREFIX + ) + if file_prefix: + path += "_" + file_prefix + reports_files = self.s3.download_files( + bucket=S3_BUILDS_BUCKET, + s3_path=path, + file_suffix=".json", + local_directory=report_path, + ) + return reports_files + + def upload_build_report(self, build_result: BuildResult) -> str: + result_json_path = build_result.write_json(Path(TEMP_PATH)) + s3_path = ( + self._get_record_s3_path(Build.PACKAGE_RELEASE) + result_json_path.name + ) + return self.s3.upload_file( + bucket=S3_BUILDS_BUCKET, file_path=result_json_path, s3_path=s3_path + ) + + def await_jobs( + self, jobs_with_params: Dict[str, Dict[str, Any]], is_release_branch: bool + ) -> Dict[str, List[int]]: + """ + await pending jobs to be finished + @jobs_with_params - jobs to await. {JOB_NAME: {"batches": [BATCHES...], "num_batches": NUM_BATCHES}} + returns successfully finished jobs: {JOB_NAME: [BATCHES...]} + """ + if not jobs_with_params: + return {} + poll_interval_sec = 300 + TIMEOUT = 3600 + await_finished: Dict[str, List[int]] = {} + round_cnt = 0 + while len(jobs_with_params) > 4 and round_cnt < 5: + round_cnt += 1 + GHActions.print_in_group( + f"Wait pending jobs, round [{round_cnt}]:", list(jobs_with_params) + ) + # this is initial approach to wait pending jobs: + # start waiting for the next TIMEOUT seconds if there are more than X(=4) jobs to wait + # wait TIMEOUT seconds in rounds. Y(=5) is the max number of rounds + expired_sec = 0 + start_at = int(time.time()) + while expired_sec < TIMEOUT and jobs_with_params: + time.sleep(poll_interval_sec) + self.update() + jobs_with_params_copy = deepcopy(jobs_with_params) + for job_name in jobs_with_params: + num_batches = jobs_with_params[job_name]["num_batches"] + job_config = CI_CONFIG.get_job_config(job_name) + for batch in jobs_with_params[job_name]["batches"]: + if self.is_pending( + job_name, + batch, + num_batches, + release_branch=is_release_branch + and job_config.required_on_release_branch, + ): + continue + print( + f"Job [{job_name}_[{batch}/{num_batches}]] is not pending anymore" + ) + + # some_job_ready = True + jobs_with_params_copy[job_name]["batches"].remove(batch) + if not jobs_with_params_copy[job_name]["batches"]: + del jobs_with_params_copy[job_name] + + if not self.is_successful( + job_name, + batch, + num_batches, + release_branch=is_release_branch + and job_config.required_on_release_branch, + ): + print( + f"NOTE: Job [{job_name}:{batch}] finished but no success - remove from awaiting list, do not add to ready" + ) + continue + if job_name in await_finished: + await_finished[job_name].append(batch) + else: + await_finished[job_name] = [batch] + jobs_with_params = jobs_with_params_copy + expired_sec = int(time.time()) - start_at + print( + f"...awaiting continues... seconds left [{TIMEOUT - expired_sec}]" + ) + if await_finished: + GHActions.print_in_group( + f"Finished jobs, round [{round_cnt}]:", + [f"{job}:{batches}" for job, batches in await_finished.items()], + ) + GHActions.print_in_group( + "Remaining jobs:", + [f"{job}:{params['batches']}" for job, params in jobs_with_params.items()], + ) + return await_finished + + def get_check_name(check_name: str, batch: int, num_batches: int) -> str: res = check_name if num_batches > 1: @@ -150,16 +815,17 @@ def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace: help="skip fetching data about job runs, used in --configure action (for debugging and nigthly ci)", ) parser.add_argument( - "--rebuild-all-docker", + "--force", action="store_true", default=False, - help="will create run config for rebuilding all dockers, used in --configure action (for nightly docker job)", + help="Used with --run, force the job to run, omitting the ci cache", ) + # FIXME: remove, not used parser.add_argument( "--rebuild-all-binaries", action="store_true", default=False, - help="will create run config without skipping build jobs in any case, used in --configure action (for release branches)", + help="[DEPRECATED. to be removed, once no wf use it] will create run config without skipping build jobs in any case, used in --configure action (for release branches)", ) parser.add_argument( "--commit-message", @@ -169,23 +835,8 @@ def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace: return parser.parse_args() -def get_file_flag_name( - job_name: str, digest: str, batch: int = 0, num_batches: int = 1 -) -> str: - if num_batches < 2: - return f"job_{job_name}_{digest}.ci" - else: - return f"job_{job_name}_{digest}_{batch}_{num_batches}.ci" - - -def get_s3_path(build_digest: str) -> str: - return f"CI_data/BUILD-{build_digest}/" - - -def get_s3_path_docs(digest: str) -> str: - return f"CI_data/DOCS-{digest}/" - - +# FIXME: rewrite the docker job as regular reusable_test job and move interaction with docker hub inside job script +# that way run config will be more clean, workflow more generic and less api calls to dockerhub def check_missing_images_on_dockerhub( image_name_tag: Dict[str, str], arch: Optional[str] = None ) -> Dict[str, str]: @@ -260,34 +911,129 @@ def check_missing_images_on_dockerhub( return result -def _check_and_update_for_early_style_check(run_config: dict) -> None: +def _pre_action(s3, indata, pr_info): + CommitStatusData.cleanup() + JobReport.cleanup() + BuildResult.cleanup() + ci_cache = CiCache(s3, indata["jobs_data"]["digests"]) + + # for release/master branches reports must be from the same branches + report_prefix = normalize_string(pr_info.head_ref) if pr_info.number == 0 else "" + print( + f"Use report prefix [{report_prefix}], pr_num [{pr_info.number}], head_ref [{pr_info.head_ref}]" + ) + reports_files = ci_cache.download_build_reports(file_prefix=report_prefix) + print(f"Pre action done. Report files [{reports_files}] have been downloaded") + + +def _mark_success_action( + s3: S3Helper, + indata: Dict[str, Any], + pr_info: PRInfo, + job: str, + batch: int, +) -> None: + ci_cache = CiCache(s3, indata["jobs_data"]["digests"]) + job_config = CI_CONFIG.get_job_config(job) + num_batches = job_config.num_batches + # if batch is not provided - set to 0 + batch = 0 if batch == -1 else batch + assert ( + 0 <= batch < num_batches + ), f"--batch must be provided and in range [0, {num_batches}) for {job}" + + # FIXME: find generic design for propagating and handling job status (e.g. stop using statuses in GH api) + # now job ca be build job w/o status data, any other job that exit with 0 with or w/o status data + if CI_CONFIG.is_build_job(job): + # there is no status for build jobs + # create dummy success to mark it as done + # FIXME: consider creating commit status for build jobs too, to treat everything the same way + CommitStatusData(SUCCESS, "dummy description", "dummy_url").dump_status() + + job_status = None + if CommitStatusData.exist(): + # normal scenario + job_status = CommitStatusData.load_status() + else: + # apparently exit after rerun-helper check + # do nothing, exit without failure + print(f"ERROR: no status file for job [{job}]") + + if job_config.run_always or job_config.run_by_label: + print(f"Job [{job}] runs always or by label in CI - do not cache") + else: + if pr_info.is_master(): + pass + # delete method is disabled for ci_cache. need it? + # pending enabled for master branch jobs only + # ci_cache.delete_pending(job, batch, num_batches, release_branch=True) + if job_status and job_status.is_ok(): + ci_cache.push_successful( + job, batch, num_batches, job_status, pr_info.is_release_branch() + ) + print(f"Job [{job}] is ok") + elif job_status and not job_status.is_ok(): + ci_cache.push_failed( + job, batch, num_batches, job_status, pr_info.is_release_branch() + ) + print(f"Job [{job}] is failed with status [{job_status.status}]") + else: + job_status = CommitStatusData( + description="dummy description", status=ERROR, report_url="dummy url" + ) + ci_cache.push_failed( + job, batch, num_batches, job_status, pr_info.is_release_branch() + ) + print(f"No CommitStatusData for [{job}], push dummy failure to ci_cache") + + +def _print_results(result: Any, outfile: Optional[str], pretty: bool = False) -> None: + if outfile: + with open(outfile, "w") as f: + if isinstance(result, str): + print(result, file=f) + elif isinstance(result, dict): + print(json.dumps(result, indent=2 if pretty else None), file=f) + else: + raise AssertionError(f"Unexpected type for 'res': {type(result)}") + else: + if isinstance(result, str): + print(result) + elif isinstance(result, dict): + print(json.dumps(result, indent=2 if pretty else None)) + else: + raise AssertionError(f"Unexpected type for 'res': {type(result)}") + + +def _check_and_update_for_early_style_check(jobs_data: dict, docker_data: dict) -> None: """ This is temporary hack to start style check before docker build if possible FIXME: need better solution to do style check as soon as possible and as fast as possible w/o dependency on docker job """ - jobs_to_do = run_config.get("jobs_data", {}).get("jobs_to_do", []) - docker_to_build = run_config.get("docker_data", {}).get("missing_multi", []) + jobs_to_do = jobs_data.get("jobs_to_do", []) + docker_to_build = docker_data.get("missing_multi", []) if ( - "Style check" in jobs_to_do + JobNames.STYLE_CHECK in jobs_to_do and docker_to_build and "clickhouse/style-test" not in docker_to_build ): - index = jobs_to_do.index("Style check") + index = jobs_to_do.index(JobNames.STYLE_CHECK) jobs_to_do[index] = "Style check early" -def _update_config_for_docs_only(run_config: dict) -> None: - DOCS_CHECK_JOBS = ["Docs check", "Style check"] +def _update_config_for_docs_only(jobs_data: dict) -> None: + DOCS_CHECK_JOBS = [JobNames.DOCS_CHECK, JobNames.STYLE_CHECK] print(f"NOTE: Will keep only docs related jobs: [{DOCS_CHECK_JOBS}]") - jobs_to_do = run_config.get("jobs_data", {}).get("jobs_to_do", []) - run_config["jobs_data"]["jobs_to_do"] = [ - job for job in jobs_to_do if job in DOCS_CHECK_JOBS - ] + jobs_to_do = jobs_data.get("jobs_to_do", []) + jobs_data["jobs_to_do"] = [job for job in jobs_to_do if job in DOCS_CHECK_JOBS] + jobs_data["jobs_to_wait"] = { + job: params + for job, params in jobs_data["jobs_to_wait"].items() + if job in DOCS_CHECK_JOBS + } -def _configure_docker_jobs( - rebuild_all_dockers: bool, docker_digest_or_latest: bool = False -) -> Dict: +def _configure_docker_jobs(docker_digest_or_latest: bool) -> Dict: print("::group::Docker images check") # generate docker jobs data docker_digester = DockerDigester() @@ -296,50 +1042,33 @@ def _configure_docker_jobs( ) # 'image name - digest' mapping images_info = docker_images_helper.get_images_info() - # a. check missing images - if not rebuild_all_dockers: - # FIXME: we need login as docker manifest inspect goes directly to one of the *.docker.com hosts instead of "registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"] - # find if it's possible to use the setting of /etc/docker/daemon.json - docker_images_helper.docker_login() - missing_multi_dict = check_missing_images_on_dockerhub(imagename_digest_dict) - missing_multi = list(missing_multi_dict) - missing_amd64 = [] - missing_aarch64 = [] - if not docker_digest_or_latest: - # look for missing arm and amd images only among missing multiarch manifests @missing_multi_dict - # to avoid extra dockerhub api calls - missing_amd64 = list( - check_missing_images_on_dockerhub(missing_multi_dict, "amd64") - ) - # FIXME: WA until full arm support: skip not supported arm images - missing_aarch64 = list( - check_missing_images_on_dockerhub( - { - im: digest - for im, digest in missing_multi_dict.items() - if not images_info[im]["only_amd64"] - }, - "aarch64", - ) - ) - # FIXME: temporary hack, remove after transition to docker digest as tag - else: - if missing_multi: - print( - f"WARNING: Missing images {list(missing_multi)} - fallback to latest tag" - ) - for image in missing_multi: - imagename_digest_dict[image] = "latest" - else: - # add all images to missing - missing_multi = list(imagename_digest_dict) - missing_amd64 = missing_multi + # FIXME: we need login as docker manifest inspect goes directly to one of the *.docker.com hosts instead of "registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"] + # find if it's possible to use the setting of /etc/docker/daemon.json (https://github.com/docker/cli/issues/4484#issuecomment-1688095463) + docker_images_helper.docker_login() + missing_multi_dict = check_missing_images_on_dockerhub(imagename_digest_dict) + missing_multi = list(missing_multi_dict) + missing_amd64 = [] + missing_aarch64 = [] + if not docker_digest_or_latest: + # look for missing arm and amd images only among missing multiarch manifests @missing_multi_dict + # to avoid extra dockerhub api calls + missing_amd64 = list( + check_missing_images_on_dockerhub(missing_multi_dict, "amd64") + ) # FIXME: WA until full arm support: skip not supported arm images - missing_aarch64 = [ - name - for name in imagename_digest_dict - if not images_info[name]["only_amd64"] - ] + missing_aarch64 = list( + check_missing_images_on_dockerhub( + { + im: digest + for im, digest in missing_multi_dict.items() + if not images_info[im]["only_amd64"] + }, + "aarch64", + ) + ) + else: + if missing_multi: + assert False, f"Missing images [{missing_multi}], cannot proceed" print("::endgroup::") return { @@ -351,14 +1080,11 @@ def _configure_docker_jobs( def _configure_jobs( - build_digest: str, - docs_digest: str, job_digester: JobDigester, s3: S3Helper, - rebuild_all_binaries: bool, - pr_labels: Iterable[str], + pr_info: PRInfo, commit_tokens: List[str], - ci_cache_enabled: bool, + ci_cache_disabled: bool, ) -> Dict: ## a. digest each item from the config job_digester = JobDigester() @@ -366,65 +1092,86 @@ def _configure_jobs( jobs_to_do: List[str] = [] jobs_to_skip: List[str] = [] digests: Dict[str, str] = {} - print("::group::Job Digests") + print("::group::Job Digests") for job in CI_CONFIG.job_generator(): digest = job_digester.get_job_digest(CI_CONFIG.get_digest_config(job)) digests[job] = digest print(f" job [{job.rjust(50)}] has digest [{digest}]") print("::endgroup::") - ## b. check if we have something done - if ci_cache_enabled: - done_files = [] - else: - path = get_s3_path(build_digest) - done_files = s3.list_prefix(path) - done_files = [file.split("/")[-1] for file in done_files] - # print(f"S3 CI files for the build [{build_digest}]: {done_files}") - docs_path = get_s3_path_docs(docs_digest) - done_files_docs = s3.list_prefix(docs_path) - done_files_docs = [file.split("/")[-1] for file in done_files_docs] - # print(f"S3 CI files for the docs [{docs_digest}]: {done_files_docs}") - done_files += done_files_docs + ## b. check what we need to run + ci_cache = None + if not ci_cache_disabled: + ci_cache = CiCache(s3, digests).update() + ci_cache.print_status() + + jobs_to_wait: Dict[str, Dict[str, Any]] = {} for job in digests: digest = digests[job] job_config = CI_CONFIG.get_job_config(job) num_batches: int = job_config.num_batches batches_to_do: List[int] = [] + add_to_skip = False - if job_config.run_by_label: - # this job controlled by label, add to todo if it's labe is set in pr - if job_config.run_by_label in pr_labels: - for batch in range(num_batches): # type: ignore + for batch in range(num_batches): # type: ignore + if job_config.pr_only and pr_info.is_release_branch(): + continue + if job_config.release_only and not pr_info.is_release_branch(): + continue + if job_config.run_by_label: + # this job controlled by label, add to todo if its label is set in pr + if job_config.run_by_label in pr_info.labels: batches_to_do.append(batch) - elif job_config.run_always: - # always add to todo - batches_to_do.append(batch) - else: - # this job controlled by digest, add to todo if it's not successfully done before - for batch in range(num_batches): # type: ignore - success_flag_name = get_file_flag_name(job, digest, batch, num_batches) - if success_flag_name not in done_files or ( - rebuild_all_binaries and CI_CONFIG.is_build_job(job) + elif job_config.run_always: + # always add to todo + batches_to_do.append(batch) + elif not ci_cache: + batches_to_do.append(batch) + elif not ci_cache.is_successful( + job, + batch, + num_batches, + release_branch=pr_info.is_release_branch() + and job_config.required_on_release_branch, + ): + # ci cache is enabled and job is not in the cache - add + batches_to_do.append(batch) + + # check if it's pending in the cache + if ci_cache.is_pending( + job, + batch, + num_batches, + release_branch=pr_info.is_release_branch() + and job_config.required_on_release_branch, ): - batches_to_do.append(batch) + if job in jobs_to_wait: + jobs_to_wait[job]["batches"].append(batch) + else: + jobs_to_wait[job] = { + "batches": [batch], + "num_batches": num_batches, + } + else: + add_to_skip = True if batches_to_do: jobs_to_do.append(job) - jobs_params[job] = { - "batches": batches_to_do, - "num_batches": num_batches, - } - else: + elif add_to_skip: + # treat job as being skipped only if it's controlled by digest jobs_to_skip.append(job) + jobs_params[job] = { + "batches": batches_to_do, + "num_batches": num_batches, + } - ## c. check CI controlling labels commit messages - if pr_labels: + ## c. check CI controlling labels and commit messages + if pr_info.labels: jobs_requested_by_label = [] # type: List[str] ci_controlling_labels = [] # type: List[str] - for label in pr_labels: + for label in pr_info.labels: label_config = CI_CONFIG.get_label_config(label) if label_config: jobs_requested_by_label += label_config.run_jobs @@ -434,6 +1181,8 @@ def _configure_jobs( print( f" : following jobs will be executed: [{jobs_requested_by_label}]" ) + # so far there is only "do not test" label in the config that runs only Style check. + # check later if we need to filter out requested jobs using ci cache. right now we do it: jobs_to_do = [job for job in jobs_requested_by_label if job in jobs_to_do] if commit_tokens: @@ -475,87 +1224,73 @@ def _configure_jobs( f"NOTE: Only specific job(s) were requested by commit message tokens: [{jobs_to_do_requested}]" ) jobs_to_do = list( - set(job for job in jobs_to_do_requested if job in jobs_to_do) + set(job for job in jobs_to_do_requested if job not in jobs_to_skip) ) return { "digests": digests, "jobs_to_do": jobs_to_do, "jobs_to_skip": jobs_to_skip, + "jobs_to_wait": { + job: params for job, params in jobs_to_wait.items() if job in jobs_to_do + }, "jobs_params": { job: params for job, params in jobs_params.items() if job in jobs_to_do }, } -def _update_gh_statuses(indata: Dict, s3: S3Helper) -> None: +def _create_gh_status( + commit: Any, job: str, batch: int, num_batches: int, job_status: CommitStatusData +) -> None: + print(f"Going to re-create GH status for job [{job}]") + assert job_status.status == SUCCESS, "BUG!" + commit.create_status( + state=job_status.status, + target_url=job_status.report_url, + description=format_description( + f"Reused from [{job_status.pr_num}-{job_status.sha[0:8]}]: " + f"{job_status.description}" + ), + context=get_check_name(job, batch=batch, num_batches=num_batches), + ) + + +def _update_gh_statuses_action(indata: Dict, s3: S3Helper) -> None: if indata["ci_flags"][Labels.NO_CI_CACHE]: print("CI cache is disabled - skip restoring commit statuses from CI cache") return - - temp_path = Path(TEMP_PATH) - if not temp_path.exists(): - temp_path.mkdir(parents=True, exist_ok=True) - - # clean up before start - for file in temp_path.glob("*.ci"): - file.unlink() - - # download all metadata files - path = get_s3_path(indata["build"]) - files = s3.download_files( # type: ignore - bucket=S3_BUILDS_BUCKET, - s3_path=path, - file_suffix=".ci", - local_directory=temp_path, - ) - print(f"CI metadata files [{files}]") - path = get_s3_path_docs(indata["docs"]) - files_docs = s3.download_files( # type: ignore - bucket=S3_BUILDS_BUCKET, - s3_path=path, - file_suffix=".ci", - local_directory=temp_path, - ) - print(f"CI docs metadata files [{files_docs}]") - files += files_docs - - # parse CI metadata job_digests = indata["jobs_data"]["digests"] + jobs_to_skip = indata["jobs_data"]["jobs_to_skip"] + jobs_to_do = indata["jobs_data"]["jobs_to_do"] + ci_cache = CiCache(s3, job_digests).update().fetch_records_data().print_status() + # create GH status pr_info = PRInfo() commit = get_commit(Github(get_best_robot_token(), per_page=100), pr_info.sha) - def run_create_status(job, digest, batch, num_batches): - success_flag_name = get_file_flag_name(job, digest, batch, num_batches) - if success_flag_name in files: - print(f"Going to re-create GH status for job [{job}] sha [{pr_info.sha}]") - job_status = CommitStatusData.load_from_file( - f"{TEMP_PATH}/{success_flag_name}" - ) # type: CommitStatusData - assert job_status.status == SUCCESS, "BUG!" - commit.create_status( - state=job_status.status, - target_url=job_status.report_url, - description=format_description( - f"Reused from [{job_status.pr_num}-{job_status.sha[0:8]}]: " - f"{job_status.description}" - ), - context=get_check_name(job, batch=batch, num_batches=num_batches), - ) - print(f"GH status re-created from file [{success_flag_name}]") + def _concurrent_create_status(job: str, batch: int, num_batches: int) -> None: + job_status = ci_cache.get_successful(job, batch, num_batches) + if not job_status: + return + _create_gh_status(commit, job, batch, num_batches, job_status) with concurrent.futures.ThreadPoolExecutor() as executor: futures = [] for job in job_digests: + if job not in jobs_to_skip and job not in jobs_to_do: + # no need to create status for job that are not supposed to be executed + continue if CI_CONFIG.is_build_job(job): # no GH status for build jobs continue - digest = job_digests[job] - num_batches = CI_CONFIG.get_job_config(job).num_batches - for batch in range(num_batches): + job_config = CI_CONFIG.get_job_config(job) + if not job_config: + # there might be a new job that does not exist on this branch - skip it + continue + for batch in range(job_config.num_batches): future = executor.submit( - run_create_status, job, digest, batch, num_batches + _concurrent_create_status, job, batch, job_config.num_batches ) futures.append(future) done, _ = concurrent.futures.wait(futures) @@ -568,11 +1303,6 @@ def _update_gh_statuses(indata: Dict, s3: S3Helper) -> None: set_status_comment(commit, pr_info) print("... CI report update - done") - # clean up - ci_files = list(temp_path.glob("*.ci")) - for file in ci_files: - file.unlink() - def _fetch_commit_tokens(message: str) -> List[str]: pattern = r"#[\w-]+" @@ -584,7 +1314,7 @@ def _fetch_commit_tokens(message: str) -> List[str]: def _upload_build_artifacts( pr_info: PRInfo, build_name: str, - build_digest: str, + ci_cache: CiCache, job_report: JobReport, s3: S3Helper, s3_destination: str, @@ -594,7 +1324,7 @@ def _upload_build_artifacts( ( get_release_or_pr(pr_info, get_version_from_repo())[1], pr_info.sha, - CI_CONFIG.normalize_string(build_name), + normalize_string(build_name), "performance.tar.zst", ) ) @@ -640,12 +1370,8 @@ def _upload_build_artifacts( head_ref=pr_info.head_ref, pr_number=pr_info.number, ) - result_json_path = build_result.write_json() - s3_path = get_s3_path(build_digest) + result_json_path.name - build_report_url = s3.upload_file( - bucket=S3_BUILDS_BUCKET, file_path=result_json_path, s3_path=s3_path - ) - print(f"Report file [{result_json_path}] has been uploaded to [{build_report_url}]") + report_url = ci_cache.upload_build_report(build_result) + print(f"Report file has been uploaded to [{report_url}]") # Upload head master binaries static_bin_name = CI_CONFIG.build_config[build_name].static_binary_name @@ -852,9 +1578,6 @@ def main() -> int: ### CONFIGURE action: start if args.configure: - docker_data = {} - git_ref = git_runner.run(f"{GIT_PREFIX} rev-parse HEAD") - # if '#no_merge_commit' is set in commit message - set git ref to PR branch head to avoid merge-commit tokens = [] ci_flags = { @@ -876,14 +1599,15 @@ def main() -> int: ci_flags[Labels.NO_CI_CACHE] = True print("NOTE: Disable CI Cache") + docker_data = {} + git_ref = git_runner.run(f"{GIT_PREFIX} rev-parse HEAD") + # let's get CH version version = get_version_from_repo(git=Git(True)).string print(f"Got CH version for this commit: [{version}]") docker_data = ( - _configure_docker_jobs( - args.rebuild_all_docker, args.docker_digest_or_latest - ) + _configure_docker_jobs(args.docker_digest_or_latest) if not args.skip_docker else {} ) @@ -893,17 +1617,13 @@ def main() -> int: CI_CONFIG.get_digest_config("package_release") ) docs_digest = job_digester.get_job_digest( - CI_CONFIG.get_digest_config("Docs check") + CI_CONFIG.get_digest_config(JobNames.DOCS_CHECK) ) jobs_data = ( _configure_jobs( - build_digest, - docs_digest, job_digester, s3, - # FIXME: add suport for master wf w/o rebuilds - args.rebuild_all_binaries or pr_info.is_master(), - pr_info.labels, + pr_info, tokens, ci_flags[Labels.NO_CI_CACHE], ) @@ -911,6 +1631,60 @@ def main() -> int: else {} ) + # # FIXME: Early style check manipulates with job names might be not robust with await feature + # if pr_info.number != 0: + # # FIXME: it runs style check before docker build if possible (style-check images is not changed) + # # find a way to do style check always before docker build and others + # _check_and_update_for_early_style_check(jobs_data, docker_data) + if not args.skip_jobs and pr_info.has_changes_in_documentation_only(): + _update_config_for_docs_only(jobs_data) + + if not args.skip_jobs: + ci_cache = CiCache(s3, jobs_data["digests"]) + + if ( + pr_info.is_release_branch() + or pr_info.event.get("pull_request", {}) + .get("user", {}) + .get("login", "not_maxknv") + == "maxknv" + ): + # wait for pending jobs to be finished, await_jobs is a long blocking call + # wait pending jobs (for now only on release/master branches) + ready_jobs_batches_dict = ci_cache.await_jobs( + jobs_data.get("jobs_to_wait", {}), pr_info.is_release_branch() + ) + jobs_to_do = jobs_data["jobs_to_do"] + jobs_to_skip = jobs_data["jobs_to_skip"] + jobs_params = jobs_data["jobs_params"] + for job, batches in ready_jobs_batches_dict.items(): + if job not in jobs_params: + print(f"WARNING: Job [{job}] is not in the params list") + continue + for batch in batches: + jobs_params[job]["batches"].remove(batch) + if not jobs_params[job]["batches"]: + jobs_to_do.remove(job) + jobs_to_skip.append(job) + del jobs_params[job] + + # set planned jobs as pending in the CI cache if on the master + if pr_info.is_master(): + for job in jobs_data["jobs_to_do"]: + config = CI_CONFIG.get_job_config(job) + if config.run_always or config.run_by_label: + continue + job_params = jobs_data["jobs_params"][job] + ci_cache.push_pending( + job, + job_params["batches"], + config.num_batches, + release_branch=pr_info.is_release_branch(), + ) + + if "jobs_to_wait" in jobs_data: + del jobs_data["jobs_to_wait"] + # conclude results result["git_ref"] = git_ref result["version"] = version @@ -919,49 +1693,12 @@ def main() -> int: result["ci_flags"] = ci_flags result["jobs_data"] = jobs_data result["docker_data"] = docker_data - if ( - not args.skip_jobs - and pr_info.number != 0 - and not args.docker_digest_or_latest - ): - # FIXME: it runs style check before docker build if possible (style-check images is not changed) - # find a way to do style check always before docker build and others - _check_and_update_for_early_style_check(result) - if not args.skip_jobs and pr_info.has_changes_in_documentation_only(): - _update_config_for_docs_only(result) ### CONFIGURE action: end ### PRE action: start elif args.pre: - CommitStatusData.cleanup() - JobReport.cleanup() - BuildResult.cleanup() - assert indata, "Run config must be provided via --infile" - report_path = Path(REPORT_PATH) - report_path.mkdir(exist_ok=True, parents=True) - path = get_s3_path(indata["build"]) - reports_files = s3.download_files( # type: ignore - bucket=S3_BUILDS_BUCKET, - s3_path=path, - file_suffix=".json", - local_directory=report_path, - ) - # for release/master branches reports must be created on the same branches - files = [] - if pr_info.number == 0: - for file in reports_files: - if pr_info.head_ref not in file: - # keep reports from the same branch only, if not in a PR - (report_path / file).unlink() - print(f"drop report: [{report_path / file}]") - else: - files.append(file) - reports_files = files - print( - f"Pre action done. Report files [{reports_files}] have been downloaded from [{path}] to [{report_path}]" - ) - ### PRE action: end + _pre_action(s3, indata, pr_info) ### RUN action: start elif args.run: @@ -993,6 +1730,9 @@ def main() -> int: print("::endgroup::") else: # this is a test job - check if GH commit status is present + + # rerun helper check + # FIXME: remove rerun_helper check and rely on ci cache only commit = get_commit( Github(get_best_robot_token(), per_page=100), pr_info.sha ) @@ -1005,7 +1745,31 @@ def main() -> int: print(status) print("::endgroup::") - if previous_status: + # ci cache check + elif not indata["ci_flags"][Labels.NO_CI_CACHE]: + ci_cache = CiCache(s3, indata["jobs_data"]["digests"]).update() + job_config = CI_CONFIG.get_job_config(check_name) + if ci_cache.is_successful( + check_name, + args.batch, + job_config.num_batches, + job_config.required_on_release_branch, + ): + job_status = ci_cache.get_successful( + check_name, args.batch, job_config.num_batches + ) + assert job_status, "BUG" + _create_gh_status( + commit, + check_name, + args.batch, + job_config.num_batches, + job_status, + ) + previous_status = job_status.status + GHActions.print_in_group("Commit Status Data", job_status) + + if previous_status and not args.force: print( f"Commit status or Build Report is already present - job will be skipped with status: [{previous_status}]" ) @@ -1019,15 +1783,15 @@ def main() -> int: ### POST action: start elif args.post: - assert ( - not CI_CONFIG.is_build_job(args.job_name) or indata - ), "--infile with config must be provided for POST action of a build type job [{args.job_name}]" job_report = JobReport.load() if JobReport.exist() else None if job_report: ch_helper = ClickHouseHelper() check_url = "" if CI_CONFIG.is_build_job(args.job_name): + assert ( + indata + ), f"--infile with config must be provided for POST action of a build type job [{args.job_name}]" build_name = args.job_name s3_path_prefix = "/".join( ( @@ -1039,7 +1803,7 @@ def main() -> int: log_url = _upload_build_artifacts( pr_info, build_name, - build_digest=indata["build"], # type: ignore + ci_cache=CiCache(s3, indata["jobs_data"]["digests"]), job_report=job_report, s3=s3, s3_destination=s3_path_prefix, @@ -1055,7 +1819,7 @@ def main() -> int: ( get_release_or_pr(pr_info, get_version_from_repo())[0], pr_info.sha, - CI_CONFIG.normalize_string( + normalize_string( job_report.check_name or _get_ext_check_name(args.job_name) ), ) @@ -1116,80 +1880,16 @@ def main() -> int: ### MARK SUCCESS action: start elif args.mark_success: assert indata, "Run config must be provided via --infile" - job = args.job_name - job_config = CI_CONFIG.get_job_config(job) - num_batches = job_config.num_batches - assert ( - num_batches <= 1 or 0 <= args.batch < num_batches - ), f"--batch must be provided and in range [0, {num_batches}) for {job}" - - # FIXME: find generic design for propagating and handling job status (e.g. stop using statuses in GH api) - # now job ca be build job w/o status data, any other job that exit with 0 with or w/o status data - if CI_CONFIG.is_build_job(job): - # there is no status for build jobs - # create dummy success to mark it as done - job_status = CommitStatusData( - status="success", description="dummy status", report_url="dummy_url" - ) - else: - if not CommitStatusData.is_present(): - # apparently exit after rerun-helper check - # do nothing, exit without failure - print(f"ERROR: no status file for job [{job}]") - job_status = CommitStatusData( - status="dummy failure", - description="dummy status", - report_url="dummy_url", - ) - else: - # normal case - job_status = CommitStatusData.load_status() - - # Storing job data (report_url) to restore OK GH status on job results reuse - if job_config.run_always: - print(f"Job [{job}] runs always in CI - do not mark as done") - elif job_status.is_ok(): - success_flag_name = get_file_flag_name( - job, indata["jobs_data"]["digests"][job], args.batch, num_batches - ) - if not CI_CONFIG.is_docs_job(job): - path = get_s3_path(indata["build"]) + success_flag_name - else: - path = get_s3_path_docs(indata["docs"]) + success_flag_name - job_status.dump_to_file(success_flag_name) - _ = s3.upload_file( - bucket=S3_BUILDS_BUCKET, file_path=success_flag_name, s3_path=path - ) - os.remove(success_flag_name) - print( - f"Job [{job}] with digest [{indata['jobs_data']['digests'][job]}] {f'and batch {args.batch}/{num_batches}' if num_batches > 1 else ''} marked as successful. path: [{path}]" - ) - else: - print(f"Job [{job}] is not ok, status [{job_status.status}]") - ### MARK SUCCESS action: end + _mark_success_action(s3, indata, pr_info, args.job_name, args.batch) ### UPDATE GH STATUSES action: start elif args.update_gh_statuses: assert indata, "Run config must be provided via --infile" - _update_gh_statuses(indata=indata, s3=s3) - ### UPDATE GH STATUSES action: end + _update_gh_statuses_action(indata=indata, s3=s3) ### print results - if args.outfile: - with open(args.outfile, "w") as f: - if isinstance(result, str): - print(result, file=f) - elif isinstance(result, dict): - print(json.dumps(result, indent=2 if args.pretty else None), file=f) - else: - raise AssertionError(f"Unexpected type for 'res': {type(result)}") - else: - if isinstance(result, str): - print(result) - elif isinstance(result, dict): - print(json.dumps(result, indent=2 if args.pretty else None)) - else: - raise AssertionError(f"Unexpected type for 'res': {type(result)}") + _print_results(result, args.outfile, args.pretty) + return exit_code diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index ece7f2f7bae..3ebcbb7ed59 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -6,15 +6,23 @@ from dataclasses import dataclass, field from pathlib import Path from typing import Callable, Dict, Iterable, List, Literal, Optional, Union -from integration_test_images import IMAGES from ci_utils import WithIter +from integration_test_images import IMAGES class Labels(metaclass=WithIter): + """ + Label names or commit tokens in normalized form + """ + DO_NOT_TEST_LABEL = "do_not_test" NO_MERGE_COMMIT = "no_merge_commit" NO_CI_CACHE = "no_ci_cache" CI_SET_REDUCED = "ci_set_reduced" + CI_SET_ARM = "ci_set_arm" + CI_SET_INTEGRATION = "ci_set_integration" + + libFuzzer = "libFuzzer" class Build(metaclass=WithIter): @@ -25,6 +33,7 @@ class Build(metaclass=WithIter): PACKAGE_TSAN = "package_tsan" PACKAGE_MSAN = "package_msan" PACKAGE_DEBUG = "package_debug" + PACKAGE_RELEASE_COVERAGE = "package_release_coverage" BINARY_RELEASE = "binary_release" BINARY_TIDY = "binary_tidy" BINARY_DARWIN = "binary_darwin" @@ -42,13 +51,15 @@ class Build(metaclass=WithIter): class JobNames(metaclass=WithIter): STYLE_CHECK = "Style check" - FAST_TEST = "Fast tests" - DOCKER_SERVER = "Docker server and keeper images" + FAST_TEST = "Fast test" + DOCKER_SERVER = "Docker server image" + DOCKER_KEEPER = "Docker keeper image" INSTALL_TEST_AMD = "Install packages (amd64)" INSTALL_TEST_ARM = "Install packages (arm64)" STATELESS_TEST_DEBUG = "Stateless tests (debug)" STATELESS_TEST_RELEASE = "Stateless tests (release)" + STATELESS_TEST_RELEASE_COVERAGE = "Stateless tests (coverage)" STATELESS_TEST_AARCH64 = "Stateless tests (aarch64)" STATELESS_TEST_ASAN = "Stateless tests (asan)" STATELESS_TEST_TSAN = "Stateless tests (tsan)" @@ -63,6 +74,7 @@ class JobNames(metaclass=WithIter): STATEFUL_TEST_DEBUG = "Stateful tests (debug)" STATEFUL_TEST_RELEASE = "Stateful tests (release)" + STATEFUL_TEST_RELEASE_COVERAGE = "Stateful tests (coverage)" STATEFUL_TEST_AARCH64 = "Stateful tests (aarch64)" STATEFUL_TEST_ASAN = "Stateful tests (asan)" STATEFUL_TEST_TSAN = "Stateful tests (tsan)" @@ -85,6 +97,7 @@ class JobNames(metaclass=WithIter): INTEGRATION_TEST_ASAN = "Integration tests (asan)" INTEGRATION_TEST_ASAN_ANALYZER = "Integration tests (asan, analyzer)" INTEGRATION_TEST_TSAN = "Integration tests (tsan)" + INTEGRATION_TEST_ARM = "Integration tests (aarch64)" INTEGRATION_TEST_FLAKY = "Integration tests flaky check (asan)" UPGRADE_TEST_DEBUG = "Upgrade check (debug)" @@ -110,7 +123,6 @@ class JobNames(metaclass=WithIter): PERFORMANCE_TEST_AMD64 = "Performance Comparison" PERFORMANCE_TEST_ARM64 = "Performance Comparison Aarch64" - SQL_LANCER_TEST = "SQLancer (release)" SQL_LOGIC_TEST = "Sqllogic test (release)" SQLANCER = "SQLancer (release)" @@ -129,7 +141,7 @@ class JobNames(metaclass=WithIter): BUILD_CHECK_SPECIAL = "ClickHouse special build check" DOCS_CHECK = "Docs check" - BUGFIX_VALIDATE = "tests bugfix validate check" + BUGFIX_VALIDATE = "Bugfix validation" # dynamically update JobName with Build jobs @@ -155,7 +167,7 @@ class DigestConfig: @dataclass class LabelConfig: """ - class to configure different CI scenarious per GH label or commit message token + configures different CI scenarious per GH label """ run_jobs: Iterable[str] = frozenset() @@ -164,19 +176,28 @@ class LabelConfig: @dataclass class JobConfig: """ - contains config parameter relevant for job execution in CI workflow - @digest - configures digest calculation for the job - @run_command - will be triggered for the job if omited in CI workflow yml - @timeout - @num_batches - sets number of batches for multi-batch job + contains config parameters for job execution in CI workflow """ + # configures digest calculation for the job digest: DigestConfig = field(default_factory=DigestConfig) + # will be triggered for the job if omited in CI workflow yml run_command: str = "" + # job timeout timeout: Optional[int] = None + # sets number of batches for multi-batch job num_batches: int = 1 + # label that enables job in CI, if set digest won't be used run_by_label: str = "" + # to run always regardless of the job digest or/and label run_always: bool = False + # if the job needs to be run on the release branch, including master (e.g. building packages, docker server). + # NOTE: Subsequent runs on the same branch with the similar digest are still considered skippable. + required_on_release_branch: bool = False + # job is for pr workflow only + pr_only: bool = False + # job is for release/master branches only + release_only: bool = False @dataclass @@ -186,13 +207,17 @@ class BuildConfig: package_type: Literal["deb", "binary", "fuzzers"] additional_pkgs: bool = False debug_build: bool = False + coverage: bool = False sanitizer: str = "" tidy: bool = False + # sparse_checkout is needed only to test the option itself. + # No particular sense to use it in every build, since it slows down the job. sparse_checkout: bool = False comment: str = "" static_binary_name: str = "" job_config: JobConfig = field( default_factory=lambda: JobConfig( + required_on_release_branch=True, digest=DigestConfig( include_paths=[ "./src", @@ -213,6 +238,12 @@ class BuildConfig: "./programs", "./packages", "./docker/packager/packager", + "./rust", + # FIXME: This is a WA to rebuild the CH and recreate the Performance.tar.zst artifact + # when there are changes in performance test scripts. + # Due to the current design of the perf test we need to rebuild CH when the performance test changes, + # otherwise the changes will not be visible in the PerformanceTest job in CI + "./tests/performance", ], exclude_files=[".md"], docker=["clickhouse/binary-builder"], @@ -251,7 +282,6 @@ class BuildReportConfig: @dataclass class TestConfig: required_build: str - force_tests: bool = False job_config: JobConfig = field(default_factory=JobConfig) @@ -271,8 +301,10 @@ install_check_digest = DigestConfig( ) stateless_check_digest = DigestConfig( include_paths=[ + "./tests/ci/functional_test_check.py", "./tests/queries/0_stateless/", "./tests/clickhouse-test", + "./tests/config", "./tests/*.txt", ], exclude_files=[".md"], @@ -280,8 +312,10 @@ stateless_check_digest = DigestConfig( ) stateful_check_digest = DigestConfig( include_paths=[ + "./tests/ci/functional_test_check.py", "./tests/queries/1_stateful/", "./tests/clickhouse-test", + "./tests/config", "./tests/*.txt", ], exclude_files=[".md"], @@ -293,6 +327,7 @@ stress_check_digest = DigestConfig( "./tests/queries/0_stateless/", "./tests/queries/1_stateful/", "./tests/clickhouse-test", + "./tests/config", "./tests/*.txt", ], exclude_files=[".md"], @@ -410,9 +445,9 @@ sql_test_params = { @dataclass -class CiConfig: +class CIConfig: """ - Contains configs for ALL jobs in CI pipeline + Contains configs for all jobs in the CI pipeline each config item in the below dicts should be an instance of JobConfig class or inherited from it """ @@ -439,9 +474,6 @@ class CiConfig: if check_name in config: # type: ignore res = config[check_name].job_config # type: ignore break - assert ( - res is not None - ), f"Invalid check_name or CI_CONFIG outdated, config not found for [{check_name}]" return res # type: ignore @staticmethod @@ -592,9 +624,31 @@ class CiConfig: raise KeyError("config contains errors", errors) -CI_CONFIG = CiConfig( +CI_CONFIG = CIConfig( label_configs={ Labels.DO_NOT_TEST_LABEL: LabelConfig(run_jobs=[JobNames.STYLE_CHECK]), + Labels.CI_SET_ARM: LabelConfig( + run_jobs=[ + # JobNames.STYLE_CHECK, + Build.PACKAGE_AARCH64, + JobNames.INTEGRATION_TEST_ARM, + ] + ), + Labels.CI_SET_INTEGRATION: LabelConfig( + run_jobs=[ + JobNames.STYLE_CHECK, + Build.PACKAGE_ASAN, + Build.PACKAGE_RELEASE, + Build.PACKAGE_TSAN, + Build.PACKAGE_AARCH64, + JobNames.INTEGRATION_TEST_ASAN, + JobNames.INTEGRATION_TEST_ARM, + JobNames.INTEGRATION_TEST, + JobNames.INTEGRATION_TEST_ASAN_ANALYZER, + JobNames.INTEGRATION_TEST_TSAN, + JobNames.INTEGRATION_TEST_FLAKY, + ] + ), Labels.CI_SET_REDUCED: LabelConfig( run_jobs=[ job @@ -607,6 +661,8 @@ CI_CONFIG = CiConfig( "tsan", "msan", "ubsan", + # skip build report jobs as not all builds will be done + "build check", ) ] ) @@ -659,6 +715,12 @@ CI_CONFIG = CiConfig( package_type="deb", sparse_checkout=True, # Check that it works with at least one build, see also update-submodules.sh ), + Build.PACKAGE_RELEASE_COVERAGE: BuildConfig( + name=Build.PACKAGE_RELEASE_COVERAGE, + compiler="clang-17", + coverage=True, + package_type="deb", + ), Build.BINARY_RELEASE: BuildConfig( name=Build.BINARY_RELEASE, compiler="clang-17", @@ -678,7 +740,6 @@ CI_CONFIG = CiConfig( compiler="clang-17-darwin", package_type="binary", static_binary_name="macos", - sparse_checkout=True, # Check that it works with at least one build, see also update-submodules.sh ), Build.BINARY_AARCH64: BuildConfig( name=Build.BINARY_AARCH64, @@ -740,6 +801,7 @@ CI_CONFIG = CiConfig( name=Build.FUZZERS, compiler="clang-17", package_type="fuzzers", + job_config=JobConfig(run_by_label=Labels.libFuzzer), ), }, builds_report_config={ @@ -752,6 +814,7 @@ CI_CONFIG = CiConfig( Build.PACKAGE_TSAN, Build.PACKAGE_MSAN, Build.PACKAGE_DEBUG, + Build.PACKAGE_RELEASE_COVERAGE, Build.BINARY_RELEASE, Build.FUZZERS, ] @@ -776,13 +839,24 @@ CI_CONFIG = CiConfig( JobNames.DOCKER_SERVER: TestConfig( "", job_config=JobConfig( + required_on_release_branch=True, digest=DigestConfig( include_paths=[ "tests/ci/docker_server.py", "./docker/server", + ] + ), + ), + ), + JobNames.DOCKER_KEEPER: TestConfig( + "", + job_config=JobConfig( + digest=DigestConfig( + include_paths=[ + "tests/ci/docker_server.py", "./docker/keeper", ] - ) + ), ), ), JobNames.DOCS_CHECK: TestConfig( @@ -797,11 +871,12 @@ CI_CONFIG = CiConfig( JobNames.FAST_TEST: TestConfig( "", job_config=JobConfig( + pr_only=True, digest=DigestConfig( include_paths=["./tests/queries/0_stateless/"], exclude_files=[".md"], docker=["clickhouse/fasttest"], - ) + ), ), ), JobNames.STYLE_CHECK: TestConfig( @@ -813,7 +888,9 @@ CI_CONFIG = CiConfig( JobNames.BUGFIX_VALIDATE: TestConfig( "", # we run this check by label - no digest required - job_config=JobConfig(run_by_label="pr-bugfix"), + job_config=JobConfig( + run_by_label="pr-bugfix", run_command="bugfix_validate_check.py" + ), ), }, test_configs={ @@ -841,16 +918,12 @@ CI_CONFIG = CiConfig( JobNames.STATEFUL_TEST_RELEASE: TestConfig( Build.PACKAGE_RELEASE, job_config=JobConfig(**stateful_test_common_params) # type: ignore ), + JobNames.STATEFUL_TEST_RELEASE_COVERAGE: TestConfig( + Build.PACKAGE_RELEASE_COVERAGE, job_config=JobConfig(**stateful_test_common_params) # type: ignore + ), JobNames.STATEFUL_TEST_AARCH64: TestConfig( Build.PACKAGE_AARCH64, job_config=JobConfig(**stateful_test_common_params) # type: ignore ), - # FIXME: delete? - # "Stateful tests (release, DatabaseOrdinary)": TestConfig( - # Build.PACKAGE_RELEASE, job_config=JobConfig(**stateful_test_common_params) # type: ignore - # ), - # "Stateful tests (release, DatabaseReplicated)": TestConfig( - # Build.PACKAGE_RELEASE, job_config=JobConfig(**stateful_test_common_params) # type: ignore - # ), # Stateful tests for parallel replicas JobNames.STATEFUL_TEST_PARALLEL_REPL_RELEASE: TestConfig( Build.PACKAGE_RELEASE, job_config=JobConfig(**stateful_test_common_params) # type: ignore @@ -894,16 +967,16 @@ CI_CONFIG = CiConfig( JobNames.STATELESS_TEST_RELEASE: TestConfig( Build.PACKAGE_RELEASE, job_config=JobConfig(**statless_test_common_params) # type: ignore ), + JobNames.STATELESS_TEST_RELEASE_COVERAGE: TestConfig( + Build.PACKAGE_RELEASE_COVERAGE, + job_config=JobConfig(num_batches=6, **statless_test_common_params), # type: ignore + ), JobNames.STATELESS_TEST_AARCH64: TestConfig( Build.PACKAGE_AARCH64, job_config=JobConfig(**statless_test_common_params) # type: ignore ), JobNames.STATELESS_TEST_ANALYZER_RELEASE: TestConfig( Build.PACKAGE_RELEASE, job_config=JobConfig(**statless_test_common_params) # type: ignore ), - # delete? - # "Stateless tests (release, DatabaseOrdinary)": TestConfig( - # Build.PACKAGE_RELEASE, job_config=JobConfig(**statless_test_common_params) # type: ignore - # ), JobNames.STATELESS_TEST_DB_REPL_RELEASE: TestConfig( Build.PACKAGE_RELEASE, job_config=JobConfig(num_batches=4, **statless_test_common_params), # type: ignore @@ -916,7 +989,7 @@ CI_CONFIG = CiConfig( Build.PACKAGE_DEBUG, job_config=JobConfig(num_batches=6, **statless_test_common_params), # type: ignore ), - JobNames.STATELESS_TEST_S3_DEBUG: TestConfig( + JobNames.STATELESS_TEST_S3_TSAN: TestConfig( Build.PACKAGE_TSAN, job_config=JobConfig(num_batches=5, **statless_test_common_params), # type: ignore ), @@ -936,16 +1009,16 @@ CI_CONFIG = CiConfig( Build.PACKAGE_DEBUG, job_config=JobConfig(**stress_test_common_params) # type: ignore ), JobNames.UPGRADE_TEST_ASAN: TestConfig( - Build.PACKAGE_ASAN, job_config=JobConfig(**upgrade_test_common_params) # type: ignore + Build.PACKAGE_ASAN, job_config=JobConfig(pr_only=True, **upgrade_test_common_params) # type: ignore ), JobNames.UPGRADE_TEST_TSAN: TestConfig( - Build.PACKAGE_TSAN, job_config=JobConfig(**upgrade_test_common_params) # type: ignore + Build.PACKAGE_TSAN, job_config=JobConfig(pr_only=True, **upgrade_test_common_params) # type: ignore ), JobNames.UPGRADE_TEST_MSAN: TestConfig( - Build.PACKAGE_MSAN, job_config=JobConfig(**upgrade_test_common_params) # type: ignore + Build.PACKAGE_MSAN, job_config=JobConfig(pr_only=True, **upgrade_test_common_params) # type: ignore ), JobNames.UPGRADE_TEST_DEBUG: TestConfig( - Build.PACKAGE_DEBUG, job_config=JobConfig(**upgrade_test_common_params) # type: ignore + Build.PACKAGE_DEBUG, job_config=JobConfig(pr_only=True, **upgrade_test_common_params) # type: ignore ), JobNames.INTEGRATION_TEST_ASAN: TestConfig( Build.PACKAGE_ASAN, @@ -959,6 +1032,11 @@ CI_CONFIG = CiConfig( Build.PACKAGE_TSAN, job_config=JobConfig(num_batches=6, **integration_test_common_params), # type: ignore ), + JobNames.INTEGRATION_TEST_ARM: TestConfig( + Build.PACKAGE_AARCH64, + # add [run_by_label="test arm"] to not run in regular pr workflow by default + job_config=JobConfig(num_batches=6, **integration_test_common_params, run_by_label="test arm"), # type: ignore + ), # FIXME: currently no wf has this job. Try to enable # "Integration tests (msan)": TestConfig(Build.PACKAGE_MSAN, job_config=JobConfig(num_batches=6, **integration_test_common_params) # type: ignore # ), @@ -967,15 +1045,19 @@ CI_CONFIG = CiConfig( job_config=JobConfig(num_batches=4, **integration_test_common_params), # type: ignore ), JobNames.INTEGRATION_TEST_FLAKY: TestConfig( - Build.PACKAGE_ASAN, job_config=JobConfig(**integration_test_common_params) # type: ignore + Build.PACKAGE_ASAN, job_config=JobConfig(pr_only=True, **integration_test_common_params) # type: ignore ), JobNames.COMPATIBILITY_TEST: TestConfig( Build.PACKAGE_RELEASE, - job_config=JobConfig(digest=compatibility_check_digest), + job_config=JobConfig( + required_on_release_branch=True, digest=compatibility_check_digest + ), ), JobNames.COMPATIBILITY_TEST_ARM: TestConfig( Build.PACKAGE_AARCH64, - job_config=JobConfig(digest=compatibility_check_digest), + job_config=JobConfig( + required_on_release_branch=True, digest=compatibility_check_digest + ), ), JobNames.UNIT_TEST: TestConfig( Build.BINARY_RELEASE, job_config=JobConfig(**unit_test_common_params) # type: ignore @@ -1010,7 +1092,7 @@ CI_CONFIG = CiConfig( JobNames.STATELESS_TEST_FLAKY_ASAN: TestConfig( # replace to non-default Build.PACKAGE_ASAN, - job_config=JobConfig(**{**statless_test_common_params, "timeout": 3600}), # type: ignore + job_config=JobConfig(pr_only=True, **{**statless_test_common_params, "timeout": 3600}), # type: ignore ), JobNames.JEPSEN_KEEPER: TestConfig( Build.BINARY_RELEASE, @@ -1041,12 +1123,12 @@ CI_CONFIG = CiConfig( JobNames.SQL_LOGIC_TEST: TestConfig( Build.PACKAGE_RELEASE, job_config=JobConfig(**sqllogic_test_params) # type: ignore ), - JobNames.SQL_LOGIC_TEST: TestConfig( + JobNames.SQLTEST: TestConfig( Build.PACKAGE_RELEASE, job_config=JobConfig(**sql_test_params) # type: ignore ), JobNames.CLCIKBENCH_TEST: TestConfig(Build.PACKAGE_RELEASE), JobNames.CLCIKBENCH_TEST_ARM: TestConfig(Build.PACKAGE_AARCH64), - JobNames.LIBFUZZER_TEST: TestConfig(Build.FUZZERS), # type: ignore + JobNames.LIBFUZZER_TEST: TestConfig(Build.FUZZERS, job_config=JobConfig(run_by_label=Labels.libFuzzer)), # type: ignore }, ) CI_CONFIG.validate() @@ -1067,6 +1149,8 @@ REQUIRED_CHECKS = [ JobNames.UNIT_TEST, JobNames.UNIT_TEST_TSAN, JobNames.UNIT_TEST_UBSAN, + JobNames.INTEGRATION_TEST_ASAN_ANALYZER, + JobNames.STATELESS_TEST_ANALYZER_RELEASE, ] @@ -1089,10 +1173,10 @@ CHECK_DESCRIPTIONS = [ lambda x: x.startswith("AST fuzzer"), ), CheckDescription( - "Bugfix validate check", + JobNames.BUGFIX_VALIDATE, "Checks that either a new test (functional or integration) or there " "some changed tests that fail with the binary built on master branch", - lambda x: x == "Bugfix validate check", + lambda x: x == JobNames.BUGFIX_VALIDATE, ), CheckDescription( "CI running", @@ -1117,16 +1201,22 @@ CHECK_DESCRIPTIONS = [ lambda x: x.startswith("Compatibility check"), ), CheckDescription( - "Docker image for servers", + JobNames.DOCKER_SERVER, "The check to build and optionally push the mentioned image to docker hub", - lambda x: x.startswith("Docker image") - and (x.endswith("building check") or x.endswith("build and push")), + lambda x: x.startswith("Docker server"), ), CheckDescription( - "Docs Check", "Builds and tests the documentation", lambda x: x == "Docs Check" + JobNames.DOCKER_KEEPER, + "The check to build and optionally push the mentioned image to docker hub", + lambda x: x.startswith("Docker keeper"), ), CheckDescription( - "Fast test", + JobNames.DOCS_CHECK, + "Builds and tests the documentation", + lambda x: x == JobNames.DOCS_CHECK, + ), + CheckDescription( + JobNames.FAST_TEST, "Normally this is the first check that is ran for a PR. It builds ClickHouse " 'and runs most of stateless functional tests, ' @@ -1134,7 +1224,7 @@ CHECK_DESCRIPTIONS = [ "Look at the report to see which tests fail, then reproduce the failure " 'locally as described here', - lambda x: x == "Fast test", + lambda x: x == JobNames.FAST_TEST, ), CheckDescription( "Flaky tests", @@ -1208,10 +1298,10 @@ CHECK_DESCRIPTIONS = [ lambda x: x.startswith("Stress test ("), ), CheckDescription( - "Style Check", + JobNames.STYLE_CHECK, "Runs a set of checks to keep the code style clean. If some of tests failed, " "see the related log from the report", - lambda x: x == "Style Check", + lambda x: x == JobNames.STYLE_CHECK, ), CheckDescription( "Unit tests", diff --git a/tests/ci/ci_utils.py b/tests/ci/ci_utils.py index 3c267cff79d..2967ec2f309 100644 --- a/tests/ci/ci_utils.py +++ b/tests/ci/ci_utils.py @@ -1,6 +1,6 @@ from contextlib import contextmanager import os -from typing import Union, Iterator +from typing import Any, List, Union, Iterator from pathlib import Path @@ -17,3 +17,34 @@ def cd(path: Union[Path, str]) -> Iterator[None]: yield finally: os.chdir(oldpwd) + + +def is_hex(s): + try: + int(s, 16) + return True + except ValueError: + return False + + +def normalize_string(string: str) -> str: + lowercase_string = string.lower() + normalized_string = ( + lowercase_string.replace(" ", "_") + .replace("-", "_") + .replace("/", "_") + .replace("(", "") + .replace(")", "") + .replace(",", "") + ) + return normalized_string + + +class GHActions: + @staticmethod + def print_in_group(group_name: str, lines: Union[Any, List[Any]]) -> None: + lines = list(lines) + print(f"::group::{group_name}") + for line in lines: + print(line) + print("::endgroup::") diff --git a/tests/ci/clickbench.py b/tests/ci/clickbench.py index 72827929ff9..50c7bb85d28 100644 --- a/tests/ci/clickbench.py +++ b/tests/ci/clickbench.py @@ -13,15 +13,12 @@ from build_download_helper import download_all_deb_packages from clickhouse_helper import ( CiLogsCredentials, ) -from commit_status_helper import ( - override_status, -) from docker_images_helper import get_docker_image, pull_image, DockerImage from env_helper import TEMP_PATH, REPORT_PATH -from pr_info import FORCE_TESTS_LABEL, PRInfo +from pr_info import PRInfo from stopwatch import Stopwatch from tee_popen import TeePopen -from report import JobReport, TestResults +from report import ERROR, SUCCESS, JobReport, StatusType, TestResults def get_image_name() -> str: @@ -52,7 +49,7 @@ def get_run_command( def process_results( result_directory: Path, server_log_path: Path, -) -> Tuple[str, str, TestResults, List[Path]]: +) -> Tuple[StatusType, str, TestResults, List[Path]]: test_results = [] # type: TestResults additional_files = [] # type: List[Path] # Just upload all files from result_directory. @@ -74,7 +71,7 @@ def process_results( if len(status) != 1 or len(status[0]) != 2: logging.info("Files in result folder %s", os.listdir(result_directory)) - return "error", "Invalid check_status.tsv", test_results, additional_files + return ERROR, "Invalid check_status.tsv", test_results, additional_files state, description = status[0][0], status[0][1] try: @@ -84,17 +81,17 @@ def process_results( logging.info("Found %s", results_path.name) else: logging.info("Files in result folder %s", os.listdir(result_directory)) - return "error", "Not found test_results.tsv", test_results, additional_files + return ERROR, "Not found test_results.tsv", test_results, additional_files except Exception as e: return ( - "error", + ERROR, f"Cannot parse test_results.tsv ({e})", test_results, additional_files, ) - return state, description, test_results, additional_files + return state, description, test_results, additional_files # type: ignore def parse_args(): @@ -168,7 +165,6 @@ def main(): state, description, test_results, additional_logs = process_results( result_path, server_log_path ) - state = override_status(state, check_name) JobReport( description=description, @@ -179,11 +175,8 @@ def main(): additional_files=[run_log_path] + additional_logs, ).dump() - if state != "success": - if FORCE_TESTS_LABEL in pr_info.labels: - print(f"'{FORCE_TESTS_LABEL}' enabled, will report success") - else: - sys.exit(1) + if state != SUCCESS: + sys.exit(1) if __name__ == "__main__": diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index 598eef9922e..b7128e36434 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -1,14 +1,15 @@ #!/usr/bin/env python3 -from collections import defaultdict -import json -from pathlib import Path -from typing import Dict, List, Optional, Union import csv +import json import logging import time +from collections import defaultdict from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Dict, List, Optional, Union +# isort: off from github import Github from github.Commit import Commit from github.CommitStatus import CommitStatus @@ -17,15 +18,15 @@ from github.GithubObject import NotSet from github.IssueComment import IssueComment from github.Repository import Repository -from ci_config import CI_CONFIG, REQUIRED_CHECKS, CHECK_DESCRIPTIONS, CheckDescription +from ci_config import REQUIRED_CHECKS, CHECK_DESCRIPTIONS, CheckDescription from env_helper import GITHUB_JOB_URL, GITHUB_REPOSITORY, TEMP_PATH -from pr_info import PRInfo, SKIP_MERGEABLE_CHECK_LABEL +from pr_info import SKIP_MERGEABLE_CHECK_LABEL, PRInfo from report import ( ERROR, FAILURE, PENDING, - StatusType, SUCCESS, + StatusType, TestResult, TestResults, get_worst_status, @@ -64,19 +65,6 @@ class RerunHelper: return None -def override_status(status: str, check_name: str, invert: bool = False) -> str: - test_config = CI_CONFIG.test_configs.get(check_name) - if test_config and test_config.force_tests: - return SUCCESS - - if invert: - if status == SUCCESS: - return ERROR - return SUCCESS - - return status - - def get_commit(gh: Github, commit_sha: str, retry_count: int = RETRY) -> Commit: for i in range(retry_count): try: @@ -93,7 +81,7 @@ def get_commit(gh: Github, commit_sha: str, retry_count: int = RETRY) -> Commit: def post_commit_status( commit: Commit, - state: str, + state: StatusType, report_url: Optional[str] = None, description: Optional[str] = None, check_name: Optional[str] = None, @@ -288,7 +276,7 @@ def generate_status_comment(pr_info: PRInfo, statuses: CommitStatuses) -> str: return "".join(result) -def get_worst_state(statuses: CommitStatuses) -> str: +def get_worst_state(statuses: CommitStatuses) -> StatusType: return get_worst_status(status.state for status in statuses) @@ -350,7 +338,7 @@ class CommitStatusData: return cls.load_from_file(STATUS_FILE_PATH) @classmethod - def is_present(cls) -> bool: + def exist(cls) -> bool: return STATUS_FILE_PATH.is_file() def dump_status(self) -> None: @@ -365,6 +353,9 @@ class CommitStatusData: def is_ok(self): return self.status == SUCCESS + def is_failure(self): + return self.status == FAILURE + @staticmethod def cleanup(): STATUS_FILE_PATH.unlink(missing_ok=True) @@ -373,12 +364,12 @@ class CommitStatusData: def get_commit_filtered_statuses(commit: Commit) -> CommitStatuses: """ Squash statuses to latest state - 1. context="first", state="success", update_time=1 - 2. context="second", state="success", update_time=2 - 3. context="first", stat="failure", update_time=3 + 1. context="first", state=SUCCESS, update_time=1 + 2. context="second", state=SUCCESS, update_time=2 + 3. context="first", stat=FAILURE, update_time=3 =========> - 1. context="second", state="success" - 2. context="first", stat="failure" + 1. context="second", state=SUCCESS + 2. context="first", stat=FAILURE """ filtered = {} for status in sorted(commit.get_statuses(), key=lambda x: x.updated_at): @@ -430,7 +421,7 @@ def format_description(description: str) -> str: def set_mergeable_check( commit: Commit, description: str = "", - state: StatusType = "success", + state: StatusType = SUCCESS, ) -> None: commit.create_status( context=MERGEABLE_NAME, diff --git a/tests/ci/compatibility_check.py b/tests/ci/compatibility_check.py index a0c6294d8fd..a2e6c94cf48 100644 --- a/tests/ci/compatibility_check.py +++ b/tests/ci/compatibility_check.py @@ -1,17 +1,17 @@ #!/usr/bin/env python3 -from distutils.version import StrictVersion -from pathlib import Path -from typing import List, Tuple import argparse import logging import subprocess import sys +from distutils.version import StrictVersion +from pathlib import Path +from typing import List, Tuple from build_download_helper import download_builds_filter from docker_images_helper import DockerImage, get_docker_image, pull_image -from env_helper import TEMP_PATH, REPORT_PATH -from report import JobReport, TestResults, TestResult +from env_helper import REPORT_PATH, TEMP_PATH +from report import FAILURE, SUCCESS, JobReport, TestResult, TestResults from stopwatch import Stopwatch IMAGE_UBUNTU = "clickhouse/test-old-ubuntu" @@ -55,19 +55,19 @@ def process_result( glibc_log_path = result_directory / "glibc.log" test_results = process_glibc_check(glibc_log_path, max_glibc_version) - status = "success" + status = SUCCESS description = "Compatibility check passed" if check_glibc: if len(test_results) > 1 or test_results[0].status != "OK": - status = "failure" + status = FAILURE description = "glibc check failed" - if status == "success" and check_distributions: + if status == SUCCESS and check_distributions: for operating_system in ("ubuntu:12.04", "centos:5"): test_result = process_os_check(result_directory / operating_system) if test_result.status != "OK": - status = "failure" + status = FAILURE description = f"Old {operating_system} failed" test_results += [test_result] break @@ -178,14 +178,14 @@ def main(): ) run_commands.extend(check_distributions_commands) - state = "success" + state = SUCCESS for run_command in run_commands: try: logging.info("Running command %s", run_command) subprocess.check_call(run_command, shell=True) except subprocess.CalledProcessError as ex: logging.info("Exception calling command %s", ex) - state = "failure" + state = FAILURE subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) @@ -215,7 +215,7 @@ def main(): additional_files=additional_logs, ).dump() - if state == "failure": + if state == FAILURE: sys.exit(1) diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py index a25669d85d0..af0416d83dc 100644 --- a/tests/ci/docker_images_check.py +++ b/tests/ci/docker_images_check.py @@ -3,24 +3,27 @@ import argparse import json import logging import os -import time import sys +import time from pathlib import Path from typing import List, Optional, Tuple +# isort: off from github import Github +# isort: on + from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse from commit_status_helper import format_description, get_commit, post_commit_status -from env_helper import RUNNER_TEMP, GITHUB_RUN_URL +from docker_images_helper import DockerImageData, docker_login, get_images_oredered_list +from env_helper import GITHUB_RUN_URL, RUNNER_TEMP from get_robot_token import get_best_robot_token from pr_info import PRInfo -from report import TestResults, TestResult +from report import FAILURE, SUCCESS, StatusType, TestResult, TestResults from s3_helper import S3Helper from stopwatch import Stopwatch from tee_popen import TeePopen from upload_result_helper import upload_results -from docker_images_helper import DockerImageData, docker_login, get_images_oredered_list NAME = "Push to Dockerhub" TEMP_PATH = Path(RUNNER_TEMP) / "docker_images_check" @@ -189,7 +192,7 @@ def main(): # additional_cache.append(str(pr_info.merged_pr)) ok_cnt = 0 - status = "success" + status = SUCCESS # type: StatusType image_tags = ( json.loads(args.image_tags) if not os.path.isfile(args.image_tags) @@ -233,7 +236,7 @@ def main(): if all(x.status == "OK" for x in res): ok_cnt += 1 else: - status = "failure" + status = FAILURE break # No need to continue with next images description = format_description( @@ -268,7 +271,7 @@ def main(): ch_helper = ClickHouseHelper() ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) - if status == "failure": + if status == FAILURE: sys.exit(1) diff --git a/tests/ci/docker_manifests_merge.py b/tests/ci/docker_manifests_merge.py index f87246be24b..fc00969d5d6 100644 --- a/tests/ci/docker_manifests_merge.py +++ b/tests/ci/docker_manifests_merge.py @@ -5,24 +5,23 @@ import json import logging import os import subprocess - import sys from typing import List, Tuple +# isort: off from github import Github -from clickhouse_helper import ( - ClickHouseHelper, - prepare_tests_results_for_clickhouse, -) +# isort: on + +from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse from commit_status_helper import format_description, get_commit, post_commit_status +from docker_images_helper import docker_login, get_images_oredered_list from get_robot_token import get_best_robot_token from pr_info import PRInfo -from report import TestResult +from report import FAILURE, SUCCESS, StatusType, TestResult from s3_helper import S3Helper from stopwatch import Stopwatch from upload_result_helper import upload_results -from docker_images_helper import docker_login, get_images_oredered_list NAME = "Push multi-arch images to Dockerhub" @@ -149,29 +148,35 @@ def main(): else json.load(open(args.missing_images)) ) test_results = [] - status = "success" + status = SUCCESS # type: StatusType ok_cnt, fail_cnt = 0, 0 images = get_images_oredered_list() for image_obj in images: - if image_obj.repo not in missing_images: - continue tag = image_tags[image_obj.repo] if image_obj.only_amd64: # FIXME: WA until full arm support tags = [f"{tag}-{arch}" for arch in archs if arch != "aarch64"] else: tags = [f"{tag}-{arch}" for arch in archs] - manifest, test_result = create_manifest(image_obj.repo, tag, tags, args.push) - test_results.append(TestResult(manifest, test_result)) + + # 1. update multiarch latest manifest for every image if args.set_latest: manifest, test_result = create_manifest( image_obj.repo, "latest", tags, args.push ) test_results.append(TestResult(manifest, test_result)) + # 2. skip manifest create if not missing + if image_obj.repo not in missing_images: + continue + + # 3. created image:digest multiarch manifest for changed images only + manifest, test_result = create_manifest(image_obj.repo, tag, tags, args.push) + test_results.append(TestResult(manifest, test_result)) + if test_result != "OK": - status = "failure" + status = FAILURE fail_cnt += 1 else: ok_cnt += 1 @@ -207,7 +212,7 @@ def main(): ) ch_helper = ClickHouseHelper() ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) - if status == "failure": + if status == FAILURE: sys.exit(1) diff --git a/tests/ci/docker_server.py b/tests/ci/docker_server.py index b9e5c13ec42..7f53034fd0f 100644 --- a/tests/ci/docker_server.py +++ b/tests/ci/docker_server.py @@ -6,25 +6,26 @@ import json import logging import sys import time +from os import makedirs +from os import path as p from pathlib import Path -from os import path as p, makedirs from typing import Dict, List from build_check import get_release_or_pr +from build_download_helper import read_build_urls from docker_images_helper import DockerImageData, docker_login from env_helper import ( GITHUB_RUN_URL, REPORT_PATH, - TEMP_PATH, S3_BUILDS_BUCKET, S3_DOWNLOAD, + TEMP_PATH, ) from git_helper import Git from pr_info import PRInfo -from report import JobReport, TestResults, TestResult +from report import FAILURE, SUCCESS, JobReport, TestResult, TestResults from stopwatch import Stopwatch from tee_popen import TeePopen -from build_download_helper import read_build_urls from version_helper import ( ClickHouseVersion, get_tagged_versions, @@ -378,7 +379,7 @@ def main(): docker_login() logging.info("Following tags will be created: %s", ", ".join(tags)) - status = "success" + status = SUCCESS test_results = [] # type: TestResults for os in args.os: for tag in tags: @@ -388,7 +389,7 @@ def main(): ) ) if test_results[-1].status != "OK": - status = "failure" + status = FAILURE pr_info = pr_info or PRInfo() description = f"Processed tags: {', '.join(tags)}" @@ -401,7 +402,7 @@ def main(): additional_files=[], ).dump() - if status != "success": + if status != SUCCESS: sys.exit(1) diff --git a/tests/ci/docs_check.py b/tests/ci/docs_check.py index a982cbc2a32..6bd4ef49675 100644 --- a/tests/ci/docs_check.py +++ b/tests/ci/docs_check.py @@ -6,16 +6,13 @@ import sys from pathlib import Path from docker_images_helper import get_docker_image, pull_image -from env_helper import TEMP_PATH, REPO_COPY +from env_helper import REPO_COPY, TEMP_PATH from pr_info import PRInfo -from report import JobReport, TestResults, TestResult +from report import FAILURE, SUCCESS, JobReport, TestResult, TestResults from stopwatch import Stopwatch from tee_popen import TeePopen -NAME = "Docs Check" - - def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, @@ -52,7 +49,7 @@ def main(): JobReport( description="No changes in docs", test_results=[], - status="success", + status=SUCCESS, start_time=stopwatch.start_time_str, duration=stopwatch.duration_seconds, additional_files=[], @@ -82,11 +79,11 @@ def main(): retcode = process.wait() if retcode == 0: logging.info("Run successfully") - status = "success" + status = SUCCESS description = "Docs check passed" else: description = "Docs check failed (non zero exit code)" - status = "failure" + status = FAILURE logging.info("Run failed") subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) @@ -95,7 +92,7 @@ def main(): if not any(test_output.iterdir()): logging.error("No output files after docs check") description = "No output files after docs check" - status = "failure" + status = FAILURE else: for p in test_output.iterdir(): additional_files.append(p) @@ -104,9 +101,9 @@ def main(): if "ERROR" in line: test_results.append(TestResult(line.split(":")[-1], "FAIL")) if test_results: - status = "failure" + status = FAILURE description = "Found errors in docs" - elif status != "failure": + elif status != FAILURE: test_results.append(TestResult("No errors found", "OK")) else: test_results.append(TestResult("Non zero exit code", "FAIL")) @@ -120,7 +117,7 @@ def main(): additional_files=additional_files, ).dump() - if status == "failure": + if status == FAILURE: sys.exit(1) diff --git a/tests/ci/fast_test_check.py b/tests/ci/fast_test_check.py index c8ddcf25057..5d528bb4c48 100644 --- a/tests/ci/fast_test_check.py +++ b/tests/ci/fast_test_check.py @@ -1,22 +1,28 @@ #!/usr/bin/env python3 import argparse -import logging -import subprocess -import os import csv +import logging +import os +import subprocess import sys from pathlib import Path from typing import Tuple from docker_images_helper import DockerImage, get_docker_image, pull_image -from env_helper import S3_BUILDS_BUCKET, TEMP_PATH, REPO_COPY -from pr_info import FORCE_TESTS_LABEL, PRInfo -from report import JobReport, TestResult, TestResults, read_test_results +from env_helper import REPO_COPY, S3_BUILDS_BUCKET, TEMP_PATH +from pr_info import PRInfo +from report import ( + ERROR, + FAILURE, + SUCCESS, + JobReport, + TestResult, + TestResults, + read_test_results, +) from stopwatch import Stopwatch from tee_popen import TeePopen -NAME = "Fast test" - # Will help to avoid errors like _csv.Error: field larger than field limit (131072) csv.field_size_limit(sys.maxsize) @@ -58,16 +64,16 @@ def process_results(result_directory: Path) -> Tuple[str, str, TestResults]: status = list(csv.reader(status_file, delimiter="\t")) if len(status) != 1 or len(status[0]) != 2: logging.info("Files in result folder %s", os.listdir(result_directory)) - return "error", "Invalid check_status.tsv", test_results + return ERROR, "Invalid check_status.tsv", test_results state, description = status[0][0], status[0][1] try: results_path = result_directory / "test_results.tsv" test_results = read_test_results(results_path) if len(test_results) == 0: - return "error", "Empty test_results.tsv", test_results + return ERROR, "Empty test_results.tsv", test_results except Exception as e: - return ("error", f"Cannot parse test_results.tsv ({e})", test_results) + return (ERROR, f"Cannot parse test_results.tsv ({e})", test_results) return state, description, test_results @@ -151,25 +157,25 @@ def main(): test_results = [] # type: TestResults if "submodule_log.txt" not in test_output_files: description = "Cannot clone repository" - state = "failure" + state = FAILURE elif "cmake_log.txt" not in test_output_files: description = "Cannot fetch submodules" - state = "failure" + state = FAILURE elif "build_log.txt" not in test_output_files: description = "Cannot finish cmake" - state = "failure" + state = FAILURE elif "install_log.txt" not in test_output_files: description = "Cannot build ClickHouse" - state = "failure" + state = FAILURE elif not test_log_exists and not test_result_exists: description = "Cannot install or start ClickHouse" - state = "failure" + state = FAILURE else: state, description, test_results = process_results(output_path) if timeout_expired: test_results.append(TestResult.create_check_timeout_expired(args.timeout)) - state = "failure" + state = FAILURE description = test_results[-1].name JobReport( @@ -183,14 +189,8 @@ def main(): ).dump() # Refuse other checks to run if fast test failed - if state != "success": - if state == "error": - print("The status is 'error', report failure disregard the labels") - sys.exit(1) - elif FORCE_TESTS_LABEL in pr_info.labels: - print(f"'{FORCE_TESTS_LABEL}' enabled, reporting success") - else: - sys.exit(1) + if state != SUCCESS: + sys.exit(1) if __name__ == "__main__": diff --git a/tests/ci/finish_check.py b/tests/ci/finish_check.py index 6c615817164..e5268947304 100644 --- a/tests/ci/finish_check.py +++ b/tests/ci/finish_check.py @@ -1,7 +1,11 @@ #!/usr/bin/env python3 import logging + +# isort: off from github import Github +# isort: on + from commit_status_helper import ( CI_STATUS_NAME, get_commit, @@ -11,6 +15,7 @@ from commit_status_helper import ( ) from get_robot_token import get_best_robot_token from pr_info import PRInfo +from report import PENDING, SUCCESS def main(): @@ -31,10 +36,10 @@ def main(): return # Take the latest status status = statuses[-1] - if status.state == "pending": + if status.state == PENDING: post_commit_status( commit, - "success", + SUCCESS, status.target_url, "All checks finished", CI_STATUS_NAME, diff --git a/tests/ci/functional_test_check.py b/tests/ci/functional_test_check.py index b7e6c656b1f..da2dea60fc1 100644 --- a/tests/ci/functional_test_check.py +++ b/tests/ci/functional_test_check.py @@ -7,35 +7,19 @@ import os import re import subprocess import sys -import atexit from pathlib import Path from typing import List, Tuple -from github import Github - from build_download_helper import download_all_deb_packages -from clickhouse_helper import ( - CiLogsCredentials, - ClickHouseHelper, - prepare_tests_results_for_clickhouse, -) -from commit_status_helper import ( - get_commit, - override_status, - post_commit_status, - post_commit_status_to_file, - update_mergeable_check, -) +from clickhouse_helper import CiLogsCredentials + from docker_images_helper import DockerImage, pull_image, get_docker_image from download_release_packages import download_last_release from env_helper import REPORT_PATH, TEMP_PATH, REPO_COPY -from get_robot_token import get_best_robot_token -from pr_info import FORCE_TESTS_LABEL, PRInfo -from report import TestResults, read_test_results -from s3_helper import S3Helper +from pr_info import PRInfo +from report import ERROR, SUCCESS, JobReport, StatusType, TestResults, read_test_results from stopwatch import Stopwatch from tee_popen import TeePopen -from upload_result_helper import upload_results NO_CHANGES_MSG = "Nothing to run" @@ -127,7 +111,7 @@ def get_run_command( ) -def get_tests_to_run(pr_info: PRInfo) -> List[str]: +def _get_statless_tests_to_run(pr_info: PRInfo) -> List[str]: result = set() if pr_info.changed_files is None: @@ -152,7 +136,7 @@ def get_tests_to_run(pr_info: PRInfo) -> List[str]: def process_results( result_directory: Path, server_log_path: Path, -) -> Tuple[str, str, TestResults, List[Path]]: +) -> Tuple[StatusType, str, TestResults, List[Path]]: test_results = [] # type: TestResults additional_files = [] # Just upload all files from result_directory. @@ -174,7 +158,7 @@ def process_results( if len(status) != 1 or len(status[0]) != 2: logging.info("Files in result folder %s", os.listdir(result_directory)) - return "error", "Invalid check_status.tsv", test_results, additional_files + return ERROR, "Invalid check_status.tsv", test_results, additional_files state, description = status[0][0], status[0][1] try: @@ -184,20 +168,20 @@ def process_results( logging.info("Found test_results.tsv") else: logging.info("Files in result folder %s", os.listdir(result_directory)) - return "error", "Not found test_results.tsv", test_results, additional_files + return ERROR, "Not found test_results.tsv", test_results, additional_files test_results = read_test_results(results_path) if len(test_results) == 0: - return "error", "Empty test_results.tsv", test_results, additional_files + return ERROR, "Empty test_results.tsv", test_results, additional_files except Exception as e: return ( - "error", + ERROR, f"Cannot parse test_results.tsv ({e})", test_results, additional_files, ) - return state, description, test_results, additional_files + return state, description, test_results, additional_files # type: ignore def parse_args(): @@ -210,10 +194,10 @@ def parse_args(): help="Check that added tests failed on latest stable", ) parser.add_argument( - "--post-commit-status", - default="commit_status", - choices=["commit_status", "file"], - help="Where to public post commit status", + "--report-to-file", + type=str, + default="", + help="Path to write script report to (for --validate-bugfix)", ) return parser.parse_args() @@ -229,7 +213,6 @@ def main(): reports_path.mkdir(parents=True, exist_ok=True) repo_path = Path(REPO_COPY) - post_commit_path = temp_path / "functional_commit_status.tsv" args = parse_args() check_name = args.check_name or os.getenv("CHECK_NAME") @@ -246,62 +229,20 @@ def main(): flaky_check = "flaky" in check_name.lower() run_changed_tests = flaky_check or validate_bugfix_check - - # For validate_bugfix_check we need up to date information about labels, so pr_event_from_api is used - pr_info = PRInfo( - need_changed_files=run_changed_tests, pr_event_from_api=validate_bugfix_check - ) - - # FIXME: move to job report and remove - gh = Github(get_best_robot_token(), per_page=100) - commit = get_commit(gh, pr_info.sha) - atexit.register(update_mergeable_check, commit, pr_info, check_name) - - if validate_bugfix_check and "pr-bugfix" not in pr_info.labels: - if args.post_commit_status == "file": - post_commit_status_to_file( - post_commit_path, - f"Skipped (no pr-bugfix in {pr_info.labels})", - "success", - "null", - ) - logging.info("Skipping '%s' (no pr-bugfix in %s)", check_name, pr_info.labels) - sys.exit(0) + pr_info = PRInfo(need_changed_files=run_changed_tests) + tests_to_run = [] + if run_changed_tests: + assert ( + args.report_to_file + ), "JobReport file path must be provided with --validate-bugfix" + tests_to_run = _get_statless_tests_to_run(pr_info) if "RUN_BY_HASH_NUM" in os.environ: run_by_hash_num = int(os.getenv("RUN_BY_HASH_NUM", "0")) run_by_hash_total = int(os.getenv("RUN_BY_HASH_TOTAL", "0")) - check_name_with_group = ( - check_name + f" [{run_by_hash_num + 1}/{run_by_hash_total}]" - ) else: run_by_hash_num = 0 run_by_hash_total = 0 - check_name_with_group = check_name - - tests_to_run = [] - if run_changed_tests: - tests_to_run = get_tests_to_run(pr_info) - if not tests_to_run: - state = override_status("success", check_name, validate_bugfix_check) - if args.post_commit_status == "commit_status": - post_commit_status( - commit, - state, - "", - NO_CHANGES_MSG, - check_name_with_group, - pr_info, - dump_to_file=True, - ) - elif args.post_commit_status == "file": - post_commit_status_to_file( - post_commit_path, - description=NO_CHANGES_MSG, - state=state, - report_url="null", - ) - sys.exit(0) image_name = get_image_name(check_name) @@ -335,91 +276,65 @@ def main(): pr_info, stopwatch.start_time_str, check_name ) - run_command = get_run_command( - check_name, - packages_path, - repo_path, - result_path, - server_log_path, - kill_timeout, - additional_envs, - ci_logs_args, - docker_image, - flaky_check, - tests_to_run, - ) - logging.info("Going to run func tests: %s", run_command) - - with TeePopen(run_command, run_log_path) as process: - retcode = process.wait() - if retcode == 0: - logging.info("Run successfully") - else: - logging.info("Run failed") - - try: - subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) - except subprocess.CalledProcessError: - logging.warning("Failed to change files owner in %s, ignoring it", temp_path) - - ci_logs_credentials.clean_ci_logs_from_credentials(run_log_path) - s3_helper = S3Helper() - - state, description, test_results, additional_logs = process_results( - result_path, server_log_path - ) - state = override_status(state, check_name, invert=validate_bugfix_check) - - ch_helper = ClickHouseHelper() - - report_url = upload_results( - s3_helper, - pr_info.number, - pr_info.sha, - test_results, - [run_log_path] + additional_logs, - check_name_with_group, - ) - - print(f"::notice:: {check_name} Report url: {report_url}") - if args.post_commit_status == "commit_status": - post_commit_status( - commit, - state, - report_url, - description, - check_name_with_group, - pr_info, - dump_to_file=True, + if (not validate_bugfix_check and not flaky_check) or tests_to_run: + run_command = get_run_command( + check_name, + packages_path, + repo_path, + result_path, + server_log_path, + kill_timeout, + additional_envs, + ci_logs_args, + docker_image, + flaky_check, + tests_to_run, ) - elif args.post_commit_status == "file": - post_commit_status_to_file( - post_commit_path, - description, - state, - report_url, + logging.info("Going to run func tests: %s", run_command) + + with TeePopen(run_command, run_log_path) as process: + retcode = process.wait() + if retcode == 0: + logging.info("Run successfully") + else: + logging.info("Run failed") + + try: + subprocess.check_call( + f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True + ) + except subprocess.CalledProcessError: + logging.warning( + "Failed to change files owner in %s, ignoring it", temp_path + ) + + ci_logs_credentials.clean_ci_logs_from_credentials(run_log_path) + + state, description, test_results, additional_logs = process_results( + result_path, server_log_path ) else: - raise Exception( - f'Unknown post_commit_status option "{args.post_commit_status}"' + print( + "This is validate bugfix or flaky check run, but no changes test to run - skip with success" + ) + state, description, test_results, additional_logs = ( + SUCCESS, + "No tests to run", + [], + [], ) - prepared_events = prepare_tests_results_for_clickhouse( - pr_info, - test_results, - state, - stopwatch.duration_seconds, - stopwatch.start_time_str, - report_url, - check_name_with_group, - ) - ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) + JobReport( + description=description, + test_results=test_results, + status=state, + start_time=stopwatch.start_time_str, + duration=stopwatch.duration_seconds, + additional_files=additional_logs, + ).dump(to_file=args.report_to_file if args.report_to_file else None) - if state != "success": - if FORCE_TESTS_LABEL in pr_info.labels: - print(f"'{FORCE_TESTS_LABEL}' enabled, will report success") - else: - sys.exit(1) + if state != SUCCESS: + sys.exit(1) if __name__ == "__main__": diff --git a/tests/ci/integration_test_check.py b/tests/ci/integration_test_check.py index 18b3d2c2898..751abf617fa 100644 --- a/tests/ci/integration_test_check.py +++ b/tests/ci/integration_test_check.py @@ -5,31 +5,27 @@ import csv import json import logging import os -import subprocess import sys from pathlib import Path from typing import Dict, List, Tuple from build_download_helper import download_all_deb_packages -from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse -from commit_status_helper import ( - get_commit, - override_status, - post_commit_status, - post_commit_status_to_file, -) -from docker_images_helper import DockerImage, get_docker_image, pull_image +from docker_images_helper import DockerImage, get_docker_image from download_release_packages import download_last_release from env_helper import REPO_COPY, REPORT_PATH, TEMP_PATH -from get_robot_token import get_best_robot_token -from github_helper import GitHub from integration_test_images import IMAGES from pr_info import PRInfo -from report import ERROR, TestResult, TestResults, read_test_results -from s3_helper import S3Helper +from report import ( + ERROR, + SUCCESS, + StatusType, + JobReport, + TestResult, + TestResults, + read_test_results, +) from stopwatch import Stopwatch from tee_popen import TeePopen -from upload_result_helper import upload_results def get_json_params_dict( @@ -84,7 +80,7 @@ def get_env_for_runner( def process_results( result_directory: Path, -) -> Tuple[str, str, TestResults, List[Path]]: +) -> Tuple[StatusType, str, TestResults, List[Path]]: test_results = [] # type: TestResults additional_files = [] # Just upload all files from result_directory. @@ -102,38 +98,41 @@ def process_results( if len(status) != 1 or len(status[0]) != 2: logging.info("Files in result folder %s", os.listdir(result_directory)) - return "error", "Invalid check_status.tsv", test_results, additional_files + return ERROR, "Invalid check_status.tsv", test_results, additional_files state, description = status[0][0], status[0][1] try: results_path = result_directory / "test_results.tsv" test_results = read_test_results(results_path, False) if len(test_results) == 0: - return "error", "Empty test_results.tsv", test_results, additional_files + return ERROR, "Empty test_results.tsv", test_results, additional_files except Exception as e: return ( - "error", + ERROR, f"Cannot parse test_results.tsv ({e})", test_results, additional_files, ) - return state, description, test_results, additional_files + return state, description, test_results, additional_files # type: ignore def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("check_name") + parser.add_argument( + "--run-tests", nargs="*", help="List of tests to run", default=None + ) parser.add_argument( "--validate-bugfix", action="store_true", help="Check that added tests failed on latest stable", ) parser.add_argument( - "--post-commit-status", - default="commit_status", - choices=["commit_status", "file"], - help="Where to public post commit status", + "--report-to-file", + type=str, + default="", + help="Path to write script report to (for --validate-bugfix)", ) return parser.parse_args() @@ -147,7 +146,6 @@ def main(): reports_path = Path(REPORT_PATH) temp_path.mkdir(parents=True, exist_ok=True) - post_commit_path = temp_path / "integration_commit_status.tsv" repo_path = Path(REPO_COPY) args = parse_args() @@ -160,39 +158,22 @@ def main(): if "RUN_BY_HASH_NUM" in os.environ: run_by_hash_num = int(os.getenv("RUN_BY_HASH_NUM", "0")) run_by_hash_total = int(os.getenv("RUN_BY_HASH_TOTAL", "0")) - check_name_with_group = ( - check_name + f" [{run_by_hash_num + 1}/{run_by_hash_total}]" - ) else: run_by_hash_num = 0 run_by_hash_total = 0 - check_name_with_group = check_name is_flaky_check = "flaky" in check_name + assert ( + not validate_bugfix_check or args.report_to_file + ), "--report-to-file must be provided for --validate-bugfix" + # For validate_bugfix_check we need up to date information about labels, so # pr_event_from_api is used - pr_info = PRInfo( - need_changed_files=is_flaky_check or validate_bugfix_check, - pr_event_from_api=validate_bugfix_check, - ) + pr_info = PRInfo(need_changed_files=is_flaky_check or validate_bugfix_check) - if validate_bugfix_check and "pr-bugfix" not in pr_info.labels: - if args.post_commit_status == "file": - post_commit_status_to_file( - post_commit_path, - f"Skipped (no pr-bugfix in {pr_info.labels})", - "success", - "null", - ) - logging.info("Skipping '%s' (no pr-bugfix in '%s')", check_name, pr_info.labels) - sys.exit(0) + images = [get_docker_image(image_) for image_ in IMAGES] - # FIXME: switch to JobReport and remove: - gh = GitHub(get_best_robot_token()) - commit = get_commit(gh, pr_info.sha) - - images = [pull_image(get_docker_image(i)) for i in IMAGES] result_path = temp_path / "output_dir" result_path.mkdir(parents=True, exist_ok=True) @@ -237,7 +218,7 @@ def main(): ), ) - ch_helper = ClickHouseHelper() + integration_infrastructure_fail = False with TeePopen(run_command, output_path_log, my_env) as process: retcode = process.wait() if retcode == 0: @@ -246,75 +227,33 @@ def main(): logging.warning( "There were issues with infrastructure. Not writing status report to restart job." ) - prepared_events = prepare_tests_results_for_clickhouse( - pr_info, - [ - TestResult( - "integration_infrastructure_fail", - "ERROR", - stopwatch.duration_seconds, - ) - ], - ERROR, - stopwatch.duration_seconds, - stopwatch.start_time_str, - "", - check_name_with_group, - ) - - ch_helper.insert_events_into( - db="default", table="checks", events=prepared_events - ) + integration_infrastructure_fail = True sys.exit(1) else: logging.info("Some tests failed") - subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) + # subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) - state, description, test_results, additional_logs = process_results(result_path) - state = override_status(state, check_name, invert=validate_bugfix_check) - - s3_helper = S3Helper() - report_url = upload_results( - s3_helper, - pr_info.number, - pr_info.sha, - test_results, - [output_path_log] + additional_logs, - check_name_with_group, - ) - - print(f"::notice:: {check_name} Report url: {report_url}") - if args.post_commit_status == "commit_status": - post_commit_status( - commit, - state, - report_url, - description, - check_name_with_group, - pr_info, - dump_to_file=True, - ) - elif args.post_commit_status == "file": - post_commit_status_to_file(post_commit_path, description, state, report_url) + if not integration_infrastructure_fail: + state, description, test_results, additional_logs = process_results(result_path) else: - raise Exception( - f'Unknown post_commit_status option "{args.post_commit_status}"' + state, description, test_results, additional_logs = ( + ERROR, + "no description", + [TestResult("infrastructure error", ERROR, stopwatch.duration_seconds)], + [], ) - prepared_events = prepare_tests_results_for_clickhouse( - pr_info, - test_results, - state, - stopwatch.duration_seconds, - stopwatch.start_time_str, - report_url, - check_name_with_group, - ) + JobReport( + description=description, + test_results=test_results, + status=state, + start_time=stopwatch.start_time_str, + duration=stopwatch.duration_seconds, + additional_files=[output_path_log] + additional_logs, + ).dump(to_file=args.report_to_file if args.report_to_file else None) - ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) - - if state == "failure": + if state != SUCCESS: sys.exit(1) diff --git a/tests/ci/jepsen_check.py b/tests/ci/jepsen_check.py index 93e33d62293..fb7540abda3 100644 --- a/tests/ci/jepsen_check.py +++ b/tests/ci/jepsen_check.py @@ -5,29 +5,26 @@ import logging import os import sys import time - from pathlib import Path from typing import Any, List import boto3 # type: ignore import requests # type: ignore - from build_download_helper import ( download_build_with_progress, get_build_name_for_check, read_build_urls, ) from compress_files import compress_fast -from env_helper import REPO_COPY, REPORT_PATH, S3_URL, TEMP_PATH, S3_BUILDS_BUCKET +from env_helper import REPO_COPY, REPORT_PATH, S3_BUILDS_BUCKET, S3_URL, TEMP_PATH from get_robot_token import get_parameter_from_ssm from git_helper import git_runner from pr_info import PRInfo -from report import JobReport, TestResults, TestResult +from report import FAILURE, SUCCESS, JobReport, TestResult, TestResults from ssh import SSHKey from stopwatch import Stopwatch from tee_popen import TeePopen - JEPSEN_GROUP_NAME = "jepsen_group" KEEPER_DESIRED_INSTANCE_COUNT = 3 @@ -263,21 +260,21 @@ def main(): else: logging.info("Run failed") - status = "success" + status = SUCCESS description = "No invalid analysis found ヽ(‘ー`)ノ" jepsen_log_path = result_path / "jepsen_run_all_tests.log" additional_data = [] try: test_result = _parse_jepsen_output(jepsen_log_path) if any(r.status == "FAIL" for r in test_result): - status = "failure" + status = FAILURE description = "Found invalid analysis (ノಥ益ಥ)ノ ┻━┻" compress_fast(result_path / "store", result_path / "jepsen_store.tar.zst") additional_data.append(result_path / "jepsen_store.tar.zst") except Exception as ex: print("Exception", ex) - status = "failure" + status = FAILURE description = "No Jepsen output log" test_result = [TestResult("No Jepsen output log", "FAIL")] diff --git a/tests/ci/lambda_shared_package/lambda_shared/pr.py b/tests/ci/lambda_shared_package/lambda_shared/pr.py index 4872ecb4d59..1b4f827cc0a 100644 --- a/tests/ci/lambda_shared_package/lambda_shared/pr.py +++ b/tests/ci/lambda_shared_package/lambda_shared/pr.py @@ -43,6 +43,8 @@ TRUSTED_CONTRIBUTORS = { "tsolodov", # ClickHouse, Inc "kitaisreal", "k-morozov", # Konstantin Morozov, Yandex Cloud + "justindeguzman", # ClickHouse, Inc + "jrdi", # ClickHouse contributor, TinyBird ] } diff --git a/tests/ci/mark_release_ready.py b/tests/ci/mark_release_ready.py index 0ad4b2bd2ed..31415fef9c0 100755 --- a/tests/ci/mark_release_ready.py +++ b/tests/ci/mark_release_ready.py @@ -7,10 +7,11 @@ import os from commit_status_helper import get_commit, post_commit_status from env_helper import GITHUB_JOB_URL from get_robot_token import get_best_robot_token +from git_helper import commit as commit_arg from github_helper import GitHub from pr_info import PRInfo from release import RELEASE_READY_STATUS -from git_helper import commit as commit_arg +from report import SUCCESS def main(): @@ -50,12 +51,11 @@ def main(): gh.get_rate_limit() post_commit_status( commit, - "success", + SUCCESS, url, description, RELEASE_READY_STATUS, pr_info, - dump_to_file=True, ) diff --git a/tests/ci/merge_pr.py b/tests/ci/merge_pr.py index 772821f4960..cc92fe4f42c 100644 --- a/tests/ci/merge_pr.py +++ b/tests/ci/merge_pr.py @@ -4,21 +4,23 @@ import argparse import logging - from datetime import datetime from os import getenv from pprint import pformat from typing import Dict, List +# isort: off from github.PaginatedList import PaginatedList from github.PullRequestReview import PullRequestReview from github.WorkflowRun import WorkflowRun +# isort: on + from commit_status_helper import get_commit_filtered_statuses from get_robot_token import get_best_robot_token from github_helper import GitHub, NamedUser, PullRequest, Repository from pr_info import PRInfo - +from report import SUCCESS # The team name for accepted approvals TEAM_NAME = getenv("GITHUB_TEAM_NAME", "core") @@ -269,7 +271,7 @@ def main(): failed_statuses = [ status.context for status in get_commit_filtered_statuses(commit) - if status.state != "success" + if status.state != SUCCESS ] if failed_statuses: logging.warning( diff --git a/tests/ci/performance_comparison_check.py b/tests/ci/performance_comparison_check.py index 524da916a5e..f0af15397c7 100644 --- a/tests/ci/performance_comparison_check.py +++ b/tests/ci/performance_comparison_check.py @@ -1,35 +1,38 @@ #!/usr/bin/env python3 -import os -import logging -import sys import json -import subprocess -import traceback +import logging +import os import re +import subprocess +import sys +import traceback from pathlib import Path +# isort: off from github import Github -from commit_status_helper import get_commit +# isort: on + +from build_download_helper import download_builds_filter from ci_config import CI_CONFIG -from docker_images_helper import pull_image, get_docker_image +from clickhouse_helper import get_instance_id, get_instance_type +from commit_status_helper import get_commit +from docker_images_helper import get_docker_image, pull_image from env_helper import ( GITHUB_EVENT_PATH, GITHUB_RUN_URL, REPO_COPY, + REPORT_PATH, S3_BUILDS_BUCKET, S3_DOWNLOAD, TEMP_PATH, - REPORT_PATH, ) from get_robot_token import get_best_robot_token, get_parameter_from_ssm from pr_info import PRInfo -from tee_popen import TeePopen -from clickhouse_helper import get_instance_type, get_instance_id +from report import FAILURE, SUCCESS, JobReport from stopwatch import Stopwatch -from build_download_helper import download_builds_filter -from report import JobReport +from tee_popen import TeePopen IMAGE_NAME = "clickhouse/performance-comparison" @@ -223,20 +226,20 @@ def main(): message = message_match.group(1).strip() # TODO: Remove me, always green mode for the first time, unless errors - status = "success" + status = SUCCESS if "errors" in message.lower() or too_many_slow(message.lower()): - status = "failure" + status = FAILURE # TODO: Remove until here except Exception: traceback.print_exc() - status = "failure" + status = FAILURE message = "Failed to parse the report." if not status: - status = "failure" + status = FAILURE message = "No status in report." elif not message: - status = "failure" + status = FAILURE message = "No message in report." JobReport( @@ -249,7 +252,7 @@ def main(): check_name=check_name_with_group, ).dump() - if status == "error": + if status != SUCCESS: sys.exit(1) diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py index d1be459666f..70f358e8070 100644 --- a/tests/ci/pr_info.py +++ b/tests/ci/pr_info.py @@ -19,7 +19,6 @@ from env_helper import ( GITHUB_SERVER_URL, ) -FORCE_TESTS_LABEL = "force tests" SKIP_MERGEABLE_CHECK_LABEL = "skip mergeable check" NeedsDataType = Dict[str, Dict[str, Union[str, Dict[str, str]]]] @@ -287,7 +286,10 @@ class PRInfo: self.fetch_changed_files() def is_master(self) -> bool: - return self.number == 0 and self.base_ref == "master" + return self.number == 0 and self.head_ref == "master" + + def is_release_branch(self) -> bool: + return self.number == 0 def is_scheduled(self): return self.event_type == EventType.SCHEDULE diff --git a/tests/ci/release.py b/tests/ci/release.py index f96845dad95..2b3331938e7 100755 --- a/tests/ci/release.py +++ b/tests/ci/release.py @@ -18,6 +18,7 @@ from contextlib import contextmanager from typing import Any, Final, Iterator, List, Optional, Tuple from git_helper import Git, commit, release_branch +from report import SUCCESS from version_helper import ( FILE_WITH_VERSION_PATH, GENERATED_CONTRIBUTORS, @@ -142,7 +143,7 @@ class Release: for status in statuses: if status["context"] == RELEASE_READY_STATUS: - if not status["state"] == "success": + if not status["state"] == SUCCESS: raise Exception( f"the status {RELEASE_READY_STATUS} is {status['state']}" ", not success" diff --git a/tests/ci/report.py b/tests/ci/report.py index b478f737963..282c343eec3 100644 --- a/tests/ci/report.py +++ b/tests/ci/report.py @@ -1,6 +1,12 @@ # -*- coding: utf-8 -*- +import csv +import datetime +import json +import logging +import os from ast import literal_eval from dataclasses import asdict, dataclass +from html import escape from pathlib import Path from typing import ( Dict, @@ -13,17 +19,11 @@ from typing import ( Tuple, Union, ) -from html import escape -import csv -import datetime -import json -import logging -import os from build_download_helper import get_gh_api -from ci_config import BuildConfig, CI_CONFIG +from ci_config import CI_CONFIG, BuildConfig from env_helper import REPORT_PATH, TEMP_PATH - +from ci_utils import normalize_string logger = logging.getLogger(__name__) @@ -34,28 +34,31 @@ SUCCESS: Final = "success" OK: Final = "OK" FAIL: Final = "FAIL" +SKIPPED: Final = "SKIPPED" StatusType = Literal["error", "failure", "pending", "success"] +STATUSES = [ERROR, FAILURE, PENDING, SUCCESS] # type: List[StatusType] + + # The order of statuses from the worst to the best -_STATES = {ERROR: 0, FAILURE: 1, PENDING: 2, SUCCESS: 3} +def _state_rank(status: str) -> int: + "return the index of status or index of SUCCESS in case of wrong status" + try: + return STATUSES.index(status) # type: ignore + except ValueError: + return 3 -def get_worst_status(statuses: Iterable[str]) -> str: - worst_status = None +def get_worst_status(statuses: Iterable[str]) -> StatusType: + worst_status = SUCCESS # type: StatusType for status in statuses: - if _STATES.get(status) is None: - continue - if worst_status is None: - worst_status = status - continue - if _STATES.get(status) < _STATES.get(worst_status): - worst_status = status + ind = _state_rank(status) + if ind < _state_rank(worst_status): + worst_status = STATUSES[ind] if worst_status == ERROR: break - if worst_status is None: - return "" return worst_status @@ -290,9 +293,10 @@ class JobReport: return JOB_REPORT_FILE.is_file() @classmethod - def load(cls): # type: ignore + def load(cls, from_file=None): # type: ignore res = {} - with open(JOB_REPORT_FILE, "r") as json_file: + from_file = from_file or JOB_REPORT_FILE + with open(from_file, "r") as json_file: res = json.load(json_file) # Deserialize the nested lists of TestResult test_results_data = res.get("test_results", []) @@ -305,13 +309,14 @@ class JobReport: if JOB_REPORT_FILE.exists(): JOB_REPORT_FILE.unlink() - def dump(self): + def dump(self, to_file=None): def path_converter(obj): if isinstance(obj, Path): return str(obj) raise TypeError("Type not serializable") - with open(JOB_REPORT_FILE, "w") as json_file: + to_file = to_file or JOB_REPORT_FILE + with open(to_file, "w") as json_file: json.dump(asdict(self), json_file, default=path_converter, indent=2) @@ -448,6 +453,12 @@ class BuildResult: return self._wrong_config_message return self.build_config.sanitizer + @property + def coverage(self) -> str: + if self.build_config is None: + return self._wrong_config_message + return str(self.build_config.coverage) + @property def grouped_urls(self) -> List[List[str]]: "Combine and preserve build_urls by artifact types" @@ -549,7 +560,7 @@ class BuildResult: def write_json(self, directory: Union[Path, str] = REPORT_PATH) -> Path: path = Path(directory) / self.get_report_name( - self.build_name, self.pr_number or self.head_ref + self.build_name, self.pr_number or normalize_string(self.head_ref) ) path.write_text( json.dumps( @@ -586,7 +597,6 @@ class ReportColorTheme: blue = "#00B4FF" default = (ReportColor.green, ReportColor.red, ReportColor.yellow) - bugfixcheck = (ReportColor.yellow, ReportColor.blue, ReportColor.blue) ColorTheme = Tuple[str, str, str] @@ -774,6 +784,7 @@ HTML_BASE_BUILD_TEMPLATE = ( + @@ -815,6 +826,8 @@ def create_build_html_report( else: row.append("") + row.append(f"") + if build_result.status: style = _get_status_style(build_result.status) row.append(f'') diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 108aa7d1946..09d50c902d8 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -1,11 +1,14 @@ #!/usr/bin/env python3 import atexit -import sys import logging +import sys from typing import Tuple +# isort: off from github import Github +# isort: on + from commit_status_helper import ( CI_STATUS_NAME, create_ci_report, @@ -18,13 +21,13 @@ from commit_status_helper import ( ) from env_helper import GITHUB_REPOSITORY, GITHUB_SERVER_URL from get_robot_token import get_best_robot_token -from pr_info import FORCE_TESTS_LABEL, PRInfo from lambda_shared_package.lambda_shared.pr import ( CATEGORY_TO_LABEL, TRUSTED_CONTRIBUTORS, check_pr_description, ) -from report import FAILURE +from pr_info import PRInfo +from report import FAILURE, PENDING TRUSTED_ORG_IDS = { 54801242, # clickhouse @@ -63,9 +66,6 @@ def pr_is_by_trusted_user(pr_user_login, pr_user_orgs): def should_run_ci_for_pr(pr_info: PRInfo) -> Tuple[bool, str]: # Consider the labels and whether the user is trusted. print("Got labels", pr_info.labels) - if FORCE_TESTS_LABEL in pr_info.labels: - print(f"Label '{FORCE_TESTS_LABEL}' set, forcing remaining checks") - return True, f"Labeled '{FORCE_TESTS_LABEL}'" if OK_SKIP_LABELS.intersection(pr_info.labels): return True, "Don't try new checks for release/backports/cherry-picks" @@ -146,7 +146,7 @@ def main(): ) post_commit_status( commit, - "failure", + FAILURE, url, format_description(description_error), PR_CHECK, @@ -170,6 +170,14 @@ def main(): # allow the workflow to continue if not can_run: + post_commit_status( + commit, + FAILURE, + "", + description, + PR_CHECK, + pr_info, + ) print("::notice ::Cannot run") sys.exit(1) @@ -177,7 +185,7 @@ def main(): print("::notice ::Can run") post_commit_status( commit, - "pending", + PENDING, ci_report_url, description, CI_STATUS_NAME, diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 616d645b5a6..bff53f00ad3 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -107,6 +107,9 @@ class S3Helper: logging.info("Upload %s to %s. Meta: %s", file_path, url, metadata) return url + def delete_file_from_s3(self, bucket_name: str, s3_path: str) -> None: + self.client.delete_object(Bucket=bucket_name, Key=s3_path) + def upload_test_report_to_s3(self, file_path: Path, s3_path: str) -> str: if CI: return self._upload_file_to_s3(S3_TEST_REPORTS_BUCKET, file_path, s3_path) diff --git a/tests/ci/sqlancer_check.py b/tests/ci/sqlancer_check.py index f85ab2be9a3..59d2a3d6275 100644 --- a/tests/ci/sqlancer_check.py +++ b/tests/ci/sqlancer_check.py @@ -7,12 +7,9 @@ import sys from pathlib import Path from build_download_helper import get_build_name_for_check, read_build_urls -from docker_images_helper import DockerImage, pull_image, get_docker_image -from env_helper import ( - REPORT_PATH, - TEMP_PATH, -) -from report import JobReport, TestResults, TestResult +from docker_images_helper import DockerImage, get_docker_image, pull_image +from env_helper import REPORT_PATH, TEMP_PATH +from report import FAILURE, SUCCESS, JobReport, TestResult, TestResults from stopwatch import Stopwatch from tee_popen import TeePopen @@ -94,7 +91,7 @@ def main(): paths += [workspace_path / f"{t}.err" for t in tests] paths += [workspace_path / f"{t}.out" for t in tests] - status = "success" + status = SUCCESS test_results = [] # type: TestResults # Try to get status message saved by the SQLancer try: @@ -109,7 +106,7 @@ def main(): with open(workspace_path / "description.txt", "r", encoding="utf-8") as desc_f: description = desc_f.readline().rstrip("\n") except: - status = "failure" + status = FAILURE description = "Task failed: $?=" + str(retcode) if not test_results: diff --git a/tests/ci/sqllogic_test.py b/tests/ci/sqllogic_test.py index a7b3e3cf69e..e9a109e425e 100755 --- a/tests/ci/sqllogic_test.py +++ b/tests/ci/sqllogic_test.py @@ -9,23 +9,22 @@ from pathlib import Path from typing import Tuple from build_download_helper import download_all_deb_packages -from commit_status_helper import override_status from docker_images_helper import DockerImage, pull_image, get_docker_image from env_helper import REPORT_PATH, TEMP_PATH, REPO_COPY from report import ( - OK, - FAIL, ERROR, + FAIL, + OK, SUCCESS, JobReport, - TestResults, + StatusType, TestResult, + TestResults, read_test_results, ) from stopwatch import Stopwatch from tee_popen import TeePopen - NO_CHANGES_MSG = "Nothing to run" IMAGE_NAME = "clickhouse/sqllogic-test" @@ -47,7 +46,7 @@ def get_run_command( ) -def read_check_status(result_folder: Path) -> Tuple[str, str]: +def read_check_status(result_folder: Path) -> Tuple[StatusType, str]: status_path = result_folder / "check_status.tsv" if not status_path.exists(): return ERROR, "Not found check_status.tsv" @@ -60,9 +59,9 @@ def read_check_status(result_folder: Path) -> Tuple[str, str]: if len(row) != 2: return ERROR, "Invalid check_status.tsv" if row[0] != SUCCESS: - return row[0], row[1] + return row[0], row[1] # type: ignore - return status_rows[-1][0], status_rows[-1][1] + return status_rows[-1][0], status_rows[-1][1] # type: ignore def parse_args() -> argparse.Namespace: @@ -163,7 +162,7 @@ def main(): status, description = ERROR, "Empty test_results.tsv" assert status is not None - status = override_status(status, check_name) + test_results.append( TestResult( "All tests", @@ -172,7 +171,7 @@ def main(): ) ) - # Until it pass all tests, do not block CI, report "success" + # Until it pass all tests, do not block CI, report SUCCESS assert description is not None # FIXME: force SUCCESS until all cases are fixed status = SUCCESS diff --git a/tests/ci/sqltest.py b/tests/ci/sqltest.py index b2105d4f5c0..2fe6aabd69c 100644 --- a/tests/ci/sqltest.py +++ b/tests/ci/sqltest.py @@ -1,21 +1,16 @@ #!/usr/bin/env python3 import logging -import subprocess import os +import subprocess import sys from pathlib import Path -from typing import Dict - from build_download_helper import get_build_name_for_check, read_build_urls -from docker_images_helper import pull_image, get_docker_image -from env_helper import ( - REPORT_PATH, - TEMP_PATH, -) +from docker_images_helper import get_docker_image, pull_image +from env_helper import REPORT_PATH, TEMP_PATH from pr_info import PRInfo -from report import JobReport, TestResult +from report import SUCCESS, JobReport, TestResult from stopwatch import Stopwatch IMAGE_NAME = "clickhouse/sqltest" @@ -98,7 +93,7 @@ def main(): "report.html": workspace_path / "report.html", "test.log": workspace_path / "test.log", } - status = "success" + status = SUCCESS description = "See the report" test_results = [TestResult(description, "OK")] diff --git a/tests/ci/stress_check.py b/tests/ci/stress_check.py index 46bb2261aba..49c1515c69f 100644 --- a/tests/ci/stress_check.py +++ b/tests/ci/stress_check.py @@ -10,11 +10,10 @@ from typing import List, Tuple from build_download_helper import download_all_deb_packages from clickhouse_helper import CiLogsCredentials - -from docker_images_helper import DockerImage, pull_image, get_docker_image -from env_helper import REPORT_PATH, TEMP_PATH, REPO_COPY +from docker_images_helper import DockerImage, get_docker_image, pull_image +from env_helper import REPO_COPY, REPORT_PATH, TEMP_PATH from pr_info import PRInfo -from report import JobReport, TestResult, TestResults, read_test_results +from report import ERROR, JobReport, TestResult, TestResults, read_test_results from stopwatch import Stopwatch from tee_popen import TeePopen @@ -89,7 +88,7 @@ def process_results( status = list(csv.reader(status_file, delimiter="\t")) if len(status) != 1 or len(status[0]) != 2: - return "error", "Invalid check_status.tsv", test_results, additional_files + return ERROR, "Invalid check_status.tsv", test_results, additional_files state, description = status[0][0], status[0][1] try: @@ -99,7 +98,7 @@ def process_results( raise Exception("Empty results") except Exception as e: return ( - "error", + ERROR, f"Cannot parse test_results.tsv ({e})", test_results, additional_files, diff --git a/tests/ci/style_check.py b/tests/ci/style_check.py index 4f791a5ee01..0c7160aeea4 100644 --- a/tests/ci/style_check.py +++ b/tests/ci/style_check.py @@ -8,17 +8,14 @@ import sys from pathlib import Path from typing import List, Tuple - from docker_images_helper import get_docker_image, pull_image from env_helper import REPO_COPY, TEMP_PATH from git_helper import GIT_PREFIX, git_runner from pr_info import PRInfo -from report import JobReport, TestResults, read_test_results +from report import ERROR, FAILURE, SUCCESS, JobReport, TestResults, read_test_results from ssh import SSHKey from stopwatch import Stopwatch -NAME = "Style Check" - def process_result( result_directory: Path, @@ -39,7 +36,7 @@ def process_result( status = list(csv.reader(status_file, delimiter="\t")) if len(status) != 1 or len(status[0]) != 2: logging.info("Files in result folder %s", os.listdir(result_directory)) - return "error", "Invalid check_status.tsv", test_results, additional_files + return ERROR, "Invalid check_status.tsv", test_results, additional_files state, description = status[0][0], status[0][1] try: @@ -50,8 +47,8 @@ def process_result( return state, description, test_results, additional_files except Exception: - if state == "success": - state, description = "error", "Failed to read test_results.tsv" + if state == SUCCESS: + state, description = ERROR, "Failed to read test_results.tsv" return state, description, test_results, additional_files @@ -164,7 +161,7 @@ def main(): additional_files=additional_files, ).dump() - if state in ["error", "failure"]: + if state in [ERROR, FAILURE]: print(f"Style check failed: [{description}]") sys.exit(1) diff --git a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh index a55c1bb2b3b..6ba0987010a 100644 --- a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh +++ b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh @@ -17,24 +17,49 @@ DOCKER_IMAGE="public.ecr.aws/lambda/python:${PY_VERSION}" LAMBDA_NAME=${DIR_NAME//_/-} # The name of directory with lambda code PACKAGE=lambda-package + +# Do not rebuild and deploy the archive if it's newer than sources +if [ -e "$PACKAGE.zip" ] && [ -z "$FORCE" ]; then + REBUILD="" + for src in app.py build_and_deploy_archive.sh requirements.txt lambda_shared/*; do + if [ "$src" -nt "$PACKAGE.zip" ]; then + REBUILD=1 + fi + done + [ -n "$REBUILD" ] || exit 0 +fi + rm -rf "$PACKAGE" "$PACKAGE".zip mkdir "$PACKAGE" cp app.py "$PACKAGE" if [ -f requirements.txt ]; then VENV=lambda-venv - rm -rf "$VENV" lambda-package.zip + rm -rf "$VENV" docker run --net=host --rm --user="${UID}" -e HOME=/tmp --entrypoint=/bin/bash \ --volume="${WORKDIR}/..:/ci" --workdir="/ci/${DIR_NAME}" "${DOCKER_IMAGE}" \ -exc " '$PY_EXEC' -m venv '$VENV' && source '$VENV/bin/activate' && - pip install -r requirements.txt + pip install -r requirements.txt && + # To have consistent pyc files + find '$VENV/lib' -name '*.pyc' -delete + find '$VENV/lib' ! -type d -exec touch -t 201212121212 {} + + python -m compileall " cp -rT "$VENV/lib/$PY_EXEC/site-packages/" "$PACKAGE" rm -r "$PACKAGE"/{pip,pip-*,setuptools,setuptools-*} + # zip stores metadata about timestamps + find "$PACKAGE" ! -type d -exec touch -t 201212121212 {} + fi -( cd "$PACKAGE" && zip -9 -r ../"$PACKAGE".zip . ) +( + export LC_ALL=c + cd "$PACKAGE" + # zip uses random files order by default, so we sort the files alphabetically + find . ! -type d -print0 | sort -z | tr '\0' '\n' | zip -XD -0 ../"$PACKAGE".zip --names-stdin +) -if [ -z "$DRY_RUN" ]; then - aws lambda update-function-code --function-name "$LAMBDA_NAME" --zip-file fileb://"$WORKDIR/$PACKAGE".zip +ECHO=() +if [ -n "$DRY_RUN" ]; then + ECHO=(echo Run the following command to push the changes:) fi +"${ECHO[@]}" aws lambda update-function-code --function-name "$LAMBDA_NAME" --zip-file fileb://"$WORKDIR/$PACKAGE".zip diff --git a/tests/ci/test_ci_cache.py b/tests/ci/test_ci_cache.py new file mode 100644 index 00000000000..3cdd6c78390 --- /dev/null +++ b/tests/ci/test_ci_cache.py @@ -0,0 +1,314 @@ +#!/usr/bin/env python + +from hashlib import md5 +from pathlib import Path +import shutil +from typing import Dict, Set +import unittest +from ci_config import Build, JobNames +from s3_helper import S3Helper +from ci import CiCache +from digest_helper import JOB_DIGEST_LEN +from commit_status_helper import CommitStatusData +from env_helper import S3_BUILDS_BUCKET, TEMP_PATH + + +def _create_mock_digest_1(string): + return md5((string).encode("utf-8")).hexdigest()[:JOB_DIGEST_LEN] + + +def _create_mock_digest_2(string): + return md5((string + "+nonce").encode("utf-8")).hexdigest()[:JOB_DIGEST_LEN] + + +DIGESTS = {job: _create_mock_digest_1(job) for job in JobNames} +DIGESTS2 = {job: _create_mock_digest_2(job) for job in JobNames} + + +# pylint:disable=protected-access +class S3HelperTestMock(S3Helper): + def __init__(self) -> None: + super().__init__() + self.files_on_s3_paths = {} # type: Dict[str, Set[str]] + + # local path which is mocking remote s3 path with ci_cache + self.mock_remote_s3_path = Path(TEMP_PATH) / "mock_s3_path" + if not self.mock_remote_s3_path.exists(): + self.mock_remote_s3_path.mkdir(parents=True, exist_ok=True) + for file in self.mock_remote_s3_path.iterdir(): + file.unlink() + + def list_prefix(self, s3_prefix_path, bucket=S3_BUILDS_BUCKET): + assert bucket == S3_BUILDS_BUCKET + file_prefix = Path(s3_prefix_path).name + path = str(Path(s3_prefix_path).parent) + return [ + path + "/" + file + for file in self.files_on_s3_paths[path] + if file.startswith(file_prefix) + ] + + def upload_file(self, bucket, file_path, s3_path): + assert bucket == S3_BUILDS_BUCKET + file_name = Path(file_path).name + assert ( + file_name in s3_path + ), f"Record file name [{file_name}] must be part of a path on s3 [{s3_path}]" + s3_path = str(Path(s3_path).parent) + if s3_path in self.files_on_s3_paths: + self.files_on_s3_paths[s3_path].add(file_name) + else: + self.files_on_s3_paths[s3_path] = set([file_name]) + shutil.copy(file_path, self.mock_remote_s3_path) + + def download_files(self, bucket, s3_path, file_suffix, local_directory): + assert bucket == S3_BUILDS_BUCKET + assert file_suffix == CiCache._RECORD_FILE_EXTENSION + assert local_directory == CiCache._LOCAL_CACHE_PATH + assert CiCache._S3_CACHE_PREFIX in s3_path + assert [job_type.value in s3_path for job_type in CiCache.JobType] + + # copying from mock remote path to local cache + for remote_record in self.mock_remote_s3_path.glob(f"*{file_suffix}"): + destination_file = CiCache._LOCAL_CACHE_PATH / remote_record.name + shutil.copy(remote_record, destination_file) + + +# pylint:disable=protected-access +class TestCiCache(unittest.TestCase): + def test_cache(self): + s3_mock = S3HelperTestMock() + ci_cache = CiCache(s3_mock, DIGESTS) + # immitate another CI run is using cache + ci_cache_2 = CiCache(s3_mock, DIGESTS2) + NUM_BATCHES = 10 + + DOCS_JOBS_NUM = 1 + assert len(set(job for job in JobNames)) == len(list(job for job in JobNames)) + NONDOCS_JOBS_NUM = len(set(job for job in JobNames)) - DOCS_JOBS_NUM + + PR_NUM = 123456 + status = CommitStatusData( + status="success", + report_url="dummy url", + description="OK OK OK", + sha="deadbeaf2", + pr_num=PR_NUM, + ) + + ### add some pending statuses for two batches, non-release branch + for job in JobNames: + ci_cache.push_pending(job, [0, 1, 2], NUM_BATCHES, release_branch=False) + ci_cache_2.push_pending(job, [0, 1, 2], NUM_BATCHES, release_branch=False) + + ### add success status for 0 batch, non-release branch + batch = 0 + for job in JobNames: + ci_cache.push_successful( + job, batch, NUM_BATCHES, status, release_branch=False + ) + ci_cache_2.push_successful( + job, batch, NUM_BATCHES, status, release_branch=False + ) + + ### add failed status for 2 batch, non-release branch + batch = 2 + for job in JobNames: + ci_cache.push_failed(job, batch, NUM_BATCHES, status, release_branch=False) + ci_cache_2.push_failed( + job, batch, NUM_BATCHES, status, release_branch=False + ) + + ### check all expected directories were created on s3 mock + expected_build_path_1 = f"{CiCache.JobType.SRCS.value}-{_create_mock_digest_1(Build.PACKAGE_RELEASE)}" + expected_docs_path_1 = ( + f"{CiCache.JobType.DOCS.value}-{_create_mock_digest_1(JobNames.DOCS_CHECK)}" + ) + expected_build_path_2 = f"{CiCache.JobType.SRCS.value}-{_create_mock_digest_2(Build.PACKAGE_RELEASE)}" + expected_docs_path_2 = ( + f"{CiCache.JobType.DOCS.value}-{_create_mock_digest_2(JobNames.DOCS_CHECK)}" + ) + self.assertCountEqual( + list(s3_mock.files_on_s3_paths.keys()), + [ + f"{CiCache._S3_CACHE_PREFIX}/{expected_build_path_1}", + f"{CiCache._S3_CACHE_PREFIX}/{expected_docs_path_1}", + f"{CiCache._S3_CACHE_PREFIX}/{expected_build_path_2}", + f"{CiCache._S3_CACHE_PREFIX}/{expected_docs_path_2}", + ], + ) + + ### check number of cache files is as expected + FILES_PER_JOB = 5 # 1 successful + 1 failed + 3 pending batches = 5 + self.assertEqual( + len( + s3_mock.files_on_s3_paths[ + f"{CiCache._S3_CACHE_PREFIX}/{expected_build_path_1}" + ] + ), + NONDOCS_JOBS_NUM * FILES_PER_JOB, + ) + self.assertEqual( + len( + s3_mock.files_on_s3_paths[ + f"{CiCache._S3_CACHE_PREFIX}/{expected_docs_path_1}" + ] + ), + DOCS_JOBS_NUM * FILES_PER_JOB, + ) + self.assertEqual( + len( + s3_mock.files_on_s3_paths[ + f"{CiCache._S3_CACHE_PREFIX}/{expected_build_path_2}" + ] + ), + NONDOCS_JOBS_NUM * FILES_PER_JOB, + ) + self.assertEqual( + len( + s3_mock.files_on_s3_paths[ + f"{CiCache._S3_CACHE_PREFIX}/{expected_docs_path_2}" + ] + ), + DOCS_JOBS_NUM * FILES_PER_JOB, + ) + + ### check statuses for all jobs in cache + for job in JobNames: + self.assertEqual( + ci_cache.is_successful(job, 0, NUM_BATCHES, release_branch=False), True + ) + self.assertEqual( + ci_cache.is_successful(job, 0, NUM_BATCHES, release_branch=True), False + ) + self.assertEqual( + ci_cache.is_successful( + job, batch=1, num_batches=NUM_BATCHES, release_branch=False + ), + False, + ) # false - it's pending + self.assertEqual( + ci_cache.is_successful( + job, + batch=NUM_BATCHES, + num_batches=NUM_BATCHES, + release_branch=False, + ), + False, + ) # false - no such record + self.assertEqual( + ci_cache.is_pending(job, 0, NUM_BATCHES, release_branch=False), False + ) # false, it's successful, success has more priority than pending + self.assertEqual( + ci_cache.is_pending(job, 1, NUM_BATCHES, release_branch=False), True + ) # true + self.assertEqual( + ci_cache.is_pending(job, 1, NUM_BATCHES, release_branch=True), False + ) # false, not pending job on release_branch + + status2 = ci_cache.get_successful(job, 0, NUM_BATCHES) + assert status2 and status2.pr_num == PR_NUM + status2 = ci_cache.get_successful(job, 1, NUM_BATCHES) + assert status2 is None + + ### add some more pending statuses for two batches and for a release branch + for job in JobNames: + ci_cache.push_pending( + job, batches=[0, 1], num_batches=NUM_BATCHES, release_branch=True + ) + + ### add success statuses for 0 batch and release branch + PR_NUM = 234 + status = CommitStatusData( + status="success", + report_url="dummy url", + description="OK OK OK", + sha="deadbeaf2", + pr_num=PR_NUM, + ) + for job in JobNames: + ci_cache.push_successful(job, 0, NUM_BATCHES, status, release_branch=True) + + ### check number of cache files is as expected + FILES_PER_JOB = 8 # 1 successful + 1 failed + 1 successful_release + 3 pending batches + 2 pending batches release = 8 + self.assertEqual( + len( + s3_mock.files_on_s3_paths[ + f"{CiCache._S3_CACHE_PREFIX}/{expected_build_path_1}" + ] + ), + NONDOCS_JOBS_NUM * FILES_PER_JOB, + ) + self.assertEqual( + len( + s3_mock.files_on_s3_paths[ + f"{CiCache._S3_CACHE_PREFIX}/{expected_docs_path_1}" + ] + ), + DOCS_JOBS_NUM * FILES_PER_JOB, + ) + + ### check statuses + for job in JobNames: + self.assertEqual(ci_cache.is_successful(job, 0, NUM_BATCHES, False), True) + self.assertEqual(ci_cache.is_successful(job, 0, NUM_BATCHES, True), True) + self.assertEqual(ci_cache.is_successful(job, 1, NUM_BATCHES, False), False) + self.assertEqual(ci_cache.is_successful(job, 1, NUM_BATCHES, True), False) + self.assertEqual( + ci_cache.is_pending(job, 0, NUM_BATCHES, False), False + ) # it's success, not pending + self.assertEqual( + ci_cache.is_pending(job, 0, NUM_BATCHES, True), False + ) # it's success, not pending + self.assertEqual(ci_cache.is_pending(job, 1, NUM_BATCHES, False), True) + self.assertEqual(ci_cache.is_pending(job, 1, NUM_BATCHES, True), True) + + self.assertEqual(ci_cache.is_failed(job, 2, NUM_BATCHES, False), True) + self.assertEqual(ci_cache.is_failed(job, 2, NUM_BATCHES, True), False) + + status2 = ci_cache.get_successful(job, 0, NUM_BATCHES) + assert status2 and status2.pr_num == PR_NUM + status2 = ci_cache.get_successful(job, 1, NUM_BATCHES) + assert status2 is None + + ### create new cache object and verify the same checks + ci_cache = CiCache(s3_mock, DIGESTS) + for job in JobNames: + self.assertEqual(ci_cache.is_successful(job, 0, NUM_BATCHES, False), True) + self.assertEqual(ci_cache.is_successful(job, 0, NUM_BATCHES, True), True) + self.assertEqual(ci_cache.is_successful(job, 1, NUM_BATCHES, False), False) + self.assertEqual(ci_cache.is_successful(job, 1, NUM_BATCHES, True), False) + self.assertEqual( + ci_cache.is_pending(job, 0, NUM_BATCHES, False), False + ) # it's success, not pending + self.assertEqual( + ci_cache.is_pending(job, 0, NUM_BATCHES, True), False + ) # it's success, not pending + self.assertEqual(ci_cache.is_pending(job, 1, NUM_BATCHES, False), True) + self.assertEqual(ci_cache.is_pending(job, 1, NUM_BATCHES, True), True) + + self.assertEqual(ci_cache.is_failed(job, 2, NUM_BATCHES, False), True) + self.assertEqual(ci_cache.is_failed(job, 2, NUM_BATCHES, True), False) + + # is_pending() is false for failed jobs batches + self.assertEqual(ci_cache.is_pending(job, 2, NUM_BATCHES, False), False) + self.assertEqual(ci_cache.is_pending(job, 2, NUM_BATCHES, True), False) + + status2 = ci_cache.get_successful(job, 0, NUM_BATCHES) + assert status2 and status2.pr_num == PR_NUM + status2 = ci_cache.get_successful(job, 1, NUM_BATCHES) + assert status2 is None + + ### check some job values which are not in the cache + self.assertEqual(ci_cache.is_successful(job, 0, NUM_BATCHES + 1, False), False) + self.assertEqual( + ci_cache.is_successful(job, NUM_BATCHES - 1, NUM_BATCHES, False), False + ) + self.assertEqual(ci_cache.is_pending(job, 0, NUM_BATCHES + 1, False), False) + self.assertEqual( + ci_cache.is_pending(job, NUM_BATCHES - 1, NUM_BATCHES, False), False + ) + + +if __name__ == "__main__": + TestCiCache().test_cache() diff --git a/tests/ci/test_ci_config.py b/tests/ci/test_ci_config.py index d22ed16748e..49d49d9c328 100644 --- a/tests/ci/test_ci_config.py +++ b/tests/ci/test_ci_config.py @@ -3,7 +3,7 @@ import unittest -class TestCiConfig(unittest.TestCase): +class TestCIConfig(unittest.TestCase): def test_no_errors_in_ci_config(self): raised = None try: diff --git a/tests/ci/unit_tests_check.py b/tests/ci/unit_tests_check.py index 495547e1dfc..41c52d53020 100644 --- a/tests/ci/unit_tests_check.py +++ b/tests/ci/unit_tests_check.py @@ -3,15 +3,15 @@ import json import logging import os -import sys import subprocess +import sys from pathlib import Path from typing import Tuple from build_download_helper import download_unit_tests -from docker_images_helper import pull_image, get_docker_image +from docker_images_helper import get_docker_image, pull_image from env_helper import REPORT_PATH, TEMP_PATH -from report import ERROR, FAILURE, FAIL, OK, SUCCESS, JobReport, TestResults, TestResult +from report import ERROR, FAIL, FAILURE, OK, SUCCESS, JobReport, TestResult, TestResults from stopwatch import Stopwatch from tee_popen import TeePopen @@ -104,7 +104,7 @@ def process_results( if "failures" in test_case: raw_logs = "" for failure in test_case["failures"]: - raw_logs += failure["failure"] + raw_logs += failure[FAILURE] if ( "Segmentation fault" in raw_logs # type: ignore and SEGFAULT not in description @@ -205,7 +205,7 @@ def main(): additional_files=additional_files, ).dump() - if state == "failure": + if state == FAILURE: sys.exit(1) diff --git a/tests/ci/upload_result_helper.py b/tests/ci/upload_result_helper.py index 6fa9c1dd873..9dca3fae1dc 100644 --- a/tests/ci/upload_result_helper.py +++ b/tests/ci/upload_result_helper.py @@ -9,7 +9,7 @@ from env_helper import ( GITHUB_RUN_URL, GITHUB_SERVER_URL, ) -from report import ReportColorTheme, TestResults, create_test_html_report +from report import TestResults, create_test_html_report from s3_helper import S3Helper @@ -92,10 +92,6 @@ def upload_results( else: raw_log_url = GITHUB_JOB_URL() - statuscolors = ( - ReportColorTheme.bugfixcheck if "bugfix validate check" in check_name else None - ) - if test_results or not ready_report_url: html_report = create_test_html_report( check_name, @@ -107,7 +103,6 @@ def upload_results( branch_name, commit_url, additional_urls, - statuscolors=statuscolors, ) report_path = Path("report.html") report_path.write_text(html_report, encoding="utf-8") diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 6ead5bd2873..9c21f1fd2a2 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -12,6 +12,7 @@ import itertools import sys import os import os.path +import glob import platform import signal import re @@ -75,6 +76,12 @@ def stringhash(s): return zlib.crc32(s.encode("utf-8")) +def read_file_as_binary_string(file_path): + with open(file_path, "rb") as file: + binary_data = file.read() + return binary_data + + # First and last lines of the log def trim_for_log(s): if not s: @@ -101,6 +108,7 @@ class HTTPError(Exception): def clickhouse_execute_http( base_args, query, + body=None, timeout=30, settings=None, default_format=None, @@ -140,6 +148,7 @@ def clickhouse_execute_http( client.request( "POST", f"/?{base_args.client_options_query_str}{urllib.parse.urlencode(params)}", + body=body, ) res = client.getresponse() data = res.read() @@ -160,6 +169,7 @@ def clickhouse_execute_http( def clickhouse_execute( base_args, query, + body=None, timeout=30, settings=None, max_http_retries=5, @@ -168,6 +178,7 @@ def clickhouse_execute( return clickhouse_execute_http( base_args, query, + body, timeout, settings, max_http_retries=max_http_retries, @@ -181,6 +192,7 @@ def clickhouse_execute_json( data = clickhouse_execute_http( base_args, query, + None, timeout, settings, "JSONEachRow", @@ -400,7 +412,6 @@ def get_stacktraces_from_gdb(server_pid): # collect server stacktraces from system.stack_trace table -# it does not work in Sandbox def get_stacktraces_from_clickhouse(args): settings_str = " ".join( [ @@ -1262,17 +1273,34 @@ class TestCase: ): clickhouse_execute( args, - f"INSERT INTO system.coverage SELECT now(), '{self.case}', coverage()", + f"INSERT INTO system.coverage_log SELECT now(), '{self.case}', coverageCurrent()", retry_error_codes=True, ) + # Check for dumped coverage files + file_pattern = "coverage.*" + matching_files = glob.glob(file_pattern) + for file_path in matching_files: + try: + body = read_file_as_binary_string(file_path) + clickhouse_execute( + args, + f"INSERT INTO system.coverage_log SELECT now(), '{self.case}', groupArray(data) FROM input('data UInt64') FORMAT RowBinary", + body=body, + retry_error_codes=True, + ) + except Exception as e: + print("Cannot insert coverage data: ", str(e)) + # Remove the file even in case of exception to avoid accumulation and quadratic complexity. + os.remove(file_path) + coverage = clickhouse_execute( args, - "SELECT length(coverage())", + "SELECT length(coverageCurrent())", retry_error_codes=True, ).decode() - description_full += f" Coverage: {coverage}" + description_full += f" (coverage: {coverage})" description_full += "\n" @@ -1335,6 +1363,7 @@ class TestCase: # We want to calculate per-test code coverage. That's why we reset it before each test. if ( args.collect_per_test_coverage + and args.reset_coverage_before_every_test and BuildFlags.SANITIZE_COVERAGE in args.build_flags ): clickhouse_execute( @@ -2460,19 +2489,20 @@ def main(args): clickhouse_execute( args, """ - CREATE TABLE IF NOT EXISTS system.coverage + CREATE TABLE IF NOT EXISTS system.coverage_log ( time DateTime, test_name String, coverage Array(UInt64) - ) ENGINE = MergeTree ORDER BY test_name; + ) ENGINE = MergeTree ORDER BY test_name + COMMENT 'Contains information about per-test coverage from the CI, but used only for exporting to the CI cluster'; """, ) # Coverage collected at the system startup before running any tests: clickhouse_execute( args, - "INSERT INTO system.coverage SELECT now(), '', coverage()", + "INSERT INTO system.coverage_log SELECT now(), '', coverageCurrent()", ) total_tests_run = 0 @@ -2859,8 +2889,14 @@ def parse_args(): parser.add_argument( "--collect-per-test-coverage", action="store_true", - default=False, - help="Create `system.coverage` table on the server and collect information about low-level code coverage on a per test basis there", + default=True, + help="Create `system.coverage_log` table on the server and collect information about low-level code coverage on a per test basis there", + ) + parser.add_argument( + "--reset-coverage-before-every-test", + action="store_true", + default=True, + help="Collect isolated test coverage for every test instead of a cumulative. Useful only when tests are run sequentially.", ) parser.add_argument( "--report-logs-stats", diff --git a/tests/config/config.d/block_number.xml b/tests/config/config.d/block_number.xml new file mode 100644 index 00000000000..b56f1f1afc2 --- /dev/null +++ b/tests/config/config.d/block_number.xml @@ -0,0 +1,6 @@ + + + + 0 + + diff --git a/tests/config/config.d/database_replicated.xml b/tests/config/config.d/database_replicated.xml index 2504a7ca526..d8bed2f08fe 100644 --- a/tests/config/config.d/database_replicated.xml +++ b/tests/config/config.d/database_replicated.xml @@ -97,4 +97,5 @@ <_functional_tests_helper_database_replicated_replace_args_macros>1 + 50 diff --git a/tests/config/config.d/keeper_port.xml b/tests/config/config.d/keeper_port.xml index b87014d2485..b724d5dd87e 100644 --- a/tests/config/config.d/keeper_port.xml +++ b/tests/config/config.d/keeper_port.xml @@ -4,6 +4,7 @@ 1 1 + 1 10000 diff --git a/tests/config/config.d/max_num_to_warn.xml b/tests/config/config.d/max_num_to_warn.xml index 77d68998f8e..776c270823d 100644 --- a/tests/config/config.d/max_num_to_warn.xml +++ b/tests/config/config.d/max_num_to_warn.xml @@ -1,5 +1,5 @@ - 10 - 10 + 5 + 2 10 diff --git a/tests/config/config.d/storage_conf.xml b/tests/config/config.d/storage_conf.xml index 18652826d83..1429dfff724 100644 --- a/tests/config/config.d/storage_conf.xml +++ b/tests/config/config.d/storage_conf.xml @@ -4,11 +4,17 @@ s3 s3_disk/ - http://localhost:11111/test/common/ + http://localhost:11111/test/s3/ clickhouse clickhouse 20000 + + s3_plain + http://localhost:11111/test/s3_plain/ + clickhouse + clickhouse + cache s3_disk diff --git a/tests/config/install.sh b/tests/config/install.sh index a68a4c19501..cfe810cda84 100755 --- a/tests/config/install.sh +++ b/tests/config/install.sh @@ -64,6 +64,7 @@ ln -sf $SRC_PATH/config.d/backups.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/filesystem_caches_path.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/validate_tcp_client_information.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/zero_copy_destructive_operations.xml $DEST_SERVER_PATH/config.d/ +ln -sf $SRC_PATH/config.d/block_number.xml $DEST_SERVER_PATH/config.d/ # Not supported with fasttest. if [ "${DEST_SERVER_PATH}" = "/etc/clickhouse-server" ] diff --git a/tests/config/users.d/readonly.xml b/tests/config/users.d/readonly.xml index 0fe1e3fe6d9..799de11decf 100644 --- a/tests/config/users.d/readonly.xml +++ b/tests/config/users.d/readonly.xml @@ -9,7 +9,8 @@ - + + ::1 127.0.0.1 diff --git a/tests/config/users.d/session_log_test.xml b/tests/config/users.d/session_log_test.xml index cc2c2c5fcde..f93b0efd828 100644 --- a/tests/config/users.d/session_log_test.xml +++ b/tests/config/users.d/session_log_test.xml @@ -18,7 +18,8 @@ - + + ::1 127.0.0.1 diff --git a/tests/integration/README.md b/tests/integration/README.md index e7ba37bfb56..1b5a0ee8994 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -19,7 +19,6 @@ Don't use Docker from your system repository. ``` sudo -H pip install \ PyMySQL \ - aerospike \ avro \ cassandra-driver \ confluent-kafka \ diff --git a/tests/integration/ci-runner.py b/tests/integration/ci-runner.py index 7c922e339fe..08dd9ba276b 100755 --- a/tests/integration/ci-runner.py +++ b/tests/integration/ci-runner.py @@ -252,9 +252,7 @@ class ClickhouseIntegrationTestsRunner: self.image_versions = self.params["docker_images_with_versions"] self.shuffle_groups = self.params["shuffle_test_groups"] self.flaky_check = "flaky check" in self.params["context_name"] - self.bugfix_validate_check = ( - "bugfix validate check" in self.params["context_name"] - ) + self.bugfix_validate_check = "bugfix" in self.params["context_name"].lower() # if use_tmpfs is not set we assume it to be true, otherwise check self.use_tmpfs = "use_tmpfs" not in self.params or self.params["use_tmpfs"] self.disable_net_host = ( diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 1d96563251b..95722dd0db9 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -3484,6 +3484,10 @@ class ClickHouseInstance: if check_callback(result): return result time.sleep(sleep_time) + except QueryRuntimeException as ex: + # Container is down, this is likely due to server crash. + if "No route to host" in str(ex): + raise except Exception as ex: # logging.debug("Retry {} got exception {}".format(i + 1, ex)) time.sleep(sleep_time) diff --git a/tests/integration/helpers/external_sources.py b/tests/integration/helpers/external_sources.py index afb91083d57..cccf151e73e 100644 --- a/tests/integration/helpers/external_sources.py +++ b/tests/integration/helpers/external_sources.py @@ -4,7 +4,6 @@ import os import uuid import warnings -import aerospike import cassandra.cluster import pymongo import pymysql.cursors @@ -696,91 +695,3 @@ class SourceRedis(ExternalSource): or layout.is_complex and self.storage_type == "hash_map" ) - - -class SourceAerospike(ExternalSource): - def __init__( - self, - name, - internal_hostname, - internal_port, - docker_hostname, - docker_port, - user, - password, - ): - ExternalSource.__init__( - self, - name, - internal_hostname, - internal_port, - docker_hostname, - docker_port, - user, - password, - ) - self.namespace = "test" - self.set = "test_set" - - def get_source_str(self, table_name): - print("AEROSPIKE get source str") - return """ - - {host} - {port} - - """.format( - host=self.docker_hostname, - port=self.docker_port, - ) - - def prepare(self, structure, table_name, cluster): - config = {"hosts": [(self.internal_hostname, self.internal_port)]} - self.client = aerospike.client(config).connect() - self.prepared = True - print("PREPARED AEROSPIKE") - print(config) - - def compatible_with_layout(self, layout): - print("compatible AEROSPIKE") - return layout.is_simple - - def _flush_aerospike_db(self): - keys = [] - - def handle_record(xxx_todo_changeme): - (key, metadata, record) = xxx_todo_changeme - print(("Handle record {} {}".format(key, record))) - keys.append(key) - - def print_record(xxx_todo_changeme1): - (key, metadata, record) = xxx_todo_changeme1 - print(("Print record {} {}".format(key, record))) - - scan = self.client.scan(self.namespace, self.set) - scan.foreach(handle_record) - - [self.client.remove(key) for key in keys] - - def load_kv_data(self, values): - self._flush_aerospike_db() - - print("Load KV Data Aerospike") - if len(values[0]) == 2: - for value in values: - key = (self.namespace, self.set, value[0]) - print(key) - self.client.put( - key, - {"bin_value": value[1]}, - policy={"key": aerospike.POLICY_KEY_SEND}, - ) - assert self.client.exists(key) - else: - assert "VALUES SIZE != 2" - - # print(values) - - def load_data(self, data, table_name): - print("Load Data Aerospike") - # print(data) diff --git a/tests/integration/helpers/keeper_config1.xml b/tests/integration/helpers/keeper_config1.xml index 12c6c0b78b6..a4a1059ffe9 100644 --- a/tests/integration/helpers/keeper_config1.xml +++ b/tests/integration/helpers/keeper_config1.xml @@ -9,11 +9,13 @@ /var/log/clickhouse-keeper/clickhouse-keeper.err.log + + 0 + az-zoo1 + + 2181 - - az-zoo1 - 1 diff --git a/tests/integration/helpers/keeper_config2.xml b/tests/integration/helpers/keeper_config2.xml index 2afff2f5e59..88a0d1f0b4b 100644 --- a/tests/integration/helpers/keeper_config2.xml +++ b/tests/integration/helpers/keeper_config2.xml @@ -9,13 +9,14 @@ /var/log/clickhouse-keeper/clickhouse-keeper.err.log + + 0 + az-zoo2 + + 2181 2 - - az-zoo2 - 1 - 10000 diff --git a/tests/integration/test_access_for_functions/test.py b/tests/integration/test_access_for_functions/test.py index 5069468110c..3e58c961421 100644 --- a/tests/integration/test_access_for_functions/test.py +++ b/tests/integration/test_access_for_functions/test.py @@ -38,8 +38,11 @@ def test_access_rights_for_function(): instance.query("GRANT DROP FUNCTION ON *.* TO B") instance.query("DROP FUNCTION MySum", user="B") - assert "Unknown function MySum" in instance.query_and_get_error( - "SELECT MySum(1, 2)" + + function_resolution_error = instance.query_and_get_error("SELECT MySum(1, 2)") + assert ( + "Unknown function MySum" in function_resolution_error + or "Function with name 'MySum' does not exists." in function_resolution_error ) instance.query("REVOKE CREATE FUNCTION ON *.* FROM A") diff --git a/tests/queries/0_stateless/02696_inverted_idx_checksums.reference b/tests/integration/test_async_insert_adaptive_busy_timeout/__init__.py similarity index 100% rename from tests/queries/0_stateless/02696_inverted_idx_checksums.reference rename to tests/integration/test_async_insert_adaptive_busy_timeout/__init__.py diff --git a/tests/integration/test_async_insert_adaptive_busy_timeout/configs/users.xml b/tests/integration/test_async_insert_adaptive_busy_timeout/configs/users.xml new file mode 100644 index 00000000000..755dc4ac269 --- /dev/null +++ b/tests/integration/test_async_insert_adaptive_busy_timeout/configs/users.xml @@ -0,0 +1,14 @@ + + + + 1 + + + + + + + default + + + diff --git a/tests/integration/test_async_insert_adaptive_busy_timeout/configs/zookeeper_config.xml b/tests/integration/test_async_insert_adaptive_busy_timeout/configs/zookeeper_config.xml new file mode 100644 index 00000000000..18412349228 --- /dev/null +++ b/tests/integration/test_async_insert_adaptive_busy_timeout/configs/zookeeper_config.xml @@ -0,0 +1,8 @@ + + + + zoo1 + 2181 + + + diff --git a/tests/integration/test_async_insert_adaptive_busy_timeout/test.py b/tests/integration/test_async_insert_adaptive_busy_timeout/test.py new file mode 100644 index 00000000000..93319a56d0f --- /dev/null +++ b/tests/integration/test_async_insert_adaptive_busy_timeout/test.py @@ -0,0 +1,372 @@ +import copy +import logging +import pytest +import random +import timeit + +from math import floor +from multiprocessing import Pool +from itertools import repeat + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + + +node = cluster.add_instance( + "node", + main_configs=["configs/zookeeper_config.xml"], + user_configs=[ + "configs/users.xml", + ], + with_zookeeper=True, +) + + +@pytest.fixture(scope="module", autouse=True) +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +_query_settings = {"async_insert": 1, "wait_for_async_insert": 1} + + +def _generate_values(size, min_int, max_int, array_size_range): + gen_tuple = lambda _min_int, _max_int, _array_size_range: ( + random.randint(_min_int, _max_int), + [ + random.randint(_min_int, _max_int) + for _ in range(random.randint(*_array_size_range)) + ], + ) + + return map(lambda _: gen_tuple(min_int, max_int, array_size_range), range(size)) + + +def _insert_query(table_name, settings, *args, **kwargs): + settings_s = ", ".join("{}={}".format(k, settings[k]) for k in settings) + INSERT_QUERY = "INSERT INTO {} SETTINGS {} VALUES {}" + node.query( + INSERT_QUERY.format( + table_name, + settings_s, + ", ".join(map(str, _generate_values(*args, **kwargs))), + ) + ) + + +def _insert_queries_sequentially( + table_name, settings, iterations, max_values_size, array_size_range +): + for iter in range(iterations): + _insert_query( + table_name, + settings, + random.randint(1, max_values_size), + iter * max_values_size, + (iter + 1) * max_values_size - 1, + array_size_range, + ) + + +def _insert_queries_in_parallel( + table_name, settings, thread_num, tasks, max_values_size, array_size_range +): + sizes = [random.randint(1, max_values_size) for _ in range(tasks)] + min_ints = [iter * max_values_size for iter in range(tasks)] + max_ints = [(iter + 1) * max_values_size - 1 for iter in range(tasks)] + with Pool(thread_num) as p: + p.starmap( + _insert_query, + zip( + repeat(table_name), + repeat(settings), + sizes, + min_ints, + max_ints, + repeat(array_size_range), + ), + ) + + +def test_with_merge_tree(): + table_name = "async_insert_mt_table" + node.query( + "CREATE TABLE {} (a UInt64, b Array(UInt64)) ENGINE=MergeTree() ORDER BY a".format( + table_name + ) + ) + + _insert_queries_sequentially( + table_name, + _query_settings, + iterations=100, + max_values_size=1000, + array_size_range=[10, 50], + ) + + node.query("DROP TABLE IF EXISTS {}".format(table_name)) + + +def test_with_merge_tree_multithread(): + thread_num = 15 + table_name = "async_insert_mt_multithread_table" + node.query( + "CREATE TABLE {} (a UInt64, b Array(UInt64)) ENGINE=MergeTree() ORDER BY a".format( + table_name + ) + ) + + _insert_queries_in_parallel( + table_name, + _query_settings, + thread_num=15, + tasks=1000, + max_values_size=1000, + array_size_range=[10, 15], + ) + + node.query("DROP TABLE IF EXISTS {}".format(table_name)) + + +def test_with_replicated_merge_tree(): + table_name = "async_insert_replicated_mt_table" + + create_query = " ".join( + ( + "CREATE TABLE {} (a UInt64, b Array(UInt64))".format(table_name), + "ENGINE=ReplicatedMergeTree('/clickhouse/tables/test/{}', 'node')".format( + table_name + ), + "ORDER BY a", + ) + ) + + node.query(create_query) + + settings = _query_settings + _insert_queries_sequentially( + table_name, + settings, + iterations=100, + max_values_size=1000, + array_size_range=[10, 50], + ) + + node.query("DROP TABLE IF EXISTS {}".format(table_name)) + + +def test_with_replicated_merge_tree_multithread(): + thread_num = 15 + table_name = "async_insert_replicated_mt_multithread_table" + + create_query = " ".join( + ( + "CREATE TABLE {} (a UInt64, b Array(UInt64))".format(table_name), + "ENGINE=ReplicatedMergeTree('/clickhouse/tables/test/{}', 'node')".format( + table_name + ), + "ORDER BY a", + ) + ) + + node.query(create_query) + + _insert_queries_in_parallel( + table_name, + _query_settings, + thread_num=15, + tasks=1000, + max_values_size=1000, + array_size_range=[10, 15], + ) + + node.query("DROP TABLE IF EXISTS {}".format(table_name)) + + +# Ensure that the combined duration of inserts with adaptive timeouts is less than +# the combined duration for fixed timeouts. +def test_compare_sequential_inserts_durations_for_adaptive_and_fixed_async_timeouts(): + fixed_tm_table_name = "async_insert_mt_fixed_async_timeout" + node.query( + "CREATE TABLE {} (a UInt64, b Array(UInt64)) ENGINE=MergeTree() ORDER BY a".format( + fixed_tm_table_name + ) + ) + + fixed_tm_settings = copy.copy(_query_settings) + fixed_tm_settings["async_insert_use_adaptive_busy_timeout"] = 0 + fixed_tm_settings["async_insert_busy_timeout_ms"] = 200 + + fixed_tm_run_duration = timeit.timeit( + lambda: _insert_queries_sequentially( + fixed_tm_table_name, + fixed_tm_settings, + iterations=100, + max_values_size=1000, + array_size_range=[10, 50], + ), + setup="pass", + number=3, + ) + + node.query("DROP TABLE IF EXISTS {}".format(fixed_tm_table_name)) + + logging.debug( + "Run duration with fixed asynchronous timeout is {} seconds".format( + fixed_tm_run_duration + ) + ) + + adaptive_tm_table_name = "async_insert_mt_adaptive_async_timeout" + node.query( + "CREATE TABLE {} (a UInt64, b Array(UInt64)) ENGINE=MergeTree() ORDER BY a".format( + adaptive_tm_table_name + ) + ) + + adaptive_tm_settings = copy.copy(_query_settings) + adaptive_tm_settings["async_insert_busy_timeout_min_ms"] = 10 + adaptive_tm_settings["async_insert_busy_timeout_max_ms"] = 1000 + + adaptive_tm_run_duration = timeit.timeit( + lambda: _insert_queries_sequentially( + adaptive_tm_table_name, + adaptive_tm_settings, + iterations=100, + max_values_size=1000, + array_size_range=[10, 50], + ), + setup="pass", + number=3, + ) + + logging.debug( + "Run duration with adaptive asynchronous timeout is {} seconds.".format( + adaptive_tm_run_duration + ) + ) + + node.query("DROP TABLE IF EXISTS {}".format(adaptive_tm_table_name)) + + assert adaptive_tm_run_duration <= fixed_tm_run_duration + + +# Ensure that the combined duration of inserts with adaptive timeouts is less than +# the combined duration for fixed timeouts. +def test_compare_parallel_inserts_durations_for_adaptive_and_fixed_async_timeouts(): + fixed_tm_table_name = "async_insert_mt_fixed_async_timeout" + node.query( + "CREATE TABLE {} (a UInt64, b Array(UInt64)) ENGINE=MergeTree() ORDER BY a".format( + fixed_tm_table_name + ) + ) + + fixed_tm_settings = copy.copy(_query_settings) + fixed_tm_settings["async_insert_use_adaptive_busy_timeout"] = 0 + fixed_tm_settings["async_insert_busy_timeout_ms"] = 200 + + fixed_tm_run_duration = timeit.timeit( + lambda: _insert_queries_in_parallel( + fixed_tm_table_name, + fixed_tm_settings, + thread_num=15, + tasks=1000, + max_values_size=1000, + array_size_range=[10, 50], + ), + setup="pass", + number=3, + ) + + node.query("DROP TABLE IF EXISTS {}".format(fixed_tm_table_name)) + + logging.debug( + "Run duration with fixed asynchronous timeout is {} seconds".format( + fixed_tm_run_duration + ) + ) + + adaptive_tm_table_name = "async_insert_mt_adaptive_async_timeout" + node.query( + "CREATE TABLE {} (a UInt64, b Array(UInt64)) ENGINE=MergeTree() ORDER BY a".format( + adaptive_tm_table_name + ) + ) + + adaptive_tm_settings = copy.copy(_query_settings) + adaptive_tm_settings["async_insert_busy_timeout_min_ms"] = 10 + adaptive_tm_settings["async_insert_busy_timeout_max_ms"] = 200 + + adaptive_tm_run_duration = timeit.timeit( + lambda: _insert_queries_in_parallel( + adaptive_tm_table_name, + adaptive_tm_settings, + thread_num=15, + tasks=100, + max_values_size=1000, + array_size_range=[10, 50], + ), + setup="pass", + number=3, + ) + + logging.debug( + "Run duration with adaptive asynchronous timeout is {} seconds.".format( + adaptive_tm_run_duration + ) + ) + + node.query("DROP TABLE IF EXISTS {}".format(adaptive_tm_table_name)) + + assert adaptive_tm_run_duration <= fixed_tm_run_duration + + +# Ensure that the delay converges to a minimum for sequential inserts and wait_for_async_insert=1. +def test_change_queries_frequency(): + table_name = "async_insert_mt_change_queries_frequencies" + + create_query = " ".join( + ( + "CREATE TABLE {} (a UInt64, b Array(UInt64))".format(table_name), + "ENGINE=ReplicatedMergeTree('/clickhouse/tables/test_frequencies/{}', 'node')".format( + table_name + ), + "ORDER BY a", + ) + ) + + node.query(create_query) + + settings = copy.copy(_query_settings) + min_ms = 50 + settings["async_insert_busy_timeout_min_ms"] = min_ms + settings["async_insert_busy_timeout_max_ms"] = 2000 + + _insert_queries_in_parallel( + table_name, + settings, + thread_num=15, + tasks=2000, + max_values_size=1000, + array_size_range=[10, 15], + ) + + _insert_queries_sequentially( + table_name, + settings, + iterations=200, + max_values_size=1000, + array_size_range=[10, 50], + ) + + select_log_query = "SELECT timeout_milliseconds FROM system.asynchronous_insert_log ORDER BY event_time DESC LIMIT 50" + res = node.query(select_log_query) + for line in res.splitlines(): + assert int(line) == min_ms + + node.query("DROP TABLE IF EXISTS {}".format(table_name)) diff --git a/tests/integration/test_backup_restore_azure_blob_storage/__init__.py b/tests/integration/test_backup_restore_azure_blob_storage/__init__.py new file mode 100644 index 00000000000..e5a0d9b4834 --- /dev/null +++ b/tests/integration/test_backup_restore_azure_blob_storage/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/tests/integration/test_backup_restore_azure_blob_storage/test.py b/tests/integration/test_backup_restore_azure_blob_storage/test.py new file mode 100644 index 00000000000..a7c7b439560 --- /dev/null +++ b/tests/integration/test_backup_restore_azure_blob_storage/test.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python3 + +import gzip +import json +import logging +import os +import io +import random +import threading +import time + +from azure.storage.blob import BlobServiceClient +import helpers.client +import pytest +from helpers.cluster import ClickHouseCluster, ClickHouseInstance +from helpers.network import PartitionManager +from helpers.mock_servers import start_mock_servers +from helpers.test_tools import exec_query_with_retry + + +def generate_cluster_def(port): + path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "./_gen/named_collections.xml", + ) + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w") as f: + f.write( + f""" + + + DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:{port}/devstoreaccount1; + cont + CSV + + + http://azurite1:{port}/devstoreaccount1 + cont + CSV + devstoreaccount1 + Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== + + + +""" + ) + return path + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + port = cluster.azurite_port + path = generate_cluster_def(port) + cluster.add_instance( + "node", + main_configs=[path], + with_azurite=True, + ) + cluster.start() + + yield cluster + finally: + cluster.shutdown() + + +def azure_query( + node, query, expect_error="false", try_num=10, settings={}, query_on_retry=None +): + for i in range(try_num): + try: + if expect_error == "true": + return node.query_and_get_error(query, settings=settings) + else: + return node.query(query, settings=settings) + except Exception as ex: + retriable_errors = [ + "DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response", + "DB::Exception: Azure::Core::Http::TransportException: Connection closed before getting full response or response is less than expected", + "DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response", + "DB::Exception: Azure::Core::Http::TransportException: Error while polling for socket ready read", + "Azure::Core::Http::TransportException, e.what() = Connection was closed by the server while trying to read a response", + "Azure::Core::Http::TransportException, e.what() = Connection closed before getting full response or response is less than expected", + "Azure::Core::Http::TransportException, e.what() = Connection was closed by the server while trying to read a response", + "Azure::Core::Http::TransportException, e.what() = Error while polling for socket ready read", + ] + retry = False + for error in retriable_errors: + if error in str(ex): + retry = True + print(f"Try num: {i}. Having retriable error: {ex}") + time.sleep(i) + break + if not retry or i == try_num - 1: + raise Exception(ex) + if query_on_retry is not None: + node.query(query_on_retry) + continue + + +def get_azure_file_content(filename, port): + container_name = "cont" + connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" + f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" + ) + blob_service_client = BlobServiceClient.from_connection_string( + str(connection_string) + ) + container_client = blob_service_client.get_container_client(container_name) + blob_client = container_client.get_blob_client(filename) + download_stream = blob_client.download_blob() + return download_stream.readall().decode("utf-8") + + +def put_azure_file_content(filename, port, data): + container_name = "cont" + connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" + f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" + ) + blob_service_client = BlobServiceClient.from_connection_string(connection_string) + try: + container_client = blob_service_client.create_container(container_name) + except: + container_client = blob_service_client.get_container_client(container_name) + + blob_client = container_client.get_blob_client(filename) + buf = io.BytesIO(data) + blob_client.upload_blob(buf) + + +@pytest.fixture(autouse=True, scope="function") +def delete_all_files(cluster): + port = cluster.env_variables["AZURITE_PORT"] + connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" + f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" + ) + blob_service_client = BlobServiceClient.from_connection_string(connection_string) + containers = blob_service_client.list_containers() + for container in containers: + container_client = blob_service_client.get_container_client(container) + blob_list = container_client.list_blobs() + for blob in blob_list: + print(blob) + blob_client = container_client.get_blob_client(blob) + blob_client.delete_blob() + + assert len(list(container_client.list_blobs())) == 0 + + yield + + +def test_backup_restore(cluster): + node = cluster.instances["node"] + port = cluster.env_variables["AZURITE_PORT"] + azure_query( + node, + f"CREATE TABLE test_simple_write_connection_string (key UInt64, data String) Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_write_c.csv', 'CSV')", + ) + azure_query( + node, f"INSERT INTO test_simple_write_connection_string VALUES (1, 'a')" + ) + print(get_azure_file_content("test_simple_write_c.csv", port)) + assert get_azure_file_content("test_simple_write_c.csv", port) == '1,"a"\n' + + backup_destination = f"AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_write_c_backup.csv')" + azure_query( + node, + f"BACKUP TABLE test_simple_write_connection_string TO {backup_destination}", + ) + print(get_azure_file_content("test_simple_write_c_backup.csv.backup", port)) + azure_query( + node, + f"RESTORE TABLE test_simple_write_connection_string AS test_simple_write_connection_string_restored FROM {backup_destination};", + ) + assert ( + azure_query(node, f"SELECT * from test_simple_write_connection_string_restored") + == "1\ta\n" + ) + + +def test_backup_restore_diff_container(cluster): + node = cluster.instances["node"] + port = cluster.env_variables["AZURITE_PORT"] + azure_query( + node, + f"CREATE TABLE test_simple_write_connection_string_cont1 (key UInt64, data String) Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_write_c_cont1.csv', 'CSV')", + ) + azure_query( + node, f"INSERT INTO test_simple_write_connection_string_cont1 VALUES (1, 'a')" + ) + backup_destination = f"AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont1', 'test_simple_write_c_backup_cont1.csv')" + azure_query( + node, + f"BACKUP TABLE test_simple_write_connection_string_cont1 TO {backup_destination}", + ) + azure_query( + node, + f"RESTORE TABLE test_simple_write_connection_string_cont1 AS test_simple_write_connection_string_restored_cont1 FROM {backup_destination};", + ) + assert ( + azure_query( + node, f"SELECT * from test_simple_write_connection_string_restored_cont1" + ) + == "1\ta\n" + ) + + +def test_backup_restore_with_named_collection_azure_conf1(cluster): + node = cluster.instances["node"] + port = cluster.env_variables["AZURITE_PORT"] + azure_query( + node, + f"CREATE TABLE test_write_connection_string (key UInt64, data String) Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_write.csv', 'CSV')", + ) + azure_query(node, f"INSERT INTO test_write_connection_string VALUES (1, 'a')") + print(get_azure_file_content("test_simple_write.csv", port)) + assert get_azure_file_content("test_simple_write.csv", port) == '1,"a"\n' + + backup_destination = ( + f"AzureBlobStorage(azure_conf1, 'test_simple_write_nc_backup.csv')" + ) + azure_query( + node, + f"BACKUP TABLE test_write_connection_string TO {backup_destination}", + ) + print(get_azure_file_content("test_simple_write_nc_backup.csv.backup", port)) + azure_query( + node, + f"RESTORE TABLE test_write_connection_string AS test_write_connection_string_restored FROM {backup_destination};", + ) + assert ( + azure_query(node, f"SELECT * from test_write_connection_string_restored") + == "1\ta\n" + ) + + +def test_backup_restore_with_named_collection_azure_conf2(cluster): + node = cluster.instances["node"] + port = cluster.env_variables["AZURITE_PORT"] + azure_query( + node, + f"CREATE TABLE test_write_connection_string_2 (key UInt64, data String) Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_write_2.csv', 'CSV')", + ) + azure_query(node, f"INSERT INTO test_write_connection_string_2 VALUES (1, 'a')") + print(get_azure_file_content("test_simple_write_2.csv", port)) + assert get_azure_file_content("test_simple_write_2.csv", port) == '1,"a"\n' + + backup_destination = ( + f"AzureBlobStorage(azure_conf2, 'test_simple_write_nc_backup_2.csv')" + ) + azure_query( + node, + f"BACKUP TABLE test_write_connection_string_2 TO {backup_destination}", + ) + print(get_azure_file_content("test_simple_write_nc_backup_2.csv.backup", port)) + azure_query( + node, + f"RESTORE TABLE test_write_connection_string_2 AS test_write_connection_string_restored_2 FROM {backup_destination};", + ) + assert ( + azure_query(node, f"SELECT * from test_write_connection_string_restored_2") + == "1\ta\n" + ) diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py index 20f538cca58..027c9736c32 100644 --- a/tests/integration/test_backup_restore_on_cluster/test.py +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -1087,9 +1087,11 @@ def test_stop_other_host_during_backup(kill): status = node1.query(f"SELECT status FROM system.backups WHERE id='{id}'").strip() if kill: - assert status in ["BACKUP_CREATED", "BACKUP_FAILED"] + expected_statuses = ["BACKUP_CREATED", "BACKUP_FAILED"] else: - assert status == "BACKUP_CREATED" + expected_statuses = ["BACKUP_CREATED", "BACKUP_CANCELLED"] + + assert status in expected_statuses node2.start_clickhouse() diff --git a/tests/integration/test_backup_restore_s3/configs/s3_settings.xml b/tests/integration/test_backup_restore_s3/configs/s3_settings.xml index 981cf67bbe9..adeb61cbe07 100644 --- a/tests/integration/test_backup_restore_s3/configs/s3_settings.xml +++ b/tests/integration/test_backup_restore_s3/configs/s3_settings.xml @@ -10,6 +10,13 @@ 3 2 + + http://minio1:9001/root/data/backups/limited/ + minio + minio123 + superuser1 + superuser2 + 1 1 diff --git a/tests/integration/test_backup_restore_s3/test.py b/tests/integration/test_backup_restore_s3/test.py index cd8f70b3239..783cf1feade 100644 --- a/tests/integration/test_backup_restore_s3/test.py +++ b/tests/integration/test_backup_restore_s3/test.py @@ -452,3 +452,57 @@ def test_backup_to_zip(): backup_name = new_backup_name() backup_destination = f"S3('http://minio1:9001/root/data/backups/{backup_name}.zip', 'minio', 'minio123')" check_backup_and_restore(storage_policy, backup_destination) + + +def test_user_specific_auth(start_cluster): + def create_user(user): + node.query(f"CREATE USER {user}") + node.query(f"GRANT CURRENT GRANTS ON *.* TO {user}") + + create_user("superuser1") + create_user("superuser2") + create_user("regularuser") + + node.query("CREATE TABLE specific_auth (col UInt64) ENGINE=Memory") + + assert "Access Denied" in node.query_and_get_error( + "BACKUP TABLE specific_auth TO S3('http://minio1:9001/root/data/backups/limited/backup1.zip')" + ) + assert "Access Denied" in node.query_and_get_error( + "BACKUP TABLE specific_auth TO S3('http://minio1:9001/root/data/backups/limited/backup1.zip')", + user="regularuser", + ) + + node.query( + "BACKUP TABLE specific_auth TO S3('http://minio1:9001/root/data/backups/limited/backup1.zip')", + user="superuser1", + ) + node.query( + "RESTORE TABLE specific_auth FROM S3('http://minio1:9001/root/data/backups/limited/backup1.zip')", + user="superuser1", + ) + + node.query( + "BACKUP TABLE specific_auth TO S3('http://minio1:9001/root/data/backups/limited/backup2.zip')", + user="superuser2", + ) + node.query( + "RESTORE TABLE specific_auth FROM S3('http://minio1:9001/root/data/backups/limited/backup2.zip')", + user="superuser2", + ) + + assert "Access Denied" in node.query_and_get_error( + "RESTORE TABLE specific_auth FROM S3('http://minio1:9001/root/data/backups/limited/backup1.zip')", + user="regularuser", + ) + + assert "HTTP response code: 403" in node.query_and_get_error( + "SELECT * FROM s3('http://minio1:9001/root/data/backups/limited/backup1.zip', 'RawBLOB')", + user="regularuser", + ) + node.query( + "SELECT * FROM s3('http://minio1:9001/root/data/backups/limited/backup1.zip', 'RawBLOB')", + user="superuser1", + ) + + node.query("DROP TABLE IF EXISTS test.specific_auth") diff --git a/tests/queries/0_stateless/02862_index_inverted_incorrect_args.reference b/tests/integration/test_broken_projections/__init__.py similarity index 100% rename from tests/queries/0_stateless/02862_index_inverted_incorrect_args.reference rename to tests/integration/test_broken_projections/__init__.py diff --git a/tests/integration/test_broken_projections/config.d/backups.xml b/tests/integration/test_broken_projections/config.d/backups.xml new file mode 100644 index 00000000000..4da8edffd67 --- /dev/null +++ b/tests/integration/test_broken_projections/config.d/backups.xml @@ -0,0 +1,13 @@ + + + + + local + /var/lib/clickhouse/disks/backups/ + + + + + backups + + diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py new file mode 100644 index 00000000000..4a4690a5d0a --- /dev/null +++ b/tests/integration/test_broken_projections/test.py @@ -0,0 +1,576 @@ +import time +import pytest +import logging +import string +import random +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "node", + main_configs=["config.d/backups.xml"], + stay_alive=True, + with_zookeeper=True, + ) + + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +def create_table(node, table, replica, data_prefix="", aggressive_merge=True): + if data_prefix == "": + data_prefix = table + + if aggressive_merge: + vertical_merge_algorithm_min_rows_to_activate = 1 + vertical_merge_algorithm_min_columns_to_activate = 1 + max_parts_to_merge_at_once = 3 + else: + vertical_merge_algorithm_min_rows_to_activate = 100000 + vertical_merge_algorithm_min_columns_to_activate = 100 + max_parts_to_merge_at_once = 3 + + node.query( + f""" + DROP TABLE IF EXISTS {table} SYNC; + CREATE TABLE {table} + ( + a String, + b String, + c Int64, + d Int64, + e Int64, + PROJECTION proj1 + ( + SELECT c ORDER BY d + ), + PROJECTION proj2 + ( + SELECT d ORDER BY c + ) + ) + ENGINE = ReplicatedMergeTree('/test_broken_projection_{data_prefix}/data/', '{replica}') ORDER BY a + SETTINGS min_bytes_for_wide_part = 0, + max_parts_to_merge_at_once={max_parts_to_merge_at_once}, + enable_vertical_merge_algorithm=0, + vertical_merge_algorithm_min_rows_to_activate = {vertical_merge_algorithm_min_rows_to_activate}, + vertical_merge_algorithm_min_columns_to_activate = {vertical_merge_algorithm_min_columns_to_activate}, + compress_primary_key=0; + """ + ) + + +def insert(node, table, offset, size): + node.query( + f""" + INSERT INTO {table} + SELECT number, number, number, number, number%2 FROM numbers({offset}, {size}) + SETTINGS insert_keeper_fault_injection_probability=0.0; + """ + ) + + +def get_parts(node, table): + return ( + node.query( + f""" + SELECT name + FROM system.parts + WHERE table='{table}' AND database=currentDatabase() AND active = 1 + ORDER BY name;" + """ + ) + .strip() + .split("\n") + ) + + +def bash(node, command): + node.exec_in_container(["bash", "-c", command], privileged=True, user="root") + + +def break_projection(node, table, part, parent_part, break_type): + part_path = node.query( + f""" + SELECT path + FROM system.projection_parts + WHERE table='{table}' + AND database=currentDatabase() + AND active=1 + AND part_name='{part}' + AND parent_name='{parent_part}' + ORDER BY modification_time DESC + LIMIT 1; + """ + ).strip() + + node.query( + f"select throwIf(substring('{part_path}', 1, 1) != '/', 'Path is relative: {part_path}')" + ) + + if break_type == "data": + bash(node, f"rm '{part_path}/d.bin'") + bash(node, f"rm '{part_path}/c.bin'") + elif break_type == "metadata": + bash(node, f"rm '{part_path}/columns.txt'") + elif break_type == "part": + bash(node, f"rm -r '{part_path}'") + + +def break_part(node, table, part): + part_path = node.query( + f""" + SELECT path + FROM system.parts + WHERE table='{table}' + AND database=currentDatabase() + AND active=1 + AND part_name='{part}' + ORDER BY modification_time DESC + LIMIT 1; + """ + ).strip() + + node.query( + f"select throwIf(substring('{part_path}', 1, 1) != '/', 'Path is relative: {part_path}')" + ) + bash(node, f"rm '{part_path}/columns.txt'") + + +def get_broken_projections_info(node, table): + return node.query( + f""" + SELECT parent_name, name, errors.name FROM + ( + SELECT parent_name, name, exception_code + FROM system.projection_parts + WHERE table='{table}' + AND database=currentDatabase() + AND is_broken = 1 + ) AS parts_info + INNER JOIN system.errors AS errors + ON parts_info.exception_code = errors.code + ORDER BY parent_name, name + """ + ).strip() + + +def get_projections_info(node, table): + return node.query( + f""" + SELECT parent_name, name, is_broken + FROM system.projection_parts + WHERE table='{table}' + AND active = 1 + AND database=currentDatabase() + ORDER BY parent_name, name + """ + ).strip() + + +def optimize(node, table, final, no_wait): + query = f"OPTIMIZE TABLE {table}" + if final: + query += " FINAL" + if no_wait: + query += " SETTINGS alter_sync=0" + node.query(query) + + +def reattach(node, table): + node.query( + f""" + DETACH TABLE {table}; + ATTACH TABLE {table}; + """ + ) + + +def materialize_projection(node, table, proj): + node.query( + f"ALTER TABLE {table} MATERIALIZE PROJECTION {proj} SETTINGS mutations_sync=2" + ) + + +def check_table_full(node, table): + return node.query( + f"CHECK TABLE {table} SETTINGS check_query_single_value_result = 0;" + ).strip() + + +def random_str(length=6): + alphabet = string.ascii_lowercase + string.digits + return "".join(random.SystemRandom().choice(alphabet) for _ in range(length)) + + +def check(node, table, check_result, expect_broken_part="", expected_error=""): + if expect_broken_part == "proj1": + assert expected_error in node.query_and_get_error( + f"SELECT c FROM '{table}' WHERE d == 12 ORDER BY c" + ) + else: + query_id = node.query( + f"SELECT queryID() FROM (SELECT c FROM '{table}' WHERE d == 12 ORDER BY c)" + ).strip() + node.query("SYSTEM FLUSH LOGS") + res = node.query( + f""" + SELECT query, splitByChar('.', arrayJoin(projections))[-1] + FROM system.query_log + WHERE query_id='{query_id}' AND type='QueryFinish' + """ + ) + if res == "": + res = node.query( + """ + SELECT query_id, query, splitByChar('.', arrayJoin(projections))[-1] + FROM system.query_log ORDER BY query_start_time_microseconds DESC + """ + ) + print(f"LOG: {res}") + assert False + assert "proj1" in res + + if expect_broken_part == "proj2": + assert expected_error in node.query_and_get_error( + f"SELECT d FROM '{table}' WHERE c == 12 ORDER BY d" + ) + else: + query_id = node.query( + f"SELECT queryID() FROM (SELECT d FROM '{table}' WHERE c == 12 ORDER BY d)" + ).strip() + node.query("SYSTEM FLUSH LOGS") + res = node.query( + f""" + SELECT query, splitByChar('.', arrayJoin(projections))[-1] + FROM system.query_log + WHERE query_id='{query_id}' AND type='QueryFinish' + """ + ) + if res == "": + res = node.query( + """ + SELECT query_id, query, splitByChar('.', arrayJoin(projections))[-1] + FROM system.query_log ORDER BY query_start_time_microseconds DESC + """ + ) + print(f"LOG: {res}") + assert False + assert "proj2" in res + + assert check_result == int(node.query(f"CHECK TABLE {table}")) + + +def test_broken_ignored(cluster): + node = cluster.instances["node"] + + table_name = "test1" + create_table(node, table_name, 1) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + + # Break metadata (columns.txt) file of projection 'proj1' + break_projection(node, table_name, "proj1", "all_2_2_0", "metadata") + + # Do select and after "check table" query. + # Select works because it does not read columns.txt. + # But expect check table result as 0. + check(node, table_name, 0) + + # Projection 'proj1' from part all_2_2_0 will now appear in broken parts info + # because it was marked broken during "check table" query. + assert "all_2_2_0\tproj1\tFILE_DOESNT_EXIST" in get_broken_projections_info( + node, table_name + ) + + # Check table query will also show a list of parts which have broken projections. + assert "all_2_2_0" in check_table_full(node, table_name) + + # Break data file of projection 'proj2' for part all_2_2_0 + break_projection(node, table_name, "proj2", "all_2_2_0", "data") + + # It will not yet appear in broken projections info. + assert "proj2" not in get_broken_projections_info(node, table_name) + + # Select now fails with error "File doesn't exist" + check(node, table_name, 0, "proj2", "FILE_DOESNT_EXIST") + + # Projection 'proj2' from part all_2_2_0 will now appear in broken parts info. + assert "all_2_2_0\tproj2\tNO_FILE_IN_DATA_PART" in get_broken_projections_info( + node, table_name + ) + + # Second select works, because projection is now marked as broken. + check(node, table_name, 0) + + # Break data file of projection 'proj2' for part all_3_3_0 + break_projection(node, table_name, "proj2", "all_3_3_0", "data") + + # It will not yet appear in broken projections info. + assert "all_3_3_0" not in get_broken_projections_info(node, table_name) + + insert(node, table_name, 20, 5) + insert(node, table_name, 25, 5) + + # Part all_3_3_0 has 'proj' and 'proj2' projections, but 'proj2' is broken and server does NOT know it yet. + # Parts all_4_4_0 and all_5_5_0 have both non-broken projections. + # So a merge will be create for future part all_3_5_1. + # During merge it will fail to read from 'proj2' of part all_3_3_0 and proj2 will be marked broken. + # Merge will be retried and on second attempt it will succeed. + # The result part all_3_5_1 will have only 1 projection - 'proj', because + # it will skip 'proj2' as it will see that one part does not have it anymore in the set of valid projections. + optimize(node, table_name, 0, 1) + time.sleep(5) + + # table_uuid=node.query(f"SELECT uuid FROM system.tables WHERE table='{table_name}' and database=currentDatabase()").strip() + # assert 0 < int( + # node.query( + # f""" + # SYSTEM FLUSH LOGS; + # SELECT count() FROM system.text_log + # WHERE level='Error' + # AND logger_name='MergeTreeBackgroundExecutor' + # AND message like 'Exception while executing background task %{table_uuid}:all_3_5_1%%Cannot open file%proj2.proj/c.bin%' + # """) + # ) + + assert "all_3_3_0" in get_broken_projections_info(node, table_name) + check(node, table_name, 0) + + +def test_materialize_broken_projection(cluster): + node = cluster.instances["node"] + + table_name = "test2" + create_table(node, table_name, 1) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + + break_projection(node, table_name, "proj1", "all_1_1_0", "metadata") + reattach(node, table_name) + + assert "all_1_1_0\tproj1\tNO_FILE_IN_DATA_PART" in get_broken_projections_info( + node, table_name + ) + assert "Part all_1_1_0 has a broken projection proj1" in check_table_full( + node, table_name + ) + + break_projection(node, table_name, "proj2", "all_1_1_0", "data") + reattach(node, table_name) + + assert "all_1_1_0\tproj2\tFILE_DOESNT_EXIST" in get_broken_projections_info( + node, table_name + ) + assert "Part all_1_1_0 has a broken projection proj2" in check_table_full( + node, table_name + ) + + materialize_projection(node, table_name, "proj1") + + assert "has a broken projection" not in check_table_full(node, table_name) + + +def test_broken_ignored_replicated(cluster): + node = cluster.instances["node"] + + table_name = "test3" + table_name2 = "test3_replica" + create_table(node, table_name, 1) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + check(node, table_name, 1) + + create_table(node, table_name2, 2, table_name) + check(node, table_name2, 1) + + break_projection(node, table_name, "proj1", "all_0_0_0", "data") + assert "Part all_0_0_0 has a broken projection proj1" in check_table_full( + node, table_name + ) + + break_part(node, table_name, "all_0_0_0") + node.query(f"SYSTEM SYNC REPLICA {table_name}") + assert "has a broken projection" not in check_table_full(node, table_name) + + +def get_random_string(string_length=8): + alphabet = string.ascii_letters + string.digits + return "".join((random.choice(alphabet) for _ in range(string_length))) + + +def test_broken_projections_in_backups_1(cluster): + node = cluster.instances["node"] + + table_name = "test4" + create_table(node, table_name, 1, aggressive_merge=False, data_prefix=table_name) + + node.query("SYSTEM STOP MERGES") + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + + check(node, table_name, 1) + + break_projection(node, table_name, "proj1", "all_2_2_0", "data") + check(node, table_name, 0, "proj1", "FILE_DOESNT_EXIST") + + assert "all_2_2_0\tproj1\tNO_FILE_IN_DATA_PART" in get_broken_projections_info( + node, table_name + ) + + backup_name = f"b1-{get_random_string()}" + assert "BACKUP_CREATED" in node.query( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', '{backup_name}') settings check_projection_parts=false; + """ + ) + + assert "RESTORED" in node.query( + f""" + drop table {table_name} sync; + set backup_restore_keeper_fault_injection_probability=0.0; + restore table {table_name} from Disk('backups', '{backup_name}'); + """ + ) + + node.query("SYSTEM STOP MERGES") + + check(node, table_name, 1) + assert "" == get_broken_projections_info(node, table_name) + + +def test_broken_projections_in_backups_2(cluster): + node = cluster.instances["node"] + + table_name = "test5" + create_table(node, table_name, 1, aggressive_merge=False, data_prefix=table_name) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + + check(node, table_name, 1) + break_projection(node, table_name, "proj2", "all_2_2_0", "part") + check(node, table_name, 0, "proj2", "ErrnoException") + + assert "all_2_2_0\tproj2\tFILE_DOESNT_EXIST" == get_broken_projections_info( + node, table_name + ) + + assert "FILE_DOESNT_EXIST" in node.query_and_get_error( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', 'b2') + """ + ) + + materialize_projection(node, table_name, "proj2") + check(node, table_name, 1) + + backup_name = f"b3-{get_random_string()}" + assert "BACKUP_CREATED" in node.query( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', '{backup_name}') settings check_projection_parts=false; + """ + ) + + assert "RESTORED" in node.query( + f""" + drop table {table_name} sync; + set backup_restore_keeper_fault_injection_probability=0.0; + restore table {table_name} from Disk('backups', '{backup_name}'); + """ + ) + check(node, table_name, 1) + + +def test_broken_projections_in_backups_3(cluster): + node = cluster.instances["node"] + + table_name = "test6" + create_table(node, table_name, 1, aggressive_merge=False, data_prefix=table_name) + + node.query("SYSTEM STOP MERGES") + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + + check(node, table_name, 1) + + break_projection(node, table_name, "proj1", "all_1_1_0", "part") + assert "Part all_1_1_0 has a broken projection proj1" in check_table_full( + node, table_name + ) + assert "all_1_1_0\tproj1\tFILE_DOESNT_EXIST" == get_broken_projections_info( + node, table_name + ) + + backup_name = f"b4-{get_random_string()}" + assert "BACKUP_CREATED" in node.query( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', '{backup_name}') settings check_projection_parts=false, allow_backup_broken_projections=true; + """ + ) + + assert "RESTORED" in node.query( + f""" + drop table {table_name} sync; + set backup_restore_keeper_fault_injection_probability=0.0; + restore table {table_name} from Disk('backups', '{backup_name}'); + """ + ) + + check(node, table_name, 0) + assert "all_1_1_0\tproj1\tNO_FILE_IN_DATA_PART" == get_broken_projections_info( + node, table_name + ) diff --git a/tests/integration/test_dictionaries_update_and_reload/test.py b/tests/integration/test_dictionaries_update_and_reload/test.py index 3d96d0b8dd4..648ea847afb 100644 --- a/tests/integration/test_dictionaries_update_and_reload/test.py +++ b/tests/integration/test_dictionaries_update_and_reload/test.py @@ -281,7 +281,7 @@ def test_reload_after_fail_in_cache_dictionary(started_cluster): query_and_get_error = instance.query_and_get_error # Can't get a value from the cache dictionary because the source (table `test.xypairs`) doesn't respond. - expected_error = "Table test.xypairs does not exist" + expected_error = "UNKNOWN_TABLE" update_error = "Could not update cache dictionary cache_xypairs now" assert expected_error in query_and_get_error( "SELECT dictGetUInt64('cache_xypairs', 'y', toUInt64(1))" diff --git a/tests/integration/test_disk_over_web_server/test.py b/tests/integration/test_disk_over_web_server/test.py index a71fdeff302..4b175d188ef 100644 --- a/tests/integration/test_disk_over_web_server/test.py +++ b/tests/integration/test_disk_over_web_server/test.py @@ -172,7 +172,7 @@ def test_incorrect_usage(cluster): assert "Table is read-only" in result result = node2.query_and_get_error("OPTIMIZE TABLE test0 FINAL") - assert "Only read-only operations are supported" in result + assert "Table is in readonly mode due to static storage" in result node2.query("DROP TABLE test0 SYNC") diff --git a/tests/integration/test_distributed_backward_compatability/configs/legacy.xml b/tests/integration/test_distributed_backward_compatability/configs/legacy.xml deleted file mode 100644 index 5c1985a17a4..00000000000 --- a/tests/integration/test_distributed_backward_compatability/configs/legacy.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - 1 - - - diff --git a/tests/integration/test_distributed_backward_compatability/configs/remote_servers.xml b/tests/integration/test_distributed_backward_compatability/configs/remote_servers.xml deleted file mode 100644 index 68b420f36b4..00000000000 --- a/tests/integration/test_distributed_backward_compatability/configs/remote_servers.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - node1 - 9000 - - - - - node2 - 9000 - - - - - diff --git a/tests/integration/test_distributed_backward_compatability/test.py b/tests/integration/test_distributed_backward_compatability/test.py deleted file mode 100644 index 319a4c08e60..00000000000 --- a/tests/integration/test_distributed_backward_compatability/test.py +++ /dev/null @@ -1,65 +0,0 @@ -import pytest - -from helpers.cluster import ClickHouseCluster - -cluster = ClickHouseCluster(__file__) - -node_old = cluster.add_instance( - "node1", - main_configs=["configs/remote_servers.xml"], - image="yandex/clickhouse-server", - tag="20.8.11.17", - stay_alive=True, - with_installed_binary=True, - allow_analyzer=False, -) -node_new = cluster.add_instance( - "node2", - main_configs=["configs/remote_servers.xml"], - user_configs=["configs/legacy.xml"], -) - - -@pytest.fixture(scope="module") -def started_cluster(): - try: - cluster.start() - - for node in (node_old, node_new): - node.query( - "CREATE TABLE local_table(id UInt32, val String) ENGINE = MergeTree ORDER BY id" - ) - - node_old.query("INSERT INTO local_table VALUES (1, 'node1')") - node_new.query("INSERT INTO local_table VALUES (2, 'node2')") - - node_old.query( - "CREATE TABLE distributed(id UInt32, val String) ENGINE = Distributed(test_cluster, default, local_table)" - ) - node_new.query( - "CREATE TABLE distributed(id UInt32, val String) ENGINE = Distributed(test_cluster, default, local_table)" - ) - - yield cluster - - finally: - cluster.shutdown() - - -def test_distributed_in_tuple(started_cluster): - query1 = "SELECT count() FROM distributed WHERE (id, val) IN ((1, 'node1'), (2, 'a'), (3, 'b'))" - query2 = ( - "SELECT sum((id, val) IN ((1, 'node1'), (2, 'a'), (3, 'b'))) FROM distributed" - ) - assert node_old.query(query1) == "1\n" - assert node_old.query(query2) == "1\n" - assert node_new.query(query1) == "1\n" - assert node_new.query(query2) == "1\n" - - large_set = "(" + ",".join([str(i) for i in range(1000)]) + ")" - query3 = "SELECT count() FROM distributed WHERE id IN " + large_set - query4 = "SELECT sum(id IN {}) FROM distributed".format(large_set) - assert node_old.query(query3) == "2\n" - assert node_old.query(query4) == "2\n" - assert node_new.query(query3) == "2\n" - assert node_new.query(query4) == "2\n" diff --git a/tests/integration/test_file_cluster/test.py b/tests/integration/test_file_cluster/test.py index d75cd6c7d23..5d12407e3f2 100644 --- a/tests/integration/test_file_cluster/test.py +++ b/tests/integration/test_file_cluster/test.py @@ -123,3 +123,91 @@ def test_no_such_files(started_cluster): distributed = node.query(get_query("*", True, "3,4")) assert TSV(local) == TSV(distributed) + + +def test_schema_inference(started_cluster): + node = started_cluster.instances["s0_0_0"] + + expected_result = node.query( + "select * from file('file*.csv', 'CSV', 's String, i UInt32') ORDER BY (i, s)" + ) + result = node.query( + "select * from fileCluster('my_cluster', 'file*.csv') ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file*.csv', auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file*.csv', CSV) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file*.csv', auto, auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file*.csv', CSV, auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file*.csv', auto, auto, auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file*.csv', CSV, auto, auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + +def test_format_detection(started_cluster): + for node_name in ("s0_0_0", "s0_0_1", "s0_1_0"): + for i in range(1, 3): + started_cluster.instances[node_name].query( + f""" + INSERT INTO TABLE FUNCTION file( + 'file_for_format_detection_{i}', 'CSV', 's String, i UInt32') VALUES ('file{i}',{i}) + """ + ) + + node = started_cluster.instances["s0_0_0"] + expected_result = node.query( + "select * from file('file_for_format_detection*', 'CSV', 's String, i UInt32') ORDER BY (i, s)" + ) + + result = node.query( + "select * from fileCluster('my_cluster', 'file_for_format_detection*') ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, 's String, i UInt32') ORDER BY (i, s)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, auto, auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, 's String, i UInt32', auto) ORDER BY (i, s)" + ) + assert result == expected_result diff --git a/tests/queries/0_stateless/02895_forbid_create_inverted_index.reference b/tests/integration/test_insert_exception_over_http/__init__.py similarity index 100% rename from tests/queries/0_stateless/02895_forbid_create_inverted_index.reference rename to tests/integration/test_insert_exception_over_http/__init__.py diff --git a/tests/integration/test_insert_exception_over_http/test.py b/tests/integration/test_insert_exception_over_http/test.py new file mode 100644 index 00000000000..a03d68e0b03 --- /dev/null +++ b/tests/integration/test_insert_exception_over_http/test.py @@ -0,0 +1,44 @@ +import pytest +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +instance = cluster.add_instance("instance", with_zookeeper=True) + + +@pytest.fixture(scope="module", autouse=True) +def start_cluster(): + try: + cluster.start() + yield cluster + + finally: + cluster.shutdown() + + +def test_insert_exception_over_http(start_cluster): + instance.query("DROP TABLE IF EXISTS tt SYNC") + instance.query( + "CREATE TABLE tt (KeyID UInt32) Engine = ReplicatedMergeTree('/test_insert_exception_over_http/tt', 'r1') ORDER BY (KeyID)" + ) + instance.query( + "SYSTEM ENABLE FAILPOINT execute_query_calling_empty_set_result_func_on_exception" + ) + + assert True == instance.http_query_and_get_error( + "insert into tt settings insert_keeper_max_retries=0, insert_keeper_fault_injection_probability=1.0, log_comment='02988_66a57d6f-d1cc-4693-8bf4-206848edab87' values (1), (2), (3), (4), (5)", + method="POST", + ).startswith("500 Internal Server Error") + + assert "0\n" == instance.query("select count() from tt") + + instance.query("SYSTEM FLUSH LOGS") + + assert "1\n" == instance.query( + "select count() from system.query_log where log_comment ='02988_66a57d6f-d1cc-4693-8bf4-206848edab87' and current_database = currentDatabase() and event_date >= yesterday() and type = 'QueryStart'" + ) + + assert "1\n" == instance.query( + "select count() from system.query_log where log_comment ='02988_66a57d6f-d1cc-4693-8bf4-206848edab87' and current_database = currentDatabase() and event_date >= yesterday() and type = 'ExceptionWhileProcessing'" + ) + + instance.query("DROP TABLE tt SYNC") diff --git a/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py index 97c8b65f15d..57f2ccd720d 100644 --- a/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py @@ -3379,7 +3379,7 @@ def gtid_after_attach_test(clickhouse_node, mysql_node, replication): f"CREATE TABLE {db}.t(id INT PRIMARY KEY AUTO_INCREMENT, score int, create_time DATETIME DEFAULT NOW())" ) - db_count = 6 + db_count = 4 for i in range(db_count): replication.create_db_ch( f"{db}{i}", @@ -3392,7 +3392,11 @@ def gtid_after_attach_test(clickhouse_node, mysql_node, replication): "t\n", ) for i in range(int(db_count / 2)): - clickhouse_node.query(f"DETACH DATABASE {db}{i}") + check_query( + clickhouse_node, + f"DETACH DATABASE {db}{i}", + "", + ) mysql_node.query(f"USE {db}") rows = 10000 diff --git a/tests/integration/test_mutations_with_merge_tree/configs/users.xml b/tests/integration/test_mutations_with_merge_tree/configs/users.xml index d1a3ae1e859..c767d6361fd 100644 --- a/tests/integration/test_mutations_with_merge_tree/configs/users.xml +++ b/tests/integration/test_mutations_with_merge_tree/configs/users.xml @@ -1,7 +1,7 @@ - 500 + 1800 1 1 diff --git a/tests/queries/0_stateless/02966_s3_access_key_id_restriction.reference b/tests/integration/test_placement_info/__init__.py similarity index 100% rename from tests/queries/0_stateless/02966_s3_access_key_id_restriction.reference rename to tests/integration/test_placement_info/__init__.py diff --git a/tests/integration/test_placement_info/configs/config_value.xml b/tests/integration/test_placement_info/configs/config_value.xml new file mode 100644 index 00000000000..d3f1a241962 --- /dev/null +++ b/tests/integration/test_placement_info/configs/config_value.xml @@ -0,0 +1,6 @@ + + + 0 + ci-test-1b + + diff --git a/tests/integration/test_placement_info/configs/file_value.xml b/tests/integration/test_placement_info/configs/file_value.xml new file mode 100644 index 00000000000..636ccacb467 --- /dev/null +++ b/tests/integration/test_placement_info/configs/file_value.xml @@ -0,0 +1,6 @@ + + + 0 + /tmp/node-zone + + diff --git a/tests/integration/test_placement_info/configs/imds.xml b/tests/integration/test_placement_info/configs/imds.xml new file mode 100644 index 00000000000..5fc8c1ab3dc --- /dev/null +++ b/tests/integration/test_placement_info/configs/imds.xml @@ -0,0 +1,8 @@ + + + 1 + + + 1 + + diff --git a/tests/integration/test_placement_info/configs/missing_value.xml b/tests/integration/test_placement_info/configs/missing_value.xml new file mode 100644 index 00000000000..d37218428b3 --- /dev/null +++ b/tests/integration/test_placement_info/configs/missing_value.xml @@ -0,0 +1,5 @@ + + + 0 + + diff --git a/tests/integration/test_placement_info/metadata_servers/simple_server.py b/tests/integration/test_placement_info/metadata_servers/simple_server.py new file mode 100644 index 00000000000..73140a7d776 --- /dev/null +++ b/tests/integration/test_placement_info/metadata_servers/simple_server.py @@ -0,0 +1,30 @@ +import http.server +import sys + + +class RequestHandler(http.server.BaseHTTPRequestHandler): + def get_response(self): + if self.path == "/": + return "OK", 200 + + if self.path == "/latest/meta-data/placement/availability-zone": + return "ci-test-1a", 200 + + # Resource not found. + return 404 + + def do_HEAD(self): + response, code = self.get_response() + self.send_response(code) + self.send_header("Content-Type", "text/plain") + self.send_header("Content-Length", len(response.encode())) + self.end_headers() + return response, code + + def do_GET(self): + response, _ = self.do_HEAD() + self.wfile.write(response.encode()) + + +httpd = http.server.HTTPServer(("0.0.0.0", int(sys.argv[1])), RequestHandler) +httpd.serve_forever() diff --git a/tests/integration/test_placement_info/test.py b/tests/integration/test_placement_info/test.py new file mode 100644 index 00000000000..1b93a3eae0b --- /dev/null +++ b/tests/integration/test_placement_info/test.py @@ -0,0 +1,95 @@ +import pytest +from helpers.cluster import ClickHouseCluster +from helpers.mock_servers import start_mock_servers +import os +import time + +METADATA_SERVER_HOSTNAME = "resolver" +METADATA_SERVER_PORT = 8080 + +cluster = ClickHouseCluster(__file__) +node_imds = cluster.add_instance( + "node_imds", + with_minio=True, + main_configs=["configs/imds.xml"], + env_variables={ + "AWS_EC2_METADATA_SERVICE_ENDPOINT": f"http://{METADATA_SERVER_HOSTNAME}:{METADATA_SERVER_PORT}", + }, + stay_alive=True, +) +node_config_value = cluster.add_instance( + "node_config_value", + main_configs=["configs/config_value.xml"], +) +node_file_value = cluster.add_instance( + "node_file_value", + main_configs=["configs/file_value.xml"], + stay_alive=True, +) +node_missing_value = cluster.add_instance( + "node_missing_value", + main_configs=["configs/missing_value.xml"], +) + + +def start_metadata_server(): + script_dir = os.path.join(os.path.dirname(__file__), "metadata_servers") + start_mock_servers( + cluster, + script_dir, + [ + ( + "simple_server.py", + METADATA_SERVER_HOSTNAME, + METADATA_SERVER_PORT, + ) + ], + ) + + +@pytest.fixture(scope="module", autouse=True) +def start_cluster(): + try: + cluster.start() + start_metadata_server() + yield + finally: + cluster.shutdown() + + +def test_placement_info_from_imds(): + node_imds.stop_clickhouse(kill=True) + node_imds.start_clickhouse() + + node_imds.query("SYSTEM FLUSH LOGS") + assert node_imds.contains_in_log( + "CloudPlacementInfo: Loaded info: availability_zone: ci-test-1a" + ) + + +def test_placement_info_from_config(): + node_config_value.query("SYSTEM FLUSH LOGS") + assert node_config_value.contains_in_log( + "CloudPlacementInfo: Loaded info: availability_zone: ci-test-1b" + ) + + +def test_placement_info_from_file(): + node_file_value.exec_in_container( + ["bash", "-c", "echo ci-test-1c > /tmp/node-zone"] + ) + + node_file_value.stop_clickhouse(kill=True) + node_file_value.start_clickhouse() + + node_file_value.query("SYSTEM FLUSH LOGS") + assert node_file_value.contains_in_log( + "CloudPlacementInfo: Loaded info: availability_zone: ci-test-1c" + ) + + +def test_placement_info_missing_data(): + node_missing_value.query("SYSTEM FLUSH LOGS") + assert node_missing_value.contains_in_log( + "CloudPlacementInfo: Availability zone info not found" + ) diff --git a/tests/integration/test_quota/test.py b/tests/integration/test_quota/test.py index cec14b0af73..bf64b57a7bf 100644 --- a/tests/integration/test_quota/test.py +++ b/tests/integration/test_quota/test.py @@ -40,7 +40,7 @@ def system_quota_usage(canonical): canonical_tsv = TSV(canonical) query = ( "SELECT quota_name, quota_key, duration, queries, max_queries, query_selects, max_query_selects, query_inserts, max_query_inserts, errors, max_errors, result_rows, max_result_rows," - "result_bytes, max_result_bytes, read_rows, max_read_rows, read_bytes, max_read_bytes, max_execution_time " + "result_bytes, max_result_bytes, read_rows, max_read_rows, read_bytes, max_read_bytes, max_execution_time, max_failed_sequential_authentications " "FROM system.quota_usage ORDER BY duration" ) r = TSV(instance.query(query)) @@ -52,7 +52,7 @@ def system_quotas_usage(canonical): canonical_tsv = TSV(canonical) query = ( "SELECT quota_name, quota_key, is_current, duration, queries, max_queries, query_selects, max_query_selects, query_inserts, max_query_inserts, errors, max_errors, result_rows, max_result_rows, " - "result_bytes, max_result_bytes, read_rows, max_read_rows, read_bytes, max_read_bytes, max_execution_time " + "result_bytes, max_result_bytes, read_rows, max_read_rows, read_bytes, max_read_bytes, max_execution_time, max_failed_sequential_authentications " "FROM system.quotas_usage ORDER BY quota_name, quota_key, duration" ) r = TSV(instance.query(query)) @@ -130,6 +130,7 @@ def test_quota_from_users_xml(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -156,6 +157,7 @@ def test_quota_from_users_xml(): 0, "\\N", "\\N", + "\\N", ] ] ) @@ -183,6 +185,7 @@ def test_quota_from_users_xml(): 0, "\\N", "\\N", + "\\N", ] ] ) @@ -211,6 +214,7 @@ def test_quota_from_users_xml(): 200, "\\N", "\\N", + "\\N", ] ] ) @@ -239,6 +243,7 @@ def test_quota_from_users_xml(): 400, "\\N", "\\N", + "\\N", ] ] ) @@ -285,6 +290,7 @@ def test_simpliest_quota(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -313,6 +319,7 @@ def test_simpliest_quota(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -351,6 +358,7 @@ def test_tracking_quota(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -377,6 +385,7 @@ def test_tracking_quota(): 0, "\\N", "\\N", + "\\N", ] ] ) @@ -405,6 +414,7 @@ def test_tracking_quota(): 200, "\\N", "\\N", + "\\N", ] ] ) @@ -433,6 +443,7 @@ def test_tracking_quota(): 400, "\\N", "\\N", + "\\N", ] ] ) @@ -456,7 +467,7 @@ def test_exceed_quota(): ] ) system_quota_limits( - [["myQuota", 31556952, 0, 1, 1, 1, 1, 1, "\\N", 1, "\\N", "\\N", "\\N"]] + [["myQuota", 31556952, 0, 1, 1, 1, 1, 1, "\\N", 1, "\\N", "\\N", "\\N", "1"]] ) system_quota_usage( [ @@ -481,6 +492,7 @@ def test_exceed_quota(): 0, "\\N", "\\N", + "1", ] ] ) @@ -512,6 +524,7 @@ def test_exceed_quota(): 0, "\\N", "\\N", + "1", ] ] ) @@ -548,6 +561,7 @@ def test_exceed_quota(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -574,6 +588,7 @@ def test_exceed_quota(): 0, "\\N", "\\N", + "\\N", ] ] ) @@ -602,6 +617,7 @@ def test_exceed_quota(): 200, "\\N", "\\N", + "\\N", ] ] ) @@ -638,6 +654,7 @@ def test_add_remove_interval(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -664,6 +681,7 @@ def test_add_remove_interval(): 0, "\\N", "\\N", + "\\N", ] ] ) @@ -700,6 +718,7 @@ def test_add_remove_interval(): "\\N", "\\N", "\\N", + "\\N", ], [ "myQuota", @@ -715,6 +734,7 @@ def test_add_remove_interval(): 20000, 120, "\\N", + "\\N", ], ] ) @@ -741,6 +761,7 @@ def test_add_remove_interval(): 0, "\\N", "\\N", + "\\N", ], [ "myQuota", @@ -763,6 +784,7 @@ def test_add_remove_interval(): 0, 20000, 120, + "\\N", ], ] ) @@ -791,6 +813,7 @@ def test_add_remove_interval(): 200, "\\N", "\\N", + "\\N", ], [ "myQuota", @@ -813,6 +836,7 @@ def test_add_remove_interval(): 200, 20000, 120, + "\\N", ], ] ) @@ -849,6 +873,7 @@ def test_add_remove_interval(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -875,6 +900,7 @@ def test_add_remove_interval(): 200, "\\N", "\\N", + "\\N", ] ] ) @@ -903,6 +929,7 @@ def test_add_remove_interval(): 400, "\\N", "\\N", + "\\N", ] ] ) @@ -947,6 +974,7 @@ def test_add_remove_interval(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -975,6 +1003,7 @@ def test_add_remove_interval(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -1011,6 +1040,7 @@ def test_add_remove_interval(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -1037,6 +1067,7 @@ def test_add_remove_interval(): 0, "\\N", "\\N", + "\\N", ] ] ) @@ -1073,6 +1104,7 @@ def test_add_remove_quota(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -1100,6 +1132,7 @@ def test_add_remove_quota(): 0, "\\N", "\\N", + "\\N", ] ] ) @@ -1146,6 +1179,7 @@ def test_add_remove_quota(): "\\N", "\\N", "\\N", + "\\N", ], [ "myQuota2", @@ -1161,6 +1195,7 @@ def test_add_remove_quota(): 400000, 60, "\\N", + "3", ], [ "myQuota2", @@ -1176,6 +1211,7 @@ def test_add_remove_quota(): "\\N", 1800, "\\N", + "\\N", ], ] ) @@ -1203,6 +1239,7 @@ def test_add_remove_quota(): 0, "\\N", "\\N", + "\\N", ] ] ) @@ -1239,6 +1276,7 @@ def test_add_remove_quota(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -1266,6 +1304,7 @@ def test_add_remove_quota(): 0, "\\N", "\\N", + "\\N", ] ] ) @@ -1308,6 +1347,7 @@ def test_add_remove_quota(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -1335,6 +1375,7 @@ def test_add_remove_quota(): 0, "\\N", "\\N", + "\\N", ] ] ) @@ -1371,6 +1412,7 @@ def test_reload_users_xml_by_timer(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -1397,7 +1439,7 @@ def test_reload_users_xml_by_timer(): assert_eq_with_retry( instance, "SELECT * FROM system.quota_limits", - [["myQuota", 31556952, 0, 1, 1, 1, 1, 1, "\\N", 1, "\\N", "\\N", "\\N"]], + [["myQuota", 31556952, 0, 1, 1, 1, 1, 1, "\\N", 1, "\\N", "\\N", "\\N", "1"]], ) @@ -1447,15 +1489,15 @@ def test_dcl_introspection(): ) assert ( instance.query("SHOW CREATE QUOTA myQuota2") - == "CREATE QUOTA myQuota2 KEYED BY client_key, user_name FOR RANDOMIZED INTERVAL 1 hour MAX result_rows = 4000, result_bytes = 400000, read_rows = 4000, read_bytes = 400000, execution_time = 60, FOR INTERVAL 1 month MAX execution_time = 1800\n" + == "CREATE QUOTA myQuota2 KEYED BY client_key, user_name FOR RANDOMIZED INTERVAL 1 hour MAX result_rows = 4000, result_bytes = 400000, read_rows = 4000, read_bytes = 400000, execution_time = 60, failed_sequential_authentications = 3, FOR INTERVAL 1 month MAX execution_time = 1800\n" ) assert ( instance.query("SHOW CREATE QUOTAS") == "CREATE QUOTA myQuota KEYED BY user_name FOR INTERVAL 1 year MAX queries = 1000, read_rows = 1000 TO default\n" - "CREATE QUOTA myQuota2 KEYED BY client_key, user_name FOR RANDOMIZED INTERVAL 1 hour MAX result_rows = 4000, result_bytes = 400000, read_rows = 4000, read_bytes = 400000, execution_time = 60, FOR INTERVAL 1 month MAX execution_time = 1800\n" + "CREATE QUOTA myQuota2 KEYED BY client_key, user_name FOR RANDOMIZED INTERVAL 1 hour MAX result_rows = 4000, result_bytes = 400000, read_rows = 4000, read_bytes = 400000, execution_time = 60, failed_sequential_authentications = 3, FOR INTERVAL 1 month MAX execution_time = 1800\n" ) assert re.match( - "myQuota\\tdefault\\t.*\\t31556952\\t1\\t1000\\t1\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t50\\t1000\\t200\\t\\\\N\\t.*\\t\\\\N\n", + "myQuota\\tdefault\\t.*\\t31556952\\t1\\t1000\\t1\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t50\\t1000\\t200\\t\\\\N\\t.*\\t\\\\N\\t0\\t\\\\N\n", instance.query("SHOW QUOTA"), ) @@ -1478,13 +1520,13 @@ def test_dcl_management(): == "CREATE QUOTA qA FOR INTERVAL 5 quarter MAX queries = 123 TO default\n" ) assert re.match( - "qA\\t\\t.*\\t39446190\\t0\\t123\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t.*\\t\\\\N\n", + "qA\\t\\t.*\\t39446190\\t0\\t123\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t.*\\t\\\\N\\t0\\t\\\\N\n", instance.query("SHOW QUOTA"), ) instance.query("SELECT * from test_table") assert re.match( - "qA\\t\\t.*\\t39446190\\t1\\t123\\t1\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t.*\\t\\\\N\n", + "qA\\t\\t.*\\t39446190\\t1\\t123\\t1\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t.*\\t\\\\N\\t0\\t\\\\N\n", instance.query("SHOW QUOTA"), ) @@ -1496,15 +1538,15 @@ def test_dcl_management(): == "CREATE QUOTA qA FOR INTERVAL 30 minute MAX execution_time = 0.5, FOR INTERVAL 5 quarter MAX queries = 321, errors = 10 TO default\n" ) assert re.match( - "qA\\t\\t.*\\t1800\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t.*\\t0.5\\t0\\t\\\\N\n" - "qA\\t\\t.*\\t39446190\\t1\\t321\\t1\\t\\\\N\\t0\\t\\\\N\\t0\\t10\\t50\\t\\\\N\\t200\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t.*\\t\\\\N\\t0\\t\\\\N\n", + "qA\\t\\t.*\\t1800\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t.*\\t0.5\\t0\\t\\\\N\\t0\\t\\\\N\n" + "qA\\t\\t.*\\t39446190\\t1\\t321\\t1\\t\\\\N\\t0\\t\\\\N\\t0\\t10\\t50\\t\\\\N\\t200\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t.*\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\n", instance.query("SHOW QUOTA"), ) instance.query("SELECT * from test_table") assert re.match( - "qA\\t\\t.*\\t1800\\t1\\t\\\\N\\t1\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t.*\\t0.5\\t0\\t\\\\N\n" - "qA\\t\\t.*\\t39446190\\t2\\t321\\t2\\t\\\\N\\t0\\t\\\\N\\t0\\t10\\t100\\t\\\\N\\t400\\t\\\\N\\t100\\t\\\\N\\t400\\t\\\\N\\t.*\\t\\\\N\\t0\\t\\\\N\n", + "qA\\t\\t.*\\t1800\\t1\\t\\\\N\\t1\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t.*\\t0.5\\t0\\t\\\\N\\t0\\t\\\\N\n" + "qA\\t\\t.*\\t39446190\\t2\\t321\\t2\\t\\\\N\\t0\\t\\\\N\\t0\\t10\\t100\\t\\\\N\\t400\\t\\\\N\\t100\\t\\\\N\\t400\\t\\\\N\\t.*\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\n", instance.query("SHOW QUOTA"), ) @@ -1518,7 +1560,7 @@ def test_dcl_management(): instance.query("SELECT * from test_table") assert re.match( - "qA\\t\\t.*\\t42075936\\t1\\t\\\\N\\t1\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t.*\\t\\\\N\\t0\\t\\\\N\n", + "qA\\t\\t.*\\t42075936\\t1\\t\\\\N\\t1\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t.*\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\n", instance.query("SHOW QUOTA"), ) @@ -1528,13 +1570,13 @@ def test_dcl_management(): == "CREATE QUOTA qB FOR RANDOMIZED INTERVAL 16 month TRACKING ONLY TO default\n" ) assert re.match( - "qB\\t\\t.*\\t42075936\\t1\\t\\\\N\\t1\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t.*\\t\\\\N\n", + "qB\\t\\t.*\\t42075936\\t1\\t\\\\N\\t1\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t50\\t\\\\N\\t200\\t\\\\N\\t.*\\t\\\\N\\t0\\t\\\\N\n", instance.query("SHOW QUOTA"), ) instance.query("SELECT * from test_table") assert re.match( - "qB\\t\\t.*\\t42075936\\t2\\t\\\\N\\t2\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t100\\t\\\\N\\t400\\t\\\\N\\t100\\t\\\\N\\t400\\t\\\\N\\t.*\\t\\\\N\\t0\\t\\\\N\n", + "qB\\t\\t.*\\t42075936\\t2\\t\\\\N\\t2\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\\t100\\t\\\\N\\t400\\t\\\\N\\t100\\t\\\\N\\t400\\t\\\\N\\t.*\\t\\\\N\\t0\\t\\\\N\\t0\\t\\\\N\n", instance.query("SHOW QUOTA"), ) @@ -1579,6 +1621,7 @@ def test_query_inserts(): "\\N", "\\N", "\\N", + "\\N", ] ] ) @@ -1605,6 +1648,7 @@ def test_query_inserts(): 0, "\\N", "\\N", + "\\N", ] ] ) @@ -1632,6 +1676,7 @@ def test_query_inserts(): 0, "\\N", "\\N", + "\\N", ] ] ) @@ -1663,6 +1708,7 @@ def test_query_inserts(): 0, "\\N", "\\N", + "\\N", ] ] ) @@ -1691,6 +1737,7 @@ def test_query_inserts(): 0, "\\N", "\\N", + "\\N", ] ] ) diff --git a/tests/integration/test_quota/tiny_limits.xml b/tests/integration/test_quota/tiny_limits.xml index 5821935bb6d..b5014674f98 100644 --- a/tests/integration/test_quota/tiny_limits.xml +++ b/tests/integration/test_quota/tiny_limits.xml @@ -12,6 +12,7 @@ 1 1 1 + 1 diff --git a/tests/integration/test_quota/two_quotas.xml b/tests/integration/test_quota/two_quotas.xml index 13872286dc6..e3b91b1bf43 100644 --- a/tests/integration/test_quota/two_quotas.xml +++ b/tests/integration/test_quota/two_quotas.xml @@ -18,6 +18,7 @@ 400000 400000 60 + 3 2629746 diff --git a/tests/integration/test_recompression_ttl/test.py b/tests/integration/test_recompression_ttl/test.py index 851e3bb4eb8..9d7b09eacdf 100644 --- a/tests/integration/test_recompression_ttl/test.py +++ b/tests/integration/test_recompression_ttl/test.py @@ -155,7 +155,7 @@ def test_recompression_multiple_ttls(started_cluster): node2.query( "SELECT recompression_ttl_info.expression FROM system.parts where name = 'all_1_1_4'" ) - == "['plus(d, toIntervalSecond(10))','plus(d, toIntervalSecond(15))','plus(d, toIntervalSecond(5))']\n" + == "['d + toIntervalSecond(10)','d + toIntervalSecond(15)','d + toIntervalSecond(5)']\n" ) diff --git a/tests/integration/test_replicated_database/configs/config.xml b/tests/integration/test_replicated_database/configs/config.xml index e598cc28d5d..7d779cb0d2e 100644 --- a/tests/integration/test_replicated_database/configs/config.xml +++ b/tests/integration/test_replicated_database/configs/config.xml @@ -4,4 +4,5 @@ 10 + 50 diff --git a/tests/integration/test_replicating_constants/test.py b/tests/integration/test_replicating_constants/test.py index 9669e890cd3..af8916dd625 100644 --- a/tests/integration/test_replicating_constants/test.py +++ b/tests/integration/test_replicating_constants/test.py @@ -8,10 +8,9 @@ node1 = cluster.add_instance("node1", with_zookeeper=True) node2 = cluster.add_instance( "node2", with_zookeeper=True, - image="yandex/clickhouse-server", - tag="19.16.9.37", + image="clickhouse/clickhouse-server", + tag="23.3", with_installed_binary=True, - allow_analyzer=False, ) diff --git a/tests/integration/test_s3_cluster/test.py b/tests/integration/test_s3_cluster/test.py index 673ca318c92..03919ee6a4d 100644 --- a/tests/integration/test_s3_cluster/test.py +++ b/tests/integration/test_s3_cluster/test.py @@ -35,7 +35,9 @@ def create_buckets_s3(cluster): # Make all files a bit different for number in range(100 + file_number): - data.append([str(number + file_number) * 10, number + file_number]) + data.append( + ["str_" + str(number + file_number) * 10, number + file_number] + ) writer = csv.writer(f) writer.writerows(data) @@ -427,3 +429,33 @@ def test_cluster_with_named_collection(started_cluster): ) assert TSV(pure_s3) == TSV(s3_cluster) + + +def test_cluster_format_detection(started_cluster): + node = started_cluster.instances["s0_0_0"] + + expected_desc_result = node.query( + "desc s3('http://minio1:9001/root/data/generated/*', 'minio', 'minio123', 'CSV')" + ) + + desc_result = node.query( + "desc s3('http://minio1:9001/root/data/generated/*', 'minio', 'minio123')" + ) + + assert expected_desc_result == desc_result + + expected_result = node.query( + "SELECT * FROM s3('http://minio1:9001/root/data/generated/*', 'minio', 'minio123', 'CSV', 'a String, b UInt64') order by a, b" + ) + + result = node.query( + "SELECT * FROM s3Cluster(cluster_simple, 'http://minio1:9001/root/data/generated/*', 'minio', 'minio123') order by c1, c2" + ) + + assert result == expected_result + + result = node.query( + "SELECT * FROM s3Cluster(cluster_simple, 'http://minio1:9001/root/data/generated/*', 'minio', 'minio123', auto, 'a String, b UInt64') order by a, b" + ) + + assert result == expected_result diff --git a/tests/integration/test_select_access_rights/test_main.py b/tests/integration/test_select_access_rights/test_main.py index eedecc2d30c..bca3c698911 100644 --- a/tests/integration/test_select_access_rights/test_main.py +++ b/tests/integration/test_select_access_rights/test_main.py @@ -1,6 +1,7 @@ import pytest + +import re from helpers.cluster import ClickHouseCluster -from helpers.test_tools import TSV cluster = ClickHouseCluster(__file__) instance = cluster.add_instance("instance") @@ -185,25 +186,39 @@ def test_select_join(): ) select_query = "SELECT * FROM table1 JOIN table2 USING(d)" - assert ( - "it's necessary to have the grant SELECT(d, x, y) ON default.table2" - in instance.query_and_get_error(select_query, user="A") - ) + + def match_error(err, columns, table): + """Check if the error message contains the expected table and columns""" + + match = re.search( + r"it's necessary to have the grant SELECT\((.*)\) ON default\.(\w+)", err + ) + if not match: + return False + if match.group(2) != table: + return False + assert set(match.group(1).split(", ")) == set( + columns.split(", ") + ), f"expected {columns} in {err}" + return True + + response = instance.query_and_get_error(select_query, user="A") + table1_match = match_error(response, "d, a, b", "table1") + table2_match = match_error(response, "d, x, y", "table2") + assert table1_match or table2_match, response instance.query("GRANT SELECT(d, x, y) ON default.table2 TO A") - assert ( - "it's necessary to have the grant SELECT(d, a, b) ON default.table1" - in instance.query_and_get_error(select_query, user="A") - ) + response = instance.query_and_get_error(select_query, user="A") + assert match_error(response, "d, a, b", "table1") + response = instance.query_and_get_error(select_query, user="A") instance.query("GRANT SELECT(d, a, b) ON default.table1 TO A") + assert instance.query(select_query, user="A") == "" instance.query("REVOKE SELECT ON default.table2 FROM A") - assert ( - "it's necessary to have the grant SELECT(d, x, y) ON default.table2" - in instance.query_and_get_error(select_query, user="A") - ) + response = instance.query_and_get_error(select_query, user="A") + assert match_error(response, "d, x, y", "table2") def test_select_union(): diff --git a/tests/integration/test_settings_profile/test.py b/tests/integration/test_settings_profile/test.py index 5e40b534cee..70740104d63 100644 --- a/tests/integration/test_settings_profile/test.py +++ b/tests/integration/test_settings_profile/test.py @@ -454,22 +454,41 @@ def test_show_profiles(): assert instance.query("SHOW PROFILES") == "default\nreadonly\nxyz\n" assert instance.query("SHOW CREATE PROFILE xyz") == "CREATE SETTINGS PROFILE xyz\n" + + query_possible_response = [ + "CREATE SETTINGS PROFILE default\n", + "CREATE SETTINGS PROFILE default SETTINGS allow_experimental_analyzer = true\n", + ] assert ( instance.query("SHOW CREATE SETTINGS PROFILE default") - == "CREATE SETTINGS PROFILE default\n" + in query_possible_response ) - assert ( - instance.query("SHOW CREATE PROFILES") == "CREATE SETTINGS PROFILE default\n" + + query_possible_response = [ + "CREATE SETTINGS PROFILE default\n" "CREATE SETTINGS PROFILE readonly SETTINGS readonly = 1\n" - "CREATE SETTINGS PROFILE xyz\n" - ) + "CREATE SETTINGS PROFILE xyz\n", + "CREATE SETTINGS PROFILE default SETTINGS allow_experimental_analyzer = true\n" + "CREATE SETTINGS PROFILE readonly SETTINGS readonly = 1\n" + "CREATE SETTINGS PROFILE xyz\n", + ] + assert instance.query("SHOW CREATE PROFILES") in query_possible_response expected_access = ( "CREATE SETTINGS PROFILE default\n" "CREATE SETTINGS PROFILE readonly SETTINGS readonly = 1\n" "CREATE SETTINGS PROFILE xyz\n" ) - assert expected_access in instance.query("SHOW ACCESS") + expected_access_analyzer = ( + "CREATE SETTINGS PROFILE default SETTINGS allow_experimental_analyzer = true\n" + "CREATE SETTINGS PROFILE readonly SETTINGS readonly = 1\n" + "CREATE SETTINGS PROFILE xyz\n" + ) + + query_response = instance.query("SHOW ACCESS") + assert ( + expected_access in query_response or expected_access_analyzer in query_response + ) def test_set_profile(): diff --git a/tests/integration/test_sql_user_defined_functions_on_cluster/test.py b/tests/integration/test_sql_user_defined_functions_on_cluster/test.py index c940998ec42..0bf03f545be 100644 --- a/tests/integration/test_sql_user_defined_functions_on_cluster/test.py +++ b/tests/integration/test_sql_user_defined_functions_on_cluster/test.py @@ -1,5 +1,5 @@ import pytest -from helpers.cluster import ClickHouseCluster +from helpers.cluster import ClickHouseCluster, ClickHouseInstance cluster = ClickHouseCluster(__file__) ch1 = cluster.add_instance( @@ -24,15 +24,17 @@ def started_cluster(): def test_sql_user_defined_functions_on_cluster(): - assert "Unknown function test_function" in ch1.query_and_get_error( - "SELECT test_function(1);" - ) - assert "Unknown function test_function" in ch2.query_and_get_error( - "SELECT test_function(1);" - ) - assert "Unknown function test_function" in ch3.query_and_get_error( - "SELECT test_function(1);" - ) + def check_function_does_not_exist(node: ClickHouseInstance): + error_message = node.query_and_get_error("SELECT test_function(1);") + assert ( + "Unknown function test_function" in error_message + or "Function with name 'test_function' does not exists. In scope SELECT test_function(1)" + in error_message + ) + + check_function_does_not_exist(ch1) + check_function_does_not_exist(ch2) + check_function_does_not_exist(ch3) ch1.query_with_retry( "CREATE FUNCTION test_function ON CLUSTER 'cluster' AS x -> x + 1;" @@ -43,12 +45,7 @@ def test_sql_user_defined_functions_on_cluster(): assert ch3.query("SELECT test_function(1);") == "2\n" ch2.query_with_retry("DROP FUNCTION test_function ON CLUSTER 'cluster'") - assert "Unknown function test_function" in ch1.query_and_get_error( - "SELECT test_function(1);" - ) - assert "Unknown function test_function" in ch2.query_and_get_error( - "SELECT test_function(1);" - ) - assert "Unknown function test_function" in ch3.query_and_get_error( - "SELECT test_function(1);" - ) + + check_function_does_not_exist(ch1) + check_function_does_not_exist(ch2) + check_function_does_not_exist(ch3) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 3cccd07c134..e1d636f3831 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -967,7 +967,7 @@ def test_union_schema_inference_mode(cluster): f"desc azureBlobStorage('{storage_account_url}', 'cont', 'test_union_schema_inference*.jsonl', '{account_name}', '{account_key}', 'auto', 'auto', 'auto') settings schema_inference_mode='union', describe_compact_output=1 format TSV", expect_error="true", ) - assert "Cannot extract table structure" in error + assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error def test_schema_inference_cache(cluster): @@ -1250,3 +1250,73 @@ def test_size_virtual_column(cluster): result == "test_size_virtual_column1.tsv\t2\ntest_size_virtual_column2.tsv\t3\ntest_size_virtual_column3.tsv\t4\n" ) + + +def test_format_detection(cluster): + node = cluster.instances["node"] + storage_account_url = cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"] + account_name = "devstoreaccount1" + account_key = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection0', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt64, y String') select number as x, 'str_' || toString(number) from numbers(0)", + ) + + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt64, y String') select number as x, 'str_' || toString(number) from numbers(10)", + ) + + expected_desc_result = azure_query( + node, + f"desc azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'auto')", + ) + + desc_result = azure_query( + node, + f"desc azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}')", + ) + + assert expected_desc_result == desc_result + + expected_result = azure_query( + node, + f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt64, y String')", + ) + + result = azure_query( + node, + f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}')", + ) + + assert result == expected_result + + result = azure_query( + node, + f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}', auto, auto, 'x UInt64, y String')", + ) + + assert result == expected_result + + result = azure_query( + node, + f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection{{0,1}}', '{account_name}', '{account_key}')", + ) + + assert result == expected_result + + node.query(f"system drop schema cache for hdfs") + + result = azure_query( + node, + f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection{{0,1}}', '{account_name}', '{account_key}')", + ) + + assert result == expected_result + + result = azure_query( + node, + f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection{{0,1}}', '{account_name}', '{account_key}')", + ) + + assert result == expected_result diff --git a/tests/integration/test_storage_azure_blob_storage/test_cluster.py b/tests/integration/test_storage_azure_blob_storage/test_cluster.py index 2bd3f24d25f..6c5e2d20ca5 100644 --- a/tests/integration/test_storage_azure_blob_storage/test_cluster.py +++ b/tests/integration/test_storage_azure_blob_storage/test_cluster.py @@ -262,3 +262,72 @@ def test_partition_parallel_reading_with_cluster(cluster): ) assert azure_cluster == "3\n" + + +def test_format_detection(cluster): + node = cluster.instances["node_0"] + storage_account_url = cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"] + account_name = "devstoreaccount1" + account_key = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" + + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection0', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt32, y String') select number as x, 'str_' || toString(number) from numbers(10)", + ) + + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt32, y String') select number as x, 'str_' || toString(number) from numbers(10, 10)", + ) + + expected_desc_result = azure_query( + node, + f"desc azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'auto')", + ) + + desc_result = azure_query( + node, + f"desc azureBlobStorageCluster('simple_cluster', '{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}')", + ) + + assert expected_desc_result == desc_result + + expected_result = azure_query( + node, + f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt32, y String') order by x", + ) + + result = azure_query( + node, + f"select * from azureBlobStorageCluster('simple_cluster', '{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}') order by x", + ) + + assert result == expected_result + + result = azure_query( + node, + f"select * from azureBlobStorageCluster('simple_cluster', '{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}', auto) order by x", + ) + + assert result == expected_result + + result = azure_query( + node, + f"select * from azureBlobStorageCluster('simple_cluster', '{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}', auto, auto) order by x", + ) + + assert result == expected_result + + result = azure_query( + node, + f"select * from azureBlobStorageCluster('simple_cluster', '{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}', 'x UInt32, y String') order by x", + ) + + assert result == expected_result + + result = azure_query( + node, + f"select * from azureBlobStorageCluster('simple_cluster', '{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}', auto, auto, 'x UInt32, y String') order by x", + ) + + assert result == expected_result diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 8ed1e4b6c0e..121263fb622 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -599,9 +599,7 @@ def test_schema_inference_with_globs(started_cluster): f"desc hdfs('hdfs://hdfs1:9000/data*.jsoncompacteachrow') settings schema_inference_use_cache_for_hdfs=0, input_format_json_infer_incomplete_types_as_strings=0" ) - assert ( - "Cannot extract table structure from JSONCompactEachRow format file" in result - ) + assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result def test_insert_select_schema_inference(started_cluster): @@ -1044,7 +1042,75 @@ def test_union_schema_inference_mode(started_cluster): error = node.query_and_get_error( "desc hdfs('hdfs://hdfs1:9000/test_union_schema_inference*.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" ) - assert "Cannot extract table structure" in error + assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error + + +def test_format_detection(started_cluster): + node = started_cluster.instances["node1"] + + node.query( + "insert into function hdfs('hdfs://hdfs1:9000/test_format_detection0', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(0)" + ) + + node.query( + "insert into function hdfs('hdfs://hdfs1:9000/test_format_detection1', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(10)" + ) + + expected_desc_result = node.query( + "desc hdfs('hdfs://hdfs1:9000/test_format_detection1', JSONEachRow)" + ) + + desc_result = node.query("desc hdfs('hdfs://hdfs1:9000/test_format_detection1')") + + assert expected_desc_result == desc_result + + expected_result = node.query( + "select * from hdfs('hdfs://hdfs1:9000/test_format_detection1', JSONEachRow, 'x UInt64, y String') order by x, y" + ) + + result = node.query( + "select * from hdfs('hdfs://hdfs1:9000/test_format_detection1') order by x, y" + ) + + assert expected_result == result + + result = node.query( + "select * from hdfs('hdfs://hdfs1:9000/test_format_detection1', auto, 'x UInt64, y String') order by x, y" + ) + + assert expected_result == result + + result = node.query( + "select * from hdfs('hdfs://hdfs1:9000/test_format_detection{0,1}') order by x, y" + ) + + assert expected_result == result + + node.query("system drop schema cache for hdfs") + + result = node.query( + "select * from hdfs('hdfs://hdfs1:9000/test_format_detection{0,1}') order by x, y" + ) + + assert expected_result == result + + result = node.query( + "select * from hdfsCluster(test_cluster_two_shards, 'hdfs://hdfs1:9000/test_format_detection{0,1}') order by x, y" + ) + + assert expected_result == result + + result = node.query( + "select * from hdfsCluster(test_cluster_two_shards, 'hdfs://hdfs1:9000/test_format_detection{0,1}', auto, auto) order by x, y" + ) + + assert expected_result == result + + result = node.query( + "select * from hdfsCluster(test_cluster_two_shards, 'hdfs://hdfs1:9000/test_format_detection{0,1}', auto, 'x UInt64, y String') order by x, y" + ) + + assert expected_result == result if __name__ == "__main__": diff --git a/tests/integration/test_storage_rabbitmq/test.py b/tests/integration/test_storage_rabbitmq/test.py index 6924f2e1508..b778e9fb556 100644 --- a/tests/integration/test_storage_rabbitmq/test.py +++ b/tests/integration/test_storage_rabbitmq/test.py @@ -3538,3 +3538,14 @@ def test_rabbitmq_handle_error_mode_stream(rabbitmq_cluster): expected = "".join(sorted(expected)) assert broken_messages == expected + + +def test_attach_broken_table(rabbitmq_cluster): + instance.query( + "ATTACH TABLE rabbit_queue UUID '2d1cdf1a-f060-4a61-a7c9-5b59e59992c6' (`payload` String) ENGINE = RabbitMQ SETTINGS rabbitmq_host_port = 'nonexisting:5671', rabbitmq_format = 'JSONEachRow', rabbitmq_username = 'test', rabbitmq_password = 'test'" + ) + + error = instance.query_and_get_error("SELECT * FROM rabbit_queue") + assert "CANNOT_CONNECT_RABBITMQ" in error + error = instance.query_and_get_error("INSERT INTO rabbit_queue VALUES ('test')") + assert "CANNOT_CONNECT_RABBITMQ" in error diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 2549cb0d473..dbbe670e8ca 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -1379,9 +1379,7 @@ def test_schema_inference_from_globs(started_cluster): f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test*.jsoncompacteachrow') settings schema_inference_use_cache_for_s3=0, input_format_json_infer_incomplete_types_as_strings=0" ) - assert ( - "Cannot extract table structure from JSONCompactEachRow format file" in result - ) + assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result url_filename = "test{0,1,2,3}.jsoncompacteachrow" @@ -1389,9 +1387,7 @@ def test_schema_inference_from_globs(started_cluster): f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{url_filename}') settings schema_inference_use_cache_for_url=0, input_format_json_infer_incomplete_types_as_strings=0" ) - assert ( - "Cannot extract table structure from JSONCompactEachRow format file" in result - ) + assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result def test_signatures(started_cluster): @@ -2193,4 +2189,58 @@ def test_union_schema_inference_mode(started_cluster): error = instance.query_and_get_error( f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference{{1,2,3,4}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" ) - assert "Cannot extract table structure" in error + assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error + + +def test_s3_format_detection(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + instance.query( + f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection0', 'JSONEachRow', 'x UInt64, y String') select number, 'str_' || toString(number) from numbers(0) settings s3_truncate_on_insert=1" + ) + + instance.query( + f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', 'JSONEachRow', 'x UInt64, y String') select number, 'str_' || toString(number) from numbers(5) settings s3_truncate_on_insert=1" + ) + + expected_result = instance.query( + f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', 'JSONEachRow', 'x UInt64, y String')" + ) + + expected_desc_result = instance.query( + f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', 'JSONEachRow')" + ) + + for engine in ["s3", "url"]: + desc_result = instance.query( + f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1')" + ) + + assert desc_result == expected_desc_result + + result = instance.query( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1')" + ) + + assert result == expected_result + + result = instance.query( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', auto, 'x UInt64, y String')" + ) + + assert result == expected_result + + result = instance.query( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection{{0,1}}', auto, 'x UInt64, y String')" + ) + + assert result == expected_result + + instance.query(f"system drop schema cache for {engine}") + + result = instance.query( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection{{0,1}}', auto, 'x UInt64, y String')" + ) + + assert result == expected_result diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py index 7d40060fec6..a7abd840834 100644 --- a/tests/integration/test_storage_s3_queue/test.py +++ b/tests/integration/test_storage_s3_queue/test.py @@ -89,6 +89,7 @@ def started_cluster(): "configs/zookeeper.xml", "configs/s3queue_log.xml", ], + stay_alive=True, ) cluster.add_instance( "instance2", @@ -98,6 +99,16 @@ def started_cluster(): main_configs=[ "configs/s3queue_log.xml", ], + stay_alive=True, + ) + cluster.add_instance( + "old_instance", + with_zookeeper=True, + image="clickhouse/clickhouse-server", + tag="23.12", + stay_alive=True, + with_installed_binary=True, + allow_analyzer=False, ) logging.info("Starting cluster...") @@ -165,6 +176,7 @@ def create_table( file_format="CSV", auth=DEFAULT_AUTH, bucket=None, + expect_error=False, ): auth_params = ",".join(auth) bucket = started_cluster.minio_bucket if bucket is None else bucket @@ -184,6 +196,10 @@ def create_table( ENGINE = S3Queue('{url}', {auth_params}, {file_format}) SETTINGS {",".join((k+"="+repr(v) for k, v in settings.items()))} """ + + if expect_error: + return node.query_and_get_error(create_query) + node.query(create_query) @@ -533,10 +549,7 @@ def test_multiple_tables_meta_mismatch(started_cluster): }, ) except QueryRuntimeException as e: - assert ( - "Metadata with the same `s3queue_zookeeper_path` was already created but with different settings" - in str(e) - ) + assert "Existing table metadata in ZooKeeper differs in engine mode" in str(e) failed = True assert failed is True @@ -960,3 +973,466 @@ def test_s3_client_reused(started_cluster): s3_clients_after = get_created_s3_clients_count() assert s3_clients_before == s3_clients_after + + +@pytest.mark.parametrize("mode", ["unordered", "ordered"]) +def test_processing_threads(started_cluster, mode): + node = started_cluster.instances["instance"] + table_name = f"processing_threads_{mode}" + dst_table_name = f"{table_name}_dst" + keeper_path = f"/clickhouse/test_{table_name}" + files_path = f"{table_name}_data" + files_to_generate = 300 + processing_threads = 32 + + create_table( + started_cluster, + node, + table_name, + mode, + files_path, + additional_settings={ + "keeper_path": keeper_path, + "s3queue_processing_threads_num": processing_threads, + }, + ) + create_mv(node, table_name, dst_table_name) + + total_values = generate_random_files( + started_cluster, files_path, files_to_generate, row_num=1 + ) + + def get_count(table_name): + return int(run_query(node, f"SELECT count() FROM {table_name}")) + + for _ in range(100): + if (get_count(f"{dst_table_name}")) == files_to_generate: + break + time.sleep(1) + + assert get_count(dst_table_name) == files_to_generate + + res = [ + list(map(int, l.split())) + for l in node.query( + f"SELECT column1, column2, column3 FROM {dst_table_name}" + ).splitlines() + ] + assert {tuple(v) for v in res} == set([tuple(i) for i in total_values]) + + if mode == "ordered": + zk = started_cluster.get_kazoo_client("zoo1") + processed_nodes = zk.get_children(f"{keeper_path}/processed/") + assert len(processed_nodes) == processing_threads + + +@pytest.mark.parametrize( + "mode, processing_threads", + [ + pytest.param("unordered", 1), + pytest.param("unordered", 8), + pytest.param("ordered", 1), + pytest.param("ordered", 8), + ], +) +def test_shards(started_cluster, mode, processing_threads): + node = started_cluster.instances["instance"] + table_name = f"test_shards_{mode}_{processing_threads}" + dst_table_name = f"{table_name}_dst" + keeper_path = f"/clickhouse/test_{table_name}" + files_path = f"{table_name}_data" + files_to_generate = 300 + shards_num = 3 + + for i in range(shards_num): + table = f"{table_name}_{i + 1}" + dst_table = f"{dst_table_name}_{i + 1}" + create_table( + started_cluster, + node, + table, + mode, + files_path, + additional_settings={ + "keeper_path": keeper_path, + "s3queue_processing_threads_num": processing_threads, + "s3queue_total_shards_num": shards_num, + }, + ) + create_mv(node, table, dst_table) + + total_values = generate_random_files( + started_cluster, files_path, files_to_generate, row_num=1 + ) + + def get_count(table_name): + return int(run_query(node, f"SELECT count() FROM {table_name}")) + + for _ in range(100): + if ( + get_count(f"{dst_table_name}_1") + + get_count(f"{dst_table_name}_2") + + get_count(f"{dst_table_name}_3") + ) == files_to_generate: + break + time.sleep(1) + + if ( + get_count(f"{dst_table_name}_1") + + get_count(f"{dst_table_name}_2") + + get_count(f"{dst_table_name}_3") + ) != files_to_generate: + info = node.query( + f"SELECT * FROM system.s3queue WHERE zookeeper_path like '%{table_name}' ORDER BY file_name FORMAT Vertical" + ) + logging.debug(info) + assert False + + res1 = [ + list(map(int, l.split())) + for l in node.query( + f"SELECT column1, column2, column3 FROM {dst_table_name}_1" + ).splitlines() + ] + res2 = [ + list(map(int, l.split())) + for l in node.query( + f"SELECT column1, column2, column3 FROM {dst_table_name}_2" + ).splitlines() + ] + res3 = [ + list(map(int, l.split())) + for l in node.query( + f"SELECT column1, column2, column3 FROM {dst_table_name}_3" + ).splitlines() + ] + assert {tuple(v) for v in res1 + res2 + res3} == set( + [tuple(i) for i in total_values] + ) + + # Checking that all files were processed only once + time.sleep(10) + assert ( + get_count(f"{dst_table_name}_1") + + get_count(f"{dst_table_name}_2") + + get_count(f"{dst_table_name}_3") + ) == files_to_generate + + if mode == "ordered": + zk = started_cluster.get_kazoo_client("zoo1") + processed_nodes = zk.get_children(f"{keeper_path}/processed/") + assert len(processed_nodes) == shards_num * processing_threads + shard_nodes = zk.get_children(f"{keeper_path}/shards/") + assert len(shard_nodes) == shards_num + + +@pytest.mark.parametrize( + "mode, processing_threads", + [ + pytest.param("unordered", 1), + pytest.param("unordered", 8), + pytest.param("ordered", 1), + pytest.param("ordered", 8), + ], +) +def test_shards_distributed(started_cluster, mode, processing_threads): + node = started_cluster.instances["instance"] + node_2 = started_cluster.instances["instance2"] + table_name = f"test_shards_distributed_{mode}_{processing_threads}" + dst_table_name = f"{table_name}_dst" + keeper_path = f"/clickhouse/test_{table_name}" + files_path = f"{table_name}_data" + files_to_generate = 300 + row_num = 50 + total_rows = row_num * files_to_generate + shards_num = 2 + + i = 0 + for instance in [node, node_2]: + create_table( + started_cluster, + instance, + table_name, + mode, + files_path, + additional_settings={ + "keeper_path": keeper_path, + "s3queue_processing_threads_num": processing_threads, + "s3queue_total_shards_num": shards_num, + }, + ) + i += 1 + + for instance in [node, node_2]: + create_mv(instance, table_name, dst_table_name) + + total_values = generate_random_files( + started_cluster, files_path, files_to_generate, row_num=row_num + ) + + def get_count(node, table_name): + return int(run_query(node, f"SELECT count() FROM {table_name}")) + + for _ in range(150): + if ( + get_count(node, dst_table_name) + get_count(node_2, dst_table_name) + ) == total_rows: + break + time.sleep(1) + + if ( + get_count(node, dst_table_name) + get_count(node_2, dst_table_name) + ) != total_rows: + info = node.query( + f"SELECT * FROM system.s3queue WHERE zookeeper_path like '%{table_name}' ORDER BY file_name FORMAT Vertical" + ) + logging.debug(info) + assert False + + get_query = f"SELECT column1, column2, column3 FROM {dst_table_name}" + res1 = [list(map(int, l.split())) for l in run_query(node, get_query).splitlines()] + res2 = [ + list(map(int, l.split())) for l in run_query(node_2, get_query).splitlines() + ] + + assert len(res1) + len(res2) == total_rows + + # Checking that all engines have made progress + assert len(res1) > 0 + assert len(res2) > 0 + + assert {tuple(v) for v in res1 + res2} == set([tuple(i) for i in total_values]) + + # Checking that all files were processed only once + time.sleep(10) + assert ( + get_count(node, dst_table_name) + get_count(node_2, dst_table_name) + ) == total_rows + + if mode == "ordered": + zk = started_cluster.get_kazoo_client("zoo1") + processed_nodes = zk.get_children(f"{keeper_path}/processed/") + assert len(processed_nodes) == shards_num * processing_threads + shard_nodes = zk.get_children(f"{keeper_path}/shards/") + assert len(shard_nodes) == shards_num + + node.restart_clickhouse() + time.sleep(10) + assert ( + get_count(node, dst_table_name) + get_count(node_2, dst_table_name) + ) == total_rows + + +def test_settings_check(started_cluster): + node = started_cluster.instances["instance"] + node_2 = started_cluster.instances["instance2"] + table_name = f"test_settings_check" + dst_table_name = f"{table_name}_dst" + keeper_path = f"/clickhouse/test_{table_name}" + files_path = f"{table_name}_data" + mode = "ordered" + + create_table( + started_cluster, + node, + table_name, + mode, + files_path, + additional_settings={ + "keeper_path": keeper_path, + "s3queue_processing_threads_num": 5, + "s3queue_total_shards_num": 2, + }, + ) + + assert ( + "Existing table metadata in ZooKeeper differs in s3queue_total_shards_num setting. Stored in ZooKeeper: 2, local: 3" + in create_table( + started_cluster, + node_2, + table_name, + mode, + files_path, + additional_settings={ + "keeper_path": keeper_path, + "s3queue_processing_threads_num": 5, + "s3queue_total_shards_num": 3, + }, + expect_error=True, + ) + ) + + assert ( + "Existing table metadata in ZooKeeper differs in s3queue_processing_threads_num setting. Stored in ZooKeeper: 5, local: 2" + in create_table( + started_cluster, + node_2, + table_name, + mode, + files_path, + additional_settings={ + "keeper_path": keeper_path, + "s3queue_processing_threads_num": 2, + "s3queue_total_shards_num": 2, + }, + expect_error=True, + ) + ) + + assert "s3queue_current_shard_num = 0" in node.query( + f"SHOW CREATE TABLE {table_name}" + ) + + node.restart_clickhouse() + + assert "s3queue_current_shard_num = 0" in node.query( + f"SHOW CREATE TABLE {table_name}" + ) + + node.query(f"DROP TABLE {table_name} SYNC") + + +@pytest.mark.parametrize("processing_threads", [1, 5]) +def test_processed_file_setting(started_cluster, processing_threads): + node = started_cluster.instances["instance"] + table_name = f"test_processed_file_setting_{processing_threads}" + dst_table_name = f"{table_name}_dst" + keeper_path = f"/clickhouse/test_{table_name}" + files_path = f"{table_name}_data" + files_to_generate = 10 + + create_table( + started_cluster, + node, + table_name, + "ordered", + files_path, + additional_settings={ + "keeper_path": keeper_path, + "s3queue_processing_threads_num": processing_threads, + "s3queue_last_processed_path": f"{files_path}/test_5.csv", + }, + ) + total_values = generate_random_files( + started_cluster, files_path, files_to_generate, start_ind=0, row_num=1 + ) + + create_mv(node, table_name, dst_table_name) + + def get_count(): + return int(node.query(f"SELECT count() FROM {dst_table_name}")) + + expected_rows = 4 + for _ in range(20): + if expected_rows == get_count(): + break + time.sleep(1) + + assert expected_rows == get_count() + + node.restart_clickhouse() + time.sleep(10) + + expected_rows = 4 + for _ in range(20): + if expected_rows == get_count(): + break + time.sleep(1) + + assert expected_rows == get_count() + + +@pytest.mark.parametrize("processing_threads", [1, 5]) +def test_processed_file_setting_distributed(started_cluster, processing_threads): + node = started_cluster.instances["instance"] + node_2 = started_cluster.instances["instance2"] + table_name = f"test_processed_file_setting_distributed_{processing_threads}" + dst_table_name = f"{table_name}_dst" + keeper_path = f"/clickhouse/test_{table_name}" + files_path = f"{table_name}_data" + files_to_generate = 10 + + for instance in [node, node_2]: + create_table( + started_cluster, + instance, + table_name, + "ordered", + files_path, + additional_settings={ + "keeper_path": keeper_path, + "s3queue_processing_threads_num": processing_threads, + "s3queue_last_processed_path": f"{files_path}/test_5.csv", + "s3queue_total_shards_num": 2, + }, + ) + + total_values = generate_random_files( + started_cluster, files_path, files_to_generate, start_ind=0, row_num=1 + ) + + for instance in [node, node_2]: + create_mv(instance, table_name, dst_table_name) + + def get_count(): + query = f"SELECT count() FROM {dst_table_name}" + return int(node.query(query)) + int(node_2.query(query)) + + expected_rows = 4 + for _ in range(20): + if expected_rows == get_count(): + break + time.sleep(1) + assert expected_rows == get_count() + + for instance in [node, node_2]: + instance.restart_clickhouse() + + time.sleep(10) + expected_rows = 4 + for _ in range(20): + if expected_rows == get_count(): + break + time.sleep(1) + assert expected_rows == get_count() + + +def test_upgrade(started_cluster): + node = started_cluster.instances["old_instance"] + + table_name = f"test_upgrade" + dst_table_name = f"{table_name}_dst" + keeper_path = f"/clickhouse/test_{table_name}" + files_path = f"{table_name}_data" + files_to_generate = 10 + + create_table( + started_cluster, + node, + table_name, + "ordered", + files_path, + additional_settings={ + "keeper_path": keeper_path, + }, + ) + total_values = generate_random_files( + started_cluster, files_path, files_to_generate, start_ind=0, row_num=1 + ) + + create_mv(node, table_name, dst_table_name) + + def get_count(): + return int(node.query(f"SELECT count() FROM {dst_table_name}")) + + expected_rows = 10 + for _ in range(20): + if expected_rows == get_count(): + break + time.sleep(1) + + assert expected_rows == get_count() + + node.restart_with_latest_version() + + assert expected_rows == get_count() diff --git a/tests/integration/test_user_defined_object_persistence/test.py b/tests/integration/test_user_defined_object_persistence/test.py index 8d775411b61..1919da0726e 100644 --- a/tests/integration/test_user_defined_object_persistence/test.py +++ b/tests/integration/test_user_defined_object_persistence/test.py @@ -35,9 +35,16 @@ def test_persistence(): instance.restart_clickhouse() - assert "Unknown function MySum1" in instance.query_and_get_error( - "SELECT MySum1(1, 2)" + error_message = instance.query_and_get_error("SELECT MySum1(1, 2)") + assert ( + "Unknown function MySum1" in error_message + or "Function with name 'MySum1' does not exists. In scope SELECT MySum1(1, 2)" + in error_message ) - assert "Unknown function MySum2" in instance.query_and_get_error( - "SELECT MySum2(1, 2)" + + error_message = instance.query_and_get_error("SELECT MySum2(1, 2)") + assert ( + "Unknown function MySum2" in error_message + or "Function with name 'MySum2' does not exists. In scope SELECT MySum2(1, 2)" + in error_message ) diff --git a/tests/integration/test_wrong_db_or_table_name/test.py b/tests/integration/test_wrong_db_or_table_name/test.py index a5096d80ca9..4a6dcf5aa41 100644 --- a/tests/integration/test_wrong_db_or_table_name/test.py +++ b/tests/integration/test_wrong_db_or_table_name/test.py @@ -92,26 +92,31 @@ def test_wrong_table_name(start): INSERT INTO test.table_test SELECT 1; """ ) - with pytest.raises( - QueryRuntimeException, - match="DB::Exception: Table test.table_test1 does not exist. Maybe you meant test.table_test?.", - ): - node.query( - """ + + error_message = node.query_and_get_error( + """ SELECT * FROM test.table_test1 LIMIT 1; """ - ) + ) + assert ( + "DB::Exception: Table test.table_test1 does not exist. Maybe you meant test.table_test?" + in error_message + or "DB::Exception: Unknown table expression identifier 'test.table_test1' in scope SELECT * FROM test.table_test1 LIMIT 1." + in error_message + ) assert int(node.query("SELECT count() FROM test.table_test;")) == 1 - with pytest.raises( - QueryRuntimeException, - match="DB::Exception: Table test2.table_test1 does not exist. Maybe you meant test.table_test?.", - ): - node.query( - """ + error_message = node.query_and_get_error( + """ SELECT * FROM test2.table_test1 LIMIT 1; """ - ) + ) + assert ( + "DB::Exception: Table test2.table_test1 does not exist. Maybe you meant test.table_test?." + in error_message + or "DB::Exception: Unknown table expression identifier 'test2.table_test1' in scope SELECT * FROM test2.table_test1 LIMIT 1." + in error_message + ) node.query( """ diff --git a/tests/jepsen.clickhouse/project.clj b/tests/jepsen.clickhouse/project.clj index 6c714604b56..bb41be1ba10 100644 --- a/tests/jepsen.clickhouse/project.clj +++ b/tests/jepsen.clickhouse/project.clj @@ -13,4 +13,7 @@ [com.hierynomus/sshj "0.34.0"] [com.clickhouse/clickhouse-jdbc "0.3.2-patch11"] [org.apache.zookeeper/zookeeper "3.6.1" :exclusions [org.slf4j/slf4j-log4j12]]] - :repl-options {:init-ns jepsen.clickhouse-keeper.main}) + :repl-options {:init-ns jepsen.clickhouse-keeper.main} + ;; otherwise, target artifacts will be created under the repo root, so that checkout with clear might fail in ci + :target-path "/tmp/jepsen_clickhouse" +) diff --git a/tests/performance/asof.xml b/tests/performance/asof.xml index d00afaa26b5..61e61be13bb 100644 --- a/tests/performance/asof.xml +++ b/tests/performance/asof.xml @@ -43,6 +43,13 @@ + + num_unique_sessions + + 1000 + 1000000 + + num_rows @@ -56,15 +63,15 @@ FROM ( SELECT - number AS id, - number AS visitor_id + (number % {num_unique_sessions}) AS visitor_id, + number AS id FROM system.numbers LIMIT {num_rows} ) AS sessions ASOF LEFT JOIN ( SELECT - number AS visitor_id, + (number % {num_unique_sessions}) AS visitor_id, number AS starting_session_id FROM system.numbers LIMIT {num_rows} diff --git a/tests/performance/coalesce.xml b/tests/performance/coalesce.xml new file mode 100644 index 00000000000..08b9a6aab1e --- /dev/null +++ b/tests/performance/coalesce.xml @@ -0,0 +1,3 @@ + + select coalesce(materialize(null), -1) from numbers(1000000000) format Null settings max_block_size = 8192 + diff --git a/tests/performance/group_array_sorted.xml b/tests/performance/group_array_sorted.xml new file mode 100644 index 00000000000..d5887998341 --- /dev/null +++ b/tests/performance/group_array_sorted.xml @@ -0,0 +1,31 @@ + + + 30000000000 + + + + + millions + + 50 + 100 + + + + window + + 10 + 1000 + 10000 + + + + + create table sorted_{millions}m engine MergeTree order by k as select number % 100 k, rand() v from numbers_mt(1000000 * {millions}) + optimize table sorted_{millions}m final + + select k, groupArraySorted({window})(v) from sorted_{millions}m group by k format Null + select k % 10 kk, groupArraySorted({window})(v) from sorted_{millions}m group by kk format Null + + drop table if exists sorted_{millions}m + diff --git a/tests/performance/if.xml b/tests/performance/if.xml index f4d0e8f9773..f7b4f3336a7 100644 --- a/tests/performance/if.xml +++ b/tests/performance/if.xml @@ -1,12 +1,28 @@ + 42949673, zero + 1, zero + 2)) ]]> + + + + + + + + + with rand32() % 2 as x select if(x, materialize(1.234), materialize(2.456)) from numbers(100000000) format Null + with rand32() % 2 as x, 1.234::Decimal64(3) as a, 2.456::Decimal64(3) as b select if(x, materialize(a), materialize(b)) from numbers(100000000) format Null - 42949673, zero + 1, zero + 2)) ]]> - - - - - - - + + with rand32() % 2 as x, 1::Int8 as a, -1::Int8 as b select if(x, a, b) from numbers(100000000) format Null + with rand32() % 2 as x, 1::Int64 as a, -1::Int64 as b select if(x, a, b) from numbers(100000000) format Null + with rand32() % 2 as x, 1::Int32 as a, -1::Int32 as b select if(x, a, b) from numbers(100000000) format Null + with rand32() % 2 as x, 1::Decimal32(3) as a, -1::Decimal32(3) as b select if(x, a, b) from numbers(100000000) format Null + with rand32() % 2 as x, 1::Decimal64(3) as a, -1::Decimal64(3) as b select if(x, a, b) from numbers(100000000) format Null + with rand32() % 2 as x, 1::Decimal128(3) as a, -1::Decimal128(3) as b select if(x, a, b) from numbers(100000000) format Null + with rand32() % 2 as x, 1::Decimal256(3) as a, -1::Decimal256(3) as b select if(x, a, b) from numbers(100000000) format Null + with rand32() % 2 as x, 1::Int128 as a, -1::Int128 as b select if(x, a, b) from numbers(100000000) format Null + with rand32() % 2 as x, 1::Int256 as a, -1::Int256 as b select if(x, a, b) from numbers(100000000) format Null + + with rand32() % 2 as x select if(x, map(1,2,3,4), map(3,4,5,6)) from numbers(1000000) format Null + with rand32() % 2 as x select if(x, materialize(map(1,2,3,4)), materialize(map(3,4,5,6))) from numbers(1000000) format Null diff --git a/tests/performance/scripts/compare.sh b/tests/performance/scripts/compare.sh index 92ba383f965..39c6854fbf9 100755 --- a/tests/performance/scripts/compare.sh +++ b/tests/performance/scripts/compare.sh @@ -444,10 +444,10 @@ create view query_logs as create table query_run_metric_arrays engine File(TSV, 'analyze/query-run-metric-arrays.tsv') as with ( - -- sumMapState with the list of all keys with 'nan' values. 'nan' is because - -- sumMap removes keys with positive/negative zeros. + -- sumMapState with the list of all keys with nullable '0' values because sumMap removes keys with default values + -- and 0::Nullable != NULL with (select groupUniqArrayArray(mapKeys(ProfileEvents)) from query_logs) as all_names - select arrayReduce('sumMapState', [(all_names, arrayMap(x->nan, all_names))]) + select arrayReduce('sumMapState', [(all_names, arrayMap(x->0::Nullable(Float64), all_names))]) ) as all_metrics select test, query_index, version, query_id, (finalizeAggregation( @@ -456,14 +456,12 @@ create table query_run_metric_arrays engine File(TSV, 'analyze/query-run-metric- all_metrics, arrayReduce('sumMapState', [(mapKeys(ProfileEvents), - arrayMap(x->toFloat64(x), mapValues(ProfileEvents)))] + arrayMap(x->toNullable(toFloat64(x)), mapValues(ProfileEvents)))] ), arrayReduce('sumMapState', [( ['client_time', 'server_time', 'memory_usage'], - arrayMap(x->if(x != 0., x, nan), [ - toFloat64(query_runs.time), - toFloat64(query_duration_ms / 1000.), - toFloat64(memory_usage)]))]) + [toNullable(toFloat64(query_runs.time)), toNullable(toFloat64(query_duration_ms / 1000.)), toNullable(toFloat64(memory_usage))] + )]) ] )) as metrics_tuple).1 metric_names, arrayMap(x->if(isNaN(x),0,x), metrics_tuple.2) metric_values diff --git a/tests/performance/scripts/entrypoint.sh b/tests/performance/scripts/entrypoint.sh index ec7e4d96dde..0c3bfa550f4 100755 --- a/tests/performance/scripts/entrypoint.sh +++ b/tests/performance/scripts/entrypoint.sh @@ -118,8 +118,8 @@ then # far in the future and have unrelated test changes. base=$(git -C right/ch merge-base pr origin/master) git -C right/ch diff --name-only "$base" pr -- . | tee all-changed-files.txt - git -C right/ch diff --name-only "$base" pr -- tests/performance | tee changed-test-definitions.txt - git -C right/ch diff --name-only "$base" pr -- :!tests/performance :!docker/test/performance-comparison | tee other-changed-files.txt + git -C right/ch diff --name-only "$base" pr -- tests/performance/*.xml | tee changed-test-definitions.txt + git -C right/ch diff --name-only "$base" pr -- :!tests/performance/*.xml :!docker/test/performance-comparison | tee other-changed-files.txt fi # Set python output encoding so that we can print queries with non-ASCII letters. diff --git a/tests/performance/sum.xml b/tests/performance/sum.xml index 57b879a360d..36b898436bf 100644 --- a/tests/performance/sum.xml +++ b/tests/performance/sum.xml @@ -17,6 +17,13 @@ SELECT sumKahan(toNullable(toFloat32(number))) FROM numbers(100000000) SELECT sumKahan(toNullable(toFloat64(number))) FROM numbers(100000000) + select sumIf(number::Decimal128(3), rand32() % 2 = 0) from numbers(100000000) + select sumIf(number::Decimal256(3), rand32() % 2 = 0) from numbers(100000000) + select sumIf(number::Int128, rand32() % 2 = 0) from numbers(100000000) + select sumIf(number::UInt128, rand32() % 2 = 0) from numbers(100000000) + select sumIf(number::Int256, rand32() % 2 = 0) from numbers(100000000) + select sumIf(number::UInt256, rand32() % 2 = 0) from numbers(100000000) + CREATE TABLE nullfloat32 (x Nullable(Float32)) ENGINE = Memory INSERT INTO nullfloat32 diff --git a/tests/queries/0_stateless/00191_aggregating_merge_tree_and_final.sql b/tests/queries/0_stateless/00191_aggregating_merge_tree_and_final.sql index 8160d4dee9e..4f73a9e9a54 100644 --- a/tests/queries/0_stateless/00191_aggregating_merge_tree_and_final.sql +++ b/tests/queries/0_stateless/00191_aggregating_merge_tree_and_final.sql @@ -7,9 +7,9 @@ INSERT INTO aggregating_00191 (k, u) SELECT intDiv(number, 100) AS k, uniqState( SELECT k, finalizeAggregation(u) FROM aggregating_00191 FINAL order by k; -OPTIMIZE TABLE aggregating_00191; +OPTIMIZE TABLE aggregating_00191 FINAL; -SELECT k, finalizeAggregation(u) FROM aggregating_00191; +SELECT k, finalizeAggregation(u) FROM aggregating_00191 order by k; SELECT k, finalizeAggregation(u) FROM aggregating_00191 FINAL order by k; DROP TABLE aggregating_00191; diff --git a/tests/queries/0_stateless/00273_quantiles.sql b/tests/queries/0_stateless/00273_quantiles.sql index eba5e772997..791ced6bc5d 100644 --- a/tests/queries/0_stateless/00273_quantiles.sql +++ b/tests/queries/0_stateless/00273_quantiles.sql @@ -2,13 +2,13 @@ SELECT quantiles(0.5)(x) FROM (SELECT number AS x FROM system.numbers LIMIT 1001 SELECT quantilesExact(0.5)(x) FROM (SELECT number AS x FROM system.numbers LIMIT 1001); SELECT quantilesTDigest(0.5)(x) FROM (SELECT number AS x FROM system.numbers LIMIT 1001); SELECT quantilesDeterministic(0.5)(x, x) FROM (SELECT number AS x FROM system.numbers LIMIT 1001); -SELECT arrayMap(a -> round(a, 2), quantilesDDSketch(0.01, 0.5)(x)) FROM (SELECT number AS x FROM system.numbers LIMIT 1001); +SELECT arrayMap(a -> round(a, 2), quantilesDD(0.01, 0.5)(x)) FROM (SELECT number AS x FROM system.numbers LIMIT 1001); SELECT quantiles(0, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 0.999, 1)(x) FROM (SELECT number AS x FROM system.numbers LIMIT 1001); SELECT quantilesExact(0, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 0.999, 1)(x) FROM (SELECT number AS x FROM system.numbers LIMIT 1001); SELECT quantilesTDigest(0, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 0.999, 1)(x) FROM (SELECT number AS x FROM system.numbers LIMIT 1001); SELECT quantilesDeterministic(0, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 0.999, 1)(x, x) FROM (SELECT number AS x FROM system.numbers LIMIT 1001); -SELECT arrayMap(a -> round(a, 2), quantilesDDSketch(0.01, 0, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 0.999, 1)(x)) FROM (SELECT number AS x FROM system.numbers LIMIT 1001); +SELECT arrayMap(a -> round(a, 2), quantilesDD(0.01, 0, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 0.999, 1)(x)) FROM (SELECT number AS x FROM system.numbers LIMIT 1001); -- The result slightly differs but it's ok since `quantilesDeterministic` is an approximate function. SET max_bytes_before_external_group_by = 0; diff --git a/tests/queries/0_stateless/00700_decimal_arithm.reference b/tests/queries/0_stateless/00700_decimal_arithm.reference index 811946c87e0..109c0632fb1 100644 --- a/tests/queries/0_stateless/00700_decimal_arithm.reference +++ b/tests/queries/0_stateless/00700_decimal_arithm.reference @@ -10,18 +10,18 @@ 63 21 -42 882 -882 2 0 2 0 63 21 -42 882 -882 2 0 2 0 1.00305798474369219219752355409390731264 -0.16305798474369219219752355409390731264 1.490591730234615865843651857942052864 -1.38847100762815390390123822295304634368 1.38847100762815390390123822295304634368 0.02 0.005 -63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2.02 0.505 -63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2.02 0.505 -63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2.02 0.505 -63.42 21.42 -41.58 890.82 -890.82 2.02 0.5 2.02 0.5 +63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2 0 +63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2 0 +63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2 0 +63.42 21.42 -41.58 890.82 -890.82 2.02 0.5 2 0 63 -21 42 882 -882 0 2 0 2 63 -21 42 882 -882 0 2 0 2 63 -21 42 882 -882 0 2 0 2 1.00305798474369219219752355409390731264 0.16305798474369219219752355409390731264 -1.490591730234615865843651857942052864 -1.38847100762815390390123822295304634368 1.38847100762815390390123822295304634368 -0.00000000000000000000000000000000000001 0.00000000000000000000000000000000000001 -63.42 -21.42 41.58 890.82 -890.82 0.495 1.98 0.495 1.98 +63.42 -21.42 41.58 890.82 -890.82 0.495 1.98 0 1 63.42 -21.42 41.58 890.82 -890.82 -63.42 -21.42 41.58 890.82 -890.82 0.495049504950495049 1.980198019801980198 0.495049504950495049 1.980198019801980198 -63.42 -21.42 41.58 890.82 -890.82 0.49 1.98 0.49 1.98 +63.42 -21.42 41.58 890.82 -890.82 0.495049504950495049 1.980198019801980198 0 1 +63.42 -21.42 41.58 890.82 -890.82 0.49 1.98 0 1 -42 42 42 42 0.42 0.42 0.42 42.42 42.42 42.42 0 0 0 0 0 0 0 0 0 0 42 -42 -42 -42 -0.42 -0.42 -0.42 -42.42 -42.42 -42.42 diff --git a/tests/queries/0_stateless/00873_t64_codec_date.reference b/tests/queries/0_stateless/00873_t64_codec_date.reference new file mode 100644 index 00000000000..9353696610c --- /dev/null +++ b/tests/queries/0_stateless/00873_t64_codec_date.reference @@ -0,0 +1,4 @@ +1970-01-01 1970-01-01 1950-01-01 1950-01-01 +1970-01-01 1970-01-01 1970-01-01 1970-01-01 +2149-06-06 2149-06-06 2149-06-06 2149-06-06 +2149-06-06 2149-06-06 2149-06-08 2149-06-08 diff --git a/tests/queries/0_stateless/00873_t64_codec_date.sql b/tests/queries/0_stateless/00873_t64_codec_date.sql new file mode 100644 index 00000000000..c6e21baba12 --- /dev/null +++ b/tests/queries/0_stateless/00873_t64_codec_date.sql @@ -0,0 +1,26 @@ +DROP TABLE IF EXISTS t64; + +CREATE TABLE t64 +( + date16 Date, + t_date16 Date Codec(T64, ZSTD), + date_32 Date32, + t_date32 Date32 Codec(T64, ZSTD) +) ENGINE MergeTree() ORDER BY tuple(); + +INSERT INTO t64 values ('1970-01-01', '1970-01-01', '1970-01-01', '1970-01-01'); +INSERT INTO t64 values ('2149-06-06', '2149-06-06', '2149-06-06', '2149-06-06'); +INSERT INTO t64 values ('2149-06-08', '2149-06-08', '2149-06-08', '2149-06-08'); +INSERT INTO t64 values ('1950-01-01', '1950-01-01', '1950-01-01', '1950-01-01'); + +SELECT * FROM t64 ORDER BY date_32; + +SELECT * FROM t64 WHERE date16 != t_date16; +SELECT * FROM t64 WHERE date_32 != t_date32; + +OPTIMIZE TABLE t64 FINAL; + +SELECT * FROM t64 WHERE date16 != t_date16; +SELECT * FROM t64 WHERE date_32 != t_date32; + +DROP TABLE t64; diff --git a/tests/queries/0_stateless/00937_format_schema_rows_template.reference b/tests/queries/0_stateless/00937_format_schema_rows_template.reference new file mode 100644 index 00000000000..85bab456512 --- /dev/null +++ b/tests/queries/0_stateless/00937_format_schema_rows_template.reference @@ -0,0 +1,9 @@ +Question: 'How awesome is clickhouse?', Answer: 'unbelievably awesome!', Number of Likes: 456, Date: 2016-01-02; +Question: 'How fast is clickhouse?', Answer: 'Lightning fast!', Number of Likes: 9876543210, Date: 2016-01-03; +Question: 'Is it opensource?', Answer: 'of course it is!', Number of Likes: 789, Date: 2016-01-04 + +===== Results ===== +Question: 'How awesome is clickhouse?', Answer: 'unbelievably awesome!', Number of Likes: 456, Date: 2016-01-02; +Question: 'How fast is clickhouse?', Answer: 'Lightning fast!', Number of Likes: 9876543210, Date: 2016-01-03; +Question: 'Is it opensource?', Answer: 'of course it is!', Number of Likes: 789, Date: 2016-01-04 +=================== diff --git a/tests/queries/0_stateless/00937_format_schema_rows_template.sh b/tests/queries/0_stateless/00937_format_schema_rows_template.sh new file mode 100755 index 00000000000..0221527f9c9 --- /dev/null +++ b/tests/queries/0_stateless/00937_format_schema_rows_template.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# shellcheck disable=SC2016 + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# Test format_template_row_format setting + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS template"; +$CLICKHOUSE_CLIENT --query="CREATE TABLE template (question String, answer String, likes UInt64, date Date) ENGINE = Memory"; +$CLICKHOUSE_CLIENT --query="INSERT INTO template VALUES +('How awesome is clickhouse?', 'unbelievably awesome!', 456, '2016-01-02'),\ +('How fast is clickhouse?', 'Lightning fast!', 9876543210, '2016-01-03'),\ +('Is it opensource?', 'of course it is!', 789, '2016-01-04')"; + +$CLICKHOUSE_CLIENT --query="SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ +format_template_row_format = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ +format_template_rows_between_delimiter = ';\n'"; + +echo -e "\n" + +# Test that if both format_template_row_format setting and format_template_row are provided, error is thrown +row_format_file="$CURDIR"/"${CLICKHOUSE_TEST_UNIQUE_NAME}"_template_output_format_row.tmp +echo -ne 'Question: ${question:Quoted}, Answer: ${answer:Quoted}, Number of Likes: ${likes:Raw}, Date: ${date:Raw}' > $row_format_file +$CLICKHOUSE_CLIENT --multiline --multiquery --query "SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ +format_template_row = '$row_format_file', \ +format_template_row_format = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ +format_template_rows_between_delimiter = ';\n'; --{clientError 474}" + +# Test format_template_resultset_format setting + +$CLICKHOUSE_CLIENT --query="SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ +format_template_row_format = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ +format_template_resultset_format = '===== Results ===== \n\${data}\n===================\n', \ +format_template_rows_between_delimiter = ';\n'"; + +# Test that if both format_template_result_format setting and format_template_resultset are provided, error is thrown +resultset_output_file="$CURDIR"/"$CLICKHOUSE_TEST_UNIQUE_NAME"_template_output_format_resultset.tmp +echo -ne '===== Resultset ===== \n \${data} \n ===============' > $resultset_output_file +$CLICKHOUSE_CLIENT --multiline --multiquery --query "SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ +format_template_resultset = '$resultset_output_file', \ +format_template_resultset_format = '===== Resultset ===== \n \${data} \n ===============', \ +format_template_row_format = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ +format_template_rows_between_delimiter = ';\n'; --{clientError 474}" + +$CLICKHOUSE_CLIENT --query="DROP TABLE template"; +rm $row_format_file +rm $resultset_output_file diff --git a/tests/queries/0_stateless/01030_storage_url_syntax.sql b/tests/queries/0_stateless/01030_storage_url_syntax.sql index 9b31558eece..eda108aca2f 100644 --- a/tests/queries/0_stateless/01030_storage_url_syntax.sql +++ b/tests/queries/0_stateless/01030_storage_url_syntax.sql @@ -1,7 +1,7 @@ drop table if exists test_table_url_syntax ; create table test_table_url_syntax (id UInt32) ENGINE = URL('') -; -- { serverError 36 } +; -- { serverError UNSUPPORTED_URI_SCHEME } create table test_table_url_syntax (id UInt32) ENGINE = URL('','','','') ; -- { serverError 42 } drop table if exists test_table_url_syntax @@ -11,7 +11,7 @@ drop table if exists test_table_url ; create table test_table_url(id UInt32) ENGINE = URL('http://localhost/endpoint') -; -- { serverError 36 } +; -- { serverError CANNOT_DETECT_FORMAT } create table test_table_url(id UInt32) ENGINE = URL('http://localhost/endpoint.json'); drop table test_table_url; diff --git a/tests/queries/0_stateless/01193_metadata_loading.sh b/tests/queries/0_stateless/01193_metadata_loading.sh index c25cdf4e970..69178a93d42 100755 --- a/tests/queries/0_stateless/01193_metadata_loading.sh +++ b/tests/queries/0_stateless/01193_metadata_loading.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-parallel, no-fasttest, no-s3-storage +# Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-parallel, no-fasttest, no-s3-storage, no-sanitize-coverage CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -8,16 +8,12 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # Check that attaching a database with a large number of tables is not too slow. # it is the worst way of making performance test, nevertheless it can detect significant slowdown and some other issues, that usually found by stress test -db="test_01193_$RANDOM" +db="test_01193_$RANDOM_$RANDOM_$RANDOM_$RANDOM" tables=1000 threads=10 count_multiplier=1 max_time_ms=1500 -debug_or_sanitizer_build=$($CLICKHOUSE_CLIENT -q "WITH ((SELECT value FROM system.build_options WHERE name='BUILD_TYPE') AS build, (SELECT value FROM system.build_options WHERE name='CXX_FLAGS') as flags) SELECT build='Debug' OR flags LIKE '%fsanitize%' OR hasThreadFuzzer()") - -if [[ debug_or_sanitizer_build -eq 1 ]]; then tables=100; count_multiplier=10; max_time_ms=1500; fi - create_tables() { $CLICKHOUSE_CLIENT -q "WITH 'CREATE TABLE $db.table_$1_' AS create1, diff --git a/tests/queries/0_stateless/01297_create_quota.reference b/tests/queries/0_stateless/01297_create_quota.reference index 308bbf79024..456c9fc56bb 100644 --- a/tests/queries/0_stateless/01297_create_quota.reference +++ b/tests/queries/0_stateless/01297_create_quota.reference @@ -57,10 +57,10 @@ q2_01297 local_directory [] [5259492] 0 ['r1_01297','u1_01297'] [] q3_01297 local_directory ['client_key','user_name'] [5259492,15778476] 0 [] [] q4_01297 local_directory [] [604800] 1 [] ['u1_01297'] -- system.quota_limits -q2_01297 5259492 0 100 \N \N 11 1000 10000 1001 10001 2.5 \N -q3_01297 5259492 0 \N \N \N \N 1002 \N \N \N \N \N -q3_01297 15778476 0 100 \N \N 11 \N \N \N \N \N \N -q4_01297 604800 0 \N \N \N \N \N \N \N \N \N \N +q2_01297 5259492 0 100 \N \N 11 1000 10000 1001 10001 2.5 \N \N +q3_01297 5259492 0 \N \N \N \N 1002 \N \N \N \N \N \N +q3_01297 15778476 0 100 \N \N 11 \N \N \N \N \N \N \N +q4_01297 604800 0 \N \N \N \N \N \N \N \N \N \N \N -- query_selects query_inserts CREATE QUOTA q1_01297 KEYED BY user_name FOR INTERVAL 1 minute MAX query_selects = 1 TO r1_01297 CREATE QUOTA q2_01297 KEYED BY user_name FOR INTERVAL 1 minute MAX query_inserts = 1 TO r1_01297 diff --git a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.sh b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.sh index 209e18e3329..379f83c6271 100755 --- a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.sh +++ b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.sh @@ -8,6 +8,11 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh +# This test does many invocations of clickhouse-client in a loop, +# leading to "Too many parts" in the system.coverage_log, +# but we are not interested in client-side coverage here. +unset CLICKHOUSE_WRITE_COVERAGE + NUM_REPLICAS=10 for i in $(seq 1 $NUM_REPLICAS); do diff --git a/tests/queries/0_stateless/01465_ttl_recompression.reference b/tests/queries/0_stateless/01465_ttl_recompression.reference index 108df565669..90661a5dc78 100644 --- a/tests/queries/0_stateless/01465_ttl_recompression.reference +++ b/tests/queries/0_stateless/01465_ttl_recompression.reference @@ -13,9 +13,9 @@ CREATE TABLE default.recompression_table\n(\n `dt` DateTime,\n `key` UInt6 1_1_1 LZ4 2_2_2 ZSTD(12) 3_3_3 ZSTD(12) -1_1_1 ['plus(dt, toIntervalDay(1))'] -2_2_2 ['plus(dt, toIntervalDay(1))'] -3_3_3 ['plus(dt, toIntervalDay(1))'] +1_1_1 ['dt + toIntervalDay(1)'] +2_2_2 ['dt + toIntervalDay(1)'] +3_3_3 ['dt + toIntervalDay(1)'] 1_1_1 LZ4 2_2_2 LZ4 3_3_3 LZ4 diff --git a/tests/queries/0_stateless/01557_field_infinite_convert_to_number.sql b/tests/queries/0_stateless/01557_field_infinite_convert_to_number.sql index edc4d5cbc91..cc71c8e6f6c 100644 --- a/tests/queries/0_stateless/01557_field_infinite_convert_to_number.sql +++ b/tests/queries/0_stateless/01557_field_infinite_convert_to_number.sql @@ -1 +1 @@ -SET max_threads = nan; -- { serverError 70 } +SET max_threads = nan; -- { serverError CANNOT_CONVERT_TYPE } diff --git a/tests/queries/0_stateless/01595_countMatches.reference b/tests/queries/0_stateless/01595_countMatches.reference index c65279c0b8e..394c8508430 100644 --- a/tests/queries/0_stateless/01595_countMatches.reference +++ b/tests/queries/0_stateless/01595_countMatches.reference @@ -12,6 +12,7 @@ case sensitive 2 4 4 +2 case insensitive 2 1 @@ -21,4 +22,8 @@ case insensitive 2 4 4 +2 errors +FixedString +2 +2 diff --git a/tests/queries/0_stateless/01595_countMatches.sql b/tests/queries/0_stateless/01595_countMatches.sql index 0b170945d44..0c2982572cd 100644 --- a/tests/queries/0_stateless/01595_countMatches.sql +++ b/tests/queries/0_stateless/01595_countMatches.sql @@ -14,6 +14,7 @@ select countMatches(concat(toString(number), 'foofoo'), 'foo') from numbers(2); select countMatches('foobarbazfoobarbaz', 'foo(bar)(?:baz|)'); select countMatches('foo.com bar.com baz.com bam.com', '([^. ]+)\.([^. ]+)'); select countMatches('foo.com@foo.com bar.com@foo.com baz.com@foo.com bam.com@foo.com', '([^. ]+)\.([^. ]+)@([^. ]+)\.([^. ]+)'); +select countMatches(materialize('foobarfoo'), 'foo'); select 'case insensitive'; select countMatchesCaseInsensitive('foobarfoo', 'FOo'); @@ -23,7 +24,13 @@ select countMatchesCaseInsensitive(concat(toString(number), 'Foofoo'), 'foo') fr select countMatchesCaseInsensitive('foOBarBAZfoobarbaz', 'foo(bar)(?:baz|)'); select countMatchesCaseInsensitive('foo.com BAR.COM baz.com bam.com', '([^. ]+)\.([^. ]+)'); select countMatchesCaseInsensitive('foo.com@foo.com bar.com@foo.com BAZ.com@foo.com bam.com@foo.com', '([^. ]+)\.([^. ]+)@([^. ]+)\.([^. ]+)'); +select countMatchesCaseInsensitive(materialize('foobarfoo'), 'FOo'); select 'errors'; -select countMatches(1, 'foo') from numbers(1); -- { serverError 43 } -select countMatches('foobarfoo', toString(number)) from numbers(1); -- { serverError 44 } +select countMatches(1, 'foo') from numbers(1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +select countMatches('foobarfoo', toString(number)) from numbers(1); -- { serverError ILLEGAL_COLUMN } +select countMatches('foo', materialize('foo')); -- { serverError ILLEGAL_COLUMN } + +select 'FixedString'; +select countMatches(toFixedString('foobarfoo', 9), 'foo'); +select countMatches(materialize(toFixedString('foobarfoo', 9)), 'foo'); diff --git a/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql b/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql index c4f26a079f0..dc1e5b37050 100644 --- a/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql +++ b/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql @@ -1,2 +1,2 @@ -SELECT intDiv(9223372036854775807, 0.9998999834060669); -- { serverError 153 } -SELECT intDiv(9223372036854775807, 1.); -- { serverError 153 } +SELECT intDiv(18446744073709551615, 0.9998999834060669); -- { serverError 153 } +SELECT intDiv(18446744073709551615, 1.); -- { serverError 153 } diff --git a/tests/queries/0_stateless/01739_index_hint.reference b/tests/queries/0_stateless/01739_index_hint.reference index 21673bf698b..21f4edc0049 100644 --- a/tests/queries/0_stateless/01739_index_hint.reference +++ b/tests/queries/0_stateless/01739_index_hint.reference @@ -35,6 +35,9 @@ SELECT count() FROM XXXX WHERE indexHint(t = toDateTime(0)) SETTINGS optimize_us drop table XXXX; CREATE TABLE XXXX (p Nullable(Int64), k Decimal(76, 39)) ENGINE = MergeTree PARTITION BY toDate(p) ORDER BY k SETTINGS index_granularity = 1, allow_nullable_key = 1; INSERT INTO XXXX FORMAT Values ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3); -SELECT count() FROM XXXX WHERE indexHint(p = 1.) SETTINGS optimize_use_implicit_projections = 1; +SELECT count() FROM XXXX WHERE indexHint(p = 1.) SETTINGS optimize_use_implicit_projections = 1, allow_experimental_analyzer=0; 0 +-- TODO: optimize_use_implicit_projections ignores indexHint (with analyzer) because source columns might be aliased. +SELECT count() FROM XXXX WHERE indexHint(p = 1.) SETTINGS optimize_use_implicit_projections = 1, allow_experimental_analyzer=1; +3 drop table XXXX; diff --git a/tests/queries/0_stateless/01739_index_hint.sql b/tests/queries/0_stateless/01739_index_hint.sql index cde46a5a2bf..1eca65f0892 100644 --- a/tests/queries/0_stateless/01739_index_hint.sql +++ b/tests/queries/0_stateless/01739_index_hint.sql @@ -38,6 +38,8 @@ CREATE TABLE XXXX (p Nullable(Int64), k Decimal(76, 39)) ENGINE = MergeTree PART INSERT INTO XXXX FORMAT Values ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3); -SELECT count() FROM XXXX WHERE indexHint(p = 1.) SETTINGS optimize_use_implicit_projections = 1; +SELECT count() FROM XXXX WHERE indexHint(p = 1.) SETTINGS optimize_use_implicit_projections = 1, allow_experimental_analyzer=0; +-- TODO: optimize_use_implicit_projections ignores indexHint (with analyzer) because source columns might be aliased. +SELECT count() FROM XXXX WHERE indexHint(p = 1.) SETTINGS optimize_use_implicit_projections = 1, allow_experimental_analyzer=1; drop table XXXX; diff --git a/tests/queries/0_stateless/01763_filter_push_down_bugs.reference b/tests/queries/0_stateless/01763_filter_push_down_bugs.reference index c8045dd26f5..80bd7dfd8c0 100644 --- a/tests/queries/0_stateless/01763_filter_push_down_bugs.reference +++ b/tests/queries/0_stateless/01763_filter_push_down_bugs.reference @@ -28,7 +28,7 @@ Expression ((Projection + Before ORDER BY)) Expression ((Project names + Projection)) Filter ((WHERE + DROP unused columns after JOIN)) Join (JOIN FillRightFirst) - Expression (Change column names to column identifiers) + Expression ReadFromMergeTree (default.t1) Indexes: PrimaryKey diff --git a/tests/queries/0_stateless/01786_explain_merge_tree.reference b/tests/queries/0_stateless/01786_explain_merge_tree.reference index 096090f8fa1..fd1bc713b08 100644 --- a/tests/queries/0_stateless/01786_explain_merge_tree.reference +++ b/tests/queries/0_stateless/01786_explain_merge_tree.reference @@ -3,21 +3,18 @@ MinMax Keys: y - Condition: (y in [1, +Inf)) Parts: 4/5 Granules: 11/12 Partition Keys: y bitAnd(z, 3) - Condition: and((y in [1, +Inf)), (bitAnd(z, 3) not in [1, 1])) Parts: 3/4 Granules: 10/11 PrimaryKey Keys: x y - Condition: and((x in [11, +Inf)), (y in [1, +Inf))) Parts: 2/3 Granules: 6/10 Skip @@ -37,7 +34,6 @@ { "Type": "MinMax", "Keys": ["y"], - "Condition": "(y in [1, +Inf))", "Initial Parts": 5, "Selected Parts": 4, "Initial Granules": 12, @@ -46,7 +42,6 @@ { "Type": "Partition", "Keys": ["y", "bitAnd(z, 3)"], - "Condition": "and((y in [1, +Inf)), (bitAnd(z, 3) not in [1, 1]))", "Initial Parts": 4, "Selected Parts": 3, "Initial Granules": 11, @@ -55,7 +50,6 @@ { "Type": "PrimaryKey", "Keys": ["x", "y"], - "Condition": "and((x in [11, +Inf)), (y in [1, +Inf)))", "Initial Parts": 3, "Selected Parts": 2, "Initial Granules": 10, @@ -109,21 +103,18 @@ MinMax Keys: y - Condition: (y in [1, +Inf)) Parts: 4/5 Granules: 11/12 Partition Keys: y bitAnd(z, 3) - Condition: and((y in [1, +Inf)), (bitAnd(z, 3) not in [1, 1])) Parts: 3/4 Granules: 10/11 PrimaryKey Keys: x y - Condition: and((x in [11, +Inf)), (y in [1, +Inf))) Parts: 2/3 Granules: 6/10 Skip @@ -138,7 +129,6 @@ { "Type": "MinMax", "Keys": ["y"], - "Condition": "(y in [1, +Inf))", "Initial Parts": 5, "Selected Parts": 4, "Initial Granules": 12, @@ -147,7 +137,6 @@ { "Type": "Partition", "Keys": ["y", "bitAnd(z, 3)"], - "Condition": "and((y in [1, +Inf)), (bitAnd(z, 3) not in [1, 1]))", "Initial Parts": 4, "Selected Parts": 3, "Initial Granules": 11, @@ -156,7 +145,6 @@ { "Type": "PrimaryKey", "Keys": ["x", "y"], - "Condition": "and((x in [11, +Inf)), (y in [1, +Inf)))", "Initial Parts": 3, "Selected Parts": 2, "Initial Granules": 10, diff --git a/tests/queries/0_stateless/01786_explain_merge_tree.sh b/tests/queries/0_stateless/01786_explain_merge_tree.sh index 23537013204..e3b28acdc41 100755 --- a/tests/queries/0_stateless/01786_explain_merge_tree.sh +++ b/tests/queries/0_stateless/01786_explain_merge_tree.sh @@ -17,13 +17,13 @@ do $CH_CLIENT -q " explain indexes = 1 select *, _part from test_index where t % 19 = 16 and y > 0 and bitAnd(z, 3) != 1 and x > 10 and t % 20 > 14; - " | grep -A 100 "ReadFromMergeTree" # | grep -v "Description" + " | grep -A 100 "ReadFromMergeTree" | grep -v "Condition" echo "-----------------" $CH_CLIENT -q " explain indexes = 1, json = 1 select *, _part from test_index where t % 19 = 16 and y > 0 and bitAnd(z, 3) != 1 and x > 10 and t % 20 > 14 format TSVRaw; - " | grep -A 100 "ReadFromMergeTree" # | grep -v "Description" + " | grep -A 100 "ReadFromMergeTree" | grep -v "Condition" echo "-----------------" diff --git a/tests/queries/0_stateless/01798_having_push_down.sql b/tests/queries/0_stateless/01798_having_push_down.sql index b3a77c8f5b5..c0c3447f5ab 100644 --- a/tests/queries/0_stateless/01798_having_push_down.sql +++ b/tests/queries/0_stateless/01798_having_push_down.sql @@ -8,11 +8,12 @@ SELECT sum(c0 = 0), min(c0 + 1), sum(c0 + 2) FROM t_having GROUP BY c0 HAVING c0 = 0 SETTINGS enable_optimize_predicate_expression=0; +SET enable_positional_arguments=0; + SELECT c0 + -1, sum(intDivOrZero(intDivOrZero(NULL, NULL), '2'), intDivOrZero(10000000000., intDivOrZero(intDivOrZero(intDivOrZero(NULL, NULL), 10), NULL))) FROM t_having GROUP BY c0 = 2, c0 = 10, intDivOrZero(intDivOrZero(intDivOrZero(NULL, NULL), NULL), NULL), c0 HAVING c0 = 2 SETTINGS enable_optimize_predicate_expression = 0; SELECT sum(c0 + 257) FROM t_having GROUP BY c0 = -9223372036854775808, NULL, -2147483649, c0 HAVING c0 = -9223372036854775808 SETTINGS enable_optimize_predicate_expression = 0; -SET enable_positional_arguments=0; SELECT c0 + -2, c0 + -9223372036854775807, c0 = NULL FROM t_having GROUP BY c0 = 0.9998999834060669, 1023, c0 HAVING c0 = 0.9998999834060669 SETTINGS enable_optimize_predicate_expression = 0; DROP TABLE t_having; diff --git a/tests/queries/0_stateless/01852_dictionary_found_rate_long.sql b/tests/queries/0_stateless/01852_dictionary_found_rate_long.sql index 09ca0e2063d..d5108e98510 100644 --- a/tests/queries/0_stateless/01852_dictionary_found_rate_long.sql +++ b/tests/queries/0_stateless/01852_dictionary_found_rate_long.sql @@ -22,7 +22,7 @@ CREATE DICTIONARY simple_key_flat_dictionary_01862 value String ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'simple_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'simple_key_source_table_01862')) LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); @@ -43,7 +43,7 @@ CREATE DICTIONARY simple_key_direct_dictionary_01862 value String ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'simple_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'simple_key_source_table_01862')) LAYOUT(DIRECT()); -- check that found_rate is 0, not nan @@ -65,7 +65,7 @@ CREATE DICTIONARY simple_key_hashed_dictionary_01862 value String ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'simple_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'simple_key_source_table_01862')) LAYOUT(HASHED()) LIFETIME(MIN 0 MAX 1000); @@ -85,7 +85,7 @@ CREATE DICTIONARY simple_key_sparse_hashed_dictionary_01862 value String ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'simple_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'simple_key_source_table_01862')) LAYOUT(SPARSE_HASHED()) LIFETIME(MIN 0 MAX 1000); @@ -105,7 +105,7 @@ CREATE DICTIONARY simple_key_cache_dictionary_01862 value String ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'simple_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'simple_key_source_table_01862')) LAYOUT(CACHE(SIZE_IN_CELLS 100000)) LIFETIME(MIN 0 MAX 1000); @@ -143,7 +143,7 @@ CREATE DICTIONARY complex_key_hashed_dictionary_01862 value String ) PRIMARY KEY id, id_key -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'complex_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'complex_key_source_table_01862')) LAYOUT(COMPLEX_KEY_HASHED()) LIFETIME(MIN 0 MAX 1000); @@ -164,7 +164,7 @@ CREATE DICTIONARY complex_key_direct_dictionary_01862 value String ) PRIMARY KEY id, id_key -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'complex_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'complex_key_source_table_01862')) LAYOUT(COMPLEX_KEY_DIRECT()); SELECT name, found_rate FROM system.dictionaries WHERE database = currentDatabase() AND name = 'complex_key_direct_dictionary_01862'; @@ -184,7 +184,7 @@ CREATE DICTIONARY complex_key_cache_dictionary_01862 value String ) PRIMARY KEY id, id_key -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'complex_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'complex_key_source_table_01862')) LAYOUT(COMPLEX_KEY_CACHE(SIZE_IN_CELLS 100000)) LIFETIME(MIN 0 MAX 1000); @@ -223,7 +223,7 @@ CREATE DICTIONARY simple_key_range_hashed_dictionary_01862 last Date ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'range_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'range_key_source_table_01862')) LAYOUT(RANGE_HASHED()) RANGE(MIN first MAX last) LIFETIME(MIN 0 MAX 1000); @@ -259,13 +259,16 @@ CREATE DICTIONARY ip_trie_dictionary_01862 value String ) PRIMARY KEY prefix -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'ip_trie_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'ip_trie_source_table_01862')) LAYOUT(IP_TRIE()) LIFETIME(MIN 0 MAX 1000); +-- found_rate = 0, because we didn't make any searches. SELECT name, found_rate FROM system.dictionaries WHERE database = currentDatabase() AND name = 'ip_trie_dictionary_01862'; +-- found_rate = 1, because the dictionary covers the 127.0.0.1 address. SELECT dictGet('ip_trie_dictionary_01862', 'value', tuple(toIPv4('127.0.0.1'))) FORMAT Null; SELECT name, found_rate FROM system.dictionaries WHERE database = currentDatabase() AND name = 'ip_trie_dictionary_01862'; +-- found_rate = 0.5, because the dictionary does not cover 1.1.1.1 and we have two lookups in total as of now. SELECT dictGet('ip_trie_dictionary_01862', 'value', tuple(toIPv4('1.1.1.1'))) FORMAT Null; SELECT name, found_rate FROM system.dictionaries WHERE database = currentDatabase() AND name = 'ip_trie_dictionary_01862'; @@ -299,7 +302,7 @@ CREATE DICTIONARY polygon_dictionary_01862 name String ) PRIMARY KEY key -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'polygons_01862')) +SOURCE(CLICKHOUSE(USER 'default' TABLE 'polygons_01862')) LIFETIME(0) LAYOUT(POLYGON()); diff --git a/tests/queries/0_stateless/01889_sqlite_read_write.reference b/tests/queries/0_stateless/01889_sqlite_read_write.reference index 9f2b382e41e..e605693d95d 100644 --- a/tests/queries/0_stateless/01889_sqlite_read_write.reference +++ b/tests/queries/0_stateless/01889_sqlite_read_write.reference @@ -29,7 +29,7 @@ CREATE TABLE default.sqlite_table3\n(\n `col1` String,\n `col2` Int32\n)\n not a null 2 3 4 -line6 6 +line\'6 6 7 test table function line1 1 diff --git a/tests/queries/0_stateless/01889_sqlite_read_write.sh b/tests/queries/0_stateless/01889_sqlite_read_write.sh index 02b9a649e94..fd0a1df20ac 100755 --- a/tests/queries/0_stateless/01889_sqlite_read_write.sh +++ b/tests/queries/0_stateless/01889_sqlite_read_write.sh @@ -76,7 +76,7 @@ ${CLICKHOUSE_CLIENT} --query='DROP TABLE IF EXISTS sqlite_table3' ${CLICKHOUSE_CLIENT} --query="CREATE TABLE sqlite_table3 (col1 String, col2 Int32) ENGINE = SQLite('${DB_PATH}', 'table3')" ${CLICKHOUSE_CLIENT} --query='SHOW CREATE TABLE sqlite_table3;' | sed -r 's/(.*SQLite)(.*)/\1/' -${CLICKHOUSE_CLIENT} --query="INSERT INTO sqlite_table3 VALUES ('line6', 6);" +${CLICKHOUSE_CLIENT} --query="INSERT INTO sqlite_table3 VALUES ('line\'6', 6);" ${CLICKHOUSE_CLIENT} --query="INSERT INTO sqlite_table3 VALUES (NULL, 7);" ${CLICKHOUSE_CLIENT} --query='SELECT * FROM sqlite_table3 ORDER BY col2' diff --git a/tests/queries/0_stateless/01920_not_chain_format.reference b/tests/queries/0_stateless/01920_not_chain_format.reference index 22abfd17dc7..bb58a0ff146 100644 --- a/tests/queries/0_stateless/01920_not_chain_format.reference +++ b/tests/queries/0_stateless/01920_not_chain_format.reference @@ -1,5 +1,5 @@ -- { echo } EXPLAIN SYNTAX SELECT NOT NOT (NOT (NOT (NULL))); -SELECT NOT (NOT (NOT NOT NULL)) +SELECT NOT (NOT (NOT (NOT NULL))) EXPLAIN SYNTAX SELECT NOT (NOT (NOT NOT NULL)); -SELECT NOT (NOT (NOT NOT NULL)) +SELECT NOT (NOT (NOT (NOT NULL))) diff --git a/tests/queries/0_stateless/01921_not_chain.reference b/tests/queries/0_stateless/01921_not_chain.reference index c29c66f1274..ebd18f4b342 100644 --- a/tests/queries/0_stateless/01921_not_chain.reference +++ b/tests/queries/0_stateless/01921_not_chain.reference @@ -4,6 +4,6 @@ SELECT 1 != (NOT 1); SELECT 1 != NOT 1; 1 EXPLAIN SYNTAX SELECT 1 != (NOT 1); -SELECT 1 != NOT 1 +SELECT 1 != (NOT 1) EXPLAIN SYNTAX SELECT 1 != NOT 1; -SELECT 1 != NOT 1 +SELECT 1 != (NOT 1) diff --git a/tests/queries/0_stateless/02006_test_positional_arguments.reference b/tests/queries/0_stateless/02006_test_positional_arguments.reference index 40100e8d5be..079bd071103 100644 --- a/tests/queries/0_stateless/02006_test_positional_arguments.reference +++ b/tests/queries/0_stateless/02006_test_positional_arguments.reference @@ -3,18 +3,50 @@ select x3, x2, x1 from test order by 1; 1 100 100 10 1 10 100 10 1 +select x3, x2, x1 from test order by -3; +1 100 100 +10 1 10 +100 10 1 select x3, x2, x1 from test order by x3; 1 100 100 10 1 10 100 10 1 +select x3, x2, x1 from test order by 3; +100 10 1 +10 1 10 +1 100 100 +select x3, x2, x1 from test order by -1; +100 10 1 +10 1 10 +1 100 100 +select x3, x2, x1 from test order by x1; +100 10 1 +10 1 10 +1 100 100 select x3, x2, x1 from test order by 1 desc; 100 10 1 10 1 10 1 100 100 +select x3, x2, x1 from test order by -3 desc; +100 10 1 +10 1 10 +1 100 100 select x3, x2, x1 from test order by x3 desc; 100 10 1 10 1 10 1 100 100 +select x3, x2, x1 from test order by 3 desc; +1 100 100 +10 1 10 +100 10 1 +select x3, x2, x1 from test order by -1 desc; +1 100 100 +10 1 10 +100 10 1 +select x3, x2, x1 from test order by x1 desc; +1 100 100 +10 1 10 +100 10 1 insert into test values (1, 10, 100), (10, 1, 10), (100, 100, 1); select x3, x2 from test group by x3, x2 order by x3; 1 100 @@ -54,6 +86,20 @@ SELECT x1 FROM test ORDER BY x3 + 1 ASC +explain syntax select x3, x2, x1 from test order by -1; +SELECT + x3, + x2, + x1 +FROM test +ORDER BY x1 ASC +explain syntax select x3 + 1, x2, x1 from test order by -1; +SELECT + x3 + 1, + x2, + x1 +FROM test +ORDER BY x1 ASC explain syntax select x3, x3 - x2, x2, x1 from test order by 2; SELECT x3, @@ -62,6 +108,14 @@ SELECT x1 FROM test ORDER BY x3 - x2 ASC +explain syntax select x3, x3 - x2, x2, x1 from test order by -2; +SELECT + x3, + x3 - x2, + x2, + x1 +FROM test +ORDER BY x2 ASC explain syntax select x3, if(x3 > 10, x3, plus(x1, x2)), x1 + x2 from test order by 2; SELECT x3, @@ -69,12 +123,28 @@ SELECT x1 + x2 FROM test ORDER BY if(x3 > 10, x3, x1 + x2) ASC +explain syntax select x3, if(x3 > 10, x3, plus(x1, x2)), x1 + x2 from test order by -2; +SELECT + x3, + if(x3 > 10, x3, x1 + x2), + x1 + x2 +FROM test +ORDER BY if(x3 > 10, x3, x1 + x2) ASC explain syntax select max(x1), x2 from test group by 2 order by 1, 2; SELECT max(x1), x2 FROM test GROUP BY x2 +ORDER BY + max(x1) ASC, + x2 ASC +explain syntax select max(x1), x2 from test group by -1 order by -2, -1; +SELECT + max(x1), + x2 +FROM test +GROUP BY x2 ORDER BY max(x1) ASC, x2 ASC @@ -83,16 +153,34 @@ SELECT 1 + greatest(x1, 1), x2 FROM test +GROUP BY + 1 + greatest(x1, 1), + x2 +explain syntax select 1 + greatest(x1, 1), x2 from test group by -2, -1; +SELECT + 1 + greatest(x1, 1), + x2 +FROM test GROUP BY 1 + greatest(x1, 1), x2 select max(x1), x2 from test group by 1, 2; -- { serverError 43, 184 } select 1 + max(x1), x2 from test group by 1, 2; -- { serverError 43, 184 } +select max(x1), x2 from test group by -2, -1; -- { serverError 43, 184 } +select 1 + max(x1), x2 from test group by -2, -1; -- { serverError 43, 184 } explain syntax select x1 + x3, x3 from test group by 1, 2; SELECT x1 + x3, x3 FROM test +GROUP BY + x1 + x3, + x3 +explain syntax select x1 + x3, x3 from test group by -2, -1; +SELECT + x1 + x3, + x3 +FROM test GROUP BY x1 + x3, x3 @@ -102,8 +190,14 @@ select x1, x1 * 2, max(x2), max(x3) from test2 group by 2, 1, x1 order by 1, 2, 1 2 10 100 10 20 1 10 100 200 100 1 +select x1, x1 * 2, max(x2), max(x3) from test2 group by 2, 1, x1 order by 1, 2, -1 desc, -2 asc; +1 2 10 100 +10 20 1 10 +100 200 100 1 select a, b, c, d, e, f from (select 44 a, 88 b, 13 c, 14 d, 15 e, 16 f) t group by 1,2,3,4,5,6 order by a; 44 88 13 14 15 16 +select a, b, c, d, e, f from (select 44 a, 88 b, 13 c, 14 d, 15 e, 16 f) t group by 1,2,3,-3,-2,-1 order by a; +44 88 13 14 15 16 explain syntax select plus(1, 1) as a group by a; SELECT 1 + 1 AS a GROUP BY a diff --git a/tests/queries/0_stateless/02006_test_positional_arguments.sql b/tests/queries/0_stateless/02006_test_positional_arguments.sql index 159ad6bd427..6f427e0298d 100644 --- a/tests/queries/0_stateless/02006_test_positional_arguments.sql +++ b/tests/queries/0_stateless/02006_test_positional_arguments.sql @@ -9,11 +9,21 @@ insert into test values (1, 10, 100), (10, 1, 10), (100, 100, 1); -- { echo } select x3, x2, x1 from test order by 1; +select x3, x2, x1 from test order by -3; select x3, x2, x1 from test order by x3; +select x3, x2, x1 from test order by 3; +select x3, x2, x1 from test order by -1; +select x3, x2, x1 from test order by x1; + select x3, x2, x1 from test order by 1 desc; +select x3, x2, x1 from test order by -3 desc; select x3, x2, x1 from test order by x3 desc; +select x3, x2, x1 from test order by 3 desc; +select x3, x2, x1 from test order by -1 desc; +select x3, x2, x1 from test order by x1 desc; + insert into test values (1, 10, 100), (10, 1, 10), (100, 100, 1); select x3, x2 from test group by x3, x2 order by x3; select x3, x2 from test group by 1, 2 order by x3; @@ -25,21 +35,32 @@ select x1, x2, x3 from test order by 3 limit 1 by 1; explain syntax select x3, x2, x1 from test order by 1; explain syntax select x3 + 1, x2, x1 from test order by 1; +explain syntax select x3, x2, x1 from test order by -1; +explain syntax select x3 + 1, x2, x1 from test order by -1; explain syntax select x3, x3 - x2, x2, x1 from test order by 2; +explain syntax select x3, x3 - x2, x2, x1 from test order by -2; explain syntax select x3, if(x3 > 10, x3, plus(x1, x2)), x1 + x2 from test order by 2; +explain syntax select x3, if(x3 > 10, x3, plus(x1, x2)), x1 + x2 from test order by -2; explain syntax select max(x1), x2 from test group by 2 order by 1, 2; +explain syntax select max(x1), x2 from test group by -1 order by -2, -1; explain syntax select 1 + greatest(x1, 1), x2 from test group by 1, 2; +explain syntax select 1 + greatest(x1, 1), x2 from test group by -2, -1; select max(x1), x2 from test group by 1, 2; -- { serverError 43, 184 } select 1 + max(x1), x2 from test group by 1, 2; -- { serverError 43, 184 } +select max(x1), x2 from test group by -2, -1; -- { serverError 43, 184 } +select 1 + max(x1), x2 from test group by -2, -1; -- { serverError 43, 184 } explain syntax select x1 + x3, x3 from test group by 1, 2; +explain syntax select x1 + x3, x3 from test group by -2, -1; create table test2(x1 Int, x2 Int, x3 Int) engine=Memory; insert into test2 values (1, 10, 100), (10, 1, 10), (100, 100, 1); select x1, x1 * 2, max(x2), max(x3) from test2 group by 2, 1, x1 order by 1, 2, 4 desc, 3 asc; +select x1, x1 * 2, max(x2), max(x3) from test2 group by 2, 1, x1 order by 1, 2, -1 desc, -2 asc; select a, b, c, d, e, f from (select 44 a, 88 b, 13 c, 14 d, 15 e, 16 f) t group by 1,2,3,4,5,6 order by a; +select a, b, c, d, e, f from (select 44 a, 88 b, 13 c, 14 d, 15 e, 16 f) t group by 1,2,3,-3,-2,-1 order by a; explain syntax select plus(1, 1) as a group by a; select substr('aaaaaaaaaaaaaa', 8) as a group by a order by a; diff --git a/tests/queries/0_stateless/02008_materialize_column.sql b/tests/queries/0_stateless/02008_materialize_column.sql index a78920d2525..cc7d3096402 100644 --- a/tests/queries/0_stateless/02008_materialize_column.sql +++ b/tests/queries/0_stateless/02008_materialize_column.sql @@ -17,6 +17,7 @@ ALTER TABLE tmp MATERIALIZE COLUMN s; ALTER TABLE tmp MODIFY COLUMN s String DEFAULT toString(x+2); SELECT arraySort(groupArray(x)), groupArray(s) FROM tmp; +ALTER TABLE tmp CLEAR COLUMN s; -- Need to clear because MATERIALIZE COLUMN won't override past values; ALTER TABLE tmp MATERIALIZE COLUMN s; ALTER TABLE tmp MODIFY COLUMN s String DEFAULT toString(x+3); SELECT arraySort(groupArray(x)), groupArray(s) FROM tmp; diff --git a/tests/queries/0_stateless/02015_async_inserts_2.sh b/tests/queries/0_stateless/02015_async_inserts_2.sh index 48523ccd9a9..606d4cc37b6 100755 --- a/tests/queries/0_stateless/02015_async_inserts_2.sh +++ b/tests/queries/0_stateless/02015_async_inserts_2.sh @@ -5,7 +5,9 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -url="${CLICKHOUSE_URL}&async_insert=1&wait_for_async_insert=1&async_insert_busy_timeout_ms=600000&async_insert_max_query_number=3&async_insert_deduplicate=1" +# With adaptive timeout enabled, the asynchronous queue can be flushed synchronously, depending on the elapsed since the last insert. +# This may result in test flakiness. +url="${CLICKHOUSE_URL}&async_insert=1&wait_for_async_insert=1&async_insert_busy_timeout_ms=600000&async_insert_max_query_number=3&async_insert_deduplicate=1&async_insert_use_adaptive_busy_timeout=0" ${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS async_inserts" ${CLICKHOUSE_CLIENT} -q "CREATE TABLE async_inserts (id UInt32, s String) ENGINE = MergeTree ORDER BY id" diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index e89d589857e..e60fb844de8 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -686,6 +686,9 @@ CREATE TABLE system.projection_parts `rows_where_ttl_info.expression` Array(String), `rows_where_ttl_info.min` Array(DateTime), `rows_where_ttl_info.max` Array(DateTime), + `is_broken` UInt8, + `exception_code` Int32, + `exception` String, `bytes` UInt64 ALIAS bytes_on_disk, `marks_size` UInt64 ALIAS marks_bytes, `part_name` String ALIAS name @@ -762,7 +765,8 @@ CREATE TABLE system.quota_limits `max_read_rows` Nullable(UInt64), `max_read_bytes` Nullable(UInt64), `max_execution_time` Nullable(Float64), - `max_written_bytes` Nullable(UInt64) + `max_written_bytes` Nullable(UInt64), + `max_failed_sequential_authentications` Nullable(UInt64) ) ENGINE = SystemQuotaLimits COMMENT 'SYSTEM TABLE is built on the fly.' @@ -792,7 +796,9 @@ CREATE TABLE system.quota_usage `execution_time` Nullable(Float64), `max_execution_time` Nullable(Float64), `written_bytes` Nullable(UInt64), - `max_written_bytes` Nullable(UInt64) + `max_written_bytes` Nullable(UInt64), + `failed_sequential_authentications` Nullable(UInt64), + `max_failed_sequential_authentications` Nullable(UInt64) ) ENGINE = SystemQuotaUsage COMMENT 'SYSTEM TABLE is built on the fly.' @@ -836,7 +842,9 @@ CREATE TABLE system.quotas_usage `execution_time` Nullable(Float64), `max_execution_time` Nullable(Float64), `written_bytes` Nullable(UInt64), - `max_written_bytes` Nullable(UInt64) + `max_written_bytes` Nullable(UInt64), + `failed_sequential_authentications` Nullable(UInt64), + `max_failed_sequential_authentications` Nullable(UInt64) ) ENGINE = SystemQuotasUsage COMMENT 'SYSTEM TABLE is built on the fly.' @@ -1075,6 +1083,7 @@ CREATE TABLE system.tables `data_paths` Array(String), `metadata_path` String, `metadata_modification_time` DateTime, + `metadata_version` Int32, `dependencies_database` Array(String), `dependencies_table` Array(String), `create_table_query` String, diff --git a/tests/queries/0_stateless/02117_show_create_table_system.sql b/tests/queries/0_stateless/02117_show_create_table_system.sql index 32465abbed7..438f26dcca7 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.sql +++ b/tests/queries/0_stateless/02117_show_create_table_system.sql @@ -1,6 +1,6 @@ /* we will `use system` to bypass style check, because `show create table` statement -cannot fit the requirement in check-sytle, which is as +cannot fit the requirement in check-style, which is as "# Queries to: tables_with_database_column=( diff --git a/tests/queries/0_stateless/02134_async_inserts_formats.sh b/tests/queries/0_stateless/02134_async_inserts_formats.sh index 631809e5dc2..89705bf6415 100755 --- a/tests/queries/0_stateless/02134_async_inserts_formats.sh +++ b/tests/queries/0_stateless/02134_async_inserts_formats.sh @@ -4,7 +4,9 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -url="${CLICKHOUSE_URL}&async_insert=1&wait_for_async_insert=1" +# With adaptive timeout enabled, the asynchronous queue can be flushed synchronously, depending on the elapsed since the last insert. +# This may result in test flakiness. +url="${CLICKHOUSE_URL}&async_insert=1&wait_for_async_insert=1&async_insert_use_adaptive_busy_timeout=0" ${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS async_inserts" ${CLICKHOUSE_CLIENT} -q "CREATE TABLE async_inserts (id UInt32, s String) ENGINE = MergeTree ORDER BY id" diff --git a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference index f2a4ef1f634..f3415a34823 100644 --- a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference +++ b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference @@ -76,8 +76,7 @@ ExpressionTransform (Expression) ExpressionTransform (ReadFromMergeTree) - ExpressionTransform - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) 0 → 1 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) 0 → 1 2020-10-11 0 0 2020-10-11 0 10 2020-10-11 0 20 @@ -106,8 +105,7 @@ ExpressionTransform (Expression) ExpressionTransform (ReadFromMergeTree) - ExpressionTransform - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) 0 → 1 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) 0 → 1 2020-10-12 0 2020-10-12 1 2020-10-12 2 @@ -140,9 +138,8 @@ ExpressionTransform (Expression) ExpressionTransform (ReadFromMergeTree) - ExpressionTransform - ReverseTransform - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InReverseOrder) 0 → 1 + ReverseTransform + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InReverseOrder) 0 → 1 2020-10-12 99999 2020-10-12 99998 2020-10-12 99997 diff --git a/tests/queries/0_stateless/02185_orc_corrupted_file.sh b/tests/queries/0_stateless/02185_orc_corrupted_file.sh index 1987f094faa..12510ae3836 100755 --- a/tests/queries/0_stateless/02185_orc_corrupted_file.sh +++ b/tests/queries/0_stateless/02185_orc_corrupted_file.sh @@ -8,4 +8,4 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') cp $CUR_DIR/data_orc/corrupted.orc $USER_FILES_PATH/ -${CLICKHOUSE_CLIENT} --query="select * from file('corrupted.orc')" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL' +${CLICKHOUSE_CLIENT} --query="select * from file('corrupted.orc')" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL' diff --git a/tests/queries/0_stateless/02235_add_part_offset_virtual_column.sql b/tests/queries/0_stateless/02235_add_part_offset_virtual_column.sql index dc8fceddc52..73ae6eb499f 100644 --- a/tests/queries/0_stateless/02235_add_part_offset_virtual_column.sql +++ b/tests/queries/0_stateless/02235_add_part_offset_virtual_column.sql @@ -52,7 +52,7 @@ SELECT _part_offset, foo FROM t_1 where granule == 0 AND _part_offset >= 100000 SELECT 'PREWHERE'; SELECT count(*), sum(_part_offset), sum(order_0) from t_1 prewhere granule == 0 where _part_offset >= 100000; -SELECT count(*), sum(_part_offset), sum(order_0) from t_1 prewhere _part != '' where granule == 0; -- { serverError 10 } -SELECT count(*), sum(_part_offset), sum(order_0) from t_1 prewhere _part_offset > 100000 where granule == 0; -- { serverError 10 } +SELECT count(*), sum(_part_offset), sum(order_0) from t_1 prewhere _part != '' where granule == 0; -- { serverError 10, 16 } +SELECT count(*), sum(_part_offset), sum(order_0) from t_1 prewhere _part_offset > 100000 where granule == 0; -- { serverError 10, 16 } SELECT _part_offset FROM t_1 PREWHERE order_0 % 10000 == 42 ORDER BY order_0 LIMIT 3; SELECT _part_offset, foo FROM t_1 PREWHERE order_0 % 10000 == 42 ORDER BY order_0 LIMIT 3; diff --git a/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh b/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh index 954e2e83f27..8ff6e28b123 100755 --- a/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh +++ b/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh @@ -12,6 +12,6 @@ DATA_FILE=$USER_FILES_PATH/$FILE_NAME cp $CUR_DIR/data_parquet_bad_column/metadata_0.parquet $DATA_FILE -$CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet)" 2>&1 | grep -qF "Cannot extract table structure" && echo "OK" || echo "FAIL" +$CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet)" 2>&1 | grep -qF "CANNOT_EXTRACT_TABLE_STRUCTURE" && echo "OK" || echo "FAIL" $CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet) settings input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference=1" $CLICKHOUSE_CLIENT -q "select count(*) from file(test_02245.parquet) settings input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference=1" diff --git a/tests/queries/0_stateless/02252_jit_profile_events.sql b/tests/queries/0_stateless/02252_jit_profile_events.sql index fbd6040c21c..fb7f806c46b 100644 --- a/tests/queries/0_stateless/02252_jit_profile_events.sql +++ b/tests/queries/0_stateless/02252_jit_profile_events.sql @@ -1,4 +1,4 @@ --- Tags: no-fasttest, no-ubsan, no-asan, no-msan, no-cpu-aarch64 +-- Tags: no-fasttest, no-parallel, no-cpu-aarch64, no-msan SET compile_expressions = 1; SET min_count_to_compile_expression = 0; diff --git a/tests/queries/0_stateless/02268_json_wrong_root_type_in_schema_inference.sql b/tests/queries/0_stateless/02268_json_wrong_root_type_in_schema_inference.sql index 5462d38f1a3..98bf29c32f5 100644 --- a/tests/queries/0_stateless/02268_json_wrong_root_type_in_schema_inference.sql +++ b/tests/queries/0_stateless/02268_json_wrong_root_type_in_schema_inference.sql @@ -1,7 +1,7 @@ -- Tags: no-fasttest insert into function file('02268_data.jsonl', 'TSV') select 1; -select * from file('02268_data.jsonl'); --{serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED} +select * from file('02268_data.jsonl'); --{serverError CANNOT_EXTRACT_TABLE_STRUCTURE} insert into function file('02268_data.jsonCompactEachRow', 'TSV') select 1; -select * from file('02268_data.jsonCompactEachRow'); --{serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED} +select * from file('02268_data.jsonCompactEachRow'); --{serverError CANNOT_EXTRACT_TABLE_STRUCTURE} diff --git a/tests/queries/0_stateless/02282_array_distance.reference b/tests/queries/0_stateless/02282_array_distance.reference index 9758da9a833..c21e294cb62 100644 --- a/tests/queries/0_stateless/02282_array_distance.reference +++ b/tests/queries/0_stateless/02282_array_distance.reference @@ -80,3 +80,7 @@ nan 5 6 268 2 10.234459893824097 23.15167380558045 536 0.00007815428961455151 6 5 268 2 10.234459893824097 23.15167380558045 536 0.00007815428961455151 6 6 0 0 0 0 0 0 +5.8309517 +0.0003244877 +5.830951894845301 +0.0003245172890904424 diff --git a/tests/queries/0_stateless/02282_array_distance.sql b/tests/queries/0_stateless/02282_array_distance.sql index 9c16071dc1f..2cca853fd67 100644 --- a/tests/queries/0_stateless/02282_array_distance.sql +++ b/tests/queries/0_stateless/02282_array_distance.sql @@ -12,10 +12,10 @@ SELECT cosineDistance([1, 2, 3], [0, 0, 0]); -- Overflows WITH CAST([-547274980, 1790553898, 1981517754, 1908431500, 1352428565, -573412550, -552499284, 2096941042], 'Array(Int32)') AS a SELECT - L1Distance(a,a), - L2Distance(a,a), - L2SquaredDistance(a,a), - LinfDistance(a,a), + L1Distance(a, a), + L2Distance(a, a), + L2SquaredDistance(a, a), + LinfDistance(a, a), cosineDistance(a, a); DROP TABLE IF EXISTS vec1; @@ -88,15 +88,33 @@ SELECT FROM vec2f v1, vec2d v2 WHERE length(v1.v) == length(v2.v); -SELECT L1Distance([0, 0], [1]); -- { serverError 190 } -SELECT L2Distance([1, 2], (3,4)); -- { serverError 43 } -SELECT L2SquaredDistance([1, 2], (3,4)); -- { serverError 43 } -SELECT LpDistance([1, 2], [3,4]); -- { serverError 42 } -SELECT LpDistance([1, 2], [3,4], -1.); -- { serverError 69 } -SELECT LpDistance([1, 2], [3,4], 'aaa'); -- { serverError 43 } -SELECT LpDistance([1, 2], [3,4], materialize(2.7)); -- { serverError 44 } +SELECT L1Distance([0, 0], [1]); -- { serverError SIZES_OF_ARRAYS_DONT_MATCH } +SELECT L2Distance([1, 2], (3,4)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT L2SquaredDistance([1, 2], (3,4)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT LpDistance([1, 2], [3,4]); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT LpDistance([1, 2], [3,4], -1.); -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT LpDistance([1, 2], [3,4], 'aaa'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT LpDistance([1, 2], [3,4], materialize(2.7)); -- { serverError ILLEGAL_COLUMN } DROP TABLE vec1; DROP TABLE vec2; DROP TABLE vec2f; DROP TABLE vec2d; + +-- Queries which trigger manually vectorized implementation + +SELECT L2Distance( + [toFloat32(0.0), toFloat32(1.0), toFloat32(2.0), toFloat32(3.0), toFloat32(4.0), toFloat32(5.0), toFloat32(6.0), toFloat32(7.0), toFloat32(8.0), toFloat32(9.0), toFloat32(10.0), toFloat32(11.0), toFloat32(12.0), toFloat32(13.0), toFloat32(14.0), toFloat32(15.0), toFloat32(16.0), toFloat32(17.0), toFloat32(18.0), toFloat32(19.0), toFloat32(20.0), toFloat32(21.0), toFloat32(22.0), toFloat32(23.0), toFloat32(24.0), toFloat32(25.0), toFloat32(26.0), toFloat32(27.0), toFloat32(28.0), toFloat32(29.0), toFloat32(30.0), toFloat32(31.0), toFloat32(32.0), toFloat32(33.0)], + materialize([toFloat32(1.0), toFloat32(2.0), toFloat32(3.0), toFloat32(4.0), toFloat32(5.0), toFloat32(6.0), toFloat32(7.0), toFloat32(8.0), toFloat32(9.0), toFloat32(10.0), toFloat32(11.0), toFloat32(12.0), toFloat32(13.0), toFloat32(14.0), toFloat32(15.0), toFloat32(16.0), toFloat32(17.0), toFloat32(18.0), toFloat32(19.0), toFloat32(20.0), toFloat32(21.0), toFloat32(22.0), toFloat32(23.0), toFloat32(24.0), toFloat32(25.0), toFloat32(26.0), toFloat32(27.0), toFloat32(28.0), toFloat32(29.0), toFloat32(30.0), toFloat32(31.0), toFloat32(32.0), toFloat32(33.0), toFloat32(34.0)])); + +SELECT cosineDistance( + [toFloat32(0.0), toFloat32(1.0), toFloat32(2.0), toFloat32(3.0), toFloat32(4.0), toFloat32(5.0), toFloat32(6.0), toFloat32(7.0), toFloat32(8.0), toFloat32(9.0), toFloat32(10.0), toFloat32(11.0), toFloat32(12.0), toFloat32(13.0), toFloat32(14.0), toFloat32(15.0), toFloat32(16.0), toFloat32(17.0), toFloat32(18.0), toFloat32(19.0), toFloat32(20.0), toFloat32(21.0), toFloat32(22.0), toFloat32(23.0), toFloat32(24.0), toFloat32(25.0), toFloat32(26.0), toFloat32(27.0), toFloat32(28.0), toFloat32(29.0), toFloat32(30.0), toFloat32(31.0), toFloat32(32.0), toFloat32(33.0)], + materialize([toFloat32(1.0), toFloat32(2.0), toFloat32(3.0), toFloat32(4.0), toFloat32(5.0), toFloat32(6.0), toFloat32(7.0), toFloat32(8.0), toFloat32(9.0), toFloat32(10.0), toFloat32(11.0), toFloat32(12.0), toFloat32(13.0), toFloat32(14.0), toFloat32(15.0), toFloat32(16.0), toFloat32(17.0), toFloat32(18.0), toFloat32(19.0), toFloat32(20.0), toFloat32(21.0), toFloat32(22.0), toFloat32(23.0), toFloat32(24.0), toFloat32(25.0), toFloat32(26.0), toFloat32(27.0), toFloat32(28.0), toFloat32(29.0), toFloat32(30.0), toFloat32(31.0), toFloat32(32.0), toFloat32(33.0), toFloat32(34.0)])); + +SELECT L2Distance( + [toFloat64(0.0), toFloat64(1.0), toFloat64(2.0), toFloat64(3.0), toFloat64(4.0), toFloat64(5.0), toFloat64(6.0), toFloat64(7.0), toFloat64(8.0), toFloat64(9.0), toFloat64(10.0), toFloat64(11.0), toFloat64(12.0), toFloat64(13.0), toFloat64(14.0), toFloat64(15.0), toFloat64(16.0), toFloat64(17.0), toFloat64(18.0), toFloat64(19.0), toFloat64(20.0), toFloat64(21.0), toFloat64(22.0), toFloat64(23.0), toFloat64(24.0), toFloat64(25.0), toFloat64(26.0), toFloat64(27.0), toFloat64(28.0), toFloat64(29.0), toFloat64(30.0), toFloat64(31.0), toFloat64(32.0), toFloat64(33.0)], + materialize([toFloat64(1.0), toFloat64(2.0), toFloat64(3.0), toFloat64(4.0), toFloat64(5.0), toFloat64(6.0), toFloat64(7.0), toFloat64(8.0), toFloat64(9.0), toFloat64(10.0), toFloat64(11.0), toFloat64(12.0), toFloat64(13.0), toFloat64(14.0), toFloat64(15.0), toFloat64(16.0), toFloat64(17.0), toFloat64(18.0), toFloat64(19.0), toFloat64(20.0), toFloat64(21.0), toFloat64(22.0), toFloat64(23.0), toFloat64(24.0), toFloat64(25.0), toFloat64(26.0), toFloat64(27.0), toFloat64(28.0), toFloat64(29.0), toFloat64(30.0), toFloat64(31.0), toFloat64(32.0), toFloat64(33.0), toFloat64(34.0)])); + +SELECT cosineDistance( + [toFloat64(0.0), toFloat64(1.0), toFloat64(2.0), toFloat64(3.0), toFloat64(4.0), toFloat64(5.0), toFloat64(6.0), toFloat64(7.0), toFloat64(8.0), toFloat64(9.0), toFloat64(10.0), toFloat64(11.0), toFloat64(12.0), toFloat64(13.0), toFloat64(14.0), toFloat64(15.0), toFloat64(16.0), toFloat64(17.0), toFloat64(18.0), toFloat64(19.0), toFloat64(20.0), toFloat64(21.0), toFloat64(22.0), toFloat64(23.0), toFloat64(24.0), toFloat64(25.0), toFloat64(26.0), toFloat64(27.0), toFloat64(28.0), toFloat64(29.0), toFloat64(30.0), toFloat64(31.0), toFloat64(32.0), toFloat64(33.0)], + materialize([toFloat64(1.0), toFloat64(2.0), toFloat64(3.0), toFloat64(4.0), toFloat64(5.0), toFloat64(6.0), toFloat64(7.0), toFloat64(8.0), toFloat64(9.0), toFloat64(10.0), toFloat64(11.0), toFloat64(12.0), toFloat64(13.0), toFloat64(14.0), toFloat64(15.0), toFloat64(16.0), toFloat64(17.0), toFloat64(18.0), toFloat64(19.0), toFloat64(20.0), toFloat64(21.0), toFloat64(22.0), toFloat64(23.0), toFloat64(24.0), toFloat64(25.0), toFloat64(26.0), toFloat64(27.0), toFloat64(28.0), toFloat64(29.0), toFloat64(30.0), toFloat64(31.0), toFloat64(32.0), toFloat64(33.0), toFloat64(34.0)])); diff --git a/tests/queries/0_stateless/02286_mysql_dump_input_format.sh b/tests/queries/0_stateless/02286_mysql_dump_input_format.sh index a3711497ae8..2f6167c3ddf 100755 --- a/tests/queries/0_stateless/02286_mysql_dump_input_format.sh +++ b/tests/queries/0_stateless/02286_mysql_dump_input_format.sh @@ -23,7 +23,7 @@ $CLICKHOUSE_CLIENT -q "desc file(dump1.sql, MySQLDump) settings input_format_mys $CLICKHOUSE_CLIENT -q "select * from file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test', max_threads=1" $CLICKHOUSE_CLIENT -q "desc file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test2'" $CLICKHOUSE_CLIENT -q "select * from file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test2', max_threads=1" -$CLICKHOUSE_CLIENT -q "desc file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test 3'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT -q "desc file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test 3'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL' $CLICKHOUSE_CLIENT -q "select * from file(dump1.sql, MySQLDump, 'x Nullable(Int32)') settings input_format_mysql_dump_table_name='test 3'" 2>&1 | grep -F -q 'EMPTY_DATA_PASSED' && echo 'OK' || echo 'FAIL' echo "dump2" diff --git a/tests/queries/0_stateless/02293_formats_json_columns.sh b/tests/queries/0_stateless/02293_formats_json_columns.sh index ce35c4bd878..4eae5a1abb4 100755 --- a/tests/queries/0_stateless/02293_formats_json_columns.sh +++ b/tests/queries/0_stateless/02293_formats_json_columns.sh @@ -88,4 +88,4 @@ echo ' } ' > $DATA_FILE -$CLICKHOUSE_CLIENT -q "desc file(data_02293, JSONColumns) settings input_format_max_rows_to_read_for_schema_inference=3, input_format_json_infer_incomplete_types_as_strings=0" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT -q "desc file(data_02293, JSONColumns) settings input_format_max_rows_to_read_for_schema_inference=3, input_format_json_infer_incomplete_types_as_strings=0" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL' diff --git a/tests/queries/0_stateless/02322_sql_insert_format.sql b/tests/queries/0_stateless/02322_sql_insert_format.sql index 34cde1e56b6..ccceaee31d9 100644 --- a/tests/queries/0_stateless/02322_sql_insert_format.sql +++ b/tests/queries/0_stateless/02322_sql_insert_format.sql @@ -1,5 +1,7 @@ -- Tags: no-parallel +set schema_inference_use_cache_for_file=0; + select number as x, number % 3 as y, 'Hello' as z from numbers(5) format SQLInsert; select number as x, number % 3 as y, 'Hello' as z from numbers(5) format SQLInsert settings output_format_sql_insert_max_batch_size=1; select number as x, number % 3 as y, 'Hello' as z from numbers(5) format SQLInsert settings output_format_sql_insert_max_batch_size=2; diff --git a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh index dfc0dedeaf1..650faf6985e 100755 --- a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh +++ b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh @@ -15,11 +15,11 @@ mkdir -p $SCHEMADIR/$SERVER_SCHEMADIR cp -r $CLIENT_SCHEMADIR/02327_* $SCHEMADIR/$SERVER_SCHEMADIR/ -$CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'The table structure cannot be extracted' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'The table structure cannot be extracted' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="create table test_protobuf engine=File(Protobuf) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="create table test_capnp engine=File(CapnProto) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="create table test_protobuf engine=File(Protobuf) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'The table structure cannot be extracted' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="create table test_capnp engine=File(CapnProto) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'The table structure cannot be extracted' && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty', input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference=1"; $CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty', input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference=1"; diff --git a/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql b/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql index 0ceed178865..a4a69f4fa40 100644 --- a/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql +++ b/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql @@ -1,6 +1,7 @@ -- Tags: no-fasttest set input_format_try_infer_integers=1; +set input_format_try_infer_exponent_floats=1; select 'JSONEachRow'; desc format(JSONEachRow, '{"x" : 123}'); diff --git a/tests/queries/0_stateless/02346_inverted_index_mutation.reference b/tests/queries/0_stateless/02346_inverted_index_bug47393.reference similarity index 100% rename from tests/queries/0_stateless/02346_inverted_index_mutation.reference rename to tests/queries/0_stateless/02346_inverted_index_bug47393.reference diff --git a/tests/queries/0_stateless/02346_inverted_index_bug47393.sql b/tests/queries/0_stateless/02346_inverted_index_bug47393.sql new file mode 100644 index 00000000000..166e051b120 --- /dev/null +++ b/tests/queries/0_stateless/02346_inverted_index_bug47393.sql @@ -0,0 +1,25 @@ +SET allow_experimental_inverted_index = 1; + +DROP TABLE IF EXISTS tab; +CREATE TABLE tab +( + id UInt64, + str String, + INDEX idx str TYPE inverted(3) GRANULARITY 1 +) +ENGINE = MergeTree +ORDER BY tuple() +SETTINGS min_rows_for_wide_part = 1, min_bytes_for_wide_part = 1; + +INSERT INTO tab (str) VALUES ('I am inverted'); + +SELECT data_version FROM system.parts WHERE database = currentDatabase() AND table = 'tab' AND active = 1; + +-- update column synchronously +ALTER TABLE tab UPDATE str = 'I am not inverted' WHERE 1 SETTINGS mutations_sync=1; + +SELECT data_version FROM system.parts WHERE database = currentDatabase() AND table = 'tab' AND active = 1; + +SELECT str FROM tab WHERE str LIKE '%inverted%' SETTINGS force_data_skipping_indices = 'idx'; + +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02346_inverted_index_bug52019.reference b/tests/queries/0_stateless/02346_inverted_index_bug52019.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02862_index_inverted_incorrect_args.sql b/tests/queries/0_stateless/02346_inverted_index_bug52019.sql similarity index 62% rename from tests/queries/0_stateless/02862_index_inverted_incorrect_args.sql rename to tests/queries/0_stateless/02346_inverted_index_bug52019.sql index 7ba122a7155..c61e17d9cea 100644 --- a/tests/queries/0_stateless/02862_index_inverted_incorrect_args.sql +++ b/tests/queries/0_stateless/02346_inverted_index_bug52019.sql @@ -1,9 +1,20 @@ --- https://github.com/ClickHouse/ClickHouse/issues/52019 -DROP TABLE IF EXISTS tab; +-- Test for Bug 52019: Undefined behavior + SET allow_experimental_inverted_index=1; -CREATE TABLE tab (`k` UInt64, `s` Map(String, String), INDEX af mapKeys(s) TYPE inverted(2) GRANULARITY 1) ENGINE = MergeTree ORDER BY k SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi'; + +DROP TABLE IF EXISTS tab; + +CREATE TABLE tab ( + k UInt64, + s Map(String, String), + INDEX idx mapKeys(s) TYPE inverted(2) GRANULARITY 1) +ENGINE = MergeTree +ORDER BY k +SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi'; + INSERT INTO tab (k) VALUES (0); SELECT * FROM tab PREWHERE (s[NULL]) = 'Click a03' SETTINGS allow_experimental_analyzer=1; SELECT * FROM tab PREWHERE (s[1]) = 'Click a03' SETTINGS allow_experimental_analyzer=1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT * FROM tab PREWHERE (s['foo']) = 'Click a03' SETTINGS allow_experimental_analyzer=1; + DROP TABLE tab; diff --git a/tests/queries/0_stateless/02346_inverted_index_bug59039.reference b/tests/queries/0_stateless/02346_inverted_index_bug59039.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02346_inverted_index_bug59039.sql b/tests/queries/0_stateless/02346_inverted_index_bug59039.sql new file mode 100644 index 00000000000..0ef0cb0c733 --- /dev/null +++ b/tests/queries/0_stateless/02346_inverted_index_bug59039.sql @@ -0,0 +1,20 @@ +-- This is supposed to test that DROP INDEX removes all index related files. Can't test this directly but at least run the statement and +-- check that no bad things happen. + +SET allow_experimental_inverted_index = 1; + +DROP TABLE IF EXISTS tab; + +CREATE TABLE tab +( + id UInt64, + doc String, + INDEX text_idx doc TYPE inverted +) +ENGINE = MergeTree +ORDER BY id +SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi', min_bytes_for_wide_part = 0, min_rows_for_wide_part = 0; + +ALTER TABLE tab DROP INDEX text_idx; + +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02346_inverted_index_detach_attach.reference b/tests/queries/0_stateless/02346_inverted_index_detach_attach.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02696_inverted_idx_checksums.sql b/tests/queries/0_stateless/02346_inverted_index_detach_attach.sql similarity index 75% rename from tests/queries/0_stateless/02696_inverted_idx_checksums.sql rename to tests/queries/0_stateless/02346_inverted_index_detach_attach.sql index 92ffa7a6196..762d78922fe 100644 --- a/tests/queries/0_stateless/02696_inverted_idx_checksums.sql +++ b/tests/queries/0_stateless/02346_inverted_index_detach_attach.sql @@ -2,8 +2,8 @@ SET allow_experimental_inverted_index = 1; CREATE TABLE t ( - `key` UInt64, - `str` String, + key UInt64, + str String, INDEX inv_idx str TYPE inverted(0) GRANULARITY 1 ) ENGINE = MergeTree @@ -13,4 +13,4 @@ INSERT INTO t VALUES (1, 'Hello World'); ALTER TABLE t DETACH PART 'all_1_1_0'; -ALTER TABLE t ATTACH PART 'all_1_1_0'; \ No newline at end of file +ALTER TABLE t ATTACH PART 'all_1_1_0'; diff --git a/tests/queries/0_stateless/02346_inverted_index_experimental_flag.reference b/tests/queries/0_stateless/02346_inverted_index_experimental_flag.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02895_forbid_create_inverted_index.sql b/tests/queries/0_stateless/02346_inverted_index_experimental_flag.sql similarity index 72% rename from tests/queries/0_stateless/02895_forbid_create_inverted_index.sql rename to tests/queries/0_stateless/02346_inverted_index_experimental_flag.sql index dc92d9198fb..bf89265372e 100644 --- a/tests/queries/0_stateless/02895_forbid_create_inverted_index.sql +++ b/tests/queries/0_stateless/02346_inverted_index_experimental_flag.sql @@ -1,4 +1,7 @@ +-- Tests that the inverted index can only be supported when allow_experimental_inverted_index = 1. + SET allow_experimental_inverted_index = 0; + DROP TABLE IF EXISTS tab; CREATE TABLE tab ( diff --git a/tests/queries/0_stateless/02951_inverted_index_support_match.reference b/tests/queries/0_stateless/02346_inverted_index_match_predicate.reference similarity index 100% rename from tests/queries/0_stateless/02951_inverted_index_support_match.reference rename to tests/queries/0_stateless/02346_inverted_index_match_predicate.reference diff --git a/tests/queries/0_stateless/02951_inverted_index_support_match.sql b/tests/queries/0_stateless/02346_inverted_index_match_predicate.sql similarity index 97% rename from tests/queries/0_stateless/02951_inverted_index_support_match.sql rename to tests/queries/0_stateless/02346_inverted_index_match_predicate.sql index 9ebf10412d9..99405c0acf2 100644 --- a/tests/queries/0_stateless/02951_inverted_index_support_match.sql +++ b/tests/queries/0_stateless/02346_inverted_index_match_predicate.sql @@ -1,3 +1,5 @@ +-- Tests that match() utilizes the inverted index + SET allow_experimental_inverted_index = true; DROP TABLE IF EXISTS tab; diff --git a/tests/queries/0_stateless/02346_inverted_index_mutation.sql b/tests/queries/0_stateless/02346_inverted_index_mutation.sql deleted file mode 100644 index 83b73807cd7..00000000000 --- a/tests/queries/0_stateless/02346_inverted_index_mutation.sql +++ /dev/null @@ -1,25 +0,0 @@ -SET allow_experimental_inverted_index=1; - -DROP TABLE IF EXISTS t; -CREATE TABLE t -( - `timestamp` UInt64, - `s` String, - INDEX idx s TYPE inverted(3) GRANULARITY 1 -) -ENGINE = MergeTree -ORDER BY tuple() -SETTINGS min_rows_for_wide_part = 1, min_bytes_for_wide_part = 1; - -INSERT INTO t (s) VALUES ('I am inverted'); - -SELECT data_version FROM system.parts WHERE database=currentDatabase() AND table='t' AND active=1; - --- do update column synchronously -ALTER TABLE t UPDATE s='I am not inverted' WHERE 1 SETTINGS mutations_sync=1; - -SELECT data_version FROM system.parts WHERE database=currentDatabase() AND table='t' AND active=1; - -SELECT s FROM t WHERE s LIKE '%inverted%' SETTINGS force_data_skipping_indices='idx'; - -DROP TABLE t; diff --git a/tests/queries/0_stateless/02346_full_text_search.reference b/tests/queries/0_stateless/02346_inverted_index_search.reference similarity index 100% rename from tests/queries/0_stateless/02346_full_text_search.reference rename to tests/queries/0_stateless/02346_inverted_index_search.reference diff --git a/tests/queries/0_stateless/02346_full_text_search.sql b/tests/queries/0_stateless/02346_inverted_index_search.sql similarity index 100% rename from tests/queries/0_stateless/02346_full_text_search.sql rename to tests/queries/0_stateless/02346_inverted_index_search.sql diff --git a/tests/queries/0_stateless/02366_kql_create_table.reference b/tests/queries/0_stateless/02366_kql_create_table.reference deleted file mode 100644 index 35136b5ff42..00000000000 --- a/tests/queries/0_stateless/02366_kql_create_table.reference +++ /dev/null @@ -1,4 +0,0 @@ --- test create table -- -Theodore -Diaz -Theodore Diaz 28 diff --git a/tests/queries/0_stateless/02366_kql_create_table.sql b/tests/queries/0_stateless/02366_kql_create_table.sql deleted file mode 100644 index b266679b06a..00000000000 --- a/tests/queries/0_stateless/02366_kql_create_table.sql +++ /dev/null @@ -1,29 +0,0 @@ -DROP TABLE IF EXISTS Customers; -CREATE TABLE Customers -( - FirstName Nullable(String), - LastName String, - Occupation String, - Education String, - Age Nullable(UInt8) -) ENGINE = Memory; - -INSERT INTO Customers VALUES ('Theodore','Diaz','Skilled Manual','Bachelors',28),('Stephanie','Cox','Management abcd defg','Bachelors',33),('Peter','Nara','Skilled Manual','Graduate Degree',26),('Latoya','Shen','Professional','Graduate Degree',25),('Apple','','Skilled Manual','Bachelors',28),(NULL,'why','Professional','Partial College',38); -Select '-- test create table --' ; -Select * from kql(Customers|project FirstName) limit 1;; -DROP TABLE IF EXISTS kql_table1; -CREATE TABLE kql_table1 ENGINE = Memory AS select *, now() as new_column From kql(Customers | project LastName | filter LastName=='Diaz'); -select LastName from kql_table1 limit 1; -DROP TABLE IF EXISTS kql_table2; -CREATE TABLE kql_table2 -( - FirstName Nullable(String), - LastName String, - Age Nullable(UInt8) -) ENGINE = Memory; -INSERT INTO kql_table2 select * from kql(Customers|project FirstName,LastName,Age | filter FirstName=='Theodore'); -select * from kql_table2 limit 1; --- select * from kql(Customers | where FirstName !in ("test", "test2")); -DROP TABLE IF EXISTS Customers; -DROP TABLE IF EXISTS kql_table1; -DROP TABLE IF EXISTS kql_table2; \ No newline at end of file diff --git a/tests/queries/0_stateless/02366_kql_datatype.reference b/tests/queries/0_stateless/02366_kql_datatype.reference deleted file mode 100644 index fe666f3734c..00000000000 --- a/tests/queries/0_stateless/02366_kql_datatype.reference +++ /dev/null @@ -1,105 +0,0 @@ --- bool -true -\N --- int -123 -\N --- long -123 -255 --1 -\N -456 --- real -0.01 -\N -nan -inf --inf --- datetime -2015-12-31 23:59:59.900000000 -2015-12-31 00:00:00.000000000 -2014-05-25 08:20:03.123456000 -2014-11-08 15:55:55.000000000 -2014-11-08 15:55:00.000000000 -2014-11-08 00:00:00.000000000 -\N -2014-05-25 08:20:03.123456000 -2014-11-08 15:55:55.123456000 --- time -1216984.12345 -45055.123 -86400 --86400 -6.000000000000001e-9 -6e-7 -172800 -259200 --- guid -\N --- timespan (time) -172800 -1800 -10 -0.1 -0.00001 -1e-7 -1120343 --- null -1 -\N \N \N \N \N --- decimal -\N -123.345 -100000 --- dynamic -\N -1 -86400 -[1,2,3] -[[1],[2],[3]] -['a','b','c'] --- cast functions -true -1 --- tobool("false") -false -1 --- tobool(1) -true -1 --- tobool(123) -true -1 --- tobool("abc") -\N -\N --- todouble() -123.4 -\N --- toreal() -123.4 -\N --- toint() -1 -\N --- tostring() -123 -1 --- todatetime() -1 -\N --- make_timespan() -01:12:00 01:12:30 1.12:30:55 --- totimespan() -1e-7 -60 -\N -1120343 --- tolong() -123 -\N --- todecimal() -123.345 -\N -\N diff --git a/tests/queries/0_stateless/02366_kql_datatype.sql b/tests/queries/0_stateless/02366_kql_datatype.sql deleted file mode 100644 index ecd29504298..00000000000 --- a/tests/queries/0_stateless/02366_kql_datatype.sql +++ /dev/null @@ -1,117 +0,0 @@ -set dialect = 'kusto'; - -print '-- bool' -print bool(true); -print bool(true); -print bool(null); -print '-- int'; -print int(123); -print int(null); -print int('4'); -- { clientError BAD_ARGUMENTS } -print '-- long'; -print long(123); -print long(0xff); -print long(-1); -print long(null); -print 456; -print '-- real'; -print real(0.01); -print real(null); -print real(nan); -print real(+inf); -print real(-inf); -print double('4.2'); -- { clientError BAD_ARGUMENTS } -print '-- datetime'; -print datetime(2015-12-31 23:59:59.9); -print datetime(2015-12-31); -print datetime('2014-05-25T08:20:03.123456'); -print datetime('2014-11-08 15:55:55'); -print datetime('2014-11-08 15:55'); -print datetime('2014-11-08'); -print datetime(null); -print datetime('2014-05-25T08:20:03.123456Z'); -print datetime('2014-11-08 15:55:55.123456Z'); -print '-- time'; -print time('14.02:03:04.12345'); -print time('12:30:55.123'); -print time(1d); -print time(-1d); -print time(6nanoseconds); -print time(6tick); -print time(2); -print time(2) + 1d; -print '-- guid' -print guid(74be27de-1e4e-49d9-b579-fe0b331d3642); -print guid(null); -print '-- timespan (time)'; -print timespan(2d); -- 2 days ---print timespan(1.5h); -- 1.5 hour -print timespan(30m); -- 30 minutes -print timespan(10s); -- 10 seconds ---print timespan(0.1s); -- 0.1 second -print timespan(100ms); -- 100 millisecond -print timespan(10microsecond); -- 10 microseconds -print timespan(1tick); -- 100 nanoseconds ---print timespan(1.5h) / timespan(30m); -print timespan('12.23:12:23') / timespan(1s); -print '-- null'; -print isnull(null); -print bool(null), int(null), long(null), real(null), double(null); -print '-- decimal'; -print decimal(null); -print decimal(123.345); -print decimal(1e5); -print '-- dynamic'; -- no support for mixed types and bags for now -print dynamic(null); -print dynamic(1); -print dynamic(timespan(1d)); -print dynamic([1,2,3]); -print dynamic([[1], [2], [3]]); -print dynamic(['a', "b", 'c']); -print '-- cast functions' -print '--tobool("true")'; -- == true -print tobool('true'); -- == true -print tobool('true') == toboolean('true'); -- == true -print '-- tobool("false")'; -- == false -print tobool('false'); -- == false -print tobool('false') == toboolean('false'); -- == false -print '-- tobool(1)'; -- == true -print tobool(1); -- == true -print tobool(1) == toboolean(1); -- == true -print '-- tobool(123)'; -- == true -print tobool(123); -- == true -print tobool(123) == toboolean(123); -- == true -print '-- tobool("abc")'; -- == null -print tobool('abc'); -- == null -print tobool('abc') == toboolean('abc'); -- == null -print '-- todouble()'; -print todouble('123.4'); -print todouble('abc') == null; -print '-- toreal()'; -print toreal("123.4"); -print toreal('abc') == null; -print '-- toint()'; -print toint("123") == int(123); -print toint('abc'); -print '-- tostring()'; -print tostring(123); -print tostring(null) == ''; -print '-- todatetime()'; -print todatetime("2015-12-24") == datetime(2015-12-24); -print todatetime('abc') == null; -print '-- make_timespan()'; -print v1=make_timespan(1,12), v2=make_timespan(1,12,30), v3=make_timespan(1,12,30,55.123); -print '-- totimespan()'; -print totimespan(1tick); -print totimespan('0.00:01:00'); -print totimespan('abc'); -print totimespan('12.23:12:23') / totimespan(1s); --- print totimespan(strcat('12.', '23', ':12:', '23')) / timespan(1s); -> 1120343 -print '-- tolong()'; -print tolong('123'); -print tolong('abc'); -print '-- todecimal()'; -print todecimal(123.345); -print todecimal(null); -print todecimal('abc'); --- print todecimal(4 * 2 + 3); -> 11 diff --git a/tests/queries/0_stateless/02366_kql_distinct.reference b/tests/queries/0_stateless/02366_kql_distinct.reference deleted file mode 100644 index 2100f44f18c..00000000000 --- a/tests/queries/0_stateless/02366_kql_distinct.reference +++ /dev/null @@ -1,27 +0,0 @@ --- distinct * -- -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Peter Nara Skilled Manual Graduate Degree 26 -Latoya Shen Professional Graduate Degree 25 -Apple Skilled Manual Bachelors 28 -\N why Professional Partial College 38 --- distinct one column -- -Skilled Manual -Management abcd defg -Professional --- distinct two column -- -Skilled Manual Bachelors -Management abcd defg Bachelors -Skilled Manual Graduate Degree -Professional Graduate Degree -Professional Partial College --- distinct with where -- -Skilled Manual Bachelors -Management abcd defg Bachelors -Skilled Manual Graduate Degree -Professional Graduate Degree -Professional Partial College --- distinct with where, order -- -Skilled Manual Bachelors -Skilled Manual Graduate Degree -Professional Graduate Degree diff --git a/tests/queries/0_stateless/02366_kql_distinct.sql b/tests/queries/0_stateless/02366_kql_distinct.sql deleted file mode 100644 index 3c997eb4865..00000000000 --- a/tests/queries/0_stateless/02366_kql_distinct.sql +++ /dev/null @@ -1,28 +0,0 @@ -DROP TABLE IF EXISTS Customers; -CREATE TABLE Customers -( - FirstName Nullable(String), - LastName String, - Occupation String, - Education String, - Age Nullable(UInt8) -) ENGINE = Memory; - -INSERT INTO Customers VALUES ('Theodore','Diaz','Skilled Manual','Bachelors',28), ('Stephanie','Cox','Management abcd defg','Bachelors',33),('Peter','Nara','Skilled Manual','Graduate Degree',26),('Latoya','Shen','Professional','Graduate Degree',25),('Apple','','Skilled Manual','Bachelors',28),(NULL,'why','Professional','Partial College',38); - -set dialect = 'kusto'; - -print '-- distinct * --'; -Customers | distinct *; - -print '-- distinct one column --'; -Customers | distinct Occupation; - -print '-- distinct two column --'; -Customers | distinct Occupation, Education; - -print '-- distinct with where --'; -Customers where Age <30 | distinct Occupation, Education; - -print '-- distinct with where, order --'; -Customers |where Age <30 | order by Age| distinct Occupation, Education; diff --git a/tests/queries/0_stateless/02366_kql_extend.reference b/tests/queries/0_stateless/02366_kql_extend.reference deleted file mode 100644 index 2936c9ea19c..00000000000 --- a/tests/queries/0_stateless/02366_kql_extend.reference +++ /dev/null @@ -1,32 +0,0 @@ --- extend #1 -- -Aldi Apple 4 2016-09-10 400 -Costco Apple 2 2016-09-11 200 --- extend #2 -- -Apple 200 -Apple 400 --- extend #3 -- -Apple cost 480 on average based on 5 samples. -Snargaluff cost 28080 on average based on 5 samples. --- extend #4 -- -1 --- extend #5 -- -Aldi Apple 4 2016-09-10 Apple was purchased from Aldi for $4 on 2016-09-10 400 -Costco Apple 2 2016-09-11 Apple was purchased from Costco for $2 on 2016-09-11 200 --- extend #6 -- -Aldi Apple 2016-09-10 400 -Costco Apple 2016-09-11 200 -Aldi Apple 2016-09-10 600 -Costco Snargaluff 2016-09-12 10000 -Aldi Apple 2016-09-12 700 -Aldi Snargaluff 2016-09-11 40000 -Costco Snargaluff 2016-09-12 10400 -Aldi Apple 2016-09-12 500 -Aldi Snargaluff 2016-09-11 60000 -Costco Snargaluff 2016-09-10 20000 --- extend #7 -- -5 --- extend #8 -- --- extend #9 -- --- extend #10 -- --- extend #11 -- -5 [2,1] diff --git a/tests/queries/0_stateless/02366_kql_extend.sql b/tests/queries/0_stateless/02366_kql_extend.sql deleted file mode 100644 index 0a3c1f3dcd4..00000000000 --- a/tests/queries/0_stateless/02366_kql_extend.sql +++ /dev/null @@ -1,61 +0,0 @@ --- datatable(Supplier:string, Fruit:string, Price: real, Purchase:datetime) --- [ --- 'Aldi','Apple',4,'2016-09-10', --- 'Costco','Apple',2,'2016-09-11', --- 'Aldi','Apple',6,'2016-09-10', --- 'Costco','Snargaluff',100,'2016-09-12', --- 'Aldi','Apple',7,'2016-09-12', --- 'Aldi','Snargaluff',400,'2016-09-11', --- 'Costco','Snargaluff',104,'2016-09-12', --- 'Aldi','Apple',5,'2016-09-12', --- 'Aldi','Snargaluff',600,'2016-09-11', --- 'Costco','Snargaluff',200,'2016-09-10', --- ] - - -DROP TABLE IF EXISTS Ledger; -CREATE TABLE Ledger -( - Supplier Nullable(String), - Fruit String , - Price Float64, - Purchase Date -) ENGINE = Memory; -INSERT INTO Ledger VALUES ('Aldi','Apple',4,'2016-09-10'), ('Costco','Apple',2,'2016-09-11'), ('Aldi','Apple',6,'2016-09-10'), ('Costco','Snargaluff',100,'2016-09-12'), ('Aldi','Apple',7,'2016-09-12'), ('Aldi','Snargaluff',400,'2016-09-11'),('Costco','Snargaluff',104,'2016-09-12'),('Aldi','Apple',5,'2016-09-12'),('Aldi','Snargaluff',600,'2016-09-11'),('Costco','Snargaluff',200,'2016-09-10'); - --- This test requies sorting after some of aggregations but I don't know KQL, sorry -set max_bytes_before_external_group_by = 0; -set dialect = 'kusto'; - -print '-- extend #1 --'; -Ledger | extend PriceInCents = 100 * Price | take 2; - -print '-- extend #2 --'; -Ledger | extend PriceInCents = 100 * Price | sort by PriceInCents asc | project Fruit, PriceInCents | take 2; - -print '-- extend #3 --'; -Ledger | extend PriceInCents = 100 * Price | sort by PriceInCents asc | project Fruit, PriceInCents | summarize AveragePrice = avg(PriceInCents), Purchases = count() by Fruit | extend Sentence = strcat(Fruit, ' cost ', tostring(AveragePrice), ' on average based on ', tostring(Purchases), ' samples.') | project Sentence; - -print '-- extend #4 --'; -Ledger | extend a = Price | extend b = a | extend c = a, d = b + 500 | extend Pass = bool(b == a and c == a and d == b + 500) | summarize binary_all_and(Pass); - -print '-- extend #5 --'; -Ledger | take 2 | extend strcat(Fruit, ' was purchased from ', Supplier, ' for $', tostring(Price), ' on ', tostring(Purchase)) | extend PriceInCents = 100 * Price; - -print '-- extend #6 --'; -Ledger | extend Price = 100 * Price; - -print '-- extend #7 --'; -print a = 4 | extend a = 5; - -print '-- extend #8 --'; --- print x = 5 | extend array_sort_desc(range(0, x), range(1, x + 1)) - -print '-- extend #9 --'; -print x = 19 | extend = 4 + ; -- { clientError SYNTAX_ERROR } - -print '-- extend #10 --'; -Ledger | extend PriceInCents = * Price | sort by PriceInCents asc | project Fruit, PriceInCents | summarize AveragePrice = avg(PriceInCents), Purchases = count() by Fruit | extend Sentence = strcat(Fruit, ' cost ', tostring(AveragePrice), ' on average based on ', tostring(Purchases), ' samples.') | project Sentence; -- { clientError SYNTAX_ERROR } - -print '-- extend #11 --'; -- should ideally return this in the future: 5 [2,1] because of the alias ex -print x = 5 | extend ex = array_sort_desc(dynamic([1, 2]), dynamic([3, 4])); diff --git a/tests/queries/0_stateless/02366_kql_func_binary.reference b/tests/queries/0_stateless/02366_kql_func_binary.reference deleted file mode 100644 index 6276cd6d867..00000000000 --- a/tests/queries/0_stateless/02366_kql_func_binary.reference +++ /dev/null @@ -1,7 +0,0 @@ - -- binary functions -4 7 -1 -1 -1 -7 3 -1 diff --git a/tests/queries/0_stateless/02366_kql_func_binary.sql b/tests/queries/0_stateless/02366_kql_func_binary.sql deleted file mode 100644 index 824022b564c..00000000000 --- a/tests/queries/0_stateless/02366_kql_func_binary.sql +++ /dev/null @@ -1,8 +0,0 @@ -set dialect='kusto'; -print ' -- binary functions'; -print binary_and(4,7), binary_or(4,7); -print binary_shift_left(1, 1) == binary_shift_left(1, 65); -print binary_shift_right(2, 1) == binary_shift_right(2, 65); -print binary_shift_right(binary_shift_left(1, 65), 65) == 1; -print binary_xor(2, 5), bitset_count_ones(42); -print bitset_count_ones(binary_shift_left(binary_and(4,7), 1)); diff --git a/tests/queries/0_stateless/02366_kql_func_datetime.reference b/tests/queries/0_stateless/02366_kql_func_datetime.reference deleted file mode 100644 index 40d8d7e19ac..00000000000 --- a/tests/queries/0_stateless/02366_kql_func_datetime.reference +++ /dev/null @@ -1,76 +0,0 @@ --- dayofmonth() -31 --- dayofweek() -4.00:00:00 --- dayofyear() -365 --- getmonth() -10 --- getyear() -2015 --- hoursofday() -23 --- startofday() -2017-01-01 00:00:00.000000000 -2016-12-31 00:00:00.000000000 -2017-01-02 00:00:00.000000000 --- endofday() -2017-01-01 23:59:59.999999000 -2016-12-31 23:59:59.999999000 -2017-01-02 23:59:59.999999000 --- endofmonth() -2017-01-31 23:59:59.999999000 -2016-12-31 23:59:59.999999000 -2017-02-28 23:59:59.999999000 -2022-09-30 23:59:59.999999000 --- startofweek() -2017-01-01 00:00:00.000000000 -2016-12-25 00:00:00.000000000 -2017-01-08 00:00:00.000000000 --- endofweek() -2017-01-07 23:59:59.999999000 -2016-12-31 23:59:59.999999000 -2017-01-14 23:59:59.999999000 --- startofyear() -2017-01-01 00:00:00.000000000 -2016-01-01 00:00:00.000000000 -2018-01-01 00:00:00.000000000 --- endofyear() -2017-12-31 23:59:59.999999000 -2016-12-31 23:59:59.999999000 -2018-12-31 23:59:59.999999000 --- unixtime_seconds_todatetime() -2019-01-01 00:00:00.000000000 -1970-01-02 00:00:00.000000000 -1969-12-31 00:00:00.000000000 --- unixtime_microseconds_todatetime -2019-01-01 00:00:00.000000 --- unixtime_milliseconds_todatetime() -2019-01-01 00:00:00.000 --- unixtime_nanoseconds_todatetime() -2019-01-01 00:00:00.000000000 --- weekofyear() -52 --- monthofyear() -12 --- weekofyear() -52 --- now() -1 --- make_datetime() -1 -2017-10-01 12:10:00.0000000 -2017-10-01 12:11:00.0000000 --- format_datetime -15-12-14 02:03:04.1234500 -17-01-29 [09:00:05] 2017-01-29 [09:00:05] 17-01-29 [09:00:05 AM] --- format_timespan() -02:03:04.1234500 -29.09:00:05:12 --- ago() --- datetime_diff() -17 2 13 4 29 2 5 10 --- datetime_part() -2017 4 10 44 30 303 01 02 03 --- datetime_add() -2018-01-01 00:00:00.0000000 2017-04-01 00:00:00.0000000 2017-02-01 00:00:00.0000000 2017-01-08 00:00:00.0000000 2017-01-02 00:00:00.0000000 2017-01-01 01:00:00.0000000 2017-01-01 00:01:00.0000000 2017-01-01 00:00:01.0000000 diff --git a/tests/queries/0_stateless/02366_kql_func_datetime.sql b/tests/queries/0_stateless/02366_kql_func_datetime.sql deleted file mode 100644 index b1fba4166a9..00000000000 --- a/tests/queries/0_stateless/02366_kql_func_datetime.sql +++ /dev/null @@ -1,86 +0,0 @@ -set dialect = 'kusto'; - -print '-- dayofmonth()'; -print dayofmonth(datetime(2015-12-31)); -print '-- dayofweek()'; -print dayofweek(datetime(2015-12-31)); -print '-- dayofyear()'; -print dayofyear(datetime(2015-12-31)); -print '-- getmonth()'; -print getmonth(datetime(2015-10-12)); -print '-- getyear()'; -print getyear(datetime(2015-10-12)); -print '-- hoursofday()'; -print hourofday(datetime(2015-12-31 23:59:59.9)); -print '-- startofday()'; -print startofday(datetime(2017-01-01 10:10:17)); -print startofday(datetime(2017-01-01 10:10:17), -1); -print startofday(datetime(2017-01-01 10:10:17), 1); -print '-- endofday()'; -print endofday(datetime(2017-01-01 10:10:17)); -print endofday(datetime(2017-01-01 10:10:17), -1); -print endofday(datetime(2017-01-01 10:10:17), 1); -print '-- endofmonth()'; -print endofmonth(datetime(2017-01-01 10:10:17)); -print endofmonth(datetime(2017-01-01 10:10:17), -1); -print endofmonth(datetime(2017-01-01 10:10:17), 1); -print endofmonth(datetime(2022-09-23)); -print '-- startofweek()'; -print startofweek(datetime(2017-01-01 10:10:17)); -print startofweek(datetime(2017-01-01 10:10:17), -1); -print startofweek(datetime(2017-01-01 10:10:17), 1); -print '-- endofweek()'; -print endofweek(datetime(2017-01-01 10:10:17)); -print endofweek(datetime(2017-01-01 10:10:17), -1); -print endofweek(datetime(2017-01-01 10:10:17), 1); -print '-- startofyear()'; -print startofyear(datetime(2017-01-01 10:10:17)); -print startofyear(datetime(2017-01-01 10:10:17), -1); -print startofyear(datetime(2017-01-01 10:10:17), 1); -print '-- endofyear()'; -print endofyear(datetime(2017-01-01 10:10:17)); -print endofyear(datetime(2017-01-01 10:10:17), -1); -print endofyear(datetime(2017-01-01 10:10:17), 1); -print '-- unixtime_seconds_todatetime()'; -print unixtime_seconds_todatetime(1546300800); -print unixtime_seconds_todatetime(1d); -print unixtime_seconds_todatetime(-1d); -print '-- unixtime_microseconds_todatetime'; -print unixtime_microseconds_todatetime(1546300800000000); -print '-- unixtime_milliseconds_todatetime()'; -print unixtime_milliseconds_todatetime(1546300800000); -print '-- unixtime_nanoseconds_todatetime()'; -print unixtime_nanoseconds_todatetime(1546300800000000000); -print '-- weekofyear()'; -print week_of_year(datetime(2000-01-01)); -print '-- monthofyear()'; -print monthofyear(datetime(2015-12-31)); -print '-- weekofyear()'; -print week_of_year(datetime(2000-01-01)); -print '-- now()'; -print getyear(now(-2d))>1900; -print '-- make_datetime()'; -print make_datetime(2017,10,01,12,10) == datetime(2017-10-01 12:10:00); -print year_month_day_hour_minute = make_datetime(2017,10,01,12,10); -print year_month_day_hour_minute_second = make_datetime(2017,10,01,12,11,0.1234567); -print '-- format_datetime'; -print format_datetime(datetime(2015-12-14 02:03:04.12345), 'y-M-d h:m:s.fffffff'); -print v1=format_datetime(datetime(2017-01-29 09:00:05),'yy-MM-dd [HH:mm:ss]'), v2=format_datetime(datetime(2017-01-29 09:00:05), 'yyyy-M-dd [H:mm:ss]'), v3=format_datetime(datetime(2017-01-29 09:00:05), 'yy-MM-dd [hh:mm:ss tt]'); -print '-- format_timespan()'; -print format_timespan(time('14.02:03:04.12345'), 'h:m:s.fffffff'); -print v1=format_timespan(time('29.09:00:05.12345'), 'dd.hh:mm:ss:FF'); --- print v2=format_timespan(time('29.09:00:05.12345'), 'ddd.h:mm:ss [fffffff]'); == '029.9:00:05 [1234500]' -print '-- ago()'; --- print ago(1d) - now(); -print '-- datetime_diff()'; -print year = datetime_diff('year',datetime(2017-01-01),datetime(2000-12-31)), quarter = datetime_diff('quarter',datetime(2017-07-01),datetime(2017-03-30)), month = datetime_diff('month',datetime(2017-01-01),datetime(2015-12-30)), week = datetime_diff('week',datetime(2017-10-29 00:00),datetime(2017-09-30 23:59)), day = datetime_diff('day',datetime(2017-10-29 00:00),datetime(2017-09-30 23:59)), hour = datetime_diff('hour',datetime(2017-10-31 01:00),datetime(2017-10-30 23:59)), minute = datetime_diff('minute',datetime(2017-10-30 23:05:01),datetime(2017-10-30 23:00:59)), second = datetime_diff('second',datetime(2017-10-30 23:00:10.100),datetime(2017-10-30 23:00:00.900)); --- millisecond = datetime_diff('millisecond',datetime(2017-10-30 23:00:00.200100),datetime(2017-10-30 23:00:00.100900)), --- microsecond = datetime_diff('microsecond',datetime(2017-10-30 23:00:00.1009001),datetime(2017-10-30 23:00:00.1008009)), --- nanosecond = datetime_diff('nanosecond',datetime(2017-10-30 23:00:00.0000000),datetime(2017-10-30 23:00:00.0000007)) -print '-- datetime_part()'; -print year = datetime_part("year", datetime(2017-10-30 01:02:03.7654321)),quarter = datetime_part("quarter", datetime(2017-10-30 01:02:03.7654321)),month = datetime_part("month", datetime(2017-10-30 01:02:03.7654321)),weekOfYear = datetime_part("week_of_year", datetime(2017-10-30 01:02:03.7654321)),day = datetime_part("day", datetime(2017-10-30 01:02:03.7654321)),dayOfYear = datetime_part("dayOfYear", datetime(2017-10-30 01:02:03.7654321)),hour = datetime_part("hour", datetime(2017-10-30 01:02:03.7654321)),minute = datetime_part("minute", datetime(2017-10-30 01:02:03.7654321)),second = datetime_part("second", datetime(2017-10-30 01:02:03.7654321)); --- millisecond = datetime_part("millisecond", dt), --- microsecond = datetime_part("microsecond", dt), --- nanosecond = datetime_part("nanosecond", dt) -print '-- datetime_add()'; -print year = datetime_add('year',1,make_datetime(2017,1,1)),quarter = datetime_add('quarter',1,make_datetime(2017,1,1)),month = datetime_add('month',1,make_datetime(2017,1,1)),week = datetime_add('week',1,make_datetime(2017,1,1)),day = datetime_add('day',1,make_datetime(2017,1,1)),hour = datetime_add('hour',1,make_datetime(2017,1,1)),minute = datetime_add('minute',1,make_datetime(2017,1,1)),second = datetime_add('second',1,make_datetime(2017,1,1)); \ No newline at end of file diff --git a/tests/queries/0_stateless/02366_kql_func_dynamic.reference b/tests/queries/0_stateless/02366_kql_func_dynamic.reference deleted file mode 100644 index 564f1eebc4b..00000000000 --- a/tests/queries/0_stateless/02366_kql_func_dynamic.reference +++ /dev/null @@ -1,152 +0,0 @@ --- constant index value -1 c ['A',NULL,'C'] --- array_length() -1 -1 --- array_sum() -1 -1 --- array_index_of() -3 -1 --- array_iif() -[1,5,3] -[1,5,3] -[1,5,NULL] -[NULL,NULL,NULL] --- array_concat() -[1,2,3,4,5,6] --- array_reverse() -[] -[1] -[4,3,2,1] -['example','an','is','this'] --- array_rotate_left() -[] -[] -[] -[3,4,5,1,2] -[1,2,3,4,5] -[3,4,5,1,2] -[4,5,1,2,3] -[1,2,3,4,5] -[4,5,1,2,3] --- array_rotate_right() -[] -[] -[] -[4,5,1,2,3] -[1,2,3,4,5] -[4,5,1,2,3] -[3,4,5,1,2] -[1,2,3,4,5] -[3,4,5,1,2] --- array_shift_left() -[] -[] -[] -[3,4,5,NULL,NULL] -[NULL,NULL,1,2,3] -[3,4,5,-1,-1] -['c','',''] --- array_shift_right() -[] -[] -[] -[3,4,5,NULL,NULL] -[NULL,NULL,1,2,3] -[3,4,5,-1,-1] -['c','',''] --- array_slice() -[3,4] --- array_split() -[[1],[2,3],[4,5]] -[[1,2],[3,4,5]] -[[1],[2,3],[4,5]] -[[1,2,3,4],[],[4,5]] --- array_sort_asc() -(['a','c','c','d',NULL]) -([1,2,3,4]) -['a','b','c'] -(['p','q','r'],['hello','clickhouse','world']) -([NULL,'a','c','c','d']) -([NULL,'a','c','c','d']) -([NULL,NULL,NULL]) -[1,2,3,NULL,NULL] -['a','e','b','c','d'] -(['George','John','Paul','Ringo']) -(['blue','green','yellow',NULL,NULL]) -([NULL,NULL,'blue','green','yellow']) --- array_sort_desc() -(['d','c','c','a',NULL]) -([4,3,2,1]) -['c','b','a'] -(['r','q','p'],['world','clickhouse','hello']) -([NULL,'d','c','c','a']) -([NULL,'d','c','c','a']) -([NULL,NULL,NULL]) -[3,2,1,NULL,NULL] -['d','c','b','e','a'] -(['Ringo','Paul','John','George']) -(['yellow','green','blue',NULL,NULL]) -([NULL,NULL,'yellow','green','blue']) --- jaccard_index() -0.75 -0 -0 -nan -0 -0.75 -0.25 --- pack_array() -1 2 4 [1,2,4] -['ab','0.0.0.42','4.2'] --- repeat() -[] -[1,1,1] -['asd','asd','asd'] -[86400,86400,86400] -[true,true,true] -[NULL] -[NULL] --- set_difference() -[] -[] -[] -[] -[4,5,6] -[4] -[1,3] -[1,2,3] -['d','s'] -['Chewbacca','Han Solo'] --- set_has_element() -0 -1 -0 -1 -0 --- set_intersect() -[] -[1,2,3] -[1,2,3] -[] -[5] -[] -['a'] -['Darth Vader'] --- set_union() -[] -[1,2,3] -[1,2,3,4,5,6] -[1,2,3,4] -[1,2,3,4,5] -[1,2,3] -['a','d','f','s'] -['Chewbacca','Darth Sidious','Darth Vader','Han Solo'] --- zip() -[] -[[1,2],[3,4],[5,6]] -[['Darth','Vader','has a suit'],['Master','Yoda','doesn\'t have a suit']] -[[1,10],[2,20],[3,NULL]] -[[NULL,1],[NULL,2],[NULL,3]] diff --git a/tests/queries/0_stateless/02366_kql_func_dynamic.sql b/tests/queries/0_stateless/02366_kql_func_dynamic.sql deleted file mode 100644 index b0956f032d0..00000000000 --- a/tests/queries/0_stateless/02366_kql_func_dynamic.sql +++ /dev/null @@ -1,161 +0,0 @@ -DROP TABLE IF EXISTS array_test; -CREATE TABLE array_test (floats Array(Float64), - strings Array(String), - nullable_strings Array(Nullable(String)) - ) ENGINE=Memory; -INSERT INTO array_test VALUES([1.0, 2.5], ['a', 'c'], ['A', NULL, 'C']); -set dialect = 'kusto'; -print '-- constant index value'; -array_test | project floats[0], strings[1], nullable_strings; -print '-- array_length()'; -print array_length(dynamic(['John', 'Denver', 'Bob', 'Marley'])) == 4; -print array_length(dynamic([1, 2, 3])) == 3; -print '-- array_sum()'; -print array_sum(dynamic([2, 5, 3])) == 10; -print array_sum(dynamic([2.5, 5.5, 3])) == 11; -print '-- array_index_of()'; -print array_index_of(dynamic(['John', 'Denver', 'Bob', 'Marley']), 'Marley'); -print array_index_of(dynamic([1, 2, 3]), 2); -print '-- array_iif()'; -print array_iif(dynamic([true,false,true]), dynamic([1,2,3]), dynamic([4,5,6])); -print array_iif(dynamic([1,0,1]), dynamic([1,2,3]), dynamic([4,5,6])); -print array_iif(dynamic([true,false,true]), dynamic([1,2]), dynamic([4,5,6])); -print array_iif(dynamic(['a','b','c']), dynamic([1,2,3]), dynamic([4,5,6])); -print '-- array_concat()'; -print array_concat(dynamic([1,2,3]),dynamic([4,5,6])); -print '-- array_reverse()'; -print array_reverse(dynamic([])); -print array_reverse(dynamic([1])); -print array_reverse(dynamic([1,2,3,4])); -print array_reverse(dynamic(["this", "is", "an", "example"])); -print '-- array_rotate_left()'; -print array_rotate_left(dynamic([]), 0); -print array_rotate_left(dynamic([]), 500); -print array_rotate_left(dynamic([]), -500); -print array_rotate_left(dynamic([1,2,3,4,5]), 2); -print array_rotate_left(dynamic([1,2,3,4,5]), 5); -print array_rotate_left(dynamic([1,2,3,4,5]), 7); -print array_rotate_left(dynamic([1,2,3,4,5]), -2); -print array_rotate_left(dynamic([1,2,3,4,5]), -5); -print array_rotate_left(dynamic([1,2,3,4,5]), -7); -print '-- array_rotate_right()'; -print array_rotate_right(dynamic([]), 0); -print array_rotate_right(dynamic([]), 500); -print array_rotate_right(dynamic([]), -500); -print array_rotate_right(dynamic([1,2,3,4,5]), 2); -print array_rotate_right(dynamic([1,2,3,4,5]), 5); -print array_rotate_right(dynamic([1,2,3,4,5]), 7); -print array_rotate_right(dynamic([1,2,3,4,5]), -2); -print array_rotate_right(dynamic([1,2,3,4,5]), -5); -print array_rotate_right(dynamic([1,2,3,4,5]), -7); -print '-- array_shift_left()'; -print array_shift_left(dynamic([]), 0); -print array_shift_left(dynamic([]), 555); -print array_shift_left(dynamic([]), -555); -print array_shift_left(dynamic([1,2,3,4,5]), 2); -print array_shift_left(dynamic([1,2,3,4,5]), -2); -print array_shift_left(dynamic([1,2,3,4,5]), 2, -1); -print array_shift_left(dynamic(['a', 'b', 'c']), 2); -print '-- array_shift_right()'; -print array_shift_left(dynamic([]), 0); -print array_shift_left(dynamic([]), 555); -print array_shift_left(dynamic([]), -555); -print array_shift_right(dynamic([1,2,3,4,5]), -2); -print array_shift_right(dynamic([1,2,3,4,5]), 2); -print array_shift_right(dynamic([1,2,3,4,5]), -2, -1); -print array_shift_right(dynamic(['a', 'b', 'c']), -2); -print '-- array_slice()'; ---print array_slice(dynamic([1,2,3]), 1, 2); -- will enable whe analyzer dixed -print array_slice(dynamic([1,2,3,4,5]), -3, -2); -print '-- array_split()'; -print array_split(dynamic([1,2,3,4,5]), dynamic([1,-2])); -print array_split(dynamic([1,2,3,4,5]), 2); -print array_split(dynamic([1,2,3,4,5]), dynamic([1,3])); -print array_split(dynamic([1,2,3,4,5]), dynamic([-1,-2])); -print '-- array_sort_asc()'; -print array_sort_asc(dynamic([null, 'd', 'a', 'c', 'c'])); -print array_sort_asc(dynamic([4, 1, 3, 2])); -print array_sort_asc(dynamic(['b', 'a', 'c']), dynamic(['q', 'p', 'r']))[0]; -print array_sort_asc(dynamic(['q', 'p', 'r']), dynamic(['clickhouse','hello', 'world'])); -print array_sort_asc( dynamic(['d', null, 'a', 'c', 'c']) , false); -print array_sort_asc( dynamic(['d', null, 'a', 'c', 'c']) , 1 > 2); -print array_sort_asc( dynamic([null, null, null]) , false); -print array_sort_asc(dynamic([2, 1, null,3, null]), dynamic([20, 10, 40, 30, 50]), 1 < 2)[0]; -print array_sort_asc(dynamic(['1','3','4','5','2']),dynamic(["a","b","c","d","e"]), dynamic(["a","b","c","d","e"]), dynamic(["a","b","c","d","e"]))[3]; -print array_sort_asc(split("John,Paul,George,Ringo", ",")); -print array_sort_asc(dynamic([null,"blue","yellow","green",null])); -print array_sort_asc(dynamic([null,"blue","yellow","green",null]), false); -print '-- array_sort_desc()'; -print array_sort_desc(dynamic([null, 'd', 'a', 'c', 'c'])); -print array_sort_desc(dynamic([4, 1, 3, 2])); -print array_sort_desc(dynamic(['b', 'a', 'c']), dynamic(['q', 'p', 'r']))[0]; -print array_sort_desc(dynamic(['q', 'p', 'r']), dynamic(['clickhouse','hello', 'world'])); -print array_sort_desc( dynamic(['d', null, 'a', 'c', 'c']) , false); -print array_sort_desc( dynamic(['d', null, 'a', 'c', 'c']) , 1 > 2); -print array_sort_desc( dynamic([null, null, null]) , false); -print array_sort_desc(dynamic([2, 1, null,3, null]), dynamic([20, 10, 40, 30, 50]), 1 < 2)[0]; -print array_sort_desc(dynamic(['1','3','4','5','2']),dynamic(["a","b","c","d","e"]), dynamic(["a","b","c","d","e"]), dynamic(["a","b","c","d","e"]))[3]; -print array_sort_desc(split("John,Paul,George,Ringo", ",")); -print array_sort_desc(dynamic([null,"blue","yellow","green",null])); -print array_sort_desc(dynamic([null,"blue","yellow","green",null]), false); -print '-- jaccard_index()'; -print jaccard_index(dynamic([1, 1, 2, 2, 3, 3]), dynamic([1, 2, 3, 4, 4, 4])); -print jaccard_index(dynamic([1, 2, 3]), dynamic([])); -print jaccard_index(dynamic([]), dynamic([1, 2, 3, 4])); -print jaccard_index(dynamic([]), dynamic([])); -print jaccard_index(dynamic([1, 2, 3]), dynamic([4, 5, 6, 7])); -print jaccard_index(dynamic(['a', 's', 'd']), dynamic(['f', 'd', 's', 'a'])); -print jaccard_index(dynamic(['Chewbacca', 'Darth Vader', 'Han Solo']), dynamic(['Darth Sidious', 'Darth Vader'])); -print '-- pack_array()'; -print pack_array(); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -print x = 1 | extend y = x * 2 | extend z = y * 2 | extend pack_array(x,y,z); -print pack_array(strcat('a', 'b'), format_ipv4(42), tostring(4.2)); -print '-- repeat()'; -print repeat(1, 0); -print repeat(1, 3); -print repeat("asd", 3); -print repeat(timespan(1d), 3); -print repeat(true, 3); -print repeat(1, -3); -print repeat(6.7,-4); -print '-- set_difference()'; -print set_difference(dynamic([]), dynamic([])); -print set_difference(dynamic([]), dynamic([9])); -print set_difference(dynamic([]), dynamic(["asd"])); -print set_difference(dynamic([1, 1, 2, 2, 3, 3]), dynamic([1, 2, 3])); -print array_sort_asc(set_difference(dynamic([1, 4, 2, 3, 5, 4, 6]), dynamic([1, 2, 3])))[0]; -print set_difference(dynamic([4]), dynamic([1, 2, 3])); -print array_sort_asc(set_difference(dynamic([1, 2, 3, 4, 5]), dynamic([5]), dynamic([2, 4])))[0]; -print array_sort_asc(set_difference(dynamic([1, 2, 3]), dynamic([])))[0]; -print array_sort_asc(set_difference(dynamic(['a', 's', 'd']), dynamic(['a', 'f'])))[0]; -print array_sort_asc(set_difference(dynamic(['Chewbacca', 'Darth Vader', 'Han Solo']), dynamic(['Darth Sidious', 'Darth Vader'])))[0]; -print '-- set_has_element()'; -print set_has_element(dynamic([]), 9); -print set_has_element(dynamic(["this", "is", "an", "example"]), "example"); -print set_has_element(dynamic(["this", "is", "an", "example"]), "examplee"); -print set_has_element(dynamic([1, 2, 3]), 2); -print set_has_element(dynamic([1, 2, 3, 4.2]), 4); -print '-- set_intersect()'; -print set_intersect(dynamic([]), dynamic([])); -print array_sort_asc(set_intersect(dynamic([1, 1, 2, 2, 3, 3]), dynamic([1, 2, 3])))[0]; -print array_sort_asc(set_intersect(dynamic([1, 4, 2, 3, 5, 4, 6]), dynamic([1, 2, 3])))[0]; -print set_intersect(dynamic([4]), dynamic([1, 2, 3])); -print set_intersect(dynamic([1, 2, 3, 4, 5]), dynamic([1, 3, 5]), dynamic([2, 5])); -print set_intersect(dynamic([1, 2, 3]), dynamic([])); -print set_intersect(dynamic(['a', 's', 'd']), dynamic(['a', 'f'])); -print set_intersect(dynamic(['Chewbacca', 'Darth Vader', 'Han Solo']), dynamic(['Darth Sidious', 'Darth Vader'])); -print '-- set_union()'; -print set_union(dynamic([]), dynamic([])); -print array_sort_asc(set_union(dynamic([1, 1, 2, 2, 3, 3]), dynamic([1, 2, 3])))[0]; -print array_sort_asc(set_union(dynamic([1, 4, 2, 3, 5, 4, 6]), dynamic([1, 2, 3])))[0]; -print array_sort_asc(set_union(dynamic([4]), dynamic([1, 2, 3])))[0]; -print array_sort_asc(set_union(dynamic([1, 3, 4]), dynamic([5]), dynamic([2, 4])))[0]; -print array_sort_asc(set_union(dynamic([1, 2, 3]), dynamic([])))[0]; -print array_sort_asc(set_union(dynamic(['a', 's', 'd']), dynamic(['a', 'f'])))[0]; -print array_sort_asc(set_union(dynamic(['Chewbacca', 'Darth Vader', 'Han Solo']), dynamic(['Darth Sidious', 'Darth Vader'])))[0]; -print '-- zip()'; -print zip(dynamic([]), dynamic([])); -print zip(dynamic([1,3,5]), dynamic([2,4,6])); -print zip(dynamic(['Darth','Master']), dynamic(['Vader','Yoda']), dynamic(['has a suit','doesn\'t have a suit'])); -print zip(dynamic([1,2,3]), dynamic([10,20])); -print zip(dynamic([]), dynamic([1,2,3])); \ No newline at end of file diff --git a/tests/queries/0_stateless/02366_kql_func_ip.reference b/tests/queries/0_stateless/02366_kql_func_ip.reference deleted file mode 100644 index 2a0bbf53fff..00000000000 --- a/tests/queries/0_stateless/02366_kql_func_ip.reference +++ /dev/null @@ -1,123 +0,0 @@ --- ipv4_is_private(\'127.0.0.1\') -0 --- ipv4_is_private(\'10.1.2.3\') -1 --- ipv4_is_private(\'192.168.1.1/24\') -1 -ipv4_is_private(strcat(\'192.\',\'168.\',\'1.\',\'1\',\'/24\')) -1 --- ipv4_is_private(\'abc\') -\N --- ipv4_netmask_suffix(\'192.168.1.1/24\') -24 --- ipv4_netmask_suffix(\'192.168.1.1\') -32 --- ipv4_netmask_suffix(\'127.0.0.1/16\') -16 --- ipv4_netmask_suffix(\'abc\') -\N -ipv4_netmask_suffix(strcat(\'127.\', \'0.\', \'0.1/16\')) -16 --- ipv4_is_in_range(\'127.0.0.1\', \'127.0.0.1\') -1 --- ipv4_is_in_range(\'192.168.1.6\', \'192.168.1.1/24\') -1 --- ipv4_is_in_range(\'192.168.1.1\', \'192.168.2.1/24\') -0 --- ipv4_is_in_range(strcat(\'192.\',\'168.\', \'1.1\'), \'192.168.2.1/24\') -0 --- ipv4_is_in_range(\'abc\', \'127.0.0.1\') -\N --- parse_ipv6(127.0.0.1) -0000:0000:0000:0000:0000:ffff:7f00:0001 --- parse_ipv6(fe80::85d:e82c:9446:7994) -fe80:0000:0000:0000:085d:e82c:9446:7994 --- parse_ipv4(\'127.0.0.1\') -2130706433 --- parse_ipv4(\'192.1.168.1\') < parse_ipv4(\'192.1.168.2\') -1 --- parse_ipv4(arrayStringConcat([\'127\', \'0\', \'0\', \'1\'], \'.\')) --- parse_ipv4_mask(\'127.0.0.1\', 24) == 2130706432 -2130706432 --- parse_ipv4_mask(\'abc\', 31) -\N -\N --- parse_ipv4_mask(\'192.1.168.2\', 31) == parse_ipv4_mask(\'192.1.168.3\', 31) -3221334018 -3221334018 --- ipv4_is_match(\'127.0.0.1\', \'127.0.0.1\') -1 --- ipv4_is_match(\'192.168.1.1\', \'192.168.1.255\') -0 --- ipv4_is_match(\'192.168.1.1/24\', \'192.168.1.255/24\') -1 --- ipv4_is_match(\'192.168.1.1\', \'192.168.1.255\', 24) -1 --- ipv4_is_match(\'abc\', \'def\', 24) -\N --- ipv4_compare() -0 --1 -1 -0 -0 -0 -0 -0 -0 -0 -0 --- format_ipv4() -192.168.1.0 -192.168.1.1 -192.168.1.0 -192.168.1.0 -1 -1 -127.0.0.0 --- format_ipv4_mask() -192.168.1.0/24 -192.168.1.0/24 -192.168.1.0/24 -192.168.1.1/32 -192.168.1.0/24 -1 -1 -127.0.0.0/24 --- parse_ipv6_mask() -0000:0000:0000:0000:0000:0000:0000:0000 -fe80:0000:0000:0000:085d:e82c:9446:7900 -0000:0000:0000:0000:0000:ffff:c0a8:ff00 -0000:0000:0000:0000:0000:ffff:c0a8:ff00 -0000:0000:0000:0000:0000:ffff:ffff:ffff -fe80:0000:0000:0000:085d:e82c:9446:7994 -fe80:0000:0000:0000:085d:e82c:9446:7900 -0000:0000:0000:0000:0000:ffff:c0a8:ffff -0000:0000:0000:0000:0000:ffff:c0a8:ff00 --- ipv6_is_match() -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 diff --git a/tests/queries/0_stateless/02366_kql_func_ip.sql b/tests/queries/0_stateless/02366_kql_func_ip.sql deleted file mode 100644 index c9b335f203a..00000000000 --- a/tests/queries/0_stateless/02366_kql_func_ip.sql +++ /dev/null @@ -1,131 +0,0 @@ -set dialect='kusto'; -print '-- ipv4_is_private(\'127.0.0.1\')'; -print ipv4_is_private('127.0.0.1'); -print '-- ipv4_is_private(\'10.1.2.3\')'; -print ipv4_is_private('10.1.2.3'); -print '-- ipv4_is_private(\'192.168.1.1/24\')'; -print ipv4_is_private('192.168.1.1/24'); -print 'ipv4_is_private(strcat(\'192.\',\'168.\',\'1.\',\'1\',\'/24\'))'; -print ipv4_is_private(strcat('192.','168.','1.','1','/24')); -print '-- ipv4_is_private(\'abc\')'; -print ipv4_is_private('abc'); -- == null - -print '-- ipv4_netmask_suffix(\'192.168.1.1/24\')'; -print ipv4_netmask_suffix('192.168.1.1/24'); -- == 24 -print '-- ipv4_netmask_suffix(\'192.168.1.1\')'; -print ipv4_netmask_suffix('192.168.1.1'); -- == 32 -print '-- ipv4_netmask_suffix(\'127.0.0.1/16\')'; -print ipv4_netmask_suffix('127.0.0.1/16'); -- == 16 -print '-- ipv4_netmask_suffix(\'abc\')'; -print ipv4_netmask_suffix('abc'); -- == null -print 'ipv4_netmask_suffix(strcat(\'127.\', \'0.\', \'0.1/16\'))'; -print ipv4_netmask_suffix(strcat('127.', '0.', '0.1/16')); -- == 16 - -print '-- ipv4_is_in_range(\'127.0.0.1\', \'127.0.0.1\')'; -print ipv4_is_in_range('127.0.0.1', '127.0.0.1'); -- == true -print '-- ipv4_is_in_range(\'192.168.1.6\', \'192.168.1.1/24\')'; -print ipv4_is_in_range('192.168.1.6', '192.168.1.1/24'); -- == true -print '-- ipv4_is_in_range(\'192.168.1.1\', \'192.168.2.1/24\')'; -print ipv4_is_in_range('192.168.1.1', '192.168.2.1/24'); -- == false -print '-- ipv4_is_in_range(strcat(\'192.\',\'168.\', \'1.1\'), \'192.168.2.1/24\')'; -print ipv4_is_in_range(strcat('192.','168.', '1.1'), '192.168.2.1/24'); -- == false -print '-- ipv4_is_in_range(\'abc\', \'127.0.0.1\')'; -- == null -print ipv4_is_in_range('abc', '127.0.0.1'); - -print '-- parse_ipv6(127.0.0.1)'; -print parse_ipv6('127.0.0.1'); -print '-- parse_ipv6(fe80::85d:e82c:9446:7994)'; -print parse_ipv6('fe80::85d:e82c:9446:7994'); -print '-- parse_ipv4(\'127.0.0.1\')'; -print parse_ipv4('127.0.0.1'); -print '-- parse_ipv4(\'192.1.168.1\') < parse_ipv4(\'192.1.168.2\')'; -print parse_ipv4('192.1.168.1') < parse_ipv4('192.1.168.2'); -print '-- parse_ipv4(arrayStringConcat([\'127\', \'0\', \'0\', \'1\'], \'.\'))'; -print parse_ipv4(arrayStringConcat(['127', '0', '0', '1'], '.')); -- { clientError UNKNOWN_FUNCTION } - -print '-- parse_ipv4_mask(\'127.0.0.1\', 24) == 2130706432'; -print parse_ipv4_mask('127.0.0.1', 24); -print '-- parse_ipv4_mask(\'abc\', 31)'; -print parse_ipv4_mask('abc', 31) -print '-- parse_ipv4_mask(\'192.1.168.2\', 1000)'; -print parse_ipv4_mask('192.1.168.2', 1000); -print '-- parse_ipv4_mask(\'192.1.168.2\', 31) == parse_ipv4_mask(\'192.1.168.3\', 31)'; ---print parse_ipv4_mask('192.1.168.2', 31) == parse_ipv4_mask('192.1.168.3', 31); // this qual failed in analyzer 3221334018 -print parse_ipv4_mask('192.1.168.2', 31); -print parse_ipv4_mask('192.1.168.3', 31); -print '-- ipv4_is_match(\'127.0.0.1\', \'127.0.0.1\')'; -print ipv4_is_match('127.0.0.1', '127.0.0.1'); -print '-- ipv4_is_match(\'192.168.1.1\', \'192.168.1.255\')'; -print ipv4_is_match('192.168.1.1', '192.168.1.255'); -print '-- ipv4_is_match(\'192.168.1.1/24\', \'192.168.1.255/24\')'; -print ipv4_is_match('192.168.1.1/24', '192.168.1.255/24'); -print '-- ipv4_is_match(\'192.168.1.1\', \'192.168.1.255\', 24)'; -print ipv4_is_match('192.168.1.1', '192.168.1.255', 24); -print '-- ipv4_is_match(\'abc\', \'def\', 24)'; -print ipv4_is_match('abc', 'dev', 24); -print '-- ipv4_compare()'; -print ipv4_compare('127.0.0.1', '127.0.0.1'); -print ipv4_compare('192.168.1.1', '192.168.1.255'); -print ipv4_compare('192.168.1.255', '192.168.1.1'); -print ipv4_compare('192.168.1.1/24', '192.168.1.255/24'); -print ipv4_compare('192.168.1.1', '192.168.1.255', 24); -print ipv4_compare('192.168.1.1/24', '192.168.1.255'); -print ipv4_compare('192.168.1.1', '192.168.1.255/24'); -print ipv4_compare('192.168.1.1/30', '192.168.1.255/24'); -print ipv4_compare('192.168.1.1', '192.168.1.0', 31); -print ipv4_compare('192.168.1.1/24', '192.168.1.255', 31); -print ipv4_compare('192.168.1.1', '192.168.1.255', 24); -print '-- format_ipv4()'; -print format_ipv4('192.168.1.255', 24); -print format_ipv4('192.168.1.1', 32); -print format_ipv4('192.168.1.1/24', 32); -print format_ipv4(3232236031, 24); -print format_ipv4('192.168.1.1/24', -1) == ''; -print format_ipv4('abc', 24) == ''; -print format_ipv4(strcat('127.0', '.0.', '1', '/32'), 12 + 12); -print '-- format_ipv4_mask()'; -print format_ipv4_mask('192.168.1.255', 24); -print format_ipv4_mask(3232236031, 24); -print format_ipv4_mask('192.168.1.1', 24); -print format_ipv4_mask('192.168.1.1', 32); -print format_ipv4_mask('192.168.1.1/24', 32); -print format_ipv4_mask('192.168.1.1/24', -1) == ''; -print format_ipv4_mask('abc', 24) == ''; -print format_ipv4_mask(strcat('127.0', '.0.', '1', '/32'), 12 + 12); -print '-- parse_ipv6_mask()'; -print parse_ipv6_mask("127.0.0.1", 24); -print parse_ipv6_mask("fe80::85d:e82c:9446:7994", 120); -print parse_ipv6_mask("192.168.255.255", 120); -print parse_ipv6_mask("192.168.255.255/24", 124); -print parse_ipv6_mask("255.255.255.255", 128); -print parse_ipv6_mask("fe80::85d:e82c:9446:7994", 128); -print parse_ipv6_mask("fe80::85d:e82c:9446:7994/120", 124); -print parse_ipv6_mask("::192.168.255.255", 128); -print parse_ipv6_mask("::192.168.255.255/24", 128); -print '-- ipv6_is_match()'; -print ipv6_is_match('::ffff:7f00:1', '127.0.0.1') == true; -print ipv6_is_match('fe80::85d:e82c:9446:7994', 'fe80::85d:e82c:9446:7995') == false; -print ipv6_is_match('192.168.1.1/24', '192.168.1.255/24') == true; -print ipv6_is_match('fe80::85d:e82c:9446:7994/127', 'fe80::85d:e82c:9446:7995/127') == true; -print ipv6_is_match('fe80::85d:e82c:9446:7994', 'fe80::85d:e82c:9446:7995', 127) == true; -print ipv6_is_match('192.168.1.1', '192.168.1.1'); -- // Equal IPs -print ipv6_is_match('192.168.1.1/24', '192.168.1.255'); -- // 24 bit IP4-prefix is used for comparison -print ipv6_is_match('192.168.1.1', '192.168.1.255/24'); -- // 24 bit IP4-prefix is used for comparison -print ipv6_is_match('192.168.1.1/30', '192.168.1.255/24'); -- // 24 bit IP4-prefix is used for comparison -print ipv6_is_match('fe80::85d:e82c:9446:7994', 'fe80::85d:e82c:9446:7994'); -- // Equal IPs -print ipv6_is_match('fe80::85d:e82c:9446:7994/120', 'fe80::85d:e82c:9446:7998'); -- // 120 bit IP6-prefix is used for comparison -print ipv6_is_match('fe80::85d:e82c:9446:7994', 'fe80::85d:e82c:9446:7998/120'); -- // 120 bit IP6-prefix is used for comparison -print ipv6_is_match('fe80::85d:e82c:9446:7994/120', 'fe80::85d:e82c:9446:7998/120'); -- // 120 bit IP6-prefix is used for comparison -print ipv6_is_match('192.168.1.1', '::ffff:c0a8:0101'); -- // Equal IPs -print ipv6_is_match('192.168.1.1/24', '::ffff:c0a8:01ff'); -- // 24 bit IP-prefix is used for comparison -print ipv6_is_match('::ffff:c0a8:0101', '192.168.1.255/24'); -- // 24 bit IP-prefix is used for comparison -print ipv6_is_match('::192.168.1.1/30', '192.168.1.255/24'); -- // 24 bit IP-prefix is used for comparison -print ipv6_is_match('192.168.1.1', '192.168.1.0', 31); -- // 31 bit IP4-prefix is used for comparison -print ipv6_is_match('192.168.1.1/24', '192.168.1.255', 31); -- // 24 bit IP4-prefix is used for comparison -print ipv6_is_match('192.168.1.1', '192.168.1.255', 24); -- // 24 bit IP4-prefix is used for comparison -print ipv6_is_match('fe80::85d:e82c:9446:7994', 'fe80::85d:e82c:9446:7995', 127); -- // 127 bit IP6-prefix is used for comparison -print ipv6_is_match('fe80::85d:e82c:9446:7994/127', 'fe80::85d:e82c:9446:7998', 120); -- // 120 bit IP6-prefix is used for comparison -print ipv6_is_match('fe80::85d:e82c:9446:7994/120', 'fe80::85d:e82c:9446:7998', 127); -- // 120 bit IP6-prefix is used for comparison -print ipv6_is_match('192.168.1.1/24', '::ffff:c0a8:01ff', 127); -- // 127 bit IP6-prefix is used for comparison -print ipv6_is_match('::ffff:c0a8:0101', '192.168.1.255', 120); -- // 120 bit IP6-prefix is used for comparison -print ipv6_is_match('::192.168.1.1/30', '192.168.1.255/24', 127); -- // 120 bit IP6-prefix is used for comparison \ No newline at end of file diff --git a/tests/queries/0_stateless/02366_kql_func_math.reference b/tests/queries/0_stateless/02366_kql_func_math.reference deleted file mode 100644 index 92f283abcb6..00000000000 --- a/tests/queries/0_stateless/02366_kql_func_math.reference +++ /dev/null @@ -1,4 +0,0 @@ --- isnan -- -1 -0 -0 diff --git a/tests/queries/0_stateless/02366_kql_func_math.sql b/tests/queries/0_stateless/02366_kql_func_math.sql deleted file mode 100644 index 4e83622eb6b..00000000000 --- a/tests/queries/0_stateless/02366_kql_func_math.sql +++ /dev/null @@ -1,7 +0,0 @@ -set dialect = 'kusto'; -print '-- isnan --'; -print isnan(double(nan)); -print isnan(4.2); -print isnan(4); -- { serverError FUNCTION_THROW_IF_VALUE_IS_NON_ZERO } -print isnan(real(+inf)); -print isnan(dynamic(null)); -- { serverError FUNCTION_THROW_IF_VALUE_IS_NON_ZERO } diff --git a/tests/queries/0_stateless/02366_kql_func_scalar.reference b/tests/queries/0_stateless/02366_kql_func_scalar.reference deleted file mode 100644 index b7fa62c5d43..00000000000 --- a/tests/queries/0_stateless/02366_kql_func_scalar.reference +++ /dev/null @@ -1,16 +0,0 @@ --- bin_at() -4.5 --12:0:0 -2017-05-14 12:00:00.000000000 -2017-05-14 00:00:00.000000000 -2018-02-25 15:14:00.000000000 5 -2018-02-24 15:14:00.000000000 3 -2018-02-23 15:14:00.000000000 4 --- bin() -4 -1970-05-11 00:00:00.000000000 -336:0:0 -1970-05-11 13:45:07.345000000 -1970-05-11 13:45:07.345623000 -2022-09-26 10:13:23.987232000 -1970-05-11 13:45:07.456336000 diff --git a/tests/queries/0_stateless/02366_kql_func_scalar.sql b/tests/queries/0_stateless/02366_kql_func_scalar.sql deleted file mode 100644 index d7e94cfd9d1..00000000000 --- a/tests/queries/0_stateless/02366_kql_func_scalar.sql +++ /dev/null @@ -1,26 +0,0 @@ -DROP TABLE IF EXISTS Bin_at_test; -CREATE TABLE Bin_at_test -( - `Date` DateTime('UTC'), - Num Nullable(UInt8) -) ENGINE = Memory; -INSERT INTO Bin_at_test VALUES ('2018-02-24T15:14:01',3), ('2018-02-23T16:14:01',4), ('2018-02-26T15:14:01',5); - -set dialect = 'kusto'; -print '-- bin_at()'; -print bin_at(6.5, 2.5, 7); -print bin_at(1h, 1d, 12h); -print bin_at(datetime(2017-05-15 10:20:00.0), 1d, datetime(1970-01-01 12:00:00.0)); -print bin_at(datetime(2017-05-17 10:20:00.0), 7d, datetime(2017-06-04 00:00:00.0)); -Bin_at_test | summarize sum(Num) by d = todatetime(bin_at(Date, 1d, datetime('2018-02-24 15:14:00'))) | order by d; -print '-- bin()'; -print bin(4.5, 1); -print bin(datetime(1970-05-11 13:45:07), 1d); -print bin(16d, 7d); -print bin(datetime(1970-05-11 13:45:07.345623), 1ms); --- print bin(datetime(2022-09-26 10:13:23.987234), 6ms); -> 2022-09-26 10:13:23.982000000 -print bin(datetime(1970-05-11 13:45:07.345623), 1microsecond); -print bin(datetime(2022-09-26 10:13:23.987234), 6microseconds); -print bin(datetime(1970-05-11 13:45:07.456345672), 16microseconds); --- print bin(datetime(2022-09-26 10:13:23.987234128), 1tick); -> 2022-09-26 10:13:23.987234100 --- print bin(datetime(2022-09-26 10:13:23.987234128), 99nanosecond); -> null diff --git a/tests/queries/0_stateless/02366_kql_func_string.reference b/tests/queries/0_stateless/02366_kql_func_string.reference deleted file mode 100644 index 9bdd38ca5db..00000000000 --- a/tests/queries/0_stateless/02366_kql_func_string.reference +++ /dev/null @@ -1,360 +0,0 @@ --- test String Functions -- --- Customers |where Education contains \'degree\' -Latoya Shen Professional Graduate Degree 25 -Peter Nara Skilled Manual Graduate Degree 26 - --- Customers |where Education !contains \'degree\' -\N why Professional Partial College 38 -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Apple Skilled Manual Bachelors 28 - --- Customers |where Education contains \'Degree\' -Latoya Shen Professional Graduate Degree 25 -Peter Nara Skilled Manual Graduate Degree 26 - --- Customers |where Education !contains \'Degree\' -\N why Professional Partial College 38 -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Apple Skilled Manual Bachelors 28 - --- Customers | where FirstName endswith \'RE\' -Theodore Diaz Skilled Manual Bachelors 28 - --- Customers | where ! FirstName endswith \'RE\' -Latoya Shen Professional Graduate Degree 25 -Peter Nara Skilled Manual Graduate Degree 26 -Stephanie Cox Management abcd defg Bachelors 33 -Apple Skilled Manual Bachelors 28 - ---Customers | where FirstName endswith_cs \'re\' -Theodore Diaz Skilled Manual Bachelors 28 - --- Customers | where FirstName !endswith_cs \'re\' -Latoya Shen Professional Graduate Degree 25 -Peter Nara Skilled Manual Graduate Degree 26 -Stephanie Cox Management abcd defg Bachelors 33 -Apple Skilled Manual Bachelors 28 - --- Customers | where Occupation == \'Skilled Manual\' -Peter Nara Skilled Manual Graduate Degree 26 -Theodore Diaz Skilled Manual Bachelors 28 -Apple Skilled Manual Bachelors 28 - --- Customers | where Occupation != \'Skilled Manual\' -\N why Professional Partial College 38 -Latoya Shen Professional Graduate Degree 25 -Stephanie Cox Management abcd defg Bachelors 33 - --- Customers | where Occupation has \'skilled\' -Peter Nara Skilled Manual Graduate Degree 26 -Theodore Diaz Skilled Manual Bachelors 28 -Apple Skilled Manual Bachelors 28 - --- Customers | where Occupation !has \'skilled\' -\N why Professional Partial College 38 -Latoya Shen Professional Graduate Degree 25 -Stephanie Cox Management abcd defg Bachelors 33 - --- Customers | where Occupation has \'Skilled\' -Peter Nara Skilled Manual Graduate Degree 26 -Theodore Diaz Skilled Manual Bachelors 28 -Apple Skilled Manual Bachelors 28 - --- Customers | where Occupation !has \'Skilled\' -\N why Professional Partial College 38 -Latoya Shen Professional Graduate Degree 25 -Stephanie Cox Management abcd defg Bachelors 33 - --- Customers | where Occupation hasprefix_cs \'Ab\' - --- Customers | where Occupation !hasprefix_cs \'Ab\' -\N why Professional Partial College 38 -Latoya Shen Professional Graduate Degree 25 -Peter Nara Skilled Manual Graduate Degree 26 -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Apple Skilled Manual Bachelors 28 - --- Customers | where Occupation hasprefix_cs \'ab\' -Stephanie Cox Management abcd defg Bachelors 33 - --- Customers | where Occupation !hasprefix_cs \'ab\' -\N why Professional Partial College 38 -Latoya Shen Professional Graduate Degree 25 -Peter Nara Skilled Manual Graduate Degree 26 -Theodore Diaz Skilled Manual Bachelors 28 -Apple Skilled Manual Bachelors 28 - --- Customers | where Occupation hassuffix \'Ent\' -Stephanie Cox Management abcd defg Bachelors 33 - --- Customers | where Occupation !hassuffix \'Ent\' -\N why Professional Partial College 38 -Latoya Shen Professional Graduate Degree 25 -Peter Nara Skilled Manual Graduate Degree 26 -Theodore Diaz Skilled Manual Bachelors 28 -Apple Skilled Manual Bachelors 28 - --- Customers | where Occupation hassuffix \'ent\' -Stephanie Cox Management abcd defg Bachelors 33 - --- Customers | where Occupation hassuffix \'ent\' -Stephanie Cox Management abcd defg Bachelors 33 - --- Customers |where Education in (\'Bachelors\',\'High School\') -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Apple Skilled Manual Bachelors 28 - --- Customers | where Education !in (\'Bachelors\',\'High School\') -\N why Professional Partial College 38 -Latoya Shen Professional Graduate Degree 25 -Peter Nara Skilled Manual Graduate Degree 26 - --- Customers | where FirstName matches regex \'P.*r\' -Peter Nara Skilled Manual Graduate Degree 26 - --- Customers | where FirstName startswith \'pet\' -Peter Nara Skilled Manual Graduate Degree 26 - --- Customers | where FirstName !startswith \'pet\' -Latoya Shen Professional Graduate Degree 25 -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Apple Skilled Manual Bachelors 28 - --- Customers | where FirstName startswith_cs \'pet\' - --- Customers | where FirstName !startswith_cs \'pet\' -Latoya Shen Professional Graduate Degree 25 -Peter Nara Skilled Manual Graduate Degree 26 -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Apple Skilled Manual Bachelors 28 - --- Customers | where isempty(LastName) -Apple Skilled Manual Bachelors 28 - --- Customers | where isnotempty(LastName) -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Peter Nara Skilled Manual Graduate Degree 26 -Latoya Shen Professional Graduate Degree 25 -\N why Professional Partial College 38 - --- Customers | where isnotnull(FirstName) -Latoya Shen Professional Graduate Degree 25 -Peter Nara Skilled Manual Graduate Degree 26 -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Apple Skilled Manual Bachelors 28 - --- Customers | where isnull(FirstName) -\N why Professional Partial College 38 - --- Customers | project url_decode(\'https%3A%2F%2Fwww.test.com%2Fhello%20word\') | take 1 -https://www.test.com/hello word - --- Customers | project url_encode(\'https://www.test.com/hello word\') | take 1 -https%3A%2F%2Fwww.test.com%2Fhello%20word - --- Customers | project name_abbr = strcat(substring(FirstName,0,3), \' \', substring(LastName,2)) -\N -Lat en -Pet ra -The az -Ste x -App - --- Customers | project name = strcat(FirstName, \' \', LastName) -\N -Latoya Shen -Peter Nara -Theodore Diaz -Stephanie Cox -Apple - --- Customers | project FirstName, strlen(FirstName) -\N \N -Latoya 6 -Peter 5 -Theodore 8 -Stephanie 9 -Apple 5 - --- Customers | project strrep(FirstName,2,\'_\') -\N -Latoya_Latoya -Peter_Peter -Theodore_Theodore -Stephanie_Stephanie -Apple_Apple - --- Customers | project toupper(FirstName) -\N -LATOYA -PETER -THEODORE -STEPHANIE -APPLE - --- Customers | project tolower(FirstName) -\N -latoya -peter -theodore -stephanie -apple - --- support subquery for in orerator (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/in-cs-operator) (subquery need to be wraped with bracket inside bracket); TODO: case-insensitive not supported yet -Latoya Shen Professional Graduate Degree 25 -Peter Nara Skilled Manual Graduate Degree 26 -Theodore Diaz Skilled Manual Bachelors 28 -Apple Skilled Manual Bachelors 28 - --- has_all (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/has-all-operator); TODO: subquery not supported yet -Peter Nara Skilled Manual Graduate Degree 26 -Theodore Diaz Skilled Manual Bachelors 28 -Apple Skilled Manual Bachelors 28 - --- has_any (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/has-anyoperator); TODO: subquery not supported yet -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Peter Nara Skilled Manual Graduate Degree 26 -Apple Skilled Manual Bachelors 28 - --- countof (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/countoffunction) -3 -3 -1 - --- extract ( https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/extractfunction) -PINEAPPLE ice cream is 20 -PINEAPPLE -20 - -20 -\N -\N -\N -\N -\N -45.6 -45.6 - --- extract_all (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/extractallfunction); TODO: captureGroups not supported yet -[['T','h','e'],['p','ric','e'],['P','INEAPPL','E'],['i','c','e'],['c','rea','m']] - --- extract_json (https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/extractjsonfunction) - - -John -iPhone -\N -26 -26 -26 -26 -\N - --- split (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/splitfunction) -['aa','bb'] -['bbb'] -[''] -['a','','b'] -['aa','cc'] -['aabbcc'] -['aaa','bbb','ccc'] -[NULL] - --- strcat_delim (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/strcat-delimfunction); TODO: only support string now. -1-2-Ab - --- indexof (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/indexoffunction); TODO: length and occurrence not supported yet -2 -2 --1 --- base64_encode_fromguid() -8jMxriJurkmwahbmqbIS6w== --- base64_decode_toarray() -[] -[75,117,115,116,111] --- base64_decode_toguid() -10e99626-bc2b-4c75-bb3e-fe606de25700 -1 --- base64_encode_tostring - -S3VzdG8x --- base64_decode_tostring - -Kusto1 --- parse_url() -{"Scheme":"scheme","Host":"","Port":"0","Path":"/this/is/a/path","Username":"username","Password":"password","Query Parameters":{"k1":"v1","k2":"v2"},"Fragment":"fragment"} --- parse_urlquery() -{"Query Parameters":{"k1":"v1","k2":"v2","k3":"v3"}} --- strcmp() -0 1 -1 1 --- substring() -CD --- translate() -kusto xxx --- trim() -https://www.ibm.com -Te st1 - asd -asd -sd --- trim_start() -www.ibm.com -Te st1// $ -asdw - -asd --- trim_end() -https -- Te st1 -wasd - -asd --- trim, trim_start, trim_end all at once ---https://bing.com-- -- https://bing.com-- --https://bing.com https://bing.com --- replace_regex -Number was: 1 --- has_any_index() -0 1 -1 -1 --- parse_version() -1000000020000000300000040 -1000000020000000000000000 -1000000020000000000000000 -\N -\N -\N -\N -1000000020000000300000004 -1000000020000000000000000 -1000000020000000300000000 -1000000000000000000000000 --- parse_json() -[1,2,3] -[{"a":123.5,"b":"{\\"c\\":456}"}] --- parse_command_line() -[NULL] -[NULL] --- reverse() -321 -43.321 - -dsa -][ -]3,2,1[ -]\'redaV\',\'htraD\'[ -000000000.00:00:21 51-01-7102 -Peter Nara Skilled Manual Graduate Degree 26 -Latoya Shen Professional Graduate Degree 25 --- parse_csv() -[''] -['aaa'] -['aa','b','cc'] -['record1','a','b','c'] diff --git a/tests/queries/0_stateless/02366_kql_func_string.sql b/tests/queries/0_stateless/02366_kql_func_string.sql deleted file mode 100644 index d251b04e08b..00000000000 --- a/tests/queries/0_stateless/02366_kql_func_string.sql +++ /dev/null @@ -1,313 +0,0 @@ --- Tags: no-fasttest - -DROP TABLE IF EXISTS Customers; -CREATE TABLE Customers -( - FirstName Nullable(String), - LastName String, - Occupation String, - Education String, - Age Nullable(UInt8) -) ENGINE = Memory; - -INSERT INTO Customers VALUES ('Theodore','Diaz','Skilled Manual','Bachelors',28), ('Stephanie','Cox','Management abcd defg','Bachelors',33),('Peter','Nara','Skilled Manual','Graduate Degree',26),('Latoya','Shen','Professional','Graduate Degree',25),('Apple','','Skilled Manual','Bachelors',28),(NULL,'why','Professional','Partial College',38); - --- datatable (Version:string) [ --- '1.2.3.4', --- '1.2', --- '1.2.3', --- '1' --- ] - -DROP TABLE IF EXISTS Versions; -CREATE TABLE Versions -( - Version String -) ENGINE = Memory; -INSERT INTO Versions VALUES ('1.2.3.4'),('1.2'),('1.2.3'),('1'); - - -set dialect='kusto'; -print '-- test String Functions --'; - -print '-- Customers |where Education contains \'degree\''; -Customers |where Education contains 'degree' | order by LastName; -print ''; -print '-- Customers |where Education !contains \'degree\''; -Customers |where Education !contains 'degree' | order by LastName; -print ''; -print '-- Customers |where Education contains \'Degree\''; -Customers |where Education contains 'Degree' | order by LastName; -print ''; -print '-- Customers |where Education !contains \'Degree\''; -Customers |where Education !contains 'Degree' | order by LastName; -print ''; -print '-- Customers | where FirstName endswith \'RE\''; -Customers | where FirstName endswith 'RE' | order by LastName; -print ''; -print '-- Customers | where ! FirstName endswith \'RE\''; -Customers | where FirstName ! endswith 'RE' | order by LastName; -print ''; -print '--Customers | where FirstName endswith_cs \'re\''; -Customers | where FirstName endswith_cs 're' | order by LastName; -print ''; -print '-- Customers | where FirstName !endswith_cs \'re\''; -Customers | where FirstName !endswith_cs 're' | order by LastName; -print ''; -print '-- Customers | where Occupation == \'Skilled Manual\''; -Customers | where Occupation == 'Skilled Manual' | order by LastName; -print ''; -print '-- Customers | where Occupation != \'Skilled Manual\''; -Customers | where Occupation != 'Skilled Manual' | order by LastName; -print ''; -print '-- Customers | where Occupation has \'skilled\''; -Customers | where Occupation has 'skilled' | order by LastName; -print ''; -print '-- Customers | where Occupation !has \'skilled\''; -Customers | where Occupation !has 'skilled' | order by LastName; -print ''; -print '-- Customers | where Occupation has \'Skilled\''; -Customers | where Occupation has 'Skilled'| order by LastName; -print ''; -print '-- Customers | where Occupation !has \'Skilled\''; -Customers | where Occupation !has 'Skilled'| order by LastName; -print ''; -print '-- Customers | where Occupation hasprefix_cs \'Ab\''; -Customers | where Occupation hasprefix_cs 'Ab'| order by LastName; -print ''; -print '-- Customers | where Occupation !hasprefix_cs \'Ab\''; -Customers | where Occupation !hasprefix_cs 'Ab'| order by LastName; -print ''; -print '-- Customers | where Occupation hasprefix_cs \'ab\''; -Customers | where Occupation hasprefix_cs 'ab'| order by LastName; -print ''; -print '-- Customers | where Occupation !hasprefix_cs \'ab\''; -Customers | where Occupation !hasprefix_cs 'ab'| order by LastName; -print ''; -print '-- Customers | where Occupation hassuffix \'Ent\''; -Customers | where Occupation hassuffix 'Ent'| order by LastName; -print ''; -print '-- Customers | where Occupation !hassuffix \'Ent\''; -Customers | where Occupation !hassuffix 'Ent'| order by LastName; -print ''; -print '-- Customers | where Occupation hassuffix \'ent\''; -Customers | where Occupation hassuffix 'ent'| order by LastName; -print ''; -print '-- Customers | where Occupation hassuffix \'ent\''; -Customers | where Occupation hassuffix 'ent'| order by LastName; -print ''; -print '-- Customers |where Education in (\'Bachelors\',\'High School\')'; -Customers |where Education in ('Bachelors','High School')| order by LastName; -print ''; -print '-- Customers | where Education !in (\'Bachelors\',\'High School\')'; -Customers | where Education !in ('Bachelors','High School')| order by LastName; -print ''; -print '-- Customers | where FirstName matches regex \'P.*r\''; -Customers | where FirstName matches regex 'P.*r'| order by LastName; -print ''; -print '-- Customers | where FirstName startswith \'pet\''; -Customers | where FirstName startswith 'pet'| order by LastName; -print ''; -print '-- Customers | where FirstName !startswith \'pet\''; -Customers | where FirstName !startswith 'pet'| order by LastName; -print ''; -print '-- Customers | where FirstName startswith_cs \'pet\''; -Customers | where FirstName startswith_cs 'pet'| order by LastName; -print ''; -print '-- Customers | where FirstName !startswith_cs \'pet\''; -Customers | where FirstName !startswith_cs 'pet'| order by LastName; -print ''; -print '-- Customers | where isempty(LastName)'; -Customers | where isempty(LastName); -print ''; -print '-- Customers | where isnotempty(LastName)'; -Customers | where isnotempty(LastName); -print ''; -print '-- Customers | where isnotnull(FirstName)'; -Customers | where isnotnull(FirstName)| order by LastName; -print ''; -print '-- Customers | where isnull(FirstName)'; -Customers | where isnull(FirstName)| order by LastName; -print ''; -print '-- Customers | project url_decode(\'https%3A%2F%2Fwww.test.com%2Fhello%20word\') | take 1'; -Customers | project url_decode('https%3A%2F%2Fwww.test.com%2Fhello%20word') | take 1; -print ''; -print '-- Customers | project url_encode(\'https://www.test.com/hello word\') | take 1'; -Customers | project url_encode('https://www.test.com/hello word') | take 1; -print ''; -print '-- Customers | project name_abbr = strcat(substring(FirstName,0,3), \' \', substring(LastName,2))'; -Customers | project name_abbr = strcat(substring(FirstName,0,3), ' ', substring(LastName,2))| order by LastName; -print ''; -print '-- Customers | project name = strcat(FirstName, \' \', LastName)'; -Customers | project name = strcat(FirstName, ' ', LastName)| order by LastName; -print ''; -print '-- Customers | project FirstName, strlen(FirstName)'; -Customers | project FirstName, strlen(FirstName)| order by LastName; -print ''; -print '-- Customers | project strrep(FirstName,2,\'_\')'; -Customers | project strrep(FirstName,2,'_')| order by LastName; -print ''; -print '-- Customers | project toupper(FirstName)'; -Customers | project toupper(FirstName)| order by LastName; -print ''; -print '-- Customers | project tolower(FirstName)'; -Customers | project tolower(FirstName)| order by LastName; -print ''; -print '-- support subquery for in orerator (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/in-cs-operator) (subquery need to be wraped with bracket inside bracket); TODO: case-insensitive not supported yet'; -Customers | where Age in ((Customers|project Age|where Age < 30)) | order by LastName; --- Customer | where LastName in~ ("diaz", "cox") -print ''; -print '-- has_all (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/has-all-operator); TODO: subquery not supported yet'; -Customers | where Occupation has_all ('manual', 'skilled') | order by LastName; -print ''; -print '-- has_any (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/has-anyoperator); TODO: subquery not supported yet'; -Customers|where Occupation has_any ('Skilled','abcd'); -print ''; -print '-- countof (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/countoffunction)'; -Customers | project countof('The cat sat on the mat', 'at') | take 1; -Customers | project countof('The cat sat on the mat', 'at', 'normal') | take 1; -Customers | project countof('The cat sat on the mat', '\\s.he', 'regex') | take 1; -print ''; -print '-- extract ( https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/extractfunction)'; -print extract('(\\b[A-Z]+\\b).+(\\b\\d+)', 0, 'The price of PINEAPPLE ice cream is 20'); -print extract('(\\b[A-Z]+\\b).+(\\b\\d+)', 1, 'The price of PINEAPPLE ice cream is 20'); -print extract('(\\b[A-Z]+\\b).+(\\b\\d+)', 2, 'The price of PINEAPPLE ice cream is 20'); -print extract('(\\b[A-Z]+\\b).+(\\b\\d+)', 3, 'The price of PINEAPPLE ice cream is 20'); -print extract('(\\b[A-Z]+\\b).+(\\b\\d+)', 2, 'The price of PINEAPPLE ice cream is 20', typeof(real)); -print extract("x=([0-9.]+)", 1, "hello x=45.6|wo" , typeof(bool)); -print extract("x=([0-9.]+)", 1, "hello x=45.6|wo" , typeof(date)); -print extract("x=([0-9.]+)", 1, "hello x=45.6|wo" , typeof(guid)); -print extract("x=([0-9.]+)", 1, "hello x=45.6|wo" , typeof(int)); -print extract("x=([0-9.]+)", 1, "hello x=45.6|wo" , typeof(long)); -print extract("x=([0-9.]+)", 1, "hello x=45.6|wo" , typeof(real)); -print extract("x=([0-9.]+)", 1, "hello x=45.6|wo" , typeof(decimal)); -print ''; -print '-- extract_all (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/extractallfunction); TODO: captureGroups not supported yet'; -Customers | project extract_all('(\\w)(\\w+)(\\w)','The price of PINEAPPLE ice cream is 20') | take 1; -print ''; -print '-- extract_json (https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/extractjsonfunction)'; -print extract_json('', ''); -- { serverError BAD_ARGUMENTS } -print extract_json('a', ''); -- { serverError BAD_ARGUMENTS } -print extract_json('$.firstName', ''); -print extract_json('$.phoneNumbers[0].type', ''); -print extractjson('$.firstName', '{"firstName":"John","lastName":"doe","age":26,"address":{"streetAddress":"naist street","city":"Nara","postalCode":"630-0192"},"phoneNumbers":[{"type":"iPhone","number":"0123-4567-8888"},{"type":"home","number":"0123-4567-8910"}]}'); -print extract_json('$.phoneNumbers[0].type', '{"firstName":"John","lastName":"doe","age":26,"address":{"streetAddress":"naist street","city":"Nara","postalCode":"630-0192"},"phoneNumbers":[{"type":"iPhone","number":"0123-4567-8888"},{"type":"home","number":"0123-4567-8910"}]}', typeof(string)); -print extract_json('$.phoneNumbers[0].type', '{"firstName":"John","lastName":"doe","age":26,"address":{"streetAddress":"naist street","city":"Nara","postalCode":"630-0192"},"phoneNumbers":[{"type":"iPhone","number":"0123-4567-8888"},{"type":"home","number":"0123-4567-8910"}]}', typeof(int)); -print extract_json('$.age', '{"firstName":"John","lastName":"doe","age":26,"address":{"streetAddress":"naist street","city":"Nara","postalCode":"630-0192"},"phoneNumbers":[{"type":"iPhone","number":"0123-4567-8888"},{"type":"home","number":"0123-4567-8910"}]}'); -print extract_json('$.age', '{"firstName":"John","lastName":"doe","age":26,"address":{"streetAddress":"naist street","city":"Nara","postalCode":"630-0192"},"phoneNumbers":[{"type":"iPhone","number":"0123-4567-8888"},{"type":"home","number":"0123-4567-8910"}]}', typeof(int)); -print extract_json('$.age', '{"firstName":"John","lastName":"doe","age":26,"address":{"streetAddress":"naist street","city":"Nara","postalCode":"630-0192"},"phoneNumbers":[{"type":"iPhone","number":"0123-4567-8888"},{"type":"home","number":"0123-4567-8910"}]}', typeof(long)); --- print extract_json('$.age', '{"firstName":"John","lastName":"doe","age":26,"address":{"streetAddress":"naist street","city":"Nara","postalCode":"630-0192"},"phoneNumbers":[{"type":"iPhone","number":"0123-4567-8888"},{"type":"home","number":"0123-4567-8910"}]}', typeof(bool)); -> true -print extract_json('$.age', '{"firstName":"John","lastName":"doe","age":26,"address":{"streetAddress":"naist street","city":"Nara","postalCode":"630-0192"},"phoneNumbers":[{"type":"iPhone","number":"0123-4567-8888"},{"type":"home","number":"0123-4567-8910"}]}', typeof(double)); -print extract_json('$.age', '{"firstName":"John","lastName":"doe","age":26,"address":{"streetAddress":"naist street","city":"Nara","postalCode":"630-0192"},"phoneNumbers":[{"type":"iPhone","number":"0123-4567-8888"},{"type":"home","number":"0123-4567-8910"}]}', typeof(guid)); --- print extract_json('$.phoneNumbers', '{"firstName":"John","lastName":"doe","age":26,"address":{"streetAddress":"naist street","city":"Nara","postalCode":"630-0192"},"phoneNumbers":[{"type":"iPhone","number":"0123-4567-8888"},{"type":"home","number":"0123-4567-8910"}]}', typeof(dynamic)); we won't be able to handle this particular case for a while, because it should return a dictionary -print ''; -print '-- split (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/splitfunction)'; -Customers | project split('aa_bb', '_') | take 1; -Customers | project split('aaa_bbb_ccc', '_', 1) | take 1; -Customers | project split('', '_') | take 1; -Customers | project split('a__b', '_') | take 1; -Customers | project split('aabbcc', 'bb') | take 1; -Customers | project split('aabbcc', '') | take 1; -Customers | project split('aaa_bbb_ccc', '_', -1) | take 1; -Customers | project split('aaa_bbb_ccc', '_', 10) | take 1; -print ''; -print '-- strcat_delim (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/strcat-delimfunction); TODO: only support string now.'; -Customers | project strcat_delim('-', '1', '2', strcat('A','b')) | take 1; --- Customers | project strcat_delim('-', '1', '2', 'A' , 1s); -print ''; -print '-- indexof (https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/indexoffunction); TODO: length and occurrence not supported yet'; -Customers | project indexof('abcdefg','cde') | take 1; -Customers | project indexof('abcdefg','cde',2) | take 1; -Customers | project indexof('abcdefg','cde',6) | take 1; -print '-- base64_encode_fromguid()'; --- print base64_encode_fromguid(guid(null)); -print base64_encode_fromguid(guid('ae3133f2-6e22-49ae-b06a-16e6a9b212eb')); -print base64_encode_fromguid(dynamic(null)); -- { serverError FUNCTION_THROW_IF_VALUE_IS_NON_ZERO } -print base64_encode_fromguid("abcd1231"); -- { serverError FUNCTION_THROW_IF_VALUE_IS_NON_ZERO } -print '-- base64_decode_toarray()'; -print base64_decode_toarray(''); -print base64_decode_toarray('S3VzdG8='); -print '-- base64_decode_toguid()'; -print base64_decode_toguid("JpbpECu8dUy7Pv5gbeJXAA=="); -print base64_decode_toguid(base64_encode_fromguid(guid('ae3133f2-6e22-49ae-b06a-16e6a9b212eb'))) == guid('ae3133f2-6e22-49ae-b06a-16e6a9b212eb'); -print '-- base64_encode_tostring'; -print base64_encode_tostring(''); -print base64_encode_tostring('Kusto1'); -print '-- base64_decode_tostring'; -print base64_decode_tostring(''); -print base64_decode_tostring('S3VzdG8x'); -print '-- parse_url()'; -print parse_url('scheme://username:password@host:1234/this/is/a/path?k1=v1&k2=v2#fragment'); -print '-- parse_urlquery()'; -print parse_urlquery('k1=v1&k2=v2&k3=v3'); -print '-- strcmp()'; -print strcmp('ABC','ABC'), strcmp('abc','ABC'), strcmp('ABC','abc'), strcmp('abcde','abc'); -print '-- substring()'; -print substring("ABCD", -2, 2); -print '-- translate()'; -print translate('krasp', 'otsku', 'spark'), translate('abc', '', 'ab'), translate('abc', 'x', 'abc'); -print '-- trim()'; -print trim("--", "--https://www.ibm.com--"); -print trim("[^\w]+", strcat("- ","Te st", "1", "// $")); -print trim("", " asd "); -print trim("a$", "asd"); -print trim("^a", "asd"); -print '-- trim_start()'; -print trim_start("https://", "https://www.ibm.com"); -print trim_start("[^\w]+", strcat("- ","Te st", "1", "// $")); -print trim_start("asd$", "asdw"); -print trim_start("asd$", "asd"); -print trim_start("d$", "asd"); -print '-- trim_end()'; -print trim_end("://www.ibm.com", "https://www.ibm.com"); -print trim_end("[^\w]+", strcat("- ","Te st", "1", "// $")); -print trim_end("^asd", "wasd"); -print trim_end("^asd", "asd"); -print trim_end("^a", "asd"); -print '-- trim, trim_start, trim_end all at once'; -print str = "--https://bing.com--", pattern = '--' | extend start = trim_start(pattern, str), end = trim_end(pattern, str), both = trim(pattern, str); -print '-- replace_regex'; -print replace_regex(strcat('Number is ', '1'), 'is (\d+)', 'was: \1'); -print '-- has_any_index()'; -print has_any_index('this is an example', dynamic(['this', 'example'])), has_any_index("this is an example", dynamic(['not', 'example'])), has_any_index("this is an example", dynamic(['not', 'found'])), has_any_index("this is an example", dynamic([])); -print '-- parse_version()'; -print parse_version(42); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } --- print parse_version(''); -> NULL -print parse_version('1.2.3.40'); -print parse_version('1.2'); -print parse_version(strcat('1.', '2')); -print parse_version('1.2.4.5.6'); -print parse_version('moo'); -print parse_version('moo.boo.foo'); -print parse_version(strcat_delim('.', 'moo', 'boo', 'foo')); -Versions | project parse_version(Version); -print '-- parse_json()'; -print parse_json(dynamic([1, 2, 3])); -print parse_json('{"a":123.5, "b":"{\\"c\\":456}"}'); -print '-- parse_command_line()'; -print parse_command_line(55, 'windows'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } --- print parse_command_line((52 + 3) * 4 % 2, 'windows'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } -print parse_command_line('', 'windows'); -print parse_command_line(strrep(' ', 6), 'windows'); --- print parse_command_line('echo \"hello world!\" print$?', 'windows'); -> ["echo","hello world!","print$?"] --- print parse_command_line("yolo swag 'asd bcd' \"moo moo \"", 'windows'); -> ["yolo","swag","'asd","bcd'","moo moo "] --- print parse_command_line(strcat_delim(' ', "yolo", "swag", "\'asd bcd\'", "\"moo moo \""), 'windows'); -> ["yolo","swag","'asd","bcd'","moo moo "] -print '-- reverse()'; -print reverse(123); -print reverse(123.34); -print reverse(''); -print reverse("asd"); -print reverse(dynamic([])); -print reverse(dynamic([1, 2, 3])); -print reverse(dynamic(['Darth', "Vader"])); -print reverse(datetime(2017-10-15 12:00)); --- print reverse(timespan(3h)); -> 00:00:30 -Customers | where Education contains 'degree' | order by reverse(FirstName); -print '-- parse_csv()'; -print parse_csv(''); -print parse_csv(65); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } -print parse_csv('aaa'); -print result=parse_csv('aa,b,cc'); -print result_multi_record=parse_csv('record1,a,b,c\nrecord2,x,y,z'); --- print result=parse_csv('aa,"b,b,b",cc,"Escaping quotes: ""Title""","line1\nline2"'); -> ["aa","b,b,b","cc","Escaping quotes: \"Title\"","line1\nline2"] --- print parse_csv(strcat(strcat_delim(',', 'aa', '"b,b,b"', 'cc', '"Escaping quotes: ""Title"""', '"line1\nline2"'), '\r\n', strcat_delim(',', 'asd', 'qcf'))); -> ["aa","b,b,b","cc","Escaping quotes: \"Title\"","line1\nline2"] diff --git a/tests/queries/0_stateless/02366_kql_makeseries.reference b/tests/queries/0_stateless/02366_kql_makeseries.reference deleted file mode 100644 index 8e7fde997bf..00000000000 --- a/tests/queries/0_stateless/02366_kql_makeseries.reference +++ /dev/null @@ -1,60 +0,0 @@ --- from to -Costco Snargaluff ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [200,0,102] -Costco Apple ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [0,2,0] -Aldi Snargaluff ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [0,500,0] -Aldi Apple ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [5,0,6] --- from -Costco Snargaluff ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [200,0,102] -Costco Apple ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000'] [0,2] -Aldi Snargaluff ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000'] [0,500] -Aldi Apple ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [5,0,6] --- to -Costco Snargaluff ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [200,0,102] -Costco Apple ['2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [2,0] -Aldi Snargaluff ['2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [500,0] -Aldi Apple ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [5,0,6] --- without from/to -Costco Snargaluff ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [200,0,102] -Costco Apple ['2016-09-11 00:00:00.000000000'] [2] -Aldi Snargaluff ['2016-09-11 00:00:00.000000000'] [500] -Aldi Apple ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [5,0,6] --- without by -['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [70,334,54] --- without aggregation alias -Costco Snargaluff ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [200,0,102] -Aldi Snargaluff ['2016-09-11 00:00:00.000000000'] [500] -Aldi Apple ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [5,0,6] -Costco Apple ['2016-09-11 00:00:00.000000000'] [2] --- assign group alias -Costco Snargaluff ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [200,0,102] -Aldi Snargaluff ['2016-09-11 00:00:00.000000000'] [500] -Aldi Apple ['2016-09-10 00:00:00.000000000','2016-09-11 00:00:00.000000000','2016-09-12 00:00:00.000000000'] [5,0,6] -Costco Apple ['2016-09-11 00:00:00.000000000'] [2] --- 3d step -Costco Snargaluff ['2016-09-10 00:00:00.000000000'] [134.66666666666666] -Costco Apple ['2016-09-10 00:00:00.000000000'] [2] -Aldi Snargaluff ['2016-09-10 00:00:00.000000000'] [500] -Aldi Apple ['2016-09-10 00:00:00.000000000'] [5.5] --- numeric column -Costco Snargaluff [10,11,12,13,14] [200,0,102,0,0] -Aldi Snargaluff [10,11,12,13,14] [0,500,0,0,0] -Aldi Apple [10,11,12,13,14] [5,0,6,0,0] -Costco Apple [10,11,12,13,14] [0,2,0,0,0] --- from -Costco Snargaluff [10,11,12] [200,0,102] -Aldi Snargaluff [10,11] [0,500] -Aldi Apple [10,11,12] [5,0,6] -Costco Apple [10,11] [0,2] --- to -Costco Snargaluff [8,12,16] [200,102,0] -Aldi Snargaluff [8,12,16] [500,0,0] -Aldi Apple [8,12,16] [5,6,0] -Costco Apple [8,12,16] [2,0,0] --- without from/to -Costco Snargaluff [10,12] [200,102] -Aldi Snargaluff [10] [500] -Aldi Apple [10,12] [5,6] -Costco Apple [10] [2] --- without by -[10,12] [202,54] -['2017-01-01 00:00:00.000000000','2017-01-02 00:00:00.000000000','2017-01-03 00:00:00.000000000','2017-01-04 00:00:00.000000000','2017-01-05 00:00:00.000000000','2017-01-06 00:00:00.000000000','2017-01-07 00:00:00.000000000','2017-01-08 00:00:00.000000000','2017-01-09 00:00:00.000000000'] [4,3,5,0,10.5,4,3,8,6.5] diff --git a/tests/queries/0_stateless/02366_kql_makeseries.sql b/tests/queries/0_stateless/02366_kql_makeseries.sql deleted file mode 100644 index c9ca91c0be0..00000000000 --- a/tests/queries/0_stateless/02366_kql_makeseries.sql +++ /dev/null @@ -1,77 +0,0 @@ --- Azure Data Explore Test Data --- let make_series_test_table = datatable (Supplier:string, Fruit:string, Price: real, Purchase:datetime) --- [ --- 'Aldi','Apple',4,'2016-09-10', --- 'Costco','Apple',2,'2016-09-11', --- 'Aldi','Apple',6,'2016-09-10', --- 'Costco','Snargaluff',100,'2016-09-12', --- 'Aldi','Apple',7,'2016-09-12', --- 'Aldi','Snargaluff',400,'2016-09-11', --- 'Costco','Snargaluff',104,'2016-09-12', --- 'Aldi','Apple',5,'2016-09-12', --- 'Aldi','Snargaluff',600,'2016-09-11', --- 'Costco','Snargaluff',200,'2016-09-10', --- ]; -DROP TABLE IF EXISTS make_series_test_table; -CREATE TABLE make_series_test_table -( - Supplier Nullable(String), - Fruit String , - Price Float64, - Purchase Date -) ENGINE = Memory; -INSERT INTO make_series_test_table VALUES ('Aldi','Apple',4,'2016-09-10'), ('Costco','Apple',2,'2016-09-11'), ('Aldi','Apple',6,'2016-09-10'), ('Costco','Snargaluff',100,'2016-09-12'), ('Aldi','Apple',7,'2016-09-12'), ('Aldi','Snargaluff',400,'2016-09-11'),('Costco','Snargaluff',104,'2016-09-12'),('Aldi','Apple',5,'2016-09-12'),('Aldi','Snargaluff',600,'2016-09-11'),('Costco','Snargaluff',200,'2016-09-10'); -DROP TABLE IF EXISTS make_series_test_table2; -CREATE TABLE make_series_test_table2 -( - Supplier Nullable(String), - Fruit String , - Price Int32, - Purchase Int32 -) ENGINE = Memory; -INSERT INTO make_series_test_table2 VALUES ('Aldi','Apple',4,10),('Costco','Apple',2,11),('Aldi','Apple',6,10),('Costco','Snargaluff',100,12),('Aldi','Apple',7,12),('Aldi','Snargaluff',400,11),('Costco','Snargaluff',104,12),('Aldi','Apple',5,12),('Aldi','Snargaluff',600,11),('Costco','Snargaluff',200,10); -DROP TABLE IF EXISTS make_series_test_table3; -CREATE TABLE make_series_test_table3 -( - timestamp datetime, - metric Float64, -) ENGINE = Memory; -INSERT INTO make_series_test_table3 VALUES (parseDateTimeBestEffort('2016-12-31T06:00', 'UTC'), 50), (parseDateTimeBestEffort('2017-01-01', 'UTC'), 4), (parseDateTimeBestEffort('2017-01-02', 'UTC'), 3), (parseDateTimeBestEffort('2017-01-03', 'UTC'), 4), (parseDateTimeBestEffort('2017-01-03T03:00', 'UTC'), 6), (parseDateTimeBestEffort('2017-01-05', 'UTC'), 8), (parseDateTimeBestEffort('2017-01-05T13:40', 'UTC'), 13), (parseDateTimeBestEffort('2017-01-06', 'UTC'), 4), (parseDateTimeBestEffort('2017-01-07', 'UTC'), 3), (parseDateTimeBestEffort('2017-01-08', 'UTC'), 8), (parseDateTimeBestEffort('2017-01-08T21:00', 'UTC'), 8), (parseDateTimeBestEffort('2017-01-09', 'UTC'), 2), (parseDateTimeBestEffort('2017-01-09T12:00', 'UTC'), 11), (parseDateTimeBestEffort('2017-01-10T05:00', 'UTC'), 5); - --- This test requies sorting after some of aggregations but I don't know KQL, sorry -set max_bytes_before_external_group_by = 0; -set dialect = 'kusto'; - -print '-- from to'; -make_series_test_table | make-series PriceAvg = avg(Price) default=0 on Purchase from datetime(2016-09-10) to datetime(2016-09-13) step 1d by Supplier, Fruit | order by Supplier, Fruit; -print '-- from'; -make_series_test_table | make-series PriceAvg = avg(Price) default=0 on Purchase from datetime(2016-09-10) step 1d by Supplier, Fruit | order by Supplier, Fruit; -print '-- to'; -make_series_test_table | make-series PriceAvg = avg(Price) default=0 on Purchase to datetime(2016-09-13) step 1d by Supplier, Fruit | order by Supplier, Fruit; -print '-- without from/to'; -make_series_test_table | make-series PriceAvg = avg(Price) default=0 on Purchase step 1d by Supplier, Fruit | order by Supplier, Fruit; -print '-- without by'; -make_series_test_table | make-series PriceAvg = avg(Price) default=0 on Purchase step 1d; -print '-- without aggregation alias'; -make_series_test_table | make-series avg(Price) default=0 on Purchase step 1d by Supplier, Fruit; -print '-- assign group alias'; -make_series_test_table | make-series avg(Price) default=0 on Purchase step 1d by Supplier_Name = Supplier, Fruit; -print '-- 3d step'; -make_series_test_table | make-series PriceAvg = avg(Price) default=0 on Purchase from datetime(2016-09-10) to datetime(2016-09-13) step 3d by Supplier, Fruit | order by Supplier, Fruit; - -print '-- numeric column' -print '-- from to'; -make_series_test_table2 | make-series PriceAvg=avg(Price) default=0 on Purchase from 10 to 15 step 1.0 by Supplier, Fruit; -print '-- from'; -make_series_test_table2 | make-series PriceAvg=avg(Price) default=0 on Purchase from 10 step 1.0 by Supplier, Fruit; -print '-- to'; -make_series_test_table2 | make-series PriceAvg=avg(Price) default=0 on Purchase to 18 step 4.0 by Supplier, Fruit; -print '-- without from/to'; -make_series_test_table2 | make-series PriceAvg=avg(Price) default=0 on Purchase step 2.0 by Supplier, Fruit; -print '-- without by'; -make_series_test_table2 | make-series PriceAvg=avg(Price) default=0 on Purchase step 2.0; - -make_series_test_table3 | make-series avg(metric) default=0 on timestamp from datetime(2017-01-01) to datetime(2017-01-10) step 1d - --- print '-- summarize --' --- make_series_test_table | summarize count() by format_datetime(bin(Purchase, 1d), 'yy-MM-dd'); diff --git a/tests/queries/0_stateless/02366_kql_mvexpand.reference b/tests/queries/0_stateless/02366_kql_mvexpand.reference deleted file mode 100644 index 25be070eb0b..00000000000 --- a/tests/queries/0_stateless/02366_kql_mvexpand.reference +++ /dev/null @@ -1,65 +0,0 @@ --- mv-expand -- --- mv_expand_test_table | mv-expand c -- -1 ['Salmon','Steak','Chicken'] 1 [5,6,7,8] -1 ['Salmon','Steak','Chicken'] 2 [5,6,7,8] -1 ['Salmon','Steak','Chicken'] 3 [5,6,7,8] -1 ['Salmon','Steak','Chicken'] 4 [5,6,7,8] --- mv_expand_test_table | mv-expand c, d -- -1 ['Salmon','Steak','Chicken'] 1 5 -1 ['Salmon','Steak','Chicken'] 2 6 -1 ['Salmon','Steak','Chicken'] 3 7 -1 ['Salmon','Steak','Chicken'] 4 8 --- mv_expand_test_table | mv-expand b | mv-expand c -- -1 Salmon 1 [5,6,7,8] -1 Salmon 2 [5,6,7,8] -1 Salmon 3 [5,6,7,8] -1 Salmon 4 [5,6,7,8] -1 Steak 1 [5,6,7,8] -1 Steak 2 [5,6,7,8] -1 Steak 3 [5,6,7,8] -1 Steak 4 [5,6,7,8] -1 Chicken 1 [5,6,7,8] -1 Chicken 2 [5,6,7,8] -1 Chicken 3 [5,6,7,8] -1 Chicken 4 [5,6,7,8] --- mv_expand_test_table | mv-expand with_itemindex=index b, c, d -- -0 1 Salmon 1 5 -1 1 Steak 2 6 -2 1 Chicken 3 7 -3 1 4 8 --- mv_expand_test_table | mv-expand array_concat(c,d) -- -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 1 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 2 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 3 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 4 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 5 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 6 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 7 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 8 --- mv_expand_test_table | mv-expand x = c, y = d -- -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 1 5 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 2 6 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 3 7 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 4 8 --- mv_expand_test_table | mv-expand xy = array_concat(c, d) -- -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 1 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 2 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 3 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 4 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 5 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 6 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 7 -1 ['Salmon','Steak','Chicken'] [1,2,3,4] [5,6,7,8] 8 --- mv_expand_test_table | mv-expand xy = array_concat(c, d) limit 2| summarize count() by xy -- -1 1 -2 1 --- mv_expand_test_table | mv-expand with_itemindex=index c,d to typeof(bool) -- -0 1 ['Salmon','Steak','Chicken'] 1 true -1 1 ['Salmon','Steak','Chicken'] 2 true -2 1 ['Salmon','Steak','Chicken'] 3 true -3 1 ['Salmon','Steak','Chicken'] 4 true --- mv_expand_test_table | mv-expand c to typeof(bool) -- -1 ['Salmon','Steak','Chicken'] [5,6,7,8] true -1 ['Salmon','Steak','Chicken'] [5,6,7,8] true -1 ['Salmon','Steak','Chicken'] [5,6,7,8] true -1 ['Salmon','Steak','Chicken'] [5,6,7,8] true diff --git a/tests/queries/0_stateless/02366_kql_mvexpand.sql b/tests/queries/0_stateless/02366_kql_mvexpand.sql deleted file mode 100644 index e7798609646..00000000000 --- a/tests/queries/0_stateless/02366_kql_mvexpand.sql +++ /dev/null @@ -1,35 +0,0 @@ --- datatable(a: int, b: dynamic, c: dynamic, d: dynamic) [ --- 1, dynamic(['Salmon', 'Steak', 'Chicken']), dynamic([1, 2, 3, 4]), dynamic([5, 6, 7, 8]) --- ] - -DROP TABLE IF EXISTS mv_expand_test_table; -CREATE TABLE mv_expand_test_table -( - a UInt8, - b Array(String), - c Array(Int8), - d Array(Int8) -) ENGINE = Memory; -INSERT INTO mv_expand_test_table VALUES (1, ['Salmon', 'Steak','Chicken'],[1,2,3,4],[5,6,7,8]); -set dialect='kusto'; -print '-- mv-expand --'; -print '-- mv_expand_test_table | mv-expand c --'; -mv_expand_test_table | mv-expand c; -print '-- mv_expand_test_table | mv-expand c, d --'; -mv_expand_test_table | mv-expand c, d; -print '-- mv_expand_test_table | mv-expand b | mv-expand c --'; -mv_expand_test_table | mv-expand b | mv-expand c; -print '-- mv_expand_test_table | mv-expand with_itemindex=index b, c, d --'; -mv_expand_test_table | mv-expand with_itemindex=index b, c, d; -print '-- mv_expand_test_table | mv-expand array_concat(c,d) --'; -mv_expand_test_table | mv-expand array_concat(c,d); -print '-- mv_expand_test_table | mv-expand x = c, y = d --'; -mv_expand_test_table | mv-expand x = c, y = d; -print '-- mv_expand_test_table | mv-expand xy = array_concat(c, d) --'; -mv_expand_test_table | mv-expand xy = array_concat(c, d); -print '-- mv_expand_test_table | mv-expand xy = array_concat(c, d) limit 2| summarize count() by xy --'; -mv_expand_test_table | mv-expand xy = array_concat(c, d) limit 2| summarize count() by xy; -print '-- mv_expand_test_table | mv-expand with_itemindex=index c,d to typeof(bool) --'; -mv_expand_test_table | mv-expand with_itemindex=index c,d to typeof(bool); -print '-- mv_expand_test_table | mv-expand c to typeof(bool) --'; -mv_expand_test_table | mv-expand c to typeof(bool); diff --git a/tests/queries/0_stateless/02366_kql_native_interval_format.reference b/tests/queries/0_stateless/02366_kql_native_interval_format.reference deleted file mode 100644 index 8a12c6885c4..00000000000 --- a/tests/queries/0_stateless/02366_kql_native_interval_format.reference +++ /dev/null @@ -1,23 +0,0 @@ -numeric -kusto -00:00:00 -00:00:00.0000001 -00:00:00.0010000 -00:00:42 -01:06:00 -2.18:00:00 -5.00:00:00 -7.00:00:00 -14.00:00:00 -('00:01:12','21.00:00:00','00:00:00.0000002') -numeric -99 -100 -1 -42 -66 -66 -5 -1 -2 -(72,3,200) diff --git a/tests/queries/0_stateless/02366_kql_native_interval_format.sql.j2 b/tests/queries/0_stateless/02366_kql_native_interval_format.sql.j2 deleted file mode 100644 index 0731687222d..00000000000 --- a/tests/queries/0_stateless/02366_kql_native_interval_format.sql.j2 +++ /dev/null @@ -1,16 +0,0 @@ -select value from system.settings where name = 'interval_output_format'; - -{% for format in ['kusto', 'numeric'] -%} -select '{{ format }}'; -set interval_output_format = '{{ format }}'; -select toIntervalNanosecond(99); -select toIntervalNanosecond(100); -select toIntervalMillisecond(1); -select toIntervalSecond(42); -select toIntervalMinute(66); -select toIntervalHour(66); -select toIntervalDay(5); -select toIntervalWeek(1); -select toIntervalWeek(2); -select toIntervalSecond(72) + toIntervalWeek(3) + toIntervalNanosecond(200); -{% endfor -%} diff --git a/tests/queries/0_stateless/02366_kql_operator_in_sql.reference b/tests/queries/0_stateless/02366_kql_operator_in_sql.reference deleted file mode 100644 index 4e0987aa5c3..00000000000 --- a/tests/queries/0_stateless/02366_kql_operator_in_sql.reference +++ /dev/null @@ -1,60 +0,0 @@ --- #1 -- -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Apple Skilled Manual Bachelors 28 --- #2 -- -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Peter Nara Skilled Manual Graduate Degree 26 -Latoya Shen Professional Graduate Degree 25 -Apple Skilled Manual Bachelors 28 --- #3 -- -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Latoya Shen Professional Graduate Degree 25 -Apple Skilled Manual Bachelors 28 --- #4 -- -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Latoya Shen Professional Graduate Degree 25 -Apple Skilled Manual Bachelors 28 --- #5 -- -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Latoya Shen Professional Graduate Degree 25 -Apple Skilled Manual Bachelors 28 --- #6 -- -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Latoya Shen Professional Graduate Degree 25 -Apple Skilled Manual Bachelors 28 --- #7 -- -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Latoya Shen Professional Graduate Degree 25 -Apple Skilled Manual Bachelors 28 --- #8 -- -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Latoya Shen Professional Graduate Degree 25 -Apple Skilled Manual Bachelors 28 --- #9 -- -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Peter Nara Skilled Manual Graduate Degree 26 -Latoya Shen Professional Graduate Degree 25 -Apple Skilled Manual Bachelors 28 --- #10 -- --- #11 -- --- #12 -- --- #13 -- --- #14 -- -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Latoya Shen Professional Graduate Degree 25 -Apple Skilled Manual Bachelors 28 --- #15 -- -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management abcd defg Bachelors 33 -Latoya Shen Professional Graduate Degree 25 -Apple Skilled Manual Bachelors 28 diff --git a/tests/queries/0_stateless/02366_kql_operator_in_sql.sql b/tests/queries/0_stateless/02366_kql_operator_in_sql.sql deleted file mode 100644 index 0b02faa0680..00000000000 --- a/tests/queries/0_stateless/02366_kql_operator_in_sql.sql +++ /dev/null @@ -1,42 +0,0 @@ -DROP TABLE IF EXISTS Customers; -CREATE TABLE Customers -( - FirstName Nullable(String), - LastName String, - Occupation String, - Education String, - Age Nullable(UInt8) -) ENGINE = Memory; - -INSERT INTO Customers VALUES ('Theodore','Diaz','Skilled Manual','Bachelors',28),('Stephanie','Cox','Management abcd defg','Bachelors',33),('Peter','Nara','Skilled Manual','Graduate Degree',26),('Latoya','Shen','Professional','Graduate Degree',25),('Apple','','Skilled Manual','Bachelors',28),(NULL,'why','Professional','Partial College',38); -Select '-- #1 --' ; -select * from kql($$Customers | where FirstName !in ('Peter', 'Latoya')$$); -Select '-- #2 --' ; -select * from kql($$Customers | where FirstName !in ("test", "test2")$$); -Select '-- #3 --' ; -select * from kql($$Customers | where FirstName !contains 'Pet'$$); -Select '-- #4 --' ; -select * from kql($$Customers | where FirstName !contains_cs 'Pet'$$); -Select '-- #5 --' ; -select * from kql($$Customers | where FirstName !endswith 'ter'$$); -Select '-- #6 --' ; -select * from kql($$Customers | where FirstName !endswith_cs 'ter'$$); -Select '-- #7 --' ; -select * from kql($$Customers | where FirstName != 'Peter'$$); -Select '-- #8 --' ; -select * from kql($$Customers | where FirstName !has 'Peter'$$); -Select '-- #9 --' ; -select * from kql($$Customers | where FirstName !has_cs 'peter'$$); -Select '-- #10 --' ; --- select * from kql($$Customers | where FirstName !hasprefix 'Peter'$$); -- will enable when analyzer fixed `and` issue -Select '-- #11 --' ; ---select * from kql($$Customers | where FirstName !hasprefix_cs 'Peter'$$); -Select '-- #12 --' ; ---select * from kql($$Customers | where FirstName !hassuffix 'Peter'$$); -Select '-- #13 --' ; ---select * from kql($$Customers | where FirstName !hassuffix_cs 'Peter'$$); -Select '-- #14 --' ; -select * from kql($$Customers | where FirstName !startswith 'Peter'$$); -Select '-- #15 --' ; -select * from kql($$Customers | where FirstName !startswith_cs 'Peter'$$); -DROP TABLE IF EXISTS Customers; diff --git a/tests/queries/0_stateless/02366_kql_summarize.reference b/tests/queries/0_stateless/02366_kql_summarize.reference deleted file mode 100644 index aeb42feb6be..00000000000 --- a/tests/queries/0_stateless/02366_kql_summarize.reference +++ /dev/null @@ -1,92 +0,0 @@ --- test summarize -- -12 25 46 32.416666666666664 389 -Skilled Manual 5 26 36 30.2 151 -Professional 6 25 46 34.166666666666664 205 -Management abcd defg 1 33 33 33 33 -Skilled Manual 0 -Professional 2 -Management abcd defg 0 -Skilled Manual 36 -Professional 38 -Management abcd defg 33 -Skilled Manual 26 -Professional 25 -Management abcd defg 33 -Skilled Manual 30.2 -Professional 29.25 -Management abcd defg 33 -Skilled Manual 151 -Professional 117 -Management abcd defg 33 -4 -2 -40 2 -30 4 -20 6 -Skilled Manual 5 -Professional 6 -Management abcd defg 1 --- make_list() -- -Skilled Manual ['Bachelors','Graduate Degree','High School','Partial College','Bachelors'] -Professional ['Graduate Degree','Partial College','Partial College','Partial College','Partial College','Partial College'] -Management abcd defg ['Bachelors'] -Skilled Manual ['Bachelors','Graduate Degree'] -Professional ['Graduate Degree','Partial College'] -Management abcd defg ['Bachelors'] --- make_list_if() -- -Skilled Manual ['Edward','Christine'] -Professional ['Dalton','Angel'] -Management abcd defg ['Stephanie'] -Skilled Manual ['Edward'] -Professional ['Dalton'] -Management abcd defg ['Stephanie'] --- make_set() -- -Skilled Manual ['Graduate Degree','High School','Partial College','Bachelors'] -Professional ['Graduate Degree','Partial College'] -Management abcd defg ['Bachelors'] -Skilled Manual ['Graduate Degree','Bachelors'] -Professional ['Graduate Degree','Partial College'] -Management abcd defg ['Bachelors'] --- make_set_if() -- -Skilled Manual ['Partial College','High School'] -Professional ['Partial College'] -Management abcd defg ['Bachelors'] -Skilled Manual ['High School'] -Professional ['Partial College'] -Management abcd defg ['Bachelors'] --- stdev() -- -6.855102059227432 --- stdevif() -- -7.557189365836421 --- binary_all_and -- -42 --- binary_all_or -- -46 --- binary_all_xor -- -4 -43.8 -25.55 30.5 43.8 -30.5 -35 -[25,35,45] --- Summarize following sort -- -Skilled Manual 5 -Professional 6 -Management abcd defg 1 --- summarize with bin -- -0 1 -245000 2 -0 1 -245 2 -0 1 -245 2 -2015-10-12 00:00:00.000000000 -2016-10-12 00:00:00.000000000 --- make_list_with_nulls -- -['Theodore','Stephanie','Peter','Latoya','Joshua','Edward','Dalton','Christine','Cameron','Angel','Apple',NULL] -Skilled Manual ['Theodore','Peter','Edward','Christine','Apple'] -Professional ['Latoya','Joshua','Dalton','Cameron','Angel',NULL] -Management abcd defg ['Stephanie'] -Skilled Manual ['Theodore','Peter','Edward','Christine','Apple'] [28,26,36,33,28] -Professional ['Latoya','Joshua','Dalton','Cameron','Angel',NULL] [25,26,42,28,46,38] -Management abcd defg ['Stephanie'] [33] diff --git a/tests/queries/0_stateless/02366_kql_summarize.sql b/tests/queries/0_stateless/02366_kql_summarize.sql deleted file mode 100644 index bb12d1f251f..00000000000 --- a/tests/queries/0_stateless/02366_kql_summarize.sql +++ /dev/null @@ -1,102 +0,0 @@ --- datatable(FirstName:string, LastName:string, Occupation:string, Education:string, Age:int) [ --- 'Theodore', 'Diaz', 'Skilled Manual', 'Bachelors', 28, --- 'Stephanie', 'Cox', 'Management abcd defg', 'Bachelors', 33, --- 'Peter', 'Nara', 'Skilled Manual', 'Graduate Degree', 26, --- 'Latoya', 'Shen', 'Professional', 'Graduate Degree', 25, --- 'Joshua', 'Lee', 'Professional', 'Partial College', 26, --- 'Edward', 'Hernandez', 'Skilled Manual', 'High School', 36, --- 'Dalton', 'Wood', 'Professional', 'Partial College', 42, --- 'Christine', 'Nara', 'Skilled Manual', 'Partial College', 33, --- 'Cameron', 'Rodriguez', 'Professional', 'Partial College', 28, --- 'Angel', 'Stewart', 'Professional', 'Partial College', 46, --- 'Apple', '', 'Skilled Manual', 'Bachelors', 28, --- dynamic(null), 'why', 'Professional', 'Partial College', 38 --- ] - -DROP TABLE IF EXISTS Customers; -CREATE TABLE Customers -( - FirstName Nullable(String), - LastName String, - Occupation String, - Education String, - Age Nullable(UInt8) -) ENGINE = Memory; - -INSERT INTO Customers VALUES ('Theodore','Diaz','Skilled Manual','Bachelors',28),('Stephanie','Cox','Management abcd defg','Bachelors',33),('Peter','Nara','Skilled Manual','Graduate Degree',26),('Latoya','Shen','Professional','Graduate Degree',25),('Joshua','Lee','Professional','Partial College',26),('Edward','Hernandez','Skilled Manual','High School',36),('Dalton','Wood','Professional','Partial College',42),('Christine','Nara','Skilled Manual','Partial College',33),('Cameron','Rodriguez','Professional','Partial College',28),('Angel','Stewart','Professional','Partial College',46),('Apple','','Skilled Manual','Bachelors',28),(NULL,'why','Professional','Partial College',38); - -drop table if exists EventLog; -create table EventLog -( - LogEntry String, - Created Int64 -) ENGINE = Memory; - -insert into EventLog values ('Darth Vader has entered the room.', 546), ('Rambo is suspciously looking at Darth Vader.', 245234), ('Darth Sidious electrocutes both using Force Lightning.', 245554); - -drop table if exists Dates; -create table Dates -( - EventTime DateTime, -) ENGINE = Memory; - -Insert into Dates VALUES ('2015-10-12') , ('2016-10-12') -Select '-- test summarize --' ; -set dialect='kusto'; -Customers | summarize count(), min(Age), max(Age), avg(Age), sum(Age); -Customers | summarize count(), min(Age), max(Age), avg(Age), sum(Age) by Occupation | order by Occupation; -Customers | summarize countif(Age>40) by Occupation | order by Occupation; -Customers | summarize MyMax = maxif(Age, Age<40) by Occupation | order by Occupation; -Customers | summarize MyMin = minif(Age, Age<40) by Occupation | order by Occupation; -Customers | summarize MyAvg = avgif(Age, Age<40) by Occupation | order by Occupation; -Customers | summarize MySum = sumif(Age, Age<40) by Occupation | order by Occupation; -Customers | summarize dcount(Education); -Customers | summarize dcountif(Education, Occupation=='Professional'); -Customers | summarize count_ = count() by bin(Age, 10) | order by count_ asc; -Customers | summarize job_count = count() by Occupation | where job_count > 0 | order by Occupation; -Customers | summarize 'Edu Count'=count() by Education | sort by 'Edu Count' desc; -- { clientError 62 } - -print '-- make_list() --'; -Customers | summarize f_list = make_list(Education) by Occupation | sort by Occupation; -Customers | summarize f_list = make_list(Education, 2) by Occupation | sort by Occupation; -print '-- make_list_if() --'; -Customers | summarize f_list = make_list_if(FirstName, Age>30) by Occupation | sort by Occupation; -Customers | summarize f_list = make_list_if(FirstName, Age>30, 1) by Occupation | sort by Occupation; -print '-- make_set() --'; -Customers | summarize f_list = make_set(Education) by Occupation | sort by Occupation; -Customers | summarize f_list = make_set(Education, 2) by Occupation | sort by Occupation; -print '-- make_set_if() --'; -Customers | summarize f_list = make_set_if(Education, Age>30) by Occupation | sort by Occupation; -Customers | summarize f_list = make_set_if(Education, Age>30, 1) by Occupation | sort by Occupation; -print '-- stdev() --'; -Customers | project Age | summarize stdev(Age); -print '-- stdevif() --'; -Customers | project Age | summarize stdevif(Age, Age%2==0); -print '-- binary_all_and --'; -Customers | project Age | where Age > 40 | summarize binary_all_and(Age); -print '-- binary_all_or --'; -Customers | project Age | where Age > 40 | summarize binary_all_or(Age); -print '-- binary_all_xor --'; -Customers | project Age | where Age > 40 | summarize binary_all_xor(Age); - -Customers | project Age | summarize percentile(Age, 95); -Customers | project Age | summarize percentiles(Age, 5, 50, 95)|project round(percentiles_Age[0],2),round(percentiles_Age[1],2),round(percentiles_Age[2],2); -Customers | project Age | summarize percentiles(Age, 5, 50, 95)[1]; -Customers | summarize w=count() by AgeBucket=bin(Age, 5) | summarize percentilew(AgeBucket, w, 75); -Customers | summarize w=count() by AgeBucket=bin(Age, 5) | summarize percentilesw(AgeBucket, w, 50, 75, 99.9); - -print '-- Summarize following sort --'; -Customers | sort by FirstName | summarize count() by Occupation | sort by Occupation; - -print '-- summarize with bin --'; -EventLog | summarize count=count() by bin(Created, 1000) | sort by count asc; -EventLog | summarize count=count() by bin(unixtime_seconds_todatetime(Created/1000), 1s) | sort by count asc; -EventLog | summarize count=count() by time_label=bin(Created/1000, 1s) | sort by count asc; -Dates | project bin(datetime(EventTime), 1m); -print '-- make_list_with_nulls --'; -Customers | summarize t = make_list_with_nulls(FirstName); -Customers | summarize f_list = make_list_with_nulls(FirstName) by Occupation | sort by Occupation; -Customers | summarize f_list = make_list_with_nulls(FirstName), a_list = make_list_with_nulls(Age) by Occupation | sort by Occupation; --- TODO: --- arg_max() --- arg_min() diff --git a/tests/queries/0_stateless/02366_kql_tabular.reference b/tests/queries/0_stateless/02366_kql_tabular.reference deleted file mode 100644 index e70c02ce34f..00000000000 --- a/tests/queries/0_stateless/02366_kql_tabular.reference +++ /dev/null @@ -1,139 +0,0 @@ --- test Query only has table name: -- -Theodore Diaz Skilled Manual Bachelors 28 -Stephanie Cox Management Bachelors 33 -Peter Nara Skilled Manual Graduate Degree 26 -Latoya Shen Professional Graduate Degree 25 -Joshua Lee Professional Partial College 26 -Edward Hernandez Skilled Manual High School 36 -Dalton Wood Professional Partial College 42 -Christine Nara Skilled Manual Partial College 33 -Cameron Rodriguez Professional Partial College 28 -Angel Stewart Professional Partial College 46 --- Query has Column Selection -- -Theodore Diaz Skilled Manual -Stephanie Cox Management -Peter Nara Skilled Manual -Latoya Shen Professional -Joshua Lee Professional -Edward Hernandez Skilled Manual -Dalton Wood Professional -Christine Nara Skilled Manual -Cameron Rodriguez Professional -Angel Stewart Professional --- Query has limit -- -Theodore Diaz Skilled Manual -Stephanie Cox Management -Peter Nara Skilled Manual -Latoya Shen Professional -Joshua Lee Professional -Theodore Diaz Skilled Manual -Stephanie Cox Management -Peter Nara Skilled Manual -Latoya Shen Professional -Joshua Lee Professional --- Query has second limit with bigger value -- -Theodore Diaz Skilled Manual -Stephanie Cox Management -Peter Nara Skilled Manual -Latoya Shen Professional -Joshua Lee Professional --- Query has second limit with smaller value -- -Theodore Diaz Skilled Manual -Stephanie Cox Management -Peter Nara Skilled Manual --- Query has second Column selection -- -Theodore Diaz -Stephanie Cox -Peter Nara --- Query has second Column selection with extra column -- --- Query with desc sort -- -Theodore -Stephanie -Peter -Latoya -Joshua -Skilled Manual -Skilled Manual -Professional -Professional -Management --- Query with asc sort -- -Management -Professional -Professional -Skilled Manual -Skilled Manual --- Query with sort (without keyword asc desc) -- -Theodore -Stephanie -Peter -Latoya -Joshua -Skilled Manual -Skilled Manual -Professional -Professional -Management --- Query with sort 2 Columns with different direction -- -Stephanie Cox Management -Latoya Shen Professional -Joshua Lee Professional -Peter Nara Skilled Manual -Theodore Diaz Skilled Manual --- Query with second sort -- -Stephanie Cox Management -Latoya Shen Professional -Joshua Lee Professional -Peter Nara Skilled Manual -Theodore Diaz Skilled Manual --- Test String Equals (==) -- -Theodore Diaz Skilled Manual -Peter Nara Skilled Manual -Edward Hernandez Skilled Manual -Christine Nara Skilled Manual --- Test String Not equals (!=) -- -Stephanie Cox Management -Latoya Shen Professional -Joshua Lee Professional -Dalton Wood Professional -Cameron Rodriguez Professional -Angel Stewart Professional --- Test Filter using a list (in) -- -Theodore Diaz Skilled Manual Bachelors -Stephanie Cox Management Bachelors -Edward Hernandez Skilled Manual High School --- Test Filter using a list (!in) -- -Peter Nara Skilled Manual Graduate Degree -Latoya Shen Professional Graduate Degree -Joshua Lee Professional Partial College -Dalton Wood Professional Partial College -Christine Nara Skilled Manual Partial College -Cameron Rodriguez Professional Partial College -Angel Stewart Professional Partial College --- Test Filter using common string operations (contains_cs) -- -Joshua Lee Professional Partial College -Dalton Wood Professional Partial College -Christine Nara Skilled Manual Partial College -Cameron Rodriguez Professional Partial College -Angel Stewart Professional Partial College --- Test Filter using common string operations (startswith_cs) -- -Latoya Shen Professional Graduate Degree -Joshua Lee Professional Partial College -Dalton Wood Professional Partial College -Cameron Rodriguez Professional Partial College -Angel Stewart Professional Partial College --- Test Filter using common string operations (endswith_cs) -- -Latoya Shen Professional Graduate Degree -Joshua Lee Professional Partial College --- Test Filter using numerical equal (==) -- -Peter Nara Skilled Manual Graduate Degree 26 -Joshua Lee Professional Partial College 26 --- Test Filter using numerical great and less (> , <) -- -Stephanie Cox Management Bachelors 33 -Edward Hernandez Skilled Manual High School 36 -Christine Nara Skilled Manual Partial College 33 --- Test Filter using multi where -- -Dalton Wood Professional Partial College 42 -Angel Stewart Professional Partial College 46 --- Complex query with unknown function -- --- Missing column in front of startsWith -- diff --git a/tests/queries/0_stateless/02366_kql_tabular.sql b/tests/queries/0_stateless/02366_kql_tabular.sql deleted file mode 100644 index f73c4c09cca..00000000000 --- a/tests/queries/0_stateless/02366_kql_tabular.sql +++ /dev/null @@ -1,88 +0,0 @@ -DROP TABLE IF EXISTS Customers; -CREATE TABLE Customers -( - FirstName Nullable(String), - LastName String, - Occupation String, - Education String, - Age Nullable(UInt8) -) ENGINE = Memory; - -INSERT INTO Customers VALUES ('Theodore','Diaz','Skilled Manual','Bachelors',28), ('Stephanie','Cox','Management','Bachelors',33), ('Peter','Nara','Skilled Manual','Graduate Degree',26), ('Latoya','Shen','Professional','Graduate Degree',25), ('Joshua','Lee','Professional','Partial College',26), ('Edward','Hernandez','Skilled Manual','High School',36), ('Dalton','Wood','Professional','Partial College',42), ('Christine','Nara','Skilled Manual','Partial College',33), ('Cameron','Rodriguez','Professional','Partial College',28), ('Angel','Stewart','Professional','Partial College',46); - -set dialect='kusto'; -print '-- test Query only has table name: --'; -Customers; - -print '-- Query has Column Selection --'; -Customers | project FirstName,LastName,Occupation; - -print '-- Query has limit --'; -Customers | project FirstName,LastName,Occupation | take 5; -Customers | project FirstName,LastName,Occupation | limit 5; - -print '-- Query has second limit with bigger value --'; -Customers | project FirstName,LastName,Occupation | take 5 | take 7; - -print '-- Query has second limit with smaller value --'; -Customers | project FirstName,LastName,Occupation | take 5 | take 3; - -print '-- Query has second Column selection --'; -Customers | project FirstName,LastName,Occupation | take 3 | project FirstName,LastName; - -print '-- Query has second Column selection with extra column --'; -Customers| project FirstName,LastName,Occupation | take 3 | project FirstName,LastName,Education;-- { serverError 47 } - -print '-- Query with desc sort --'; -Customers | project FirstName | take 5 | sort by FirstName desc; -Customers | project Occupation | take 5 | order by Occupation desc; - -print '-- Query with asc sort --'; -Customers | project Occupation | take 5 | sort by Occupation asc; - -print '-- Query with sort (without keyword asc desc) --'; -Customers | project FirstName | take 5 | sort by FirstName; -Customers | project Occupation | take 5 | order by Occupation; - -print '-- Query with sort 2 Columns with different direction --'; -Customers | project FirstName,LastName,Occupation | take 5 | sort by Occupation asc, LastName desc; - -print '-- Query with second sort --'; -Customers | project FirstName,LastName,Occupation | take 5 | sort by Occupation desc |sort by Occupation asc, LastName desc; - -print '-- Test String Equals (==) --'; -Customers | project FirstName,LastName,Occupation | where Occupation == 'Skilled Manual'; - -print '-- Test String Not equals (!=) --'; -Customers | project FirstName,LastName,Occupation | where Occupation != 'Skilled Manual'; - -print '-- Test Filter using a list (in) --'; -Customers | project FirstName,LastName,Occupation,Education | where Education in ('Bachelors','High School'); - -print '-- Test Filter using a list (!in) --'; -set dialect='kusto'; -Customers | project FirstName,LastName,Occupation,Education | where Education !in ('Bachelors','High School'); - -print '-- Test Filter using common string operations (contains_cs) --'; -Customers | project FirstName,LastName,Occupation,Education | where Education contains_cs 'Coll'; - -print '-- Test Filter using common string operations (startswith_cs) --'; -Customers | project FirstName,LastName,Occupation,Education | where Occupation startswith_cs 'Prof'; - -print '-- Test Filter using common string operations (endswith_cs) --'; -Customers | project FirstName,LastName,Occupation,Education | where FirstName endswith_cs 'a'; - -print '-- Test Filter using numerical equal (==) --'; -Customers | project FirstName,LastName,Occupation,Education,Age | where Age == 26; - -print '-- Test Filter using numerical great and less (> , <) --'; -Customers | project FirstName,LastName,Occupation,Education,Age | where Age > 30 and Age < 40; - -print '-- Test Filter using multi where --'; -Customers | project FirstName,LastName,Occupation,Education,Age | where Age > 30 | where Occupation == 'Professional'; - -print '-- Complex query with unknown function --'; -hits | where CounterID == 62 and EventDate >= '2013-07-14' and EventDate <= '2013-07-15' and IsRefresh == 0 and DontCountHits == 0 | summarize count() by d=bin(poopoo(EventTime), 1m) | order by d | limit 10; -- { clientError UNKNOWN_FUNCTION } - -print '-- Missing column in front of startsWith --'; -StormEvents | where startswith "W" | summarize Count=count() by State; -- { clientError SYNTAX_ERROR } diff --git a/tests/queries/0_stateless/02416_json_object_inference.sql b/tests/queries/0_stateless/02416_json_object_inference.sql index 91137c0243c..3022ee026d0 100644 --- a/tests/queries/0_stateless/02416_json_object_inference.sql +++ b/tests/queries/0_stateless/02416_json_object_inference.sql @@ -2,5 +2,5 @@ set allow_experimental_object_type=1; desc format(JSONEachRow, '{"a" : {"b" : {"c" : 1, "d" : "str"}}}'); set allow_experimental_object_type=0, input_format_json_read_objects_as_strings=0, input_format_json_try_infer_named_tuples_from_objects=0, input_format_json_read_numbers_as_strings=0; -desc format(JSONEachRow, '{"a" : {"b" : {"c" : 1, "d" : "str"}}}'); -- {serverError 652} +desc format(JSONEachRow, '{"a" : {"b" : {"c" : 1, "d" : "str"}}}'); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} diff --git a/tests/queries/0_stateless/02424_pod_array_overflow.sql b/tests/queries/0_stateless/02424_pod_array_overflow.sql index 4b85d5be029..50c46cf19f1 100644 --- a/tests/queries/0_stateless/02424_pod_array_overflow.sql +++ b/tests/queries/0_stateless/02424_pod_array_overflow.sql @@ -1 +1 @@ -SELECT * FROM format(Native, '\x02\x02\x02\x6b\x30\x1a\x4d\x61\x70\x28\x46\x69\x78\x65\x64\x53\x74\x72\x69\x6e\x67\x28\x31\x29\x2c\x20\x49\x6e\x74\x36\x34\x29\x01\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x7f\x00\x7f\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x64\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xcf\x31\x3f\x56\x69\x11\x89\x25'); -- { serverError 128 } +SELECT * FROM format(Native, '\x02\x02\x02\x6b\x30\x1a\x4d\x61\x70\x28\x46\x69\x78\x65\x64\x53\x74\x72\x69\x6e\x67\x28\x31\x29\x2c\x20\x49\x6e\x74\x36\x34\x29\x01\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x7f\x00\x7f\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x64\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xcf\x31\x3f\x56\x69\x11\x89\x25'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } diff --git a/tests/queries/0_stateless/02426_pod_array_overflow_2.sql b/tests/queries/0_stateless/02426_pod_array_overflow_2.sql index 52a00730227..6a0d97acee3 100644 --- a/tests/queries/0_stateless/02426_pod_array_overflow_2.sql +++ b/tests/queries/0_stateless/02426_pod_array_overflow_2.sql @@ -1 +1 @@ -SELECT * FROM format(Native, 'k0\x23Array(Tuple(FixedString(1), Int64))\0\0\0\0\0\0\0�����\0����������������\0�\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0d\0\0\0\0\0\0\0\0\0\0\0\0\0�1?Vi�%'); -- { serverError 128 } +SELECT * FROM format(Native, 'k0\x23Array(Tuple(FixedString(1), Int64))\0\0\0\0\0\0\0�����\0����������������\0�\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0d\0\0\0\0\0\0\0\0\0\0\0\0\0�1?Vi�%'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } diff --git a/tests/queries/0_stateless/02426_pod_array_overflow_3.sql b/tests/queries/0_stateless/02426_pod_array_overflow_3.sql index 857ba2ca28e..caabf7d1679 100644 --- a/tests/queries/0_stateless/02426_pod_array_overflow_3.sql +++ b/tests/queries/0_stateless/02426_pod_array_overflow_3.sql @@ -1 +1 @@ -SELECT * FROM format(Native, '\x01\x01\x01x\x0CArray(UInt8)\x01\x00\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF'); -- { serverError 128 } +SELECT * FROM format(Native, '\x01\x01\x01x\x0CArray(UInt8)\x01\x00\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } diff --git a/tests/queries/0_stateless/02428_parameterized_view.reference b/tests/queries/0_stateless/02428_parameterized_view.reference index 422fdaa4983..fd77e6ed8df 100644 --- a/tests/queries/0_stateless/02428_parameterized_view.reference +++ b/tests/queries/0_stateless/02428_parameterized_view.reference @@ -23,6 +23,7 @@ ERROR 20 20 ERROR +20 30 20 30 diff --git a/tests/queries/0_stateless/02428_parameterized_view.sh b/tests/queries/0_stateless/02428_parameterized_view.sh index ad9c672f4c5..c6f0927db36 100755 --- a/tests/queries/0_stateless/02428_parameterized_view.sh +++ b/tests/queries/0_stateless/02428_parameterized_view.sh @@ -37,7 +37,7 @@ $CLICKHOUSE_CLIENT -q "CREATE VIEW test_02428_pv1 AS SELECT * FROM test_02428_Ca $CLICKHOUSE_CLIENT -q "SELECT Price FROM test_02428_pv1(price=20)" $CLICKHOUSE_CLIENT -q "SELECT Price FROM \`test_02428_pv1\`(price=20)" -$CLICKHOUSE_CLIENT -q "SELECT Price FROM test_02428_pv1" 2>&1 | grep -Fq "UNKNOWN_QUERY_PARAMETER" && echo 'ERROR' || echo 'OK' +$CLICKHOUSE_CLIENT -q "SELECT Price FROM test_02428_pv1" 2>&1 | grep -q "UNKNOWN_QUERY_PARAMETER\|UNKNOWN_IDENTIFIER" && echo 'ERROR' || echo 'OK' $CLICKHOUSE_CLIENT --param_p 10 -q "SELECT Price FROM test_02428_pv1(price={p:UInt64})" $CLICKHOUSE_CLIENT --param_l 1 -q "SELECT Price FROM test_02428_pv1(price=50) LIMIT ({l:UInt64})" @@ -72,7 +72,8 @@ $CLICKHOUSE_CLIENT -q "INSERT INTO ${CLICKHOUSE_TEST_UNIQUE_NAME}.Catalog VALUES $CLICKHOUSE_CLIENT -q "INSERT INTO ${CLICKHOUSE_TEST_UNIQUE_NAME}.Catalog VALUES ('Paper', 20, 1)" $CLICKHOUSE_CLIENT -q "CREATE VIEW ${CLICKHOUSE_TEST_UNIQUE_NAME}.pv1 AS SELECT * FROM ${CLICKHOUSE_TEST_UNIQUE_NAME}.Catalog WHERE Price={price:UInt64}" $CLICKHOUSE_CLIENT -q "SELECT Price FROM ${CLICKHOUSE_TEST_UNIQUE_NAME}.pv1(price=20)" -$CLICKHOUSE_CLIENT -q "SELECT Price FROM \`${CLICKHOUSE_TEST_UNIQUE_NAME}.pv1\`(price=20)" 2>&1 | grep -Fq "UNKNOWN_FUNCTION" && echo 'ERROR' || echo 'OK' +$CLICKHOUSE_CLIENT -q "SELECT Price FROM \`${CLICKHOUSE_TEST_UNIQUE_NAME}.pv1\`(price=20) SETTINGS allow_experimental_analyzer = 0" 2>&1 | grep -Fq "UNKNOWN_FUNCTION" && echo 'ERROR' || echo 'OK' +$CLICKHOUSE_CLIENT -q "SELECT Price FROM \`${CLICKHOUSE_TEST_UNIQUE_NAME}.pv1\`(price=20) SETTINGS allow_experimental_analyzer = 1" $CLICKHOUSE_CLIENT -q "INSERT INTO test_02428_Catalog VALUES ('Book2', 30, 8)" diff --git a/tests/queries/0_stateless/02450_kill_distributed_query_deadlock.sh b/tests/queries/0_stateless/02450_kill_distributed_query_deadlock.sh index 03c43843d3a..0cd520d8d5d 100755 --- a/tests/queries/0_stateless/02450_kill_distributed_query_deadlock.sh +++ b/tests/queries/0_stateless/02450_kill_distributed_query_deadlock.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long +# Tags: long, no-random-settings, no-debug CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/02455_duplicate_column_names_in_schema_inference.sql b/tests/queries/0_stateless/02455_duplicate_column_names_in_schema_inference.sql index 626a4d7034e..f67e5496a98 100644 --- a/tests/queries/0_stateless/02455_duplicate_column_names_in_schema_inference.sql +++ b/tests/queries/0_stateless/02455_duplicate_column_names_in_schema_inference.sql @@ -1,7 +1,7 @@ -- Tags: no-fasttest -desc format(JSONEachRow, '{"x" : 1, "x" : 2}'); -- {serverError INCORRECT_DATA} -desc format(JSONEachRow, '{"x" : 1, "y" : 2}\n{"x" : 2, "x" : 3}'); -- {serverError INCORRECT_DATA} -desc format(CSVWithNames, 'a,b,a\n1,2,3'); -- {serverError INCORRECT_DATA} -desc format(CSV, '1,2,3') settings column_names_for_schema_inference='a, b, a'; -- {serverError INCORRECT_DATA} +desc format(JSONEachRow, '{"x" : 1, "x" : 2}'); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +desc format(JSONEachRow, '{"x" : 1, "y" : 2}\n{"x" : 2, "x" : 3}'); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +desc format(CSVWithNames, 'a,b,a\n1,2,3'); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +desc format(CSV, '1,2,3') settings column_names_for_schema_inference='a, b, a'; -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} diff --git a/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql b/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql index ac549a7faf1..71a2381d7b6 100644 --- a/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql +++ b/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql @@ -10,14 +10,14 @@ set input_format_json_infer_incomplete_types_as_strings=0; insert into test select * from file(02458_data.jsonl); insert into test select x, 1 from file(02458_data.jsonl); insert into test select x, y from file(02458_data.jsonl); -insert into test select x + 1, y from file(02458_data.jsonl); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +insert into test select x + 1, y from file(02458_data.jsonl); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} insert into test select x, z from file(02458_data.jsonl); insert into test select * from file(02458_data.jsoncompacteachrow); -insert into test select x, 1 from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} -insert into test select x, y from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} -insert into test select x + 1, y from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} -insert into test select x, z from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +insert into test select x, 1 from file(02458_data.jsoncompacteachrow); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +insert into test select x, y from file(02458_data.jsoncompacteachrow); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +insert into test select x + 1, y from file(02458_data.jsoncompacteachrow); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +insert into test select x, z from file(02458_data.jsoncompacteachrow); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} insert into test select * from input() format CSV 1,2 insert into test select x, y from input() format CSV 1,2 -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} diff --git a/tests/queries/0_stateless/02475_bson_each_row_format.sh b/tests/queries/0_stateless/02475_bson_each_row_format.sh index aa58d27fa50..f5c48608639 100755 --- a/tests/queries/0_stateless/02475_bson_each_row_format.sh +++ b/tests/queries/0_stateless/02475_bson_each_row_format.sh @@ -5,6 +5,12 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh +# In case of parallel parsing and small block +# (--min_chunk_bytes_for_parallel_parsing) we may have multiple blocks, and +# this will break sorting order, so let's limit number of threads to avoid +# reordering. +CLICKHOUSE_CLIENT+="--allow_repeated_settings --max_threads 1" + echo "Integers" $CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow) select number::Bool as bool, number::Int8 as int8, number::UInt8 as uint8, number::Int16 as int16, number::UInt16 as uint16, number::Int32 as int32, number::UInt32 as uint32, number::Int64 as int64, number::UInt64 as uint64 from numbers(5) settings engine_file_truncate_on_insert=1" $CLICKHOUSE_CLIENT -q "select * from file(02475_data.bsonEachRow, auto, 'bool Bool, int8 Int8, uint8 UInt8, int16 Int16, uint16 UInt16, int32 Int32, uint32 UInt32, int64 Int64, uint64 UInt64')" diff --git a/tests/queries/0_stateless/02479_mysql_connect_to_self.reference b/tests/queries/0_stateless/02479_mysql_connect_to_self.reference index 6838dacc3b3..8057b945c5a 100644 --- a/tests/queries/0_stateless/02479_mysql_connect_to_self.reference +++ b/tests/queries/0_stateless/02479_mysql_connect_to_self.reference @@ -67,6 +67,6 @@ SELECT __table1.a AS a, __table1.b AS b, __table1.c AS c -FROM mysql(\'127.0.0.1:9004\', \'default\', foo, \'default\', \'\', SETTINGS connection_wait_timeout = 123, connect_timeout = 40123002, read_write_timeout = 40123001, connection_pool_size = 3) AS __table1 +FROM mysql(\'127.0.0.1:9004\', _CAST(\'default\', \'String\'), foo, \'default\', \'\', SETTINGS connection_wait_timeout = 123, connect_timeout = 40123002, read_write_timeout = 40123001, connection_pool_size = 3) AS __table1 --- 5 diff --git a/tests/queries/0_stateless/02495_concat_with_separator.reference b/tests/queries/0_stateless/02495_concat_with_separator.reference index 8f0ea917f4b..ebff5deb6aa 100644 --- a/tests/queries/0_stateless/02495_concat_with_separator.reference +++ b/tests/queries/0_stateless/02495_concat_with_separator.reference @@ -14,6 +14,45 @@ 1 1 1 +1 +1 \N \N \N +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +0 diff --git a/tests/queries/0_stateless/02495_concat_with_separator.sql b/tests/queries/0_stateless/02495_concat_with_separator.sql index 916c4cda1b7..7167d48a1da 100644 --- a/tests/queries/0_stateless/02495_concat_with_separator.sql +++ b/tests/queries/0_stateless/02495_concat_with_separator.sql @@ -1,27 +1,72 @@ -select concatWithSeparator('|', 'a', 'b') == 'a|b'; -select concatWithSeparator('|', 'a', materialize('b')) == 'a|b'; -select concatWithSeparator('|', materialize('a'), 'b') == 'a|b'; -select concatWithSeparator('|', materialize('a'), materialize('b')) == 'a|b'; +SET allow_suspicious_low_cardinality_types=1; -select concatWithSeparator('|', 'a', toFixedString('b', 1)) == 'a|b'; -select concatWithSeparator('|', 'a', materialize(toFixedString('b', 1))) == 'a|b'; -select concatWithSeparator('|', materialize('a'), toFixedString('b', 1)) == 'a|b'; -select concatWithSeparator('|', materialize('a'), materialize(toFixedString('b', 1))) == 'a|b'; +-- negative tests +SELECT concatWithSeparator(materialize('|'), 'a', 'b'); -- { serverError ILLEGAL_COLUMN } +SELECT concatWithSeparator(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -select concatWithSeparator('|', toFixedString('a', 1), 'b') == 'a|b'; -select concatWithSeparator('|', toFixedString('a', 1), materialize('b')) == 'a|b'; -select concatWithSeparator('|', materialize(toFixedString('a', 1)), 'b') == 'a|b'; -select concatWithSeparator('|', materialize(toFixedString('a', 1)), materialize('b')) == 'a|b'; +-- special cases +SELECT concatWithSeparator('|') = ''; +SELECT concatWithSeparator('|', 'a') == 'a'; -select concatWithSeparator('|', toFixedString('a', 1), toFixedString('b', 1)) == 'a|b'; -select concatWithSeparator('|', toFixedString('a', 1), materialize(toFixedString('b', 1))) == 'a|b'; -select concatWithSeparator('|', materialize(toFixedString('a', 1)), toFixedString('b', 1)) == 'a|b'; -select concatWithSeparator('|', materialize(toFixedString('a', 1)), materialize(toFixedString('b', 1))) == 'a|b'; +SELECT concatWithSeparator('|', 'a', 'b') == 'a|b'; +SELECT concatWithSeparator('|', 'a', materialize('b')) == 'a|b'; +SELECT concatWithSeparator('|', materialize('a'), 'b') == 'a|b'; +SELECT concatWithSeparator('|', materialize('a'), materialize('b')) == 'a|b'; -select concatWithSeparator(null, 'a', 'b') == null; -select concatWithSeparator('1', null, 'b') == null; -select concatWithSeparator('1', 'a', null) == null; +SELECT concatWithSeparator('|', 'a', toFixedString('b', 1)) == 'a|b'; +SELECT concatWithSeparator('|', 'a', materialize(toFixedString('b', 1))) == 'a|b'; +SELECT concatWithSeparator('|', materialize('a'), toFixedString('b', 1)) == 'a|b'; +SELECT concatWithSeparator('|', materialize('a'), materialize(toFixedString('b', 1))) == 'a|b'; -select concatWithSeparator(materialize('|'), 'a', 'b'); -- { serverError 44 } -select concatWithSeparator(); -- { serverError 42 } -select concatWithSeparator('|', 'a', 100); -- { serverError 43 } +SELECT concatWithSeparator('|', toFixedString('a', 1), 'b') == 'a|b'; +SELECT concatWithSeparator('|', toFixedString('a', 1), materialize('b')) == 'a|b'; +SELECT concatWithSeparator('|', materialize(toFixedString('a', 1)), 'b') == 'a|b'; +SELECT concatWithSeparator('|', materialize(toFixedString('a', 1)), materialize('b')) == 'a|b'; + +SELECT concatWithSeparator('|', toFixedString('a', 1), toFixedString('b', 1)) == 'a|b'; +SELECT concatWithSeparator('|', toFixedString('a', 1), materialize(toFixedString('b', 1))) == 'a|b'; +SELECT concatWithSeparator('|', materialize(toFixedString('a', 1)), toFixedString('b', 1)) == 'a|b'; +SELECT concatWithSeparator('|', materialize(toFixedString('a', 1)), materialize(toFixedString('b', 1))) == 'a|b'; + +SELECT concatWithSeparator(null, 'a', 'b') == null; +SELECT concatWithSeparator('1', null, 'b') == null; +SELECT concatWithSeparator('1', 'a', null) == null; + +-- Const String + non-const non-String/non-FixedString type' +SELECT concatWithSeparator('|', 'a', materialize(42 :: Int8)) == 'a|42'; +SELECT concatWithSeparator('|', 'a', materialize(43 :: Int16)) == 'a|43'; +SELECT concatWithSeparator('|', 'a', materialize(44 :: Int32)) == 'a|44'; +SELECT concatWithSeparator('|', 'a', materialize(45 :: Int64)) == 'a|45'; +SELECT concatWithSeparator('|', 'a', materialize(46 :: Int128)) == 'a|46'; +SELECT concatWithSeparator('|', 'a', materialize(47 :: Int256)) == 'a|47'; +SELECT concatWithSeparator('|', 'a', materialize(48 :: UInt8)) == 'a|48'; +SELECT concatWithSeparator('|', 'a', materialize(49 :: UInt16)) == 'a|49'; +SELECT concatWithSeparator('|', 'a', materialize(50 :: UInt32)) == 'a|50'; +SELECT concatWithSeparator('|', 'a', materialize(51 :: UInt64)) == 'a|51'; +SELECT concatWithSeparator('|', 'a', materialize(52 :: UInt128)) == 'a|52'; +SELECT concatWithSeparator('|', 'a', materialize(53 :: UInt256)) == 'a|53'; +SELECT concatWithSeparator('|', 'a', materialize(42.42 :: Float32)) == 'a|42.42'; +SELECT concatWithSeparator('|', 'a', materialize(43.43 :: Float64)) == 'a|43.43'; +SELECT concatWithSeparator('|', 'a', materialize(44.44 :: Decimal(2))) == 'a|44'; +SELECT concatWithSeparator('|', 'a', materialize(true :: Bool)) == 'a|true'; +SELECT concatWithSeparator('|', 'a', materialize(false :: Bool)) == 'a|false'; +SELECT concatWithSeparator('|', 'a', materialize('foo' :: String)) == 'a|foo'; +SELECT concatWithSeparator('|', 'a', materialize('bar' :: FixedString(3))) == 'a|bar'; +SELECT concatWithSeparator('|', 'a', materialize('foo' :: Nullable(String))) == 'a|foo'; +SELECT concatWithSeparator('|', 'a', materialize('bar' :: Nullable(FixedString(3)))) == 'a|bar'; +SELECT concatWithSeparator('|', 'a', materialize('foo' :: LowCardinality(String))) == 'a|foo'; +SELECT concatWithSeparator('|', 'a', materialize('bar' :: LowCardinality(FixedString(3)))) == 'a|bar'; +SELECT concatWithSeparator('|', 'a', materialize('foo' :: LowCardinality(Nullable(String)))) == 'a|foo'; +SELECT concatWithSeparator('|', 'a', materialize('bar' :: LowCardinality(Nullable(FixedString(3))))) == 'a|bar'; +SELECT concatWithSeparator('|', 'a', materialize(42 :: LowCardinality(Nullable(UInt32)))) == 'a|42'; +SELECT concatWithSeparator('|', 'a', materialize(42 :: LowCardinality(UInt32))) == 'a|42'; +SELECT concatWithSeparator('|', 'a', materialize('fae310ca-d52a-4923-9e9b-02bf67f4b009' :: UUID)) == 'a|fae310ca-d52a-4923-9e9b-02bf67f4b009'; +SELECT concatWithSeparator('|', 'a', materialize('2023-11-14' :: Date)) == 'a|2023-11-14'; +SELECT concatWithSeparator('|', 'a', materialize('2123-11-14' :: Date32)) == 'a|2123-11-14'; +SELECT concatWithSeparator('|', 'a', materialize('2023-11-14 05:50:12' :: DateTime('Europe/Amsterdam'))) == 'a|2023-11-14 05:50:12'; +SELECT concatWithSeparator('|', 'a', materialize('hallo' :: Enum('hallo' = 1))) == 'a|hallo'; +SELECT concatWithSeparator('|', 'a', materialize(['foo', 'bar'] :: Array(String))) == 'a|[\'foo\',\'bar\']'; +SELECT concatWithSeparator('|', 'a', materialize((42, 'foo') :: Tuple(Int32, String))) == 'a|(42,\'foo\')'; +SELECT concatWithSeparator('|', 'a', materialize(map(42, 'foo') :: Map(Int32, String))) == 'a|{42:\'foo\'}'; +SELECT concatWithSeparator('|', 'a', materialize('122.233.64.201' :: IPv4)) == 'a|122.233.64.201'; +SELECT concatWithSeparator('|', 'a', materialize('2001:0001:130F:0002:0003:09C0:876A:130B' :: IPv6)) == 'a|2001:0001:130F:0002:0003:09C0:876A:130B'; diff --git a/tests/queries/0_stateless/02497_schema_inference_nulls.sql b/tests/queries/0_stateless/02497_schema_inference_nulls.sql index a25060e8182..b78b5709dbb 100644 --- a/tests/queries/0_stateless/02497_schema_inference_nulls.sql +++ b/tests/queries/0_stateless/02497_schema_inference_nulls.sql @@ -4,7 +4,7 @@ set input_format_json_try_infer_named_tuples_from_objects=0; set input_format_json_read_objects_as_strings=0; set input_format_json_infer_incomplete_types_as_strings=0; set input_format_json_read_numbers_as_strings=0; -desc format(JSONEachRow, '{"x" : 1234}, {"x" : "String"}') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError TYPE_MISMATCH } +desc format(JSONEachRow, '{"x" : 1234}, {"x" : "String"}') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } desc format(JSONEachRow, '{"x" : [null, 1]}'); desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : []}'); desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : [null]}'); @@ -26,7 +26,7 @@ desc format(JSONEachRow, '{"x" : [1, 2]}, {"x" : [null]}'); select 'JSONCompactEachRow'; set schema_inference_make_columns_nullable=1; -desc format(JSONCompactEachRow, '[1234], ["String"]') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError TYPE_MISMATCH } +desc format(JSONCompactEachRow, '[1234], ["String"]') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } desc format(JSONCompactEachRow, '[[null, 1]]'); desc format(JSONCompactEachRow, '[[null, 1]], [[]]'); desc format(JSONCompactEachRow, '[[null, 1]], [[null]]'); diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference index f646583bbd3..9a0cfdffcb5 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference @@ -293,7 +293,7 @@ SELECT {'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} -- { echoOn } -SET extract_kvp_max_pairs_per_row = 2; +SET extract_key_value_pairs_max_pairs_per_row = 2; -- Should be allowed because it no longer exceeds the max number of pairs -- expected output: {'key1':'value1','key2':'value2'} WITH @@ -307,7 +307,7 @@ WITH SELECT x; {'key1':'value1','key2':'value2'} -SET extract_kvp_max_pairs_per_row = 0; +SET extract_key_value_pairs_max_pairs_per_row = 0; -- Should be allowed because max pairs per row is set to 0 (unlimited) -- expected output: {'key1':'value1','key2':'value2'} WITH diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql index 9277ba6d7ec..4f3db3f166b 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql @@ -415,7 +415,7 @@ SELECT x; -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} -- Should fail allowed because it exceeds the max number of pairs -SET extract_kvp_max_pairs_per_row = 1; +SET extract_key_value_pairs_max_pairs_per_row = 1; WITH extractKeyValuePairs('key1:value1,key2:value2') AS s_map, CAST( @@ -429,7 +429,7 @@ SELECT -- { echoOn } -SET extract_kvp_max_pairs_per_row = 2; +SET extract_key_value_pairs_max_pairs_per_row = 2; -- Should be allowed because it no longer exceeds the max number of pairs -- expected output: {'key1':'value1','key2':'value2'} WITH @@ -443,7 +443,7 @@ WITH SELECT x; -SET extract_kvp_max_pairs_per_row = 0; +SET extract_key_value_pairs_max_pairs_per_row = 0; -- Should be allowed because max pairs per row is set to 0 (unlimited) -- expected output: {'key1':'value1','key2':'value2'} WITH diff --git a/tests/queries/0_stateless/02500_numbers_inference.sh b/tests/queries/0_stateless/02500_numbers_inference.sh index ce9cd5bdc9f..5d863bd616f 100755 --- a/tests/queries/0_stateless/02500_numbers_inference.sh +++ b/tests/queries/0_stateless/02500_numbers_inference.sh @@ -8,10 +8,10 @@ $CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : 1.2}')"; echo '{"x" : 1.2}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test"; $CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : 1}')"; echo '{"x" : 1}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test"; -$CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : 1e10}')"; -echo '{"x" : 1e10}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test"; -$CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : [1, 42.42, 1, 1e10]}')"; -echo '{"x" : [1, 42.42, 1, 1e10]}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test"; +$CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : 1e10}')" --input_format_try_infer_exponent_floats=1; +echo '{"x" : 1e10}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test" --input_format_try_infer_exponent_floats=1; +$CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : [1, 42.42, 1, 1e10]}')" --input_format_try_infer_exponent_floats=1; +echo '{"x" : [1, 42.42, 1, 1e10]}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test" --input_format_try_infer_exponent_floats=1; $CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : [1, 42.42, false]}')"; echo '{"x" : [1, 42.42, false]}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test"; @@ -19,10 +19,10 @@ $CLICKHOUSE_LOCAL -q "desc format(TSV, '1.2')"; echo '1.2' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test"; $CLICKHOUSE_LOCAL -q "desc format(TSV, '1')"; echo '1' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test"; -$CLICKHOUSE_LOCAL -q "desc format(TSV, '1e10')"; -echo '1e10' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test"; -$CLICKHOUSE_LOCAL -q "desc format(TSV, '[1, 42.42, 1, 1e10]')"; -echo '[1, 42.42, 1, 1e10]' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test"; +$CLICKHOUSE_LOCAL -q "desc format(TSV, '1e10')" --input_format_try_infer_exponent_floats=1; +echo '1e10' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test" --input_format_try_infer_exponent_floats=1; +$CLICKHOUSE_LOCAL -q "desc format(TSV, '[1, 42.42, 1, 1e10]')" --input_format_try_infer_exponent_floats=1; +echo '[1, 42.42, 1, 1e10]' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test" --input_format_try_infer_exponent_floats=1; $CLICKHOUSE_LOCAL -q "desc format(TSV, '[1, 42.42, false]')"; echo '[1, 42.42, false]' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test"; diff --git a/tests/queries/0_stateless/02502_bad_values_schema_inference.sql b/tests/queries/0_stateless/02502_bad_values_schema_inference.sql index 4c796842c0d..67ac09832de 100644 --- a/tests/queries/0_stateless/02502_bad_values_schema_inference.sql +++ b/tests/queries/0_stateless/02502_bad_values_schema_inference.sql @@ -1,2 +1,2 @@ -desc format(Values, '(\'abc)'); -- { serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED } +desc format(Values, '(\'abc)'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } diff --git a/tests/queries/0_stateless/02521_aggregation_by_partitions.reference b/tests/queries/0_stateless/02521_aggregation_by_partitions.reference index 67a131ff853..87b2d5c3430 100644 --- a/tests/queries/0_stateless/02521_aggregation_by_partitions.reference +++ b/tests/queries/0_stateless/02521_aggregation_by_partitions.reference @@ -1,3 +1,5 @@ +-- { echoOn } +explain pipeline select a from t1 group by a; (Expression) ExpressionTransform × 16 (Aggregating) @@ -15,6 +17,8 @@ ExpressionTransform × 16 Resize 3 → 1 MergeTreeSelect(pool: ReadPool, algorithm: Thread) × 3 0 → 1 1000000 +-- { echoOn } +explain pipeline select a from t2 group by a; (Expression) ExpressionTransform × 16 (Aggregating) @@ -40,6 +44,8 @@ ExpressionTransform × 16 Resize 2 → 1 MergeTreeSelect(pool: ReadPool, algorithm: Thread) × 2 0 → 1 1000000 +-- { echoOn } +explain pipeline select a from t3 group by a; (Expression) ExpressionTransform × 16 (Aggregating) @@ -82,6 +88,8 @@ ExpressionTransform × 16 MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 1000000 1000000 +-- { echoOn } +explain pipeline select a from t4 group by a settings read_in_order_two_level_merge_threshold = 1e12; (Expression) ExpressionTransform × 16 (Aggregating) @@ -91,20 +99,21 @@ ExpressionTransform × 16 (Expression) ExpressionTransform × 4 (ReadFromMergeTree) - ExpressionTransform × 4 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 1000000 +-- { echoOn } +explain pipeline select a from t5 group by a settings read_in_order_two_level_merge_threshold = 1e12; (Expression) ExpressionTransform × 16 (Aggregating) @@ -114,41 +123,6 @@ ExpressionTransform × 16 (Expression) ExpressionTransform × 8 (ReadFromMergeTree) - ExpressionTransform × 8 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 -1000000 -(Expression) -ExpressionTransform × 16 - (Aggregating) - FinalizeAggregatedTransform × 16 - AggregatingInOrderTransform × 16 - (Expression) - ExpressionTransform × 16 - (ReadFromMergeTree) - ExpressionTransform × 16 MergingSortedTransform 2 → 1 ExpressionTransform × 2 MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 @@ -173,30 +147,65 @@ ExpressionTransform × 16 MergingSortedTransform 2 → 1 ExpressionTransform × 2 MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 - MergingSortedTransform 2 → 1 - ExpressionTransform × 2 - MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 +1000000 +-- { echoOn } +explain pipeline select a from t6 group by a settings read_in_order_two_level_merge_threshold = 1e12; +(Expression) +ExpressionTransform × 16 + (Aggregating) + FinalizeAggregatedTransform × 16 + AggregatingInOrderTransform × 16 + (Expression) + ExpressionTransform × 16 + (ReadFromMergeTree) + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 + MergingSortedTransform 2 → 1 + ExpressionTransform × 2 + MergeTreeSelect(pool: ReadPoolInOrder, algorithm: InOrder) × 2 0 → 1 1000000 Skip merging: 1 Skip merging: 1 diff --git a/tests/queries/0_stateless/02521_aggregation_by_partitions.sql b/tests/queries/0_stateless/02521_aggregation_by_partitions.sql index 87317e5fba4..5b013ca5aef 100644 --- a/tests/queries/0_stateless/02521_aggregation_by_partitions.sql +++ b/tests/queries/0_stateless/02521_aggregation_by_partitions.sql @@ -15,7 +15,9 @@ system stop merges t1; insert into t1 select number from numbers_mt(1e6); insert into t1 select number from numbers_mt(1e6); +-- { echoOn } explain pipeline select a from t1 group by a; +-- { echoOff } select count() from (select throwIf(count() != 2) from t1 group by a); @@ -28,7 +30,9 @@ system stop merges t2; insert into t2 select number from numbers_mt(1e6); insert into t2 select number from numbers_mt(1e6); +-- { echoOn } explain pipeline select a from t2 group by a; +-- { echoOff } select count() from (select throwIf(count() != 2) from t2 group by a); @@ -41,7 +45,9 @@ system stop merges t3; insert into t3 select number from numbers_mt(1e6); insert into t3 select number from numbers_mt(1e6); +-- { echoOn } explain pipeline select a from t3 group by a; +-- { echoOff } select count() from (select throwIf(count() != 2) from t3 group by a); @@ -63,7 +69,9 @@ system stop merges t4; insert into t4 select number from numbers_mt(1e6); insert into t4 select number from numbers_mt(1e6); +-- { echoOn } explain pipeline select a from t4 group by a settings read_in_order_two_level_merge_threshold = 1e12; +-- { echoOff } select count() from (select throwIf(count() != 2) from t4 group by a); @@ -76,7 +84,9 @@ system stop merges t5; insert into t5 select number from numbers_mt(1e6); insert into t5 select number from numbers_mt(1e6); +-- { echoOn } explain pipeline select a from t5 group by a settings read_in_order_two_level_merge_threshold = 1e12; +-- { echoOff } select count() from (select throwIf(count() != 2) from t5 group by a); @@ -89,7 +99,9 @@ system stop merges t6; insert into t6 select number from numbers_mt(1e6); insert into t6 select number from numbers_mt(1e6); +-- { echoOn } explain pipeline select a from t6 group by a settings read_in_order_two_level_merge_threshold = 1e12; +-- { echoOff } select count() from (select throwIf(count() != 2) from t6 group by a); diff --git a/tests/queries/0_stateless/02590_interserver_mode_client_info_initial_query_start_time.reference b/tests/queries/0_stateless/02590_interserver_mode_client_info_initial_query_start_time.reference index fbce8ae2026..eafcfc23fb8 100644 --- a/tests/queries/0_stateless/02590_interserver_mode_client_info_initial_query_start_time.reference +++ b/tests/queries/0_stateless/02590_interserver_mode_client_info_initial_query_start_time.reference @@ -1,8 +1,8 @@ SELECT -3 0 0 -3 0 0 +3 1 +3 1 INSERT CHECK 1 2 -6 0 2 +6 2 diff --git a/tests/queries/0_stateless/02590_interserver_mode_client_info_initial_query_start_time.sh b/tests/queries/0_stateless/02590_interserver_mode_client_info_initial_query_start_time.sh index 5da643bd17b..3b0d2309784 100755 --- a/tests/queries/0_stateless/02590_interserver_mode_client_info_initial_query_start_time.sh +++ b/tests/queries/0_stateless/02590_interserver_mode_client_info_initial_query_start_time.sh @@ -33,10 +33,10 @@ query_id="$(get_query_id)" $CLICKHOUSE_CLIENT --prefer_localhost_replica=0 --query_id "$query_id" -q "select * from dist" $CLICKHOUSE_CLIENT -nm --param_query_id "$query_id" -q " system flush logs; - select count(), countIf(initial_query_start_time_microseconds != query_start_time_microseconds), countIf(event_time - initial_query_start_time > 3) from system.query_log where type = 'QueryFinish' and initial_query_id = {query_id:String}; + select count(), count(distinct initial_query_start_time_microseconds) from system.query_log where type = 'QueryFinish' and initial_query_id = {query_id:String}; " -sleep 6 +sleep 1 query_id="$(get_query_id)" # this query (and all subsequent) should reuse the previous connection (at least most of the time) @@ -44,7 +44,7 @@ $CLICKHOUSE_CLIENT --prefer_localhost_replica=0 --query_id "$query_id" -q "selec $CLICKHOUSE_CLIENT -nm --param_query_id "$query_id" -q " system flush logs; - select count(), countIf(initial_query_start_time_microseconds != query_start_time_microseconds), countIf(event_time - initial_query_start_time > 3) from system.query_log where type = 'QueryFinish' and initial_query_id = {query_id:String}; + select count(), count(distinct initial_query_start_time_microseconds) from system.query_log where type = 'QueryFinish' and initial_query_id = {query_id:String}; " echo "INSERT" @@ -54,7 +54,7 @@ $CLICKHOUSE_CLIENT --prefer_localhost_replica=0 --query_id "$query_id" -nm -q " select * from data; " -sleep 3 +sleep 1 $CLICKHOUSE_CLIENT -nm --param_query_id "$query_id" -q "system flush distributed dist_dist" sleep 1 $CLICKHOUSE_CLIENT -nm --param_query_id "$query_id" -q "system flush distributed dist" @@ -63,5 +63,5 @@ echo "CHECK" $CLICKHOUSE_CLIENT -nm --param_query_id "$query_id" -q " select * from data order by key; system flush logs; - select count(), countIf(initial_query_start_time_microseconds != query_start_time_microseconds), countIf(event_time - initial_query_start_time > 3) from system.query_log where type = 'QueryFinish' and initial_query_id = {query_id:String}; + select count(), count(distinct initial_query_start_time_microseconds) from system.query_log where type = 'QueryFinish' and initial_query_id = {query_id:String}; " diff --git a/tests/queries/0_stateless/02661_quantile_approx.reference b/tests/queries/0_stateless/02661_quantile_approx.reference index 8369363aa9b..0ee846a268b 100644 --- a/tests/queries/0_stateless/02661_quantile_approx.reference +++ b/tests/queries/0_stateless/02661_quantile_approx.reference @@ -19,6 +19,20 @@ select quantilesGK(1000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(numbe [99,199,249,313,776] select quantilesGK(10000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(number + 1) from numbers(1000); [100,200,250,314,777] +SELECT quantileGKMerge(100, 0.5)(x) +FROM +( + SELECT quantileGKState(100, 0.5)(number + 1) AS x + FROM numbers(49999) +); +24902 +SELECT quantilesGKMerge(100, 0.5, 0.9, 0.99)(x) +FROM +( + SELECT quantilesGKState(100, 0.5, 0.9, 0.99)(number + 1) AS x + FROM numbers(49999) +); +[24902,44518,49999] select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS } select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 1; -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } select quantileGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS } diff --git a/tests/queries/0_stateless/02661_quantile_approx.sql b/tests/queries/0_stateless/02661_quantile_approx.sql index 52c2979ad44..c0004260fa1 100644 --- a/tests/queries/0_stateless/02661_quantile_approx.sql +++ b/tests/queries/0_stateless/02661_quantile_approx.sql @@ -15,6 +15,19 @@ select quantilesGK(100, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(number select quantilesGK(1000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(number + 1) from numbers(1000); select quantilesGK(10000, 100/1000, 200/1000, 250/1000, 314/1000, 777/1000)(number + 1) from numbers(1000); +SELECT quantileGKMerge(100, 0.5)(x) +FROM +( + SELECT quantileGKState(100, 0.5)(number + 1) AS x + FROM numbers(49999) +); + +SELECT quantilesGKMerge(100, 0.5, 0.9, 0.99)(x) +FROM +( + SELECT quantilesGKState(100, 0.5, 0.9, 0.99)(number + 1) AS x + FROM numbers(49999) +); select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 0; -- { serverError BAD_ARGUMENTS } select medianGK()(number) from numbers(10) SETTINGS allow_experimental_analyzer = 1; -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } diff --git a/tests/queries/0_stateless/02676_trailing_commas.reference b/tests/queries/0_stateless/02676_trailing_commas.reference index 76d173ca23e..cfb2ccd6a0f 100644 --- a/tests/queries/0_stateless/02676_trailing_commas.reference +++ b/tests/queries/0_stateless/02676_trailing_commas.reference @@ -3,3 +3,6 @@ 1 1 2 0 1 +(1,'foo') +(1,'foo') +(1,(2,'foo')) diff --git a/tests/queries/0_stateless/02676_trailing_commas.sql b/tests/queries/0_stateless/02676_trailing_commas.sql index 048405c4d20..7fb64bb57a3 100644 --- a/tests/queries/0_stateless/02676_trailing_commas.sql +++ b/tests/queries/0_stateless/02676_trailing_commas.sql @@ -3,3 +3,7 @@ SELECT 1, FROM numbers(1); WITH 1 as a SELECT a, FROM numbers(1); WITH 1 as from SELECT from, from + from, from in [0], FROM numbers(1); SELECT n, FROM (SELECT 1 AS n); +SELECT (1, 'foo')::Tuple(a Int, b String,); +SELECT (1, 'foo')::Tuple(a Int, b String,,); -- { clientError SYNTAX_ERROR } +SELECT (1, 'foo')::Tuple(Int, String,); +SELECT (1, (2,'foo'))::Tuple(Int, Tuple(Int, String,),); diff --git a/tests/queries/0_stateless/02720_row_policy_column_with_dots.reference b/tests/queries/0_stateless/02720_row_policy_column_with_dots.reference index dd2c30cc9f8..d00491fd7e5 100644 --- a/tests/queries/0_stateless/02720_row_policy_column_with_dots.reference +++ b/tests/queries/0_stateless/02720_row_policy_column_with_dots.reference @@ -1 +1 @@ -2024-01-01 Hello World +1 diff --git a/tests/queries/0_stateless/02720_row_policy_column_with_dots.sql b/tests/queries/0_stateless/02720_row_policy_column_with_dots.sql index 361bd0e0ec7..fcb0bf62859 100644 --- a/tests/queries/0_stateless/02720_row_policy_column_with_dots.sql +++ b/tests/queries/0_stateless/02720_row_policy_column_with_dots.sql @@ -1,6 +1,6 @@ -CREATE table if not exists table_with_dot_column (date Date, regular_column String, `other_column.2` String) ENGINE = MergeTree() ORDER BY date; -INSERT INTO table_with_dot_column select '2020-01-01', 'Hello', 'World'; -INSERT INTO table_with_dot_column select '2024-01-01', 'Hello', 'World'; +CREATE TABLE IF NOT EXISTS table_with_dot_column (date Date, regular_column String, `other_column.2` String) ENGINE = MergeTree() ORDER BY date; +INSERT INTO table_with_dot_column SELECT '2020-01-01', 'Hello', 'World'; +INSERT INTO table_with_dot_column SELECT toDate(now() + 48*3600), 'Hello', 'World'; CREATE ROW POLICY IF NOT EXISTS row_policy ON table_with_dot_column USING toDate(date) >= today() - 30 TO ALL; -SELECT * FROM table_with_dot_column; +SELECT count(*) FROM table_with_dot_column; DROP TABLE table_with_dot_column; diff --git a/tests/queries/0_stateless/02723_jit_aggregation_bug_48120.sql b/tests/queries/0_stateless/02723_jit_aggregation_bug_48120.sql index a9a6d3058b2..88561f9d895 100644 --- a/tests/queries/0_stateless/02723_jit_aggregation_bug_48120.sql +++ b/tests/queries/0_stateless/02723_jit_aggregation_bug_48120.sql @@ -1,4 +1,4 @@ --- Tags: no-fasttest, no-ubsan, no-msan, no-cpu-aarch64 +-- Tags: no-fasttest, no-cpu-aarch64, no-msan drop table if exists dummy; CREATE TABLE dummy ( num1 Int32, num2 Enum8('foo' = 0, 'bar' = 1, 'tar' = 2) ) diff --git a/tests/queries/0_stateless/02723_parallelize_output_setting.reference b/tests/queries/0_stateless/02723_parallelize_output_setting.reference index 0f2a396f471..36e4e68ecd5 100644 --- a/tests/queries/0_stateless/02723_parallelize_output_setting.reference +++ b/tests/queries/0_stateless/02723_parallelize_output_setting.reference @@ -5,3 +5,7 @@ select startsWith(trimLeft(explain),'Resize') as resize from (explain pipeline s -- no Resize in pipeline set parallelize_output_from_storages=0; select startsWith(trimLeft(explain),'Resize') as resize from (explain pipeline select * from file(data_02723.csv)) where resize; +-- Data from URL source is immediately resized to max_treads streams, before any ExpressionTransform. +set parallelize_output_from_storages=1; +select match(arrayStringConcat(groupArray(explain), ''), '.*Resize 1 → 2 *URL 0 → 1 *$') from (explain pipeline select x, count() from url('https://example.com', Parquet, 'x Int64') group by x order by count() limit 10); +1 diff --git a/tests/queries/0_stateless/02723_parallelize_output_setting.sql b/tests/queries/0_stateless/02723_parallelize_output_setting.sql index 7db28ca4dec..86e6d4b4e3d 100644 --- a/tests/queries/0_stateless/02723_parallelize_output_setting.sql +++ b/tests/queries/0_stateless/02723_parallelize_output_setting.sql @@ -1,4 +1,4 @@ --- Tags: no-parallel +-- Tags: no-parallel, no-fasttest insert into function file(data_02723.csv) select number from numbers(5) settings engine_file_truncate_on_insert=1; @@ -10,3 +10,6 @@ select startsWith(trimLeft(explain),'Resize') as resize from (explain pipeline s set parallelize_output_from_storages=0; select startsWith(trimLeft(explain),'Resize') as resize from (explain pipeline select * from file(data_02723.csv)) where resize; +-- Data from URL source is immediately resized to max_treads streams, before any ExpressionTransform. +set parallelize_output_from_storages=1; +select match(arrayStringConcat(groupArray(explain), ''), '.*Resize 1 → 2 *URL 0 → 1 *$') from (explain pipeline select x, count() from url('https://example.com', Parquet, 'x Int64') group by x order by count() limit 10); \ No newline at end of file diff --git a/tests/queries/0_stateless/02724_database_s3.sh b/tests/queries/0_stateless/02724_database_s3.sh index 13b627c0342..80b47282146 100755 --- a/tests/queries/0_stateless/02724_database_s3.sh +++ b/tests/queries/0_stateless/02724_database_s3.sh @@ -46,7 +46,7 @@ DROP DATABASE IF EXISTS test3; CREATE DATABASE test3 ENGINE = S3; USE test3; SELECT * FROM \"http://localhost:11111/test/a.myext\" -""" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: +""" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "S3_ERROR" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ USE test3; diff --git a/tests/queries/0_stateless/02725_database_hdfs.sh b/tests/queries/0_stateless/02725_database_hdfs.sh index b4e081f6de0..d62f928e947 100755 --- a/tests/queries/0_stateless/02725_database_hdfs.sh +++ b/tests/queries/0_stateless/02725_database_hdfs.sh @@ -58,7 +58,7 @@ SELECT * FROM \"abacaba/file.tsv\" """ 2>&1 | tr '\n' ' ' | grep -oF "CANNOT_EXTRACT_TABLE_STRUCTURE" ${CLICKHOUSE_CLIENT} -q "SELECT * FROM test_hdfs_4.\`http://localhost:11111/test/a.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/test_02725_3.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: diff --git a/tests/queries/0_stateless/02726_async_insert_flush_queue.sql b/tests/queries/0_stateless/02726_async_insert_flush_queue.sql index 98e78045b85..1ae24e4f3da 100644 --- a/tests/queries/0_stateless/02726_async_insert_flush_queue.sql +++ b/tests/queries/0_stateless/02726_async_insert_flush_queue.sql @@ -6,7 +6,8 @@ CREATE TABLE t_async_inserts_flush (a UInt64) ENGINE = Memory; SET async_insert = 1; SET wait_for_async_insert = 0; -SET async_insert_busy_timeout_ms = 1000000; +SET async_insert_busy_timeout_min_ms = 1000000; +SET async_insert_busy_timeout_max_ms = 10000000; INSERT INTO t_async_inserts_flush VALUES (1) (2); INSERT INTO t_async_inserts_flush FORMAT JSONEachRow {"a": 10} {"a": 20}; diff --git a/tests/queries/0_stateless/02731_parallel_replicas_join_subquery.reference b/tests/queries/0_stateless/02731_parallel_replicas_join_subquery.reference index ec4928bc325..028cc744170 100644 --- a/tests/queries/0_stateless/02731_parallel_replicas_join_subquery.reference +++ b/tests/queries/0_stateless/02731_parallel_replicas_join_subquery.reference @@ -31,7 +31,7 @@ 29 2j&S)ba?XG QuQj 17163829389637435056 3 UlI+1 14144472852965836438 =============== QUERIES EXECUTED BY PARALLEL INNER QUERY ALONE =============== -0 3 SELECT `__table1`.`key` AS `key`, `__table1`.`value1` AS `value1`, `__table1`.`value2` AS `value2`, toUInt64(min(`__table1`.`time`)) AS `start_ts` FROM `default`.`join_inner_table` AS `__table1` PREWHERE (`__table1`.`id` = \'833c9e22-c245-4eb5-8745-117a9a1f26b1\') AND (`__table1`.`number` > 1610517366120) GROUP BY `__table1`.`key`, `__table1`.`value1`, `__table1`.`value2` ORDER BY `__table1`.`key` ASC, `__table1`.`value1` ASC, `__table1`.`value2` ASC LIMIT _CAST(10, \'UInt64\') SETTINGS allow_experimental_parallel_reading_from_replicas = 1, allow_experimental_analyzer = 1 +0 3 SELECT `__table1`.`key` AS `key`, `__table1`.`value1` AS `value1`, `__table1`.`value2` AS `value2`, toUInt64(min(`__table1`.`time`)) AS `start_ts` FROM `default`.`join_inner_table` AS `__table1` PREWHERE (`__table1`.`id` = \'833c9e22-c245-4eb5-8745-117a9a1f26b1\') AND (`__table1`.`number` > _CAST(1610517366120, \'UInt64\')) GROUP BY `__table1`.`key`, `__table1`.`value1`, `__table1`.`value2` ORDER BY `__table1`.`key` ASC, `__table1`.`value1` ASC, `__table1`.`value2` ASC LIMIT _CAST(10, \'UInt64\') SETTINGS allow_experimental_parallel_reading_from_replicas = 1, allow_experimental_analyzer = 1 0 3 SELECT `key`, `value1`, `value2`, toUInt64(min(`time`)) AS `start_ts` FROM `default`.`join_inner_table` PREWHERE (`id` = \'833c9e22-c245-4eb5-8745-117a9a1f26b1\') AND (`number` > toUInt64(\'1610517366120\')) GROUP BY `key`, `value1`, `value2` ORDER BY `key` ASC, `value1` ASC, `value2` ASC LIMIT 10 1 1 -- Parallel inner query alone\nSELECT\n key,\n value1,\n value2,\n toUInt64(min(time)) AS start_ts\nFROM join_inner_table\nPREWHERE (id = \'833c9e22-c245-4eb5-8745-117a9a1f26b1\') AND (number > toUInt64(\'1610517366120\'))\nGROUP BY key, value1, value2\nORDER BY key, value1, value2\nLIMIT 10\nSETTINGS allow_experimental_parallel_reading_from_replicas = 1, allow_experimental_analyzer=0; 1 1 -- Parallel inner query alone\nSELECT\n key,\n value1,\n value2,\n toUInt64(min(time)) AS start_ts\nFROM join_inner_table\nPREWHERE (id = \'833c9e22-c245-4eb5-8745-117a9a1f26b1\') AND (number > toUInt64(\'1610517366120\'))\nGROUP BY key, value1, value2\nORDER BY key, value1, value2\nLIMIT 10\nSETTINGS allow_experimental_parallel_reading_from_replicas = 1, allow_experimental_analyzer=1; @@ -58,8 +58,7 @@ U c 10 UlI+1 10 bX?}ix [ Ny]2 G 10 t 1610517366120) GROUP BY `__table1`.`key`, `__table1`.`value1`, `__table1`.`value2` -0 3 SELECT `__table2`.`value1` AS `value1`, `__table2`.`value2` AS `value2`, count() AS `count` FROM `default`.`join_outer_table` AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table2` USING (`key`) GROUP BY `__table1`.`key`, `__table2`.`value1`, `__table2`.`value2` +0 3 SELECT `__table2`.`value1` AS `value1`, `__table2`.`value2` AS `value2`, count() AS `count` FROM `default`.`join_outer_table` AS `__table1` ALL INNER JOIN (SELECT `__table3`.`key` AS `key`, `__table3`.`value1` AS `value1`, `__table3`.`value2` AS `value2` FROM `default`.`join_inner_table` AS `__table3` PREWHERE (`__table3`.`id` = \'833c9e22-c245-4eb5-8745-117a9a1f26b1\') AND (`__table3`.`number` > _CAST(1610517366120, \'UInt64\')) GROUP BY `__table3`.`key`, `__table3`.`value1`, `__table3`.`value2`) AS `__table2` USING (`key`) GROUP BY `__table1`.`key`, `__table2`.`value1`, `__table2`.`value2` 0 3 SELECT `key`, `value1`, `value2` FROM `default`.`join_inner_table` PREWHERE (`id` = \'833c9e22-c245-4eb5-8745-117a9a1f26b1\') AND (`number` > toUInt64(\'1610517366120\')) GROUP BY `key`, `value1`, `value2` 0 3 SELECT `value1`, `value2`, count() AS `count` FROM `default`.`join_outer_table` ALL INNER JOIN `_data_` USING (`key`) GROUP BY `key`, `value1`, `value2` 1 1 -- Parallel full query\nSELECT\n value1,\n value2,\n avg(count) AS avg\nFROM\n (\n SELECT\n key,\n value1,\n value2,\n count() AS count\n FROM join_outer_table\n INNER JOIN\n (\n SELECT\n key,\n value1,\n value2,\n toUInt64(min(time)) AS start_ts\n FROM join_inner_table\n PREWHERE (id = \'833c9e22-c245-4eb5-8745-117a9a1f26b1\') AND (number > toUInt64(\'1610517366120\'))\n GROUP BY key, value1, value2\n ) USING (key)\n GROUP BY key, value1, value2\n )\nGROUP BY value1, value2\nORDER BY value1, value2\nSETTINGS allow_experimental_parallel_reading_from_replicas = 1, allow_experimental_analyzer=0; diff --git a/tests/queries/0_stateless/02769_parallel_replicas_unavailable_shards.sql b/tests/queries/0_stateless/02769_parallel_replicas_unavailable_shards.sql index 38d592201e3..1a75e000349 100644 --- a/tests/queries/0_stateless/02769_parallel_replicas_unavailable_shards.sql +++ b/tests/queries/0_stateless/02769_parallel_replicas_unavailable_shards.sql @@ -2,14 +2,13 @@ DROP TABLE IF EXISTS test_parallel_replicas_unavailable_shards; CREATE TABLE test_parallel_replicas_unavailable_shards (n UInt64) ENGINE=MergeTree() ORDER BY tuple(); INSERT INTO test_parallel_replicas_unavailable_shards SELECT * FROM numbers(10); -SYSTEM FLUSH LOGS; - SET allow_experimental_parallel_reading_from_replicas=2, max_parallel_replicas=11, cluster_for_parallel_replicas='parallel_replicas', parallel_replicas_for_non_replicated_merge_tree=1; SET send_logs_level='error'; -SELECT count() FROM test_parallel_replicas_unavailable_shards WHERE NOT ignore(*); +SELECT count() FROM test_parallel_replicas_unavailable_shards WHERE NOT ignore(*) SETTINGS log_comment = '02769_7b513191-5082-4073-8568-53b86a49da79'; SYSTEM FLUSH LOGS; -SELECT count() > 0 FROM system.text_log WHERE yesterday() <= event_date AND message LIKE '%Replica number 10 is unavailable%'; +SET allow_experimental_parallel_reading_from_replicas=0; +SELECT ProfileEvents['ParallelReplicasUnavailableCount'] FROM system.query_log WHERE yesterday() <= event_date AND query_id in (select query_id from system.query_log where log_comment = '02769_7b513191-5082-4073-8568-53b86a49da79' and current_database = currentDatabase()) and type = 'QueryFinish' and query_id == initial_query_id; DROP TABLE test_parallel_replicas_unavailable_shards; diff --git a/tests/queries/0_stateless/02771_ignore_data_skipping_indices.reference b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.reference index fcede2caf2a..33df18c8801 100644 --- a/tests/queries/0_stateless/02771_ignore_data_skipping_indices.reference +++ b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.reference @@ -38,3 +38,40 @@ Description: minmax GRANULARITY 1 Parts: 0/0 Granules: 0/0 + ReadFromMergeTree (default.data_02771) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 + Skip + Name: x_idx + Description: minmax GRANULARITY 1 + Parts: 0/1 + Granules: 0/1 + Skip + Name: y_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 + Skip + Name: xy_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 + ReadFromMergeTree (default.data_02771) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 1/1 + Skip + Name: x_idx + Description: minmax GRANULARITY 1 + Parts: 0/1 + Granules: 0/1 + Skip + Name: y_idx + Description: minmax GRANULARITY 1 + Parts: 0/0 + Granules: 0/0 diff --git a/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql index a49239e9de2..951d87fd2c0 100644 --- a/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql +++ b/tests/queries/0_stateless/02771_ignore_data_skipping_indices.sql @@ -1,5 +1,3 @@ -SET allow_experimental_analyzer = 0; - DROP TABLE IF EXISTS data_02771; @@ -24,6 +22,14 @@ SELECT * FROM data_02771 SETTINGS ignore_data_skipping_indices='na_idx'; SELECT * FROM data_02771 WHERE x = 1 AND y = 1 SETTINGS ignore_data_skipping_indices='xy_idx',force_data_skipping_indices='xy_idx' ; -- { serverError 277 } SELECT * FROM data_02771 WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx'; + +SET allow_experimental_analyzer = 0; + +SELECT * from ( EXPLAIN indexes = 1 SELECT * FROM data_02771 WHERE x = 1 AND y = 2 ) WHERE explain NOT LIKE '%Expression%' AND explain NOT LIKE '%Filter%'; +SELECT * from ( EXPLAIN indexes = 1 SELECT * FROM data_02771 WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx' ) WHERE explain NOT LIKE '%Expression%' AND explain NOT LIKE '%Filter%'; + +SET allow_experimental_analyzer = 1; + SELECT * from ( EXPLAIN indexes = 1 SELECT * FROM data_02771 WHERE x = 1 AND y = 2 ) WHERE explain NOT LIKE '%Expression%' AND explain NOT LIKE '%Filter%'; SELECT * from ( EXPLAIN indexes = 1 SELECT * FROM data_02771 WHERE x = 1 AND y = 2 SETTINGS ignore_data_skipping_indices='xy_idx' ) WHERE explain NOT LIKE '%Expression%' AND explain NOT LIKE '%Filter%'; diff --git a/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql b/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql index b4165e8e80a..ef0381df1a6 100644 --- a/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql +++ b/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql @@ -1,5 +1,5 @@ set input_format_max_rows_to_read_for_schema_inference=2; set input_format_json_infer_incomplete_types_as_strings=0; -desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=10; -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=10; -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=20; diff --git a/tests/queries/0_stateless/02784_parallel_replicas_automatic_decision_join.sh b/tests/queries/0_stateless/02784_parallel_replicas_automatic_decision_join.sh index 1a74c3230c6..ef3e6000903 100755 --- a/tests/queries/0_stateless/02784_parallel_replicas_automatic_decision_join.sh +++ b/tests/queries/0_stateless/02784_parallel_replicas_automatic_decision_join.sh @@ -64,6 +64,7 @@ function run_query_with_pure_parallel_replicas () { --query_id "${1}_pure" \ --max_parallel_replicas 3 \ --prefer_localhost_replica 1 \ + --parallel_replicas_prefer_local_join 0 \ --cluster_for_parallel_replicas "parallel_replicas" \ --allow_experimental_parallel_reading_from_replicas 1 \ --parallel_replicas_for_non_replicated_merge_tree 1 \ diff --git a/tests/queries/0_stateless/02810_async_insert_dedup_replicated_collapsing.sh b/tests/queries/0_stateless/02810_async_insert_dedup_replicated_collapsing.sh index 804cd894ebc..57950af8975 100755 --- a/tests/queries/0_stateless/02810_async_insert_dedup_replicated_collapsing.sh +++ b/tests/queries/0_stateless/02810_async_insert_dedup_replicated_collapsing.sh @@ -7,7 +7,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS 02810_async_insert_dedup_collapsing" ${CLICKHOUSE_CLIENT} -q "CREATE TABLE 02810_async_insert_dedup_collapsing (stringvalue String, sign Int8) ENGINE = ReplicatedCollapsingMergeTree('/clickhouse/{database}/02810_async_insert_dedup', 'r1', sign) ORDER BY stringvalue" -url="${CLICKHOUSE_URL}&async_insert=1&wait_for_async_insert=1&async_insert_busy_timeout_ms=3000&async_insert_deduplicate=1" +url="${CLICKHOUSE_URL}&async_insert=1&wait_for_async_insert=1&async_insert_busy_timeout_ms=3000&async_insert_use_adaptive_busy_timeout=0&async_insert_deduplicate=1" # insert value with same key and sign so it's collapsed on insert ${CLICKHOUSE_CURL} -sS "$url" -d "INSERT INTO 02810_async_insert_dedup_collapsing VALUES ('string1', 1)" & @@ -36,4 +36,4 @@ wait ${CLICKHOUSE_CLIENT} -q "SELECT stringvalue FROM 02810_async_insert_dedup_collapsing ORDER BY stringvalue" ${CLICKHOUSE_CLIENT} -q "SELECT '------------'" -${CLICKHOUSE_CLIENT} -q "DROP TABLE 02810_async_insert_dedup_collapsing" \ No newline at end of file +${CLICKHOUSE_CLIENT} -q "DROP TABLE 02810_async_insert_dedup_collapsing" diff --git a/tests/queries/0_stateless/02813_seriesDecomposeSTL.reference b/tests/queries/0_stateless/02813_seriesDecomposeSTL.reference index dc30e7f8371..28dae705335 100644 --- a/tests/queries/0_stateless/02813_seriesDecomposeSTL.reference +++ b/tests/queries/0_stateless/02813_seriesDecomposeSTL.reference @@ -1,4 +1,4 @@ -[[-13.529999,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.530001,-3.18,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1799994,16.71,-13.529999,-3.1799994,16.709997],[23.63,23.63,23.630003,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.630001,23.630001,23.630001,23.630003],[0,0.0000019073486,-0.0000019073486,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0000019073486,0,0]] -[[4.04452e-8,-1.7846537e-8,-5.9488454e-9,0,0,0,0,0,0,-1.9868216e-8,-9.5297715e-8,2.2540547e-9,3.4229203e-8,8.573613e-8],[1.9999999,2,2,2,2,2,2,2,2,2,2,2,1.9999996,1.9999996],[1.1920929e-7,0,0,0,0,0,0,0,0,0,0,0,3.5762787e-7,2.3841858e-7]] -[[-13.529999,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.530001,-3.18,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1799994,16.71,-13.529999,-3.1799994,16.709997],[23.63,23.63,23.630003,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.630001,23.630001,23.630001,23.630003],[0,0.0000019073486,-0.0000019073486,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0000019073486,0,0]] -[[53.946846,-4.8119445,43.525013,-23.71359,-42.472305,-51.636955,-50.458298,-51.982674,37.62072,-15.9006605,56.65076,-5.809669,57.143845,-2.0370207,54.050922,-4.897961,43.954018,-23.808758,-42.651337,-51.86827,-50.709732,-52.18156,37.734905,-15.853402,56.91643,-5.8815174,57.253094,-2.012879,54.157806,-4.9817176,44.384747,-23.902956,-42.830154,-52.10025,-50.96271,-52.3829,37.84573,-15.81032,57.177113,-5.958963,57.356136,-1.9952412,54.27533,-5.066312,44.878296,-23.956438,-42.993656,-52.337124,-51.208073,-52.615646,37.91102,-15.8062525,57.49891,-6.056076,57.45604,-1.9797823,54.39525,-5.1483474,45.374573],[88.028534,88.95315,89.87776,90.802376,91.64913,92.49588,93.342636,94.19737,95.0521,95.90684,96.712975,97.51912,98.32526,98.36342,98.40158,98.43974,98.36777,98.29579,98.223816,98.536446,98.849075,99.161705,99.7552,100.348694,100.94219,101.53184,102.12149,102.711136,103.79921,104.88729,105.975365,107.50462,109.033875,110.56313,111.79767,113.032196,114.26673,115.02128,115.775826,116.53037,117.15541,117.78044,118.40548,118.86489,119.3243,119.783714,120.04031,120.29691,120.55351,120.78621,121.01891,121.25161,121.533585,121.81555,122.09753,122.41821,122.7389,123.059586,123.39267],[-2.97538,2.8587952,-23.402771,0.91121674,4.8231735,9.141075,8.115662,10.785301,0.32717896,5.99382,-12.363731,5.29055,0.53089905,-2.3264008,-3.4524994,1.4582214,-2.321785,2.51297,5.4275208,3.3318253,5.8606567,0.019859314,-4.4901123,-12.495293,-5.8586197,-1.650322,-11.374588,4.3017426,4.042984,1.094429,9.639885,3.3983307,-3.20372,-5.462883,-5.834961,-6.649292,-1.1124649,3.7890396,16.047066,-2.5714111,8.488449,-2.785202,2.319191,-0.79857635,13.797401,-5.827278,-6.0466614,-5.9597855,-7.3454437,-3.1705627,6.0700684,3.5546417,1.9675064,-0.7594757,2.446434,0.5615692,0.86585236,-3.9112396,1.2327576]] +[[-13.529999,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.530001,-3.18,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1799994,16.71,-13.529999,-3.1799994,16.709997],[23.63,23.63,23.630003,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.630001,23.630001,23.630001,23.630003],[0,0.0000019073486,-0.0000019073486,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0000019073486,0,0],[10.1,20.449999,40.340004,10.100001,20.45,40.34,10.100001,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.100002,20.45,40.34]] +[[4.04452e-8,-1.7846537e-8,-5.9488454e-9,0,0,0,0,0,0,-1.9868216e-8,-9.5297715e-8,2.2540547e-9,3.4229203e-8,8.573613e-8],[1.9999999,2,2,2,2,2,2,2,2,2,2,2,1.9999996,1.9999996],[1.1920929e-7,0,0,0,0,0,0,0,0,0,0,0,3.5762787e-7,2.3841858e-7],[1.9999999,2,2,2,2,2,2,2,2,2,1.9999999,2,1.9999996,1.9999998]] +[[-13.529999,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.530001,-3.18,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1799994,16.71,-13.529999,-3.1799994,16.709997],[23.63,23.63,23.630003,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.630001,23.630001,23.630001,23.630003],[0,0.0000019073486,-0.0000019073486,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0000019073486,0,0],[10.1,20.449999,40.340004,10.100001,20.45,40.34,10.100001,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.100002,20.45,40.34]] +[[53.946846,-4.8119445,43.525013,-23.71359,-42.472305,-51.636955,-50.458298,-51.982674,37.62072,-15.9006605,56.65076,-5.809669,57.143845,-2.0370207,54.050922,-4.897961,43.954018,-23.808758,-42.651337,-51.86827,-50.709732,-52.18156,37.734905,-15.853402,56.91643,-5.8815174,57.253094,-2.012879,54.157806,-4.9817176,44.384747,-23.902956,-42.830154,-52.10025,-50.96271,-52.3829,37.84573,-15.81032,57.177113,-5.958963,57.356136,-1.9952412,54.27533,-5.066312,44.878296,-23.956438,-42.993656,-52.337124,-51.208073,-52.615646,37.91102,-15.8062525,57.49891,-6.056076,57.45604,-1.9797823,54.39525,-5.1483474,45.374573],[88.028534,88.95315,89.87776,90.802376,91.64913,92.49588,93.342636,94.19737,95.0521,95.90684,96.712975,97.51912,98.32526,98.36342,98.40158,98.43974,98.36777,98.29579,98.223816,98.536446,98.849075,99.161705,99.7552,100.348694,100.94219,101.53184,102.12149,102.711136,103.79921,104.88729,105.975365,107.50462,109.033875,110.56313,111.79767,113.032196,114.26673,115.02128,115.775826,116.53037,117.15541,117.78044,118.40548,118.86489,119.3243,119.783714,120.04031,120.29691,120.55351,120.78621,121.01891,121.25161,121.533585,121.81555,122.09753,122.41821,122.7389,123.059586,123.39267],[-2.97538,2.8587952,-23.402771,0.91121674,4.8231735,9.141075,8.115662,10.785301,0.32717896,5.99382,-12.363731,5.29055,0.53089905,-2.3264008,-3.4524994,1.4582214,-2.321785,2.51297,5.4275208,3.3318253,5.8606567,0.019859314,-4.4901123,-12.495293,-5.8586197,-1.650322,-11.374588,4.3017426,4.042984,1.094429,9.639885,3.3983307,-3.20372,-5.462883,-5.834961,-6.649292,-1.1124649,3.7890396,16.047066,-2.5714111,8.488449,-2.785202,2.319191,-0.79857635,13.797401,-5.827278,-6.0466614,-5.9597855,-7.3454437,-3.1705627,6.0700684,3.5546417,1.9675064,-0.7594757,2.446434,0.5615692,0.86585236,-3.9112396,1.2327576],[141.97537,84.141205,133.40277,67.08878,49.176826,40.858925,42.88434,42.2147,132.67282,80.00618,153.36374,91.70945,155.4691,96.3264,152.4525,93.54178,142.32178,74.48703,55.57248,46.668175,48.139343,46.980145,137.49011,84.49529,157.85863,95.65032,159.37459,100.69826,157.95702,99.90557,150.3601,83.60167,66.20372,58.462883,60.834957,60.649296,152.11246,99.21096,172.95294,110.57141,174.51155,115.7852,172.68082,113.79858,164.2026,95.82728,77.04666,67.95979,69.34544,68.17056,158.92993,105.44536,179.0325,115.759476,179.55356,120.43843,177.13416,117.91124,168.76724]] diff --git a/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.reference b/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.reference new file mode 100644 index 00000000000..85c65ab10ba --- /dev/null +++ b/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.reference @@ -0,0 +1,12 @@ +[-4.475000000000001,0,6.925000000000001,0,0,0,0,0,0,0,0,7.925000000000001,0,0,0,0] +[0,0,0,0,0,0,0,0,0,27.975,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,11.100000000000001,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0] +[-2.4999999999999996,0,5.1,0,0,0,0,0,2.0999999999999996,50.1,2.0999999999999996,0,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +[0,0,0,0] +[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0] diff --git a/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.sql b/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.sql new file mode 100644 index 00000000000..ca116e8b7ed --- /dev/null +++ b/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.sql @@ -0,0 +1,32 @@ +-- Tags: no-cpu-aarch64 +-- Tag no-cpu-aarch64: values generated are slighly different on aarch64 + +DROP TABLE IF EXISTS tb1; + +CREATE TABLE tb1 (n UInt32, a Array(Float64)) engine=Memory; +INSERT INTO tb1 VALUES (1, [-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 3, 4, 5, 16, 7, 5, 5, 4]), (2, [-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 12, 45, 12, 3.40, 3, 4, 5, 6]); + +-- non-const inputs +SELECT seriesOutliersDetectTukey(a) FROM tb1 ORDER BY n; +SELECT seriesOutliersDetectTukey(a,10,90,1.5) FROM tb1 ORDER BY n; +DROP TABLE IF EXISTS tb1; + +-- const inputs +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6]); +SELECT seriesOutliersDetectTukey([-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 12, 60, 12, 3.40, 3, 4, 5, 6, 3.40, 2.7]); + +-- const inputs with optional arguments +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 25, 75, 1.5); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 10, 90, 1.5); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 2, 98, 1.5); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 2, 98, 1.5); +SELECT seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30))); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 25, 75, 3); + +-- negative tests +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 25, 75, -1); -- { serverError BAD_ARGUMENTS} +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 33, 53); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 33); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +SELECT seriesOutliersDetectTukey([-3, 2.4, 15, NULL]); -- { serverError ILLEGAL_COLUMN} +SELECT seriesOutliersDetectTukey([]); -- { serverError ILLEGAL_COLUMN} +SELECT seriesOutliersDetectTukey([-3, 2.4, 15]); -- { serverError BAD_ARGUMENTS} \ No newline at end of file diff --git a/tests/queries/0_stateless/02832_alter_max_sessions_for_user.sh b/tests/queries/0_stateless/02832_alter_max_sessions_for_user.sh index 546c54a4de9..a3b0d17f1be 100755 --- a/tests/queries/0_stateless/02832_alter_max_sessions_for_user.sh +++ b/tests/queries/0_stateless/02832_alter_max_sessions_for_user.sh @@ -23,14 +23,14 @@ function test_alter_profile() ${CLICKHOUSE_CLIENT} -q $"ALTER SETTINGS PROFILE ${PROFILE} SETTINGS max_sessions_for_user = ${max_session_count}" - # Create sesssions with $max_session_count resriction + # Create sessions with $max_session_count restriction for ((i = 1 ; i <= ${max_session_count} ; i++)); do local session_id="${SESSION_ID_PREFIX}_${i}" # Skip output from this query ${CLICKHOUSE_CURL} -sS -X POST "${CLICKHOUSE_URL}&user=${USER}&session_id=${session_id}&session_check=0" --data-binary "SELECT 1" > /dev/null done - # Update resriction to $alter_sessions_count + # Update restriction to $alter_sessions_count ${CLICKHOUSE_CLIENT} -q $"ALTER SETTINGS PROFILE ${PROFILE} SETTINGS max_sessions_for_user = ${alter_sessions_count}" # Simultaneous sessions should use max settings from profile ($alter_sessions_count) diff --git a/tests/queries/0_stateless/02841_group_array_sorted.reference b/tests/queries/0_stateless/02841_group_array_sorted.reference new file mode 100644 index 00000000000..1043f949590 --- /dev/null +++ b/tests/queries/0_stateless/02841_group_array_sorted.reference @@ -0,0 +1,12 @@ +[0,1,2,3,4] +[0,1,2,3,4] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99] +['0','1','10','11','12','13','14','15','16','17','18','19','2','20','21','22','23','24','25','26','27','28','29','3','4','5','6','7','8','9'] +[0,0,1,1,2,2,3,3,4,4] +[[1,2,3,4],[2,3,4,5],[3,4,5,6]] +[(2,1),(15,25),(30,60),(100,200)] +[0.2,2.2,6.6,12.5] +['AAA','Aaa','aaa','abc','bbc'] +1000000 +1000000 +[0,1] diff --git a/tests/queries/0_stateless/02841_group_array_sorted.sql b/tests/queries/0_stateless/02841_group_array_sorted.sql new file mode 100644 index 00000000000..a8cd6791ff3 --- /dev/null +++ b/tests/queries/0_stateless/02841_group_array_sorted.sql @@ -0,0 +1,41 @@ +SELECT groupArraySorted(5)(number) FROM numbers(100); + +SELECT groupArraySorted(10)(number) FROM numbers(5); + +SELECT groupArraySorted(100)(number) FROM numbers(1000); + +SELECT groupArraySorted(30)(str) FROM (SELECT toString(number) as str FROM numbers(30)); + +SELECT groupArraySorted(10)(toInt64(number/2)) FROM numbers(100); + +DROP TABLE IF EXISTS test; +CREATE TABLE test (a Array(UInt64)) engine=MergeTree ORDER BY a; +INSERT INTO test VALUES ([3,4,5,6]), ([1,2,3,4]), ([2,3,4,5]); +SELECT groupArraySorted(3)(a) FROM test; +DROP TABLE test; + +CREATE TABLE IF NOT EXISTS test (id Int32, data Tuple(Int32, Int32)) ENGINE = MergeTree() ORDER BY id; +INSERT INTO test (id, data) VALUES (1, (100, 200)), (2, (15, 25)), (3, (2, 1)), (4, (30, 60)); +SELECT groupArraySorted(4)(data) FROM test; +DROP TABLE test; + +CREATE TABLE IF NOT EXISTS test (id Int32, data Decimal32(2)) ENGINE = MergeTree() ORDER BY id; +INSERT INTO test (id, data) VALUES (1, 12.5), (2, 0.2), (3, 6.6), (4, 2.2); +SELECT groupArraySorted(4)(data) FROM test; +DROP TABLE test; + +CREATE TABLE IF NOT EXISTS test (id Int32, data FixedString(3)) ENGINE = MergeTree() ORDER BY id; +INSERT INTO test (id, data) VALUES (1, 'AAA'), (2, 'bbc'), (3, 'abc'), (4, 'aaa'), (5, 'Aaa'); +SELECT groupArraySorted(5)(data) FROM test; +DROP TABLE test; + +CREATE TABLE test (id Decimal(76, 53), str String) ENGINE = MergeTree ORDER BY id; +INSERT INTO test SELECT number, 'test' FROM numbers(1000000); +SELECT count(id) FROM test; +SELECT count(concat(toString(id), 'a')) FROM test; +DROP TABLE test; + +CREATE TABLE test (id UInt64, agg AggregateFunction(groupArraySorted(2), UInt64)) engine=MergeTree ORDER BY id; +INSERT INTO test SELECT 1, groupArraySortedState(2)(number) FROM numbers(10); +SELECT groupArraySortedMerge(2)(agg) FROM test; +DROP TABLE test; diff --git a/tests/queries/0_stateless/02841_not_ready_set_bug.sh b/tests/queries/0_stateless/02841_not_ready_set_bug.sh index fd7f62d28bf..3aaffe51578 100755 --- a/tests/queries/0_stateless/02841_not_ready_set_bug.sh +++ b/tests/queries/0_stateless/02841_not_ready_set_bug.sh @@ -9,3 +9,4 @@ $CLICKHOUSE_CLIENT -q "create table t1 (number UInt64) engine = MergeTree order $CLICKHOUSE_CLIENT -q "insert into t1 select number from numbers(10);" $CLICKHOUSE_CLIENT --max_threads=2 --max_result_rows=1 --result_overflow_mode=break -q "with tab as (select min(number) from t1 prewhere number in (select number from view(select number, row_number() OVER (partition by number % 2 ORDER BY number DESC) from numbers_mt(1e4)) where number != 2 order by number)) select number from t1 union all select * from tab;" > /dev/null +$CLICKHOUSE_CLIENT -q "SELECT * FROM system.tables WHERE 1 in (SELECT number from numbers(2)) AND database = currentDatabase() format Null" diff --git a/tests/queries/0_stateless/02870_per_column_settings.reference b/tests/queries/0_stateless/02870_per_column_settings.reference index 144c8c5ee2e..c2ae34928bd 100644 --- a/tests/queries/0_stateless/02870_per_column_settings.reference +++ b/tests/queries/0_stateless/02870_per_column_settings.reference @@ -1,10 +1,14 @@ CREATE TABLE default.tab\n(\n `id` UInt64,\n `long_string` String SETTINGS (min_compress_block_size = 163840, max_compress_block_size = 163840),\n `v1` String,\n `v2` UInt64,\n `v3` Float32,\n `v4` Float64\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/tables/default/tab/2870\', \'r1\')\nORDER BY id\nSETTINGS min_bytes_for_wide_part = 1, index_granularity = 8192 1000 +ALTER TABLE tab\n MODIFY COLUMN `long_string` MODIFY SETTING min_compress_block_size = 8192 CREATE TABLE default.tab\n(\n `id` UInt64,\n `long_string` String SETTINGS (min_compress_block_size = 8192, max_compress_block_size = 163840),\n `v1` String,\n `v2` UInt64,\n `v3` Float32,\n `v4` Float64\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/tables/default/tab/2870\', \'r1\')\nORDER BY id\nSETTINGS min_bytes_for_wide_part = 1, index_granularity = 8192 +ALTER TABLE tab\n MODIFY COLUMN `long_string` RESET SETTING min_compress_block_size CREATE TABLE default.tab\n(\n `id` UInt64,\n `long_string` String SETTINGS (max_compress_block_size = 163840),\n `v1` String,\n `v2` UInt64,\n `v3` Float32,\n `v4` Float64\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/tables/default/tab/2870\', \'r1\')\nORDER BY id\nSETTINGS min_bytes_for_wide_part = 1, index_granularity = 8192 +ALTER TABLE tab\n MODIFY COLUMN `long_string` REMOVE SETTINGS CREATE TABLE default.tab\n(\n `id` UInt64,\n `long_string` String,\n `v1` String,\n `v2` UInt64,\n `v3` Float32,\n `v4` Float64\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/tables/default/tab/2870\', \'r1\')\nORDER BY id\nSETTINGS min_bytes_for_wide_part = 1, index_granularity = 8192 +ALTER TABLE tab\n MODIFY COLUMN `long_string` String SETTINGS (min_compress_block_size = 163840, max_compress_block_size = 163840) CREATE TABLE default.tab\n(\n `id` UInt64,\n `long_string` String SETTINGS (min_compress_block_size = 163840, max_compress_block_size = 163840),\n `v1` String,\n `v2` UInt64,\n `v3` Float32,\n `v4` Float64\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/tables/default/tab/2870\', \'r1\')\nORDER BY id\nSETTINGS min_bytes_for_wide_part = 1, index_granularity = 8192 ---- +--- (0,0) 0 (1,1) 1 (2,2) 2 @@ -15,4 +19,4 @@ CREATE TABLE default.tab\n(\n `id` UInt64,\n `long_string` String SETTINGS (7,7) 7 (8,8) 8 (9,9) 9 ---- +--- diff --git a/tests/queries/0_stateless/02870_per_column_settings.sql b/tests/queries/0_stateless/02870_per_column_settings.sql index 345cf5cc744..d242ebe6c61 100644 --- a/tests/queries/0_stateless/02870_per_column_settings.sql +++ b/tests/queries/0_stateless/02870_per_column_settings.sql @@ -23,21 +23,25 @@ SHOW CREATE tab; INSERT INTO TABLE tab SELECT number, randomPrintableASCII(1000), randomPrintableASCII(10), rand(number), rand(number+1), rand(number+2) FROM numbers(1000); SELECT count() FROM tab; +SELECT formatQuery('ALTER TABLE tab MODIFY COLUMN long_string MODIFY SETTING min_compress_block_size = 8192;'); ALTER TABLE tab MODIFY COLUMN long_string MODIFY SETTING min_compress_block_size = 8192; SHOW CREATE tab; +SELECT formatQuery('ALTER TABLE tab MODIFY COLUMN long_string RESET SETTING min_compress_block_size;'); ALTER TABLE tab MODIFY COLUMN long_string RESET SETTING min_compress_block_size; SHOW CREATE tab; +SELECT formatQuery('ALTER TABLE tab MODIFY COLUMN long_string REMOVE SETTINGS;'); ALTER TABLE tab MODIFY COLUMN long_string REMOVE SETTINGS; SHOW CREATE tab; +SELECT formatQuery('ALTER TABLE tab MODIFY COLUMN long_string String SETTINGS (min_compress_block_size = 163840, max_compress_block_size = 163840);'); ALTER TABLE tab MODIFY COLUMN long_string String SETTINGS (min_compress_block_size = 163840, max_compress_block_size = 163840); SHOW CREATE tab; DROP TABLE tab; -SELECT '--- '; +SELECT '---'; SET allow_experimental_object_type = 1; @@ -56,7 +60,7 @@ SELECT tup, json.key AS key FROM tab ORDER BY key LIMIT 10; DROP TABLE tab; -SELECT '--- '; +SELECT '---'; -- Unsupported column-level settings are rejected CREATE TABLE tab diff --git a/tests/queries/0_stateless/02880_indexHint__partition_id.reference b/tests/queries/0_stateless/02880_indexHint__partition_id.reference index 365e7b676c7..2cdd2cc1954 100644 --- a/tests/queries/0_stateless/02880_indexHint__partition_id.reference +++ b/tests/queries/0_stateless/02880_indexHint__partition_id.reference @@ -1,9 +1,10 @@ -- { echoOn } select * from data prewhere indexHint(_partition_id = '1'); 1 -select count() from data prewhere indexHint(_partition_id = '1'); +-- TODO: optimize_use_implicit_projections ignores indexHint (with analyzer) because source columns might be aliased. +select count() from data prewhere indexHint(_partition_id = '1') settings optimize_use_implicit_projections = 0; 1 select * from data where indexHint(_partition_id = '1'); 1 -select count() from data where indexHint(_partition_id = '1'); +select count() from data where indexHint(_partition_id = '1') settings optimize_use_implicit_projections = 0; 1 diff --git a/tests/queries/0_stateless/02880_indexHint__partition_id.sql b/tests/queries/0_stateless/02880_indexHint__partition_id.sql index d15b3f4ccea..9d5dc7bcbc2 100644 --- a/tests/queries/0_stateless/02880_indexHint__partition_id.sql +++ b/tests/queries/0_stateless/02880_indexHint__partition_id.sql @@ -4,6 +4,7 @@ insert into data values (1)(2); -- { echoOn } select * from data prewhere indexHint(_partition_id = '1'); -select count() from data prewhere indexHint(_partition_id = '1'); +-- TODO: optimize_use_implicit_projections ignores indexHint (with analyzer) because source columns might be aliased. +select count() from data prewhere indexHint(_partition_id = '1') settings optimize_use_implicit_projections = 0; select * from data where indexHint(_partition_id = '1'); -select count() from data where indexHint(_partition_id = '1'); +select count() from data where indexHint(_partition_id = '1') settings optimize_use_implicit_projections = 0; diff --git a/tests/queries/0_stateless/02882_primary_key_index_in_function_different_types.reference b/tests/queries/0_stateless/02882_primary_key_index_in_function_different_types.reference index f34aad737d4..7a5e798359b 100644 --- a/tests/queries/0_stateless/02882_primary_key_index_in_function_different_types.reference +++ b/tests/queries/0_stateless/02882_primary_key_index_in_function_different_types.reference @@ -1,6 +1,6 @@ -CreatingSets (Create sets before main query execution) - Expression ((Projection + Before ORDER BY)) - ReadFromMergeTree (default.test_table) +CreatingSets + Expression + ReadFromMergeTree Indexes: PrimaryKey Keys: @@ -9,9 +9,9 @@ CreatingSets (Create sets before main query execution) Condition: and((id in (-Inf, 10]), (value in 1-element set)) Parts: 1/1 Granules: 1/1 -CreatingSets (Create sets before main query execution) - Expression ((Projection + Before ORDER BY)) - ReadFromMergeTree (default.test_table) +CreatingSets + Expression + ReadFromMergeTree Indexes: PrimaryKey Keys: @@ -20,9 +20,9 @@ CreatingSets (Create sets before main query execution) Condition: and((id in (-Inf, 10]), (value in 1-element set)) Parts: 1/1 Granules: 1/1 -CreatingSets (Create sets before main query execution) - Expression ((Projection + Before ORDER BY)) - ReadFromMergeTree (default.test_table) +CreatingSets + Expression + ReadFromMergeTree Indexes: PrimaryKey Keys: @@ -31,53 +31,9 @@ CreatingSets (Create sets before main query execution) Condition: and((id in (-Inf, 10]), (value in 5-element set)) Parts: 1/1 Granules: 1/1 -CreatingSets (Create sets before main query execution) - Expression ((Projection + Before ORDER BY)) - ReadFromMergeTree (default.test_table) - Indexes: - PrimaryKey - Keys: - id - value - Condition: and((id in (-Inf, 10]), (value in 5-element set)) - Parts: 1/1 - Granules: 1/1 -CreatingSets (Create sets before main query execution) - Expression ((Project names + Projection)) - ReadFromMergeTree (default.test_table) - Indexes: - PrimaryKey - Keys: - id - value - Condition: and((id in (-Inf, 10]), (value in 1-element set)) - Parts: 1/1 - Granules: 1/1 -CreatingSets (Create sets before main query execution) - Expression ((Project names + Projection)) - ReadFromMergeTree (default.test_table) - Indexes: - PrimaryKey - Keys: - id - value - Condition: and((id in (-Inf, 10]), (value in 1-element set)) - Parts: 1/1 - Granules: 1/1 -CreatingSets (Create sets before main query execution) - Expression ((Project names + Projection)) - ReadFromMergeTree (default.test_table) - Indexes: - PrimaryKey - Keys: - id - value - Condition: and((id in (-Inf, 10]), (value in 5-element set)) - Parts: 1/1 - Granules: 1/1 -CreatingSets (Create sets before main query execution) - Expression ((Project names + Projection)) - ReadFromMergeTree (default.test_table) +CreatingSets + Expression + ReadFromMergeTree Indexes: PrimaryKey Keys: @@ -86,3 +42,51 @@ CreatingSets (Create sets before main query execution) Condition: and((id in (-Inf, 10]), (value in 5-element set)) Parts: 1/1 Granules: 1/1 +CreatingSets + Expression + Expression + ReadFromMergeTree + Indexes: + PrimaryKey + Keys: + id + value + Condition: and((value in 1-element set), (id in (-Inf, 10])) + Parts: 1/1 + Granules: 1/1 +CreatingSets + Expression + Expression + ReadFromMergeTree + Indexes: + PrimaryKey + Keys: + id + value + Condition: and((value in 1-element set), (id in (-Inf, 10])) + Parts: 1/1 + Granules: 1/1 +CreatingSets + Expression + Expression + ReadFromMergeTree + Indexes: + PrimaryKey + Keys: + id + value + Condition: and((value in 5-element set), (id in (-Inf, 10])) + Parts: 1/1 + Granules: 1/1 +CreatingSets + Expression + Expression + ReadFromMergeTree + Indexes: + PrimaryKey + Keys: + id + value + Condition: and((value in 5-element set), (id in (-Inf, 10])) + Parts: 1/1 + Granules: 1/1 diff --git a/tests/queries/0_stateless/02882_primary_key_index_in_function_different_types.sql b/tests/queries/0_stateless/02882_primary_key_index_in_function_different_types.sql index 077c49fb22e..1b1a7607344 100644 --- a/tests/queries/0_stateless/02882_primary_key_index_in_function_different_types.sql +++ b/tests/queries/0_stateless/02882_primary_key_index_in_function_different_types.sql @@ -7,18 +7,18 @@ CREATE TABLE test_table INSERT INTO test_table SELECT number, number FROM numbers(10); -SET allow_experimental_analyzer = 0; +set allow_experimental_analyzer = 0; -EXPLAIN indexes = 1 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT 5); -EXPLAIN indexes = 1 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT '5'); -EXPLAIN indexes = 1 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT toUInt8(number) FROM numbers(5)); -EXPLAIN indexes = 1 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT toString(number) FROM numbers(5)); +EXPLAIN indexes = 1, description=0 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT 5); +EXPLAIN indexes = 1, description=0 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT '5'); +EXPLAIN indexes = 1, description=0 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT toUInt8(number) FROM numbers(5)); +EXPLAIN indexes = 1, description=0 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT toString(number) FROM numbers(5)); -SET allow_experimental_analyzer = 1; +set allow_experimental_analyzer = 1; -EXPLAIN indexes = 1 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT 5); -EXPLAIN indexes = 1 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT '5'); -EXPLAIN indexes = 1 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT toUInt8(number) FROM numbers(5)); -EXPLAIN indexes = 1 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT toString(number) FROM numbers(5)); +EXPLAIN indexes = 1, description=0 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT 5); +EXPLAIN indexes = 1, description=0 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT '5'); +EXPLAIN indexes = 1, description=0 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT toUInt8(number) FROM numbers(5)); +EXPLAIN indexes = 1, description=0 SELECT id FROM test_table WHERE id <= 10 AND value IN (SELECT toString(number) FROM numbers(5)); DROP TABLE test_table; diff --git a/tests/queries/0_stateless/02884_async_insert_native_protocol_1.sh b/tests/queries/0_stateless/02884_async_insert_native_protocol_1.sh index 82e2bb709f9..7f583087336 100755 --- a/tests/queries/0_stateless/02884_async_insert_native_protocol_1.sh +++ b/tests/queries/0_stateless/02884_async_insert_native_protocol_1.sh @@ -12,7 +12,7 @@ $CLICKHOUSE_CLIENT -n -q " CREATE TABLE t_async_insert_native_1 (id UInt64, s String) ENGINE = MergeTree ORDER BY id; " -async_insert_options="--async_insert 1 --wait_for_async_insert 0 --async_insert_busy_timeout_ms 1000000" +async_insert_options="--async_insert 1 --wait_for_async_insert 0 --async_insert_busy_timeout_min_ms 1000000 --async_insert_busy_timeout_max_ms 10000000" echo '{"id": 1, "s": "aaa"} {"id": 2, "s": "bbb"}' | $CLICKHOUSE_CLIENT $async_insert_options -q 'INSERT INTO t_async_insert_native_1 FORMAT JSONEachRow' $CLICKHOUSE_CLIENT $async_insert_options -q 'INSERT INTO t_async_insert_native_1 FORMAT JSONEachRow {"id": 3, "s": "ccc"}' diff --git a/tests/queries/0_stateless/02884_async_insert_native_protocol_3.sh b/tests/queries/0_stateless/02884_async_insert_native_protocol_3.sh index abe6be9e2bc..c9d399607d0 100755 --- a/tests/queries/0_stateless/02884_async_insert_native_protocol_3.sh +++ b/tests/queries/0_stateless/02884_async_insert_native_protocol_3.sh @@ -12,7 +12,7 @@ $CLICKHOUSE_CLIENT -n -q " CREATE TABLE t_async_insert_native_3 (id UInt64, s String) ENGINE = MergeTree ORDER BY id; " -async_insert_options="--async_insert 1 --wait_for_async_insert 0 --async_insert_busy_timeout_ms 1000000" +async_insert_options="--async_insert 1 --wait_for_async_insert 0 --async_insert_busy_timeout_min_ms 1000000 --async_insert_busy_timeout_max_ms 10000000" echo '{"id": 1, "s": "aaa"} {"id": 2, "s": "bbb"}' | $CLICKHOUSE_CLIENT $async_insert_options -q 'INSERT INTO t_async_insert_native_3 FORMAT JSONEachRow' echo "(3, 'ccc') (4, 'ddd') (5, 'eee')" | $CLICKHOUSE_CLIENT $async_insert_options -q 'INSERT INTO t_async_insert_native_3 FORMAT Values' diff --git a/tests/queries/0_stateless/02884_async_insert_skip_settings.sql b/tests/queries/0_stateless/02884_async_insert_skip_settings.sql index facd39d1079..9bc689fb4ec 100644 --- a/tests/queries/0_stateless/02884_async_insert_skip_settings.sql +++ b/tests/queries/0_stateless/02884_async_insert_skip_settings.sql @@ -9,7 +9,8 @@ ORDER BY id; SET async_insert = 1; SET async_insert_deduplicate = 1; SET wait_for_async_insert = 0; -SET async_insert_busy_timeout_ms = 100000; +SET async_insert_busy_timeout_min_ms = 100000; +SET async_insert_busy_timeout_max_ms = 1000000; SET insert_deduplication_token = '1'; SET log_comment = 'async_insert_skip_settings_1'; diff --git a/tests/queries/0_stateless/02884_authentication_quota.reference b/tests/queries/0_stateless/02884_authentication_quota.reference new file mode 100644 index 00000000000..638034bab82 --- /dev/null +++ b/tests/queries/0_stateless/02884_authentication_quota.reference @@ -0,0 +1,54 @@ +> Drop the user, quota, and role if those were created. +> Create the user with quota with the maximum single authentication attempt. +> Check if the quota has been created. +1 +> Try to login to the user account with correct password +> Login to the user account using the wrong password. +password is incorrect +> Quota is exceeded 1 >= 1. Login with correct password should fail. +QUOTA_EXCEEDED +> Check the failed_sequential_authentications, max_failed_sequential_authentications fields. +2 1 +> Alter the quota with MAX FAILED SEQUENTIAL AUTHENTICATIONS = 4 +> Try to login to the user account with correct password +> Successfull login should reset failed authentications counter. Check the failed_sequential_authentications, max_failed_sequential_authentications fields. +0 4 +> Login to the user account using the wrong password before exeeding the quota. +password is incorrect +password is incorrect +password is incorrect +password is incorrect +QUOTA_EXCEEDED +> Also try to login with correct password. Quota should stay exceeded. +QUOTA_EXCEEDED +> Check the failed_sequential_authentications, max_failed_sequential_authentications fields. +6 4 +> Reset the quota by increasing MAX FAILED SEQUENTIAL AUTHENTICATIONS and succesfull login +> and check failed_sequential_authentications, max_failed_sequential_authentications. +0 7 + --------------------------------------------------------------------------- +> Create the role with quota with the maximum single authentication attempt. +> Try to login to the user account with correct password +> Login to the user account using the wrong password. +password is incorrect +> Quota is exceeded 1 >= 1. Login with correct password should fail. +QUOTA_EXCEEDED +> Check the failed_sequential_authentications, max_failed_sequential_authentications fields. +2 1 +> Alter the quota with MAX FAILED SEQUENTIAL AUTHENTICATIONS = 4 +> Try to login to the user account with correct password +> Successfull login should reset failed authentications counter. Check the failed_sequential_authentications, max_failed_sequential_authentications fields. +0 4 +> Login to the user account using the wrong password before exeeding the quota. +password is incorrect +password is incorrect +password is incorrect +password is incorrect +QUOTA_EXCEEDED +> Also try to login with correct password. Quota should stay exceeded. +QUOTA_EXCEEDED +> Check the failed_sequential_authentications, max_failed_sequential_authentications fields. +6 4 +> Reset the quota by increasing MAX FAILED SEQUENTIAL AUTHENTICATIONS and succesfull login +> and check failed_sequential_authentications, max_failed_sequential_authentications. +0 7 diff --git a/tests/queries/0_stateless/02884_authentication_quota.sh b/tests/queries/0_stateless/02884_authentication_quota.sh new file mode 100755 index 00000000000..f013bb4d639 --- /dev/null +++ b/tests/queries/0_stateless/02884_authentication_quota.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +QUOTA="2884_quota_$$" +USER="2884_user_$$" +ROLE="2884_role_$$" + + +function login_test() +{ + echo "> Try to login to the user account with correct password" + ${CLICKHOUSE_CLIENT} --user ${USER} --password "pass" --query "select 1 format Null" + + echo "> Login to the user account using the wrong password." + ${CLICKHOUSE_CLIENT} --user ${USER} --password "wrong_pass" --query "select 1 format Null" 2>&1 | grep -m1 -o 'password is incorrect' + + echo "> Quota is exceeded 1 >= 1. Login with correct password should fail." + ${CLICKHOUSE_CLIENT} --user ${USER} --password "pass" --query "select 1 format Null" 2>&1 | grep -m1 -o 'QUOTA_EXCEEDED' + + echo "> Check the failed_sequential_authentications, max_failed_sequential_authentications fields." + ${CLICKHOUSE_CLIENT} -q "SELECT failed_sequential_authentications, max_failed_sequential_authentications FROM system.quotas_usage WHERE quota_name = '${QUOTA}'" + + echo "> Alter the quota with MAX FAILED SEQUENTIAL AUTHENTICATIONS = 4" + ${CLICKHOUSE_CLIENT} -q "ALTER QUOTA ${QUOTA} FOR INTERVAL 100 YEAR MAX FAILED SEQUENTIAL AUTHENTICATIONS = 4 TO ${USER}" + + echo "> Try to login to the user account with correct password" + ${CLICKHOUSE_CLIENT} --user ${USER} --password "pass" --query "select 1 format Null" + + echo "> Successfull login should reset failed authentications counter. Check the failed_sequential_authentications, max_failed_sequential_authentications fields." + ${CLICKHOUSE_CLIENT} -q "SELECT failed_sequential_authentications, max_failed_sequential_authentications FROM system.quotas_usage WHERE quota_name = '${QUOTA}'" + + echo "> Login to the user account using the wrong password before exeeding the quota." + ${CLICKHOUSE_CLIENT} --user ${USER} --password "wrong_pass" --query "select 1 format Null" 2>&1 | grep -m1 -o 'password is incorrect' + ${CLICKHOUSE_CLIENT} --user ${USER} --password "wrong_pass" --query "select 1 format Null" 2>&1 | grep -m1 -o 'password is incorrect' + ${CLICKHOUSE_CLIENT} --user ${USER} --password "wrong_pass" --query "select 1 format Null" 2>&1 | grep -m1 -o 'password is incorrect' + ${CLICKHOUSE_CLIENT} --user ${USER} --password "wrong_pass" --query "select 1 format Null" 2>&1 | grep -m1 -o 'password is incorrect' + ${CLICKHOUSE_CLIENT} --user ${USER} --password "wrong_pass" --query "select 1 format Null" 2>&1 | grep -m1 -o 'QUOTA_EXCEEDED' + + echo "> Also try to login with correct password. Quota should stay exceeded." + ${CLICKHOUSE_CLIENT} --user ${USER} --password "pass" --query "select 1 format Null" 2>&1 | grep -m1 -o 'QUOTA_EXCEEDED' + + echo "> Check the failed_sequential_authentications, max_failed_sequential_authentications fields." + ${CLICKHOUSE_CLIENT} -q "SELECT failed_sequential_authentications, max_failed_sequential_authentications FROM system.quotas_usage WHERE quota_name = '${QUOTA}'" + + echo "> Reset the quota by increasing MAX FAILED SEQUENTIAL AUTHENTICATIONS and succesfull login" + echo "> and check failed_sequential_authentications, max_failed_sequential_authentications." + ${CLICKHOUSE_CLIENT} -q "ALTER QUOTA ${QUOTA} FOR INTERVAL 100 YEAR MAX FAILED SEQUENTIAL AUTHENTICATIONS = 7 TO ${USER}" + ${CLICKHOUSE_CLIENT} --user ${USER} --password "pass" --query "select 1 format Null" + ${CLICKHOUSE_CLIENT} -q "SELECT failed_sequential_authentications, max_failed_sequential_authentications FROM system.quotas_usage WHERE quota_name = '${QUOTA}'" +} + +echo "> Drop the user, quota, and role if those were created." +${CLICKHOUSE_CLIENT} -q "DROP USER IF EXISTS ${USER}" +${CLICKHOUSE_CLIENT} -q "DROP QUOTA IF EXISTS ${QUOTA}" +${CLICKHOUSE_CLIENT} -q "DROP ROLE IF EXISTS ${ROLE}" + +echo "> Create the user with quota with the maximum single authentication attempt." +${CLICKHOUSE_CLIENT} -q "CREATE USER ${USER} IDENTIFIED WITH plaintext_password BY 'pass'" +${CLICKHOUSE_CLIENT} -q "CREATE QUOTA ${QUOTA} FOR INTERVAL 100 YEAR MAX FAILED SEQUENTIAL AUTHENTICATIONS = 1 TO ${USER}" + +echo "> Check if the quota has been created." +${CLICKHOUSE_CLIENT} -q "SELECT COUNT(*) FROM system.quotas WHERE name = '${QUOTA}'" + +login_test + +echo " ---------------------------------------------------------------------------" +echo "> Create the role with quota with the maximum single authentication attempt." +${CLICKHOUSE_CLIENT} -q "CREATE ROLE ${ROLE}" +${CLICKHOUSE_CLIENT} -q "GRANT ALL ON *.* TO ${ROLE}" +${CLICKHOUSE_CLIENT} -q "GRANT ${ROLE} to ${USER}" +${CLICKHOUSE_CLIENT} -q "ALTER QUOTA ${QUOTA} FOR INTERVAL 100 YEAR MAX FAILED SEQUENTIAL AUTHENTICATIONS = 1 TO ${ROLE}" + +login_test + +${CLICKHOUSE_CLIENT} -q "DROP USER IF EXISTS ${USER}" +${CLICKHOUSE_CLIENT} -q "DROP QUOTA IF EXISTS ${QUOTA}" +${CLICKHOUSE_CLIENT} -q "DROP ROLE IF EXISTS ${ROLE}" diff --git a/tests/queries/0_stateless/02896_leading_zeroes_no_octal.reference b/tests/queries/0_stateless/02896_leading_zeroes_no_octal.reference new file mode 100644 index 00000000000..7c69b7e02aa --- /dev/null +++ b/tests/queries/0_stateless/02896_leading_zeroes_no_octal.reference @@ -0,0 +1,154 @@ +Leading zeroes into Int64 (1XXX without input_format_values_interpret_expressions and 1XXXX with) +1 1000 0 0 0 Single zero +1 1001 00 0 0 Double zero +1 1002 000000000000000 0 0 Mutliple redundant zeroes +1 1003 01 1 1 Octal like, interpret as decimal +1 1004 08 8 8 Octal like, interpret as decimal +1 1005 0100 100 100 Octal like, interpret as decimal +1 1006 0000000000100 100 100 Octal like, interpret as decimal, multiple leading zeroes +1 1010 -0 0 0 Single zero negative +1 1011 -00 0 0 Double zero negative +1 1012 -000000000000000 0 0 Mutliple redundant zeroes negative +1 1013 -01 -1 -1 Octal like, interpret as decimal negative +1 1014 -08 -8 -8 Octal like, interpret as decimal negative +1 1015 -0100 -100 -100 Octal like, interpret as decimal negative +1 1016 -0000000000100 -100 -100 Octal like, interpret as decimal, multiple leading zeroes negative +1 1020 +0 0 0 Single zero positive +1 1021 +00 0 0 Double zero negpositiveative +1 1022 +000000000000000 0 0 Mutliple redundant zeroes positive +1 1023 +01 1 1 Octal like, interpret as decimal positive +1 1024 +08 8 8 Octal like, interpret as decimal positive +1 1025 +0100 100 100 Octal like, interpret as decimal positive +1 1026 +0000000000100 100 100 Octal like, interpret as decimal, multiple leading zeroes positive +1 1030 0000.008 0 0 Floating point should work... +1 1031 -0000.008 0 0 Floating point should work... +1 1032 +0000.008 0 0 Floating point should work... +1 1033 0000.008e3 8 8 Floating point should work... +1 1034 -0000.008e3 -8 -8 Floating point should work... +1 1035 +0000.008e3 8 8 Floating point should work... +1 1036 08000.008e-3 8 8 Floating point should work... +1 1037 -08000.008e-3 -8 -8 Floating point should work... +1 1038 +08000.008e-3 8 8 Floating point should work... +1 1060 0x0abcd 43981 43981 Hex should be parsed +1 1061 -0x0abcd -43981 -43981 Hex should be parsed +1 1062 +0x0abcd 43981 43981 Hex should be parsed +1 1063 0x0abcdP1 87962 87962 Hex should be parsed +1 1064 0x0abcdP+1 87962 87962 Hex should be parsed +1 1065 0x0abcdP-1 21990 21990 Hex should be parsed +1 1066 0x0abcdP01 87962 87962 Hex should be parsed +1 1067 0x0abcdP+01 87962 87962 Hex should be parsed +1 1068 0x0abcdP-01 21990 21990 Hex should be parsed +1 11000 0 0 0 Single zero +1 11001 00 0 0 Double zero +1 11002 000000000000000 0 0 Mutliple redundant zeroes +1 11003 01 1 1 Octal like, interpret as decimal +1 11004 08 8 8 Octal like, interpret as decimal +1 11005 0100 100 100 Octal like, interpret as decimal +1 11006 0000000000100 100 100 Octal like, interpret as decimal, multiple leading zeroes +1 11010 -0 0 0 Single zero negative +1 11011 -00 0 0 Double zero negative +1 11012 -000000000000000 0 0 Mutliple redundant zeroes negative +1 11013 -01 -1 -1 Octal like, interpret as decimal negative +1 11014 -08 -8 -8 Octal like, interpret as decimal negative +1 11015 -0100 -100 -100 Octal like, interpret as decimal negative +1 11016 -0000000000100 -100 -100 Octal like, interpret as decimal, multiple leading zeroes negative +1 11020 +0 0 0 Single zero positive +1 11021 +00 0 0 Double zero negpositiveative +1 11022 +000000000000000 0 0 Mutliple redundant zeroes positive +1 11023 +01 1 1 Octal like, interpret as decimal positive +1 11024 +08 8 8 Octal like, interpret as decimal positive +1 11025 +0100 100 100 Octal like, interpret as decimal positive +1 11026 +0000000000100 100 100 Octal like, interpret as decimal, multiple leading zeroes positive +1 11030 0000.008 0 0 Floating point should work... +1 11031 -0000.008 0 0 Floating point should work... +1 11032 +0000.008 0 0 Floating point should work... +1 11033 0000.008e3 8 8 Floating point should work... +1 11034 -0000.008e3 -8 -8 Floating point should work... +1 11035 +0000.008e3 8 8 Floating point should work... +1 11036 08000.008e-3 8 8 Floating point should work... +1 11037 -08000.008e-3 -8 -8 Floating point should work... +1 11038 +08000.008e-3 8 8 Floating point should work... +1 11050 0b10000 16 16 Binary should be parsed +1 11051 -0b10000 -16 -16 Binary should be parsed +1 11052 +0b10000 16 16 Binary should be parsed +1 11060 0x0abcd 43981 43981 Hex should be parsed +1 11061 -0x0abcd -43981 -43981 Hex should be parsed +1 11062 +0x0abcd 43981 43981 Hex should be parsed +1 11063 0x0abcdP1 87962 87962 Hex should be parsed +1 11064 0x0abcdP+1 87962 87962 Hex should be parsed +1 11065 0x0abcdP-1 21990 21990 Hex should be parsed +1 11066 0x0abcdP01 87962 87962 Hex should be parsed +1 11067 0x0abcdP+01 87962 87962 Hex should be parsed +1 11068 0x0abcdP-01 21990 21990 Hex should be parsed +Leading zeroes into Float64 (2XXX without input_format_values_interpret_expressions and 2XXXX with) +1 2000 0 0 0 Single zero +1 2001 00 0 0 Double zero +1 2002 000000000000000 0 0 Mutliple redundant zeroes +1 2003 01 1 1 Octal like, interpret as decimal +1 2004 08 8 8 Octal like, interpret as decimal +1 2005 0100 100 100 Octal like, interpret as decimal +1 2006 0000000000100 100 100 Octal like, interpret as decimal, multiple leading zeroes +1 2013 -01 -1 -1 Octal like, interpret as decimal negative +1 2014 -08 -8 -8 Octal like, interpret as decimal negative +1 2015 -0100 -100 -100 Octal like, interpret as decimal negative +1 2016 -0000000000100 -100 -100 Octal like, interpret as decimal, multiple leading zeroes negative +1 2020 +0 0 0 Single zero positive +1 2021 +00 0 0 Double zero negpositiveative +1 2022 +000000000000000 0 0 Mutliple redundant zeroes positive +1 2023 +01 1 1 Octal like, interpret as decimal positive +1 2024 +08 8 8 Octal like, interpret as decimal positive +1 2025 +0100 100 100 Octal like, interpret as decimal positive +1 2026 +0000000000100 100 100 Octal like, interpret as decimal, multiple leading zeroes positive +1 2030 0000.008 0.008 0.008 Floating point should work... +1 2031 -0000.008 -0.008 -0.008 Floating point should work... +1 2032 +0000.008 0.008 0.008 Floating point should work... +1 2033 0000.008e3 8 8 Floating point should work... +1 2034 -0000.008e3 -8 -8 Floating point should work... +1 2035 +0000.008e3 8 8 Floating point should work... +1 2036 08.5e-3 0.0085 0.0085 Floating point should work... +1 2037 -08.5e-3 -0.0085 -0.0085 Floating point should work... +1 2038 +08.5e-3 0.0085 0.0085 Floating point should work... +1 2063 0x0abcdP1 87962 87962 Hex should be parsed +1 2064 0x0abcdP+1 87962 87962 Hex should be parsed +1 2065 0x0abcdP-1 21990.5 21990.5 Hex should be parsed +1 2066 0x0abcdP01 87962 87962 Hex should be parsed +1 2067 0x0abcdP+01 87962 87962 Hex should be parsed +1 2068 0x0abcdP-01 21990.5 21990.5 Hex should be parsed +1 2069 0x01P-01 0.5 0.5 Hex should be parsed +1 12000 0 0 0 Single zero +1 12001 00 0 0 Double zero +1 12002 000000000000000 0 0 Mutliple redundant zeroes +1 12003 01 1 1 Octal like, interpret as decimal +1 12004 08 8 8 Octal like, interpret as decimal +1 12005 0100 100 100 Octal like, interpret as decimal +1 12006 0000000000100 100 100 Octal like, interpret as decimal, multiple leading zeroes +1 12013 -01 -1 -1 Octal like, interpret as decimal negative +1 12014 -08 -8 -8 Octal like, interpret as decimal negative +1 12015 -0100 -100 -100 Octal like, interpret as decimal negative +1 12016 -0000000000100 -100 -100 Octal like, interpret as decimal, multiple leading zeroes negative +1 12020 +0 0 0 Single zero positive +1 12021 +00 0 0 Double zero negpositiveative +1 12022 +000000000000000 0 0 Mutliple redundant zeroes positive +1 12023 +01 1 1 Octal like, interpret as decimal positive +1 12024 +08 8 8 Octal like, interpret as decimal positive +1 12025 +0100 100 100 Octal like, interpret as decimal positive +1 12026 +0000000000100 100 100 Octal like, interpret as decimal, multiple leading zeroes positive +1 12030 0000.008 0.008 0.008 Floating point should work... +1 12031 -0000.008 -0.008 -0.008 Floating point should work... +1 12032 +0000.008 0.008 0.008 Floating point should work... +1 12033 0000.008e3 8 8 Floating point should work... +1 12034 -0000.008e3 -8 -8 Floating point should work... +1 12035 +0000.008e3 8 8 Floating point should work... +1 12036 08.5e-3 0.0085 0.0085 Floating point should work... +1 12037 -08.5e-3 -0.0085 -0.0085 Floating point should work... +1 12038 +08.5e-3 0.0085 0.0085 Floating point should work... +1 12050 0b10000 16 16 Binary should be parsed +1 12051 -0b10000 -16 -16 Binary should be parsed +1 12052 +0b10000 16 16 Binary should be parsed +1 12063 0x0abcdP1 87962 87962 Hex should be parsed +1 12064 0x0abcdP+1 87962 87962 Hex should be parsed +1 12065 0x0abcdP-1 21990.5 21990.5 Hex should be parsed +1 12066 0x0abcdP01 87962 87962 Hex should be parsed +1 12067 0x0abcdP+01 87962 87962 Hex should be parsed +1 12068 0x0abcdP-01 21990.5 21990.5 Hex should be parsed +1 12069 0x01P-01 0.5 0.5 Hex should be parsed diff --git a/tests/queries/0_stateless/02896_leading_zeroes_no_octal.sql b/tests/queries/0_stateless/02896_leading_zeroes_no_octal.sql new file mode 100644 index 00000000000..05d841ce01f --- /dev/null +++ b/tests/queries/0_stateless/02896_leading_zeroes_no_octal.sql @@ -0,0 +1,223 @@ +DROP TABLE IF EXISTS t_leading_zeroes; +DROP TABLE IF EXISTS t_leading_zeroes_f; + +CREATE TABLE t_leading_zeroes(id Int64, input String, val Int64, expected Int64, comment String) ENGINE=MergeTree ORDER BY id; +CREATE TABLE t_leading_zeroes_f(id Int64, input String, val Float64, expected Float64, comment String) ENGINE=MergeTree ORDER BY id; + +SET input_format_values_interpret_expressions = 0; + +INSERT INTO t_leading_zeroes VALUES (1000, '0', 0, 0, 'Single zero'); +INSERT INTO t_leading_zeroes VALUES (1001, '00', 00, 0, 'Double zero'); +INSERT INTO t_leading_zeroes VALUES (1002, '000000000000000', 000000000000000, 0, 'Mutliple redundant zeroes'); +INSERT INTO t_leading_zeroes VALUES (1003, '01', 01, 1, 'Octal like, interpret as decimal'); +INSERT INTO t_leading_zeroes VALUES (1004, '08', 08, 8, 'Octal like, interpret as decimal'); +INSERT INTO t_leading_zeroes VALUES (1005, '0100', 0100, 100, 'Octal like, interpret as decimal'); +INSERT INTO t_leading_zeroes VALUES (1006, '0000000000100', 0000000000100, 100, 'Octal like, interpret as decimal, multiple leading zeroes'); + +INSERT INTO t_leading_zeroes VALUES (1010, '-0', -0, 0, 'Single zero negative'); +INSERT INTO t_leading_zeroes VALUES (1011, '-00', -00, 0, 'Double zero negative'); +INSERT INTO t_leading_zeroes VALUES (1012, '-000000000000000', -000000000000000, 0, 'Mutliple redundant zeroes negative'); +INSERT INTO t_leading_zeroes VALUES (1013, '-01', -01, -1, 'Octal like, interpret as decimal negative'); +INSERT INTO t_leading_zeroes VALUES (1014, '-08', -08, -8, 'Octal like, interpret as decimal negative'); +INSERT INTO t_leading_zeroes VALUES (1015, '-0100', -0100, -100, 'Octal like, interpret as decimal negative'); +INSERT INTO t_leading_zeroes VALUES (1016, '-0000000000100', -0000000000100, -100, 'Octal like, interpret as decimal, multiple leading zeroes negative'); + +INSERT INTO t_leading_zeroes VALUES (1020, '+0', +0, 0, 'Single zero positive'); +INSERT INTO t_leading_zeroes VALUES (1021, '+00', +00, 0, 'Double zero negpositiveative'); +INSERT INTO t_leading_zeroes VALUES (1022, '+000000000000000', +000000000000000, 0, 'Mutliple redundant zeroes positive'); +INSERT INTO t_leading_zeroes VALUES (1023, '+01', +01, 1, 'Octal like, interpret as decimal positive'); +INSERT INTO t_leading_zeroes VALUES (1024, '+08', +08, 8, 'Octal like, interpret as decimal positive'); +INSERT INTO t_leading_zeroes VALUES (1025, '+0100', +0100, 100, 'Octal like, interpret as decimal positive'); +INSERT INTO t_leading_zeroes VALUES (1026, '+0000000000100', +0000000000100, 100, 'Octal like, interpret as decimal, multiple leading zeroes positive'); + +INSERT INTO t_leading_zeroes VALUES (1030, '0000.008', 0000.008, 0, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (1031, '-0000.008', -0000.008, 0, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (1032, '+0000.008', +0000.008, 0, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (1033, '0000.008e3', 0000.008e3, 8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (1034, '-0000.008e3', -0000.008e3, -8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (1035, '+0000.008e3', 0000.008e3, 8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (1036, '08000.008e-3', 08000.008e-3, 8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (1037, '-08000.008e-3', -08000.008e-3, -8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (1038, '+08000.008e-3', 08000.008e-3, 8, 'Floating point should work...'); + +INSERT INTO t_leading_zeroes VALUES (1060, '0x0abcd', 0x0abcd, 43981, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (1061, '-0x0abcd', -0x0abcd, -43981, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (1062, '+0x0abcd', +0x0abcd, 43981, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (1063, '0x0abcdP1', 0x0abcdP1, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (1064, '0x0abcdP+1', 0x0abcdP+1, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (1065, '0x0abcdP-1', 0x0abcdP-1, 21990, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (1066, '0x0abcdP01', 0x0abcdP01, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (1067, '0x0abcdP+01', 0x0abcdP+01, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (1068, '0x0abcdP-01', 0x0abcdP-01, 21990, 'Hex should be parsed'); + + +-- Floating point numbers go via readFloatTextFastImpl - so should not be affected + +INSERT INTO t_leading_zeroes_f VALUES (2000, '0', 0, 0, 'Single zero'); +INSERT INTO t_leading_zeroes_f VALUES (2001, '00', 00, 0, 'Double zero'); +INSERT INTO t_leading_zeroes_f VALUES (2002, '000000000000000', 000000000000000, 0, 'Mutliple redundant zeroes'); +INSERT INTO t_leading_zeroes_f VALUES (2003, '01', 01, 1, 'Octal like, interpret as decimal'); +INSERT INTO t_leading_zeroes_f VALUES (2004, '08', 08, 8, 'Octal like, interpret as decimal'); +INSERT INTO t_leading_zeroes_f VALUES (2005, '0100', 0100, 100, 'Octal like, interpret as decimal'); +INSERT INTO t_leading_zeroes_f VALUES (2006, '0000000000100', 0000000000100, 100, 'Octal like, interpret as decimal, multiple leading zeroes'); + +-- Float negative zero is machine/context dependent +-- INSERT INTO t_leading_zeroes_f VALUES (2010, '-0', -0, 0, 'Single zero negative'); +-- INSERT INTO t_leading_zeroes_f VALUES (2011, '-00', -00, 0, 'Double zero negative'); +-- INSERT INTO t_leading_zeroes_f VALUES (2012, '-000000000000000', -000000000000000, 0, 'Mutliple redundant zeroes negative'); +INSERT INTO t_leading_zeroes_f VALUES (2013, '-01', -01, -1, 'Octal like, interpret as decimal negative'); +INSERT INTO t_leading_zeroes_f VALUES (2014, '-08', -08, -8, 'Octal like, interpret as decimal negative'); +INSERT INTO t_leading_zeroes_f VALUES (2015, '-0100', -0100, -100, 'Octal like, interpret as decimal negative'); +INSERT INTO t_leading_zeroes_f VALUES (2016, '-0000000000100', -0000000000100, -100, 'Octal like, interpret as decimal, multiple leading zeroes negative'); + +INSERT INTO t_leading_zeroes_f VALUES (2020, '+0', +0, 0, 'Single zero positive'); +INSERT INTO t_leading_zeroes_f VALUES (2021, '+00', +00, 0, 'Double zero negpositiveative'); +INSERT INTO t_leading_zeroes_f VALUES (2022, '+000000000000000', +000000000000000, 0, 'Mutliple redundant zeroes positive'); +INSERT INTO t_leading_zeroes_f VALUES (2023, '+01', +01, 1, 'Octal like, interpret as decimal positive'); +INSERT INTO t_leading_zeroes_f VALUES (2024, '+08', +08, 8, 'Octal like, interpret as decimal positive'); +INSERT INTO t_leading_zeroes_f VALUES (2025, '+0100', +0100, 100, 'Octal like, interpret as decimal positive'); +INSERT INTO t_leading_zeroes_f VALUES (2026, '+0000000000100', +0000000000100, 100, 'Octal like, interpret as decimal, multiple leading zeroes positive'); + +INSERT INTO t_leading_zeroes_f VALUES (2030, '0000.008', 0000.008, 0.008, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (2031, '-0000.008', -0000.008, -0.008, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (2032, '+0000.008', +0000.008, 0.008, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (2033, '0000.008e3', 0000.008e3, 8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (2034, '-0000.008e3', -0000.008e3, -8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (2035, '+0000.008e3', 0000.008e3, 8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (2036, '08.5e-3', 08.5e-3, 0.0085, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (2037, '-08.5e-3', -08.5e-3, -0.0085, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (2038, '+08.5e-3', 08.5e-3, 0.0085, 'Floating point should work...'); + +INSERT INTO t_leading_zeroes_f VALUES (2063, '0x0abcdP1', 0x0abcdP1, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes_f VALUES (2064, '0x0abcdP+1', 0x0abcdP+1, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes_f VALUES (2065, '0x0abcdP-1', 0x0abcdP-1, 21990.5, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes_f VALUES (2066, '0x0abcdP01', 0x0abcdP01, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes_f VALUES (2067, '0x0abcdP+01', 0x0abcdP+01, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes_f VALUES (2068, '0x0abcdP-01', 0x0abcdP-01, 21990.5, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes_f VALUES (2069, '0x01P-01', 0x01P-01, 0.5, 'Hex should be parsed'); + +-- Coincidentally, the following result in 9 rather than 9e9 because of readFloatTextFastImpl +-- using readUIntTextUpToNSignificantDigits<4>(exponent, in) +-- INSERT INTO t_leading_zeroes_f VALUES (2070, '00009e00009', 00009e00009, 9e9, '???'); + +-- Binary should not work with input_format_values_interpret_expressions = 0 + +INSERT INTO t_leading_zeroes_f VALUES (2050, '0b10000', 0b10000, 16, 'Binary should not be parsed'); -- { clientError SYNTAX_ERROR } +INSERT INTO t_leading_zeroes_f VALUES (2051, '-0b10000', -0b10000, -16, 'Binary should not be parsed'); -- { clientError SYNTAX_ERROR } +INSERT INTO t_leading_zeroes_f VALUES (2052, '+0b10000', +0b10000, 16, 'Binary should not be parsed'); -- { clientError SYNTAX_ERROR } + +INSERT INTO t_leading_zeroes VALUES (1050, '0b10000', 0b10000, 16, 'Binary should not be parsed'); -- { clientError SYNTAX_ERROR } +INSERT INTO t_leading_zeroes VALUES (1051, '-0b10000', -0b10000, -16, 'Binary should not be parsed'); -- { clientError SYNTAX_ERROR } +INSERT INTO t_leading_zeroes VALUES (1052, '+0b10000', +0b10000, 16, 'Binary should not be parsed'); -- { clientError SYNTAX_ERROR } + + + +SET input_format_values_interpret_expressions = 1; + +INSERT INTO t_leading_zeroes VALUES (11000, '0', 0, 0, 'Single zero'); +INSERT INTO t_leading_zeroes VALUES (11001, '00', 00, 0, 'Double zero'); +INSERT INTO t_leading_zeroes VALUES (11002, '000000000000000', 000000000000000, 0, 'Mutliple redundant zeroes'); +INSERT INTO t_leading_zeroes VALUES (11003, '01', 01, 1, 'Octal like, interpret as decimal'); +INSERT INTO t_leading_zeroes VALUES (11004, '08', 08, 8, 'Octal like, interpret as decimal'); +INSERT INTO t_leading_zeroes VALUES (11005, '0100', 0100, 100, 'Octal like, interpret as decimal'); +INSERT INTO t_leading_zeroes VALUES (11006, '0000000000100', 0000000000100, 100, 'Octal like, interpret as decimal, multiple leading zeroes'); + +INSERT INTO t_leading_zeroes VALUES (11010, '-0', -0, 0, 'Single zero negative'); +INSERT INTO t_leading_zeroes VALUES (11011, '-00', -00, 0, 'Double zero negative'); +INSERT INTO t_leading_zeroes VALUES (11012, '-000000000000000', -000000000000000, 0, 'Mutliple redundant zeroes negative'); +INSERT INTO t_leading_zeroes VALUES (11013, '-01', -01, -1, 'Octal like, interpret as decimal negative'); +INSERT INTO t_leading_zeroes VALUES (11014, '-08', -08, -8, 'Octal like, interpret as decimal negative'); +INSERT INTO t_leading_zeroes VALUES (11015, '-0100', -0100, -100, 'Octal like, interpret as decimal negative'); +INSERT INTO t_leading_zeroes VALUES (11016, '-0000000000100', -0000000000100, -100, 'Octal like, interpret as decimal, multiple leading zeroes negative'); + +INSERT INTO t_leading_zeroes VALUES (11020, '+0', +0, 0, 'Single zero positive'); +INSERT INTO t_leading_zeroes VALUES (11021, '+00', +00, 0, 'Double zero negpositiveative'); +INSERT INTO t_leading_zeroes VALUES (11022, '+000000000000000', +000000000000000, 0, 'Mutliple redundant zeroes positive'); +INSERT INTO t_leading_zeroes VALUES (11023, '+01', +01, 1, 'Octal like, interpret as decimal positive'); +INSERT INTO t_leading_zeroes VALUES (11024, '+08', +08, 8, 'Octal like, interpret as decimal positive'); +INSERT INTO t_leading_zeroes VALUES (11025, '+0100', +0100, 100, 'Octal like, interpret as decimal positive'); +INSERT INTO t_leading_zeroes VALUES (11026, '+0000000000100', +0000000000100, 100, 'Octal like, interpret as decimal, multiple leading zeroes positive'); + +INSERT INTO t_leading_zeroes VALUES (11030, '0000.008', 0000.008, 0, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (11031, '-0000.008', -0000.008, 0, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (11032, '+0000.008', +0000.008, 0, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (11033, '0000.008e3', 0000.008e3, 8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (11034, '-0000.008e3', -0000.008e3, -8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (11035, '+0000.008e3', 0000.008e3, 8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (11036, '08000.008e-3', 08000.008e-3, 8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (11037, '-08000.008e-3', -08000.008e-3, -8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes VALUES (11038, '+08000.008e-3', 08000.008e-3, 8, 'Floating point should work...'); + +INSERT INTO t_leading_zeroes VALUES (11050, '0b10000', 0b10000, 16, 'Binary should be parsed'); +INSERT INTO t_leading_zeroes VALUES (11051, '-0b10000', -0b10000, -16, 'Binary should be parsed'); +INSERT INTO t_leading_zeroes VALUES (11052, '+0b10000', +0b10000, 16, 'Binary should be parsed'); + +INSERT INTO t_leading_zeroes VALUES (11060, '0x0abcd', 0x0abcd, 43981, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (11061, '-0x0abcd', -0x0abcd, -43981, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (11062, '+0x0abcd', +0x0abcd, 43981, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (11063, '0x0abcdP1', 0x0abcdP1, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (11064, '0x0abcdP+1', 0x0abcdP+1, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (11065, '0x0abcdP-1', 0x0abcdP-1, 21990, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (11066, '0x0abcdP01', 0x0abcdP01, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (11067, '0x0abcdP+01', 0x0abcdP+01, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes VALUES (11068, '0x0abcdP-01', 0x0abcdP-01, 21990, 'Hex should be parsed'); + +-- Floating point numbers go via readFloatTextFastImpl - so should not be affected + +INSERT INTO t_leading_zeroes_f VALUES (12000, '0', 0, 0, 'Single zero'); +INSERT INTO t_leading_zeroes_f VALUES (12001, '00', 00, 0, 'Double zero'); +INSERT INTO t_leading_zeroes_f VALUES (12002, '000000000000000', 000000000000000, 0, 'Mutliple redundant zeroes'); +INSERT INTO t_leading_zeroes_f VALUES (12003, '01', 01, 1, 'Octal like, interpret as decimal'); +INSERT INTO t_leading_zeroes_f VALUES (12004, '08', 08, 8, 'Octal like, interpret as decimal'); +INSERT INTO t_leading_zeroes_f VALUES (12005, '0100', 0100, 100, 'Octal like, interpret as decimal'); +INSERT INTO t_leading_zeroes_f VALUES (12006, '0000000000100', 0000000000100, 100, 'Octal like, interpret as decimal, multiple leading zeroes'); + +-- Float negative zero is machine/context dependent +-- INSERT INTO t_leading_zeroes_f VALUES (12010, '-0', -0, 0, 'Single zero negative'); +-- INSERT INTO t_leading_zeroes_f VALUES (12011, '-00', -00, 0, 'Double zero negative'); +-- INSERT INTO t_leading_zeroes_f VALUES (12012, '-000000000000000', -000000000000000, 0, 'Mutliple redundant zeroes negative'); +INSERT INTO t_leading_zeroes_f VALUES (12013, '-01', -01, -1, 'Octal like, interpret as decimal negative'); +INSERT INTO t_leading_zeroes_f VALUES (12014, '-08', -08, -8, 'Octal like, interpret as decimal negative'); +INSERT INTO t_leading_zeroes_f VALUES (12015, '-0100', -0100, -100, 'Octal like, interpret as decimal negative'); +INSERT INTO t_leading_zeroes_f VALUES (12016, '-0000000000100', -0000000000100, -100, 'Octal like, interpret as decimal, multiple leading zeroes negative'); + +INSERT INTO t_leading_zeroes_f VALUES (12020, '+0', +0, 0, 'Single zero positive'); +INSERT INTO t_leading_zeroes_f VALUES (12021, '+00', +00, 0, 'Double zero negpositiveative'); +INSERT INTO t_leading_zeroes_f VALUES (12022, '+000000000000000', +000000000000000, 0, 'Mutliple redundant zeroes positive'); +INSERT INTO t_leading_zeroes_f VALUES (12023, '+01', +01, 1, 'Octal like, interpret as decimal positive'); +INSERT INTO t_leading_zeroes_f VALUES (12024, '+08', +08, 8, 'Octal like, interpret as decimal positive'); +INSERT INTO t_leading_zeroes_f VALUES (12025, '+0100', +0100, 100, 'Octal like, interpret as decimal positive'); +INSERT INTO t_leading_zeroes_f VALUES (12026, '+0000000000100', +0000000000100, 100, 'Octal like, interpret as decimal, multiple leading zeroes positive'); + +INSERT INTO t_leading_zeroes_f VALUES (12030, '0000.008', 0000.008, 0.008, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (12031, '-0000.008', -0000.008, -0.008, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (12032, '+0000.008', +0000.008, 0.008, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (12033, '0000.008e3', 0000.008e3, 8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (12034, '-0000.008e3', -0000.008e3, -8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (12035, '+0000.008e3', 0000.008e3, 8, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (12036, '08.5e-3', 08.5e-3, 0.0085, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (12037, '-08.5e-3', -08.5e-3, -0.0085, 'Floating point should work...'); +INSERT INTO t_leading_zeroes_f VALUES (12038, '+08.5e-3', 08.5e-3, 0.0085, 'Floating point should work...'); + +INSERT INTO t_leading_zeroes_f VALUES (12050, '0b10000', 0b10000, 16, 'Binary should be parsed'); +INSERT INTO t_leading_zeroes_f VALUES (12051, '-0b10000', -0b10000, -16, 'Binary should be parsed'); +INSERT INTO t_leading_zeroes_f VALUES (12052, '+0b10000', +0b10000, 16, 'Binary should be parsed'); + +INSERT INTO t_leading_zeroes_f VALUES (12063, '0x0abcdP1', 0x0abcdP1, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes_f VALUES (12064, '0x0abcdP+1', 0x0abcdP+1, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes_f VALUES (12065, '0x0abcdP-1', 0x0abcdP-1, 21990.5, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes_f VALUES (12066, '0x0abcdP01', 0x0abcdP01, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes_f VALUES (12067, '0x0abcdP+01', 0x0abcdP+01, 87962, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes_f VALUES (12068, '0x0abcdP-01', 0x0abcdP-01, 21990.5, 'Hex should be parsed'); +INSERT INTO t_leading_zeroes_f VALUES (12069, '0x01P-01', 0x01P-01, 0.5, 'Hex should be parsed'); + +SELECT 'Leading zeroes into Int64 (1XXX without input_format_values_interpret_expressions and 1XXXX with)'; +SELECT t.val == t.expected AS ok, * FROM t_leading_zeroes t ORDER BY id; + + +SELECT 'Leading zeroes into Float64 (2XXX without input_format_values_interpret_expressions and 2XXXX with)'; +SELECT t.val == t.expected AS ok, * FROM t_leading_zeroes_f t ORDER BY id; + + +DROP TABLE IF EXISTS t_leading_zeroes; +DROP TABLE IF EXISTS t_leading_zeroes_f; diff --git a/tests/queries/0_stateless/02897_alter_partition_parameters.reference b/tests/queries/0_stateless/02897_alter_partition_parameters.reference index bc6ff2b709c..d4b70c58ae5 100644 --- a/tests/queries/0_stateless/02897_alter_partition_parameters.reference +++ b/tests/queries/0_stateless/02897_alter_partition_parameters.reference @@ -7,3 +7,8 @@ 0 0 0 +0 +0 +0 +0 +0 diff --git a/tests/queries/0_stateless/02897_alter_partition_parameters.sql b/tests/queries/0_stateless/02897_alter_partition_parameters.sql index 62ceb9d9768..0be7308ed1a 100644 --- a/tests/queries/0_stateless/02897_alter_partition_parameters.sql +++ b/tests/queries/0_stateless/02897_alter_partition_parameters.sql @@ -10,6 +10,24 @@ PARTITION BY toMonday(EventDate); INSERT INTO test VALUES(toDate('2023-10-09')); +ALTER TABLE test DROP PARTITION ('2023-10-09'); + +SELECT count() FROM test; + +INSERT INTO test VALUES(toDate('2023-10-09')); + +ALTER TABLE test DROP PARTITION (('2023-10-09')); + +SELECT count() FROM test; + +INSERT INTO test VALUES(toDate('2023-10-09')); + +ALTER TABLE test DROP PARTITION '2023-10-09'; + +SELECT count() FROM test; + +INSERT INTO test VALUES(toDate('2023-10-09')); + SET param_partition='2023-10-09'; ALTER TABLE test DROP PARTITION {partition:String}; @@ -51,6 +69,17 @@ ENGINE = MergeTree ORDER BY tuple() PARTITION BY (a * b, b * b); +INSERT INTO test2 VALUES(1, 2); + +ALTER TABLE test2 DROP PARTITION tuple(2, 4); + +SELECT count() FROM test2; + +INSERT INTO test2 VALUES(1, 2); + +ALTER TABLE test2 DROP PARTITION (2, 4); + +SELECT count() FROM test2; INSERT INTO test2 VALUES(1, 2); diff --git a/tests/queries/0_stateless/02900_union_schema_inference_mode.sh b/tests/queries/0_stateless/02900_union_schema_inference_mode.sh index dc0dd8ae1f4..a0fdb5276e0 100755 --- a/tests/queries/0_stateless/02900_union_schema_inference_mode.sh +++ b/tests/queries/0_stateless/02900_union_schema_inference_mode.sh @@ -39,13 +39,13 @@ desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/archive.tar :: data{1,2,3}.jsonl'); " echo 'Error' > $CLICKHOUSE_TEST_UNIQUE_NAME/data4.jsonl -$CLICKHOUSE_LOCAL -q "desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/data{1,2,3,4}.jsonl') settings schema_inference_mode='union'" 2>&1 | grep -c -F "Cannot extract table structure" +$CLICKHOUSE_LOCAL -q "desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/data{1,2,3,4}.jsonl') settings schema_inference_mode='union'" 2>&1 | grep -c -F "CANNOT_EXTRACT_TABLE_STRUCTURE" $CLICKHOUSE_LOCAL -nm -q " set schema_inference_mode = 'union'; desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/data{2,3}.jsonl'); desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/data{1,2,3,4}.jsonl'); -" 2>&1 | grep -c -F "Cannot extract table structure" +" 2>&1 | grep -c -F "CANNOT_EXTRACT_TABLE_STRUCTURE" echo 42 > $CLICKHOUSE_TEST_UNIQUE_NAME/data1.csv echo 42, 43 > $CLICKHOUSE_TEST_UNIQUE_NAME/data2.csv diff --git a/tests/queries/0_stateless/02919_ddsketch_quantile.sql b/tests/queries/0_stateless/02919_ddsketch_quantile.sql index 99eace15d2d..d98978c117e 100644 --- a/tests/queries/0_stateless/02919_ddsketch_quantile.sql +++ b/tests/queries/0_stateless/02919_ddsketch_quantile.sql @@ -1,23 +1,23 @@ SELECT '1'; -- simple test -SELECT round(quantileDDSketch(0.01, 0.5)(number), 2) FROM numbers(200); -SELECT round(quantileDDSketch(0.0001, 0.69)(number), 2) FROM numbers(500); -SELECT round(quantileDDSketch(0.003, 0.42)(number), 2) FROM numbers(200); -SELECT round(quantileDDSketch(0.02, 0.99)(number), 2) FROM numbers(500); +SELECT round(quantileDD(0.01, 0.5)(number), 2) FROM numbers(200); +SELECT round(quantileDD(0.0001, 0.69)(number), 2) FROM numbers(500); +SELECT round(quantileDD(0.003, 0.42)(number), 2) FROM numbers(200); +SELECT round(quantileDD(0.02, 0.99)(number), 2) FROM numbers(500); SELECT '2'; -- median is close to 0 -SELECT round(quantileDDSketch(0.01, 0.5)(number), 2) +SELECT round(quantileDD(0.01, 0.5)(number), 2) FROM ( SELECT arrayJoin([toInt64(number), number - 10]) AS number FROM numbers(0, 10) ); -SELECT round(quantileDDSketch(0.01, 0.5)(number - 10), 2) FROM numbers(21); +SELECT round(quantileDD(0.01, 0.5)(number - 10), 2) FROM numbers(21); SELECT '3'; -- all values are negative -SELECT round(quantileDDSketch(0.01, 0.99)(-number), 2) FROM numbers(1, 500); +SELECT round(quantileDD(0.01, 0.99)(-number), 2) FROM numbers(1, 500); SELECT '4'; -- min and max values of integer types (-2^63, 2^63-1) -SELECT round(quantileDDSketch(0.01, 0.5)(number), 2) +SELECT round(quantileDD(0.01, 0.5)(number), 2) FROM ( SELECT arrayJoin([toInt64(number), number - 9223372036854775808, toInt64(number + 9223372036854775798)]) AS number @@ -25,7 +25,7 @@ FROM ); SELECT '5'; -- min and max values of floating point types -SELECT round(quantileDDSketch(0.01, 0.42)(number), 2) +SELECT round(quantileDD(0.01, 0.42)(number), 2) FROM ( SELECT arrayJoin([toFloat32(number), number - 3.4028235e+38, toFloat32(number + 3.4028235e+38)]) AS number @@ -33,7 +33,7 @@ FROM ); SELECT '6'; -- denormalized floats -SELECT round(quantileDDSketch(0.01, 0.69)(number), 2) +SELECT round(quantileDD(0.01, 0.69)(number), 2) FROM ( SELECT arrayJoin([toFloat32(number), number - 1.1754944e-38, toFloat32(number + 1.1754944e-38)]) AS number @@ -41,7 +41,7 @@ FROM ); SELECT '7'; -- NaNs -SELECT round(quantileDDSketch(0.01, 0.5)(number), 2) +SELECT round(quantileDD(0.01, 0.5)(number), 2) FROM ( SELECT arrayJoin([toFloat32(number), NaN * number]) AS number @@ -50,7 +50,7 @@ FROM SELECT '8'; -- sparse sketch -SELECT round(quantileDDSketch(0.01, 0.75)(number), 2) +SELECT round(quantileDD(0.01, 0.75)(number), 2) FROM ( SELECT number * 1e7 AS number @@ -63,11 +63,11 @@ DROP TABLE IF EXISTS `02919_ddsketch_quantile`; CREATE TABLE `02919_ddsketch_quantile` ENGINE = Log AS -SELECT quantilesDDSketchState(0.001, 0.9)(number) AS sketch +SELECT quantilesDDState(0.001, 0.9)(number) AS sketch FROM numbers(1000); -INSERT INTO `02919_ddsketch_quantile` SELECT quantilesDDSketchState(0.001, 0.9)(number + 1000) +INSERT INTO `02919_ddsketch_quantile` SELECT quantilesDDState(0.001, 0.9)(number + 1000) FROM numbers(1000); -SELECT arrayMap(a -> round(a, 2), (quantilesDDSketchMerge(0.001, 0.9)(sketch))) +SELECT arrayMap(a -> round(a, 2), (quantilesDDMerge(0.001, 0.9)(sketch))) FROM `02919_ddsketch_quantile`; diff --git a/tests/queries/0_stateless/02931_max_num_to_warn.reference b/tests/queries/0_stateless/02931_max_num_to_warn.reference index c0ad7354039..7de998eebfa 100644 --- a/tests/queries/0_stateless/02931_max_num_to_warn.reference +++ b/tests/queries/0_stateless/02931_max_num_to_warn.reference @@ -1,3 +1,3 @@ -The number of attached tables is more than 10 -The number of attached databases is more than 10 +The number of attached tables is more than 5 +The number of attached databases is more than 2 The number of active parts is more than 10 diff --git a/tests/queries/0_stateless/02931_max_num_to_warn.sql b/tests/queries/0_stateless/02931_max_num_to_warn.sql index 49b981fc355..23f04816d5a 100644 --- a/tests/queries/0_stateless/02931_max_num_to_warn.sql +++ b/tests/queries/0_stateless/02931_max_num_to_warn.sql @@ -37,7 +37,7 @@ INSERT INTO test_max_num_to_warn_02931.test_max_num_to_warn_9 VALUES (1, 'Hello' INSERT INTO test_max_num_to_warn_02931.test_max_num_to_warn_10 VALUES (1, 'Hello'); INSERT INTO test_max_num_to_warn_02931.test_max_num_to_warn_11 VALUES (1, 'Hello'); -SELECT * FROM system.warnings where message in ('The number of attached tables is more than 10', 'The number of attached databases is more than 10', 'The number of active parts is more than 10'); +SELECT * FROM system.warnings where message in ('The number of attached tables is more than 5', 'The number of attached databases is more than 2', 'The number of active parts is more than 10'); DROP DATABASE IF EXISTS test_max_num_to_warn_02931; DROP DATABASE IF EXISTS test_max_num_to_warn_1; diff --git a/tests/queries/0_stateless/02931_rewrite_sum_column_and_constant.reference b/tests/queries/0_stateless/02931_rewrite_sum_column_and_constant.reference index 8f29910e9ae..3124698d218 100644 --- a/tests/queries/0_stateless/02931_rewrite_sum_column_and_constant.reference +++ b/tests/queries/0_stateless/02931_rewrite_sum_column_and_constant.reference @@ -47,24 +47,24 @@ SELECT sum(uint64) + (1 * count(uint64)) FROM test_table WHERE ((uint64 + 1) AS i) > 0 EXPLAIN SYNTAX (SELECT sum(uint64 + 1) AS j from test_table having j > 0); -SELECT sum(uint64) + (1 * count(uint64)) +SELECT sum(uint64) + (1 * count(uint64)) AS j FROM test_table -HAVING (sum(uint64) + (1 * count(uint64))) > 0 +HAVING j > 0 EXPLAIN SYNTAX (SELECT sum(uint64 + 1 AS i) j from test_table where i > 0 having j > 0); -SELECT sum(uint64) + (1 * count(uint64)) +SELECT sum(uint64) + (1 * count(uint64)) AS j FROM test_table WHERE ((uint64 + 1) AS i) > 0 -HAVING (sum(uint64) + (1 * count(uint64))) > 0 +HAVING j > 0 EXPLAIN SYNTAX (SELECT sum((uint64 AS m) + (1 AS n)) j from test_table where m > 0 and n > 0 having j > 0); -SELECT sum(uint64) + ((1 AS n) * count(uint64)) +SELECT sum(uint64) + ((1 AS n) * count(uint64)) AS j FROM test_table WHERE ((uint64 AS m) > 0) AND (n > 0) -HAVING (sum(uint64) + (n * count(uint64))) > 0 +HAVING j > 0 EXPLAIN SYNTAX (SELECT sum(((uint64 AS m) + (1 AS n)) AS i) j from test_table where m > 0 and n > 0 and i > 0 having j > 0); -SELECT sum(uint64) + ((1 AS n) * count(uint64)) +SELECT sum(uint64) + ((1 AS n) * count(uint64)) AS j FROM test_table WHERE ((uint64 AS m) > 0) AND (n > 0) AND (((m + n) AS i) > 0) -HAVING (sum(uint64) + (n * count(uint64))) > 0 +HAVING j > 0 SELECT sum(1 + uint64 AS i) from test_table where i > 0; 20 SELECT sum(1 + uint64) AS j from test_table having j > 0; @@ -80,24 +80,24 @@ SELECT (1 * count(uint64)) + sum(uint64) FROM test_table WHERE ((1 + uint64) AS i) > 0 EXPLAIN SYNTAX (SELECT sum(1 + uint64) AS j from test_table having j > 0); -SELECT (1 * count(uint64)) + sum(uint64) +SELECT (1 * count(uint64)) + sum(uint64) AS j FROM test_table -HAVING ((1 * count(uint64)) + sum(uint64)) > 0 +HAVING j > 0 EXPLAIN SYNTAX (SELECT sum(1 + uint64 AS i) j from test_table where i > 0 having j > 0); -SELECT (1 * count(uint64)) + sum(uint64) +SELECT (1 * count(uint64)) + sum(uint64) AS j FROM test_table WHERE ((1 + uint64) AS i) > 0 -HAVING ((1 * count(uint64)) + sum(uint64)) > 0 +HAVING j > 0 EXPLAIN SYNTAX (SELECT sum((1 AS m) + (uint64 AS n)) j from test_table where m > 0 and n > 0 having j > 0); -SELECT ((1 AS m) * count(uint64)) + sum(uint64) +SELECT ((1 AS m) * count(uint64)) + sum(uint64) AS j FROM test_table WHERE (m > 0) AND ((uint64 AS n) > 0) -HAVING ((m * count(uint64)) + sum(uint64)) > 0 +HAVING j > 0 EXPLAIN SYNTAX (SELECT sum(((1 AS m) + (uint64 AS n)) AS i) j from test_table where m > 0 and n > 0 and i > 0 having j > 0); -SELECT ((1 AS m) * count(uint64)) + sum(uint64) +SELECT ((1 AS m) * count(uint64)) + sum(uint64) AS j FROM test_table WHERE (m > 0) AND ((uint64 AS n) > 0) AND (((m + n) AS i) > 0) -HAVING ((m * count(uint64)) + sum(uint64)) > 0 +HAVING j > 0 SELECT sum(uint64 - 1 AS i) from test_table where i > 0; 10 SELECT sum(uint64 - 1) AS j from test_table having j > 0; @@ -113,24 +113,24 @@ SELECT sum(uint64) - (1 * count(uint64)) FROM test_table WHERE ((uint64 - 1) AS i) > 0 EXPLAIN SYNTAX (SELECT sum(uint64 - 1) AS j from test_table having j > 0); -SELECT sum(uint64) - (1 * count(uint64)) +SELECT sum(uint64) - (1 * count(uint64)) AS j FROM test_table -HAVING (sum(uint64) - (1 * count(uint64))) > 0 +HAVING j > 0 EXPLAIN SYNTAX (SELECT sum(uint64 - 1 AS i) j from test_table where i > 0 having j > 0); -SELECT sum(uint64) - (1 * count(uint64)) +SELECT sum(uint64) - (1 * count(uint64)) AS j FROM test_table WHERE ((uint64 - 1) AS i) > 0 -HAVING (sum(uint64) - (1 * count(uint64))) > 0 +HAVING j > 0 EXPLAIN SYNTAX (SELECT sum((uint64 AS m) - (1 AS n)) j from test_table where m > 0 and n > 0 having j > 0); -SELECT sum(uint64) - ((1 AS n) * count(uint64)) +SELECT sum(uint64) - ((1 AS n) * count(uint64)) AS j FROM test_table WHERE ((uint64 AS m) > 0) AND (n > 0) -HAVING (sum(uint64) - (n * count(uint64))) > 0 +HAVING j > 0 EXPLAIN SYNTAX (SELECT sum(((uint64 AS m) - (1 AS n)) AS i) j from test_table where m > 0 and n > 0 and i > 0 having j > 0); -SELECT sum(uint64) - ((1 AS n) * count(uint64)) +SELECT sum(uint64) - ((1 AS n) * count(uint64)) AS j FROM test_table WHERE ((uint64 AS m) > 0) AND (n > 0) AND (((m - n) AS i) > 0) -HAVING (sum(uint64) - (n * count(uint64))) > 0 +HAVING j > 0 SELECT sum(1 - uint64 AS i) from test_table; -10 SELECT sum(1 - uint64) AS j from test_table; @@ -146,24 +146,24 @@ SELECT (1 * count(uint64)) - sum(uint64) FROM test_table WHERE ((1 - uint64) AS i) > 0 EXPLAIN SYNTAX (SELECT sum(1 - uint64) AS j from test_table having j < 0); -SELECT (1 * count(uint64)) - sum(uint64) +SELECT (1 * count(uint64)) - sum(uint64) AS j FROM test_table -HAVING ((1 * count(uint64)) - sum(uint64)) < 0 +HAVING j < 0 EXPLAIN SYNTAX (SELECT sum(1 - uint64 AS i) j from test_table where i > 0 having j < 0); -SELECT (1 * count(uint64)) - sum(uint64) +SELECT (1 * count(uint64)) - sum(uint64) AS j FROM test_table WHERE ((1 - uint64) AS i) > 0 -HAVING ((1 * count(uint64)) - sum(uint64)) < 0 +HAVING j < 0 EXPLAIN SYNTAX (SELECT sum((1 AS m) - (uint64 AS n)) j from test_table where m > 0 and n > 0 having j < 0); -SELECT ((1 AS m) * count(uint64)) - sum(uint64) +SELECT ((1 AS m) * count(uint64)) - sum(uint64) AS j FROM test_table WHERE (m > 0) AND ((uint64 AS n) > 0) -HAVING ((m * count(uint64)) - sum(uint64)) < 0 +HAVING j < 0 EXPLAIN SYNTAX (SELECT sum(((1 AS m) - (uint64 AS n)) AS i) j from test_table where m > 0 and n > 0 and i < 0 having j < 0); -SELECT ((1 AS m) * count(uint64)) - sum(uint64) +SELECT ((1 AS m) * count(uint64)) - sum(uint64) AS j FROM test_table WHERE (m > 0) AND ((uint64 AS n) > 0) AND (((m - n) AS i) < 0) -HAVING ((m * count(uint64)) - sum(uint64)) < 0 +HAVING j < 0 SELECT sum(uint64 + 2.11) From test_table; 25.549999999999997 SELECT sum(2.11 + uint64) From test_table; @@ -474,3 +474,11 @@ FROM test_table EXPLAIN SYNTAX (SELECT (2 * count(decimal32) - sum(decimal32)) + (3 * count(decimal32) - sum(decimal32)) From test_table); SELECT ((2 * count(decimal32)) - sum(decimal32)) + ((3 * count(decimal32)) - sum(decimal32)) FROM test_table +-- https://github.com/ClickHouse/ClickHouse/issues/59414 +SELECT sum(uint64 + 2) as j, j + 5 as t from test_table; +25 30 +EXPLAIN SYNTAX SELECT sum(uint64 + 2) as j, j + 5 as t from test_table; +SELECT + sum(uint64) + (2 * count(uint64)) AS j, + j + 5 AS t +FROM test_table diff --git a/tests/queries/0_stateless/02931_rewrite_sum_column_and_constant.sql b/tests/queries/0_stateless/02931_rewrite_sum_column_and_constant.sql index b29407d7208..c7b0ff82442 100644 --- a/tests/queries/0_stateless/02931_rewrite_sum_column_and_constant.sql +++ b/tests/queries/0_stateless/02931_rewrite_sum_column_and_constant.sql @@ -204,6 +204,11 @@ EXPLAIN SYNTAX (SELECT (sum(decimal32) + 2 * count(decimal32)) - (sum(decimal32) EXPLAIN SYNTAX (SELECT (sum(decimal32) - 2 * count(decimal32)) + (sum(decimal32) - 3 * count(decimal32)) From test_table); EXPLAIN SYNTAX (SELECT (sum(decimal32) - 2 * count(decimal32)) - (sum(decimal32) - 3 * count(decimal32)) From test_table); EXPLAIN SYNTAX (SELECT (2 * count(decimal32) - sum(decimal32)) + (3 * count(decimal32) - sum(decimal32)) From test_table); + +-- https://github.com/ClickHouse/ClickHouse/issues/59414 +SELECT sum(uint64 + 2) as j, j + 5 as t from test_table; +EXPLAIN SYNTAX SELECT sum(uint64 + 2) as j, j + 5 as t from test_table; -- { echoOff } + DROP TABLE IF EXISTS test_table; diff --git a/tests/queries/0_stateless/02932_group_by_null_fuzzer.sql b/tests/queries/0_stateless/02932_group_by_null_fuzzer.sql index 0c28c120d40..603c7783ef8 100644 --- a/tests/queries/0_stateless/02932_group_by_null_fuzzer.sql +++ b/tests/queries/0_stateless/02932_group_by_null_fuzzer.sql @@ -1,5 +1,6 @@ -- https://github.com/ClickHouse/ClickHouse/issues/43202 -- Queries are generated by the fuzzer, so don't expect them to make sense +SET enable_positional_arguments=0; SELECT NULL, '' FROM (SELECT toNullable(''), NULL AS key GROUP BY GROUPING SETS ((NULL))) AS s1 ALL LEFT JOIN (SELECT '' AS key, NULL AS value GROUP BY GROUPING SETS (('')) WITH TOTALS UNION ALL SELECT NULL AS key, toNullable(NULL) AS value GROUP BY '', NULL, '' WITH TOTALS) AS s2 USING (key); SELECT NULL GROUP BY NULL WITH TOTALS; SELECT 1048575, NULL, b FROM (SELECT '25.5' AS a, NULL, NULL AS b GROUP BY GROUPING SETS ((0.0001)) WITH TOTALS) AS js1 ANY RIGHT JOIN (SELECT NULL AS a, NULL AS b WHERE NULL GROUP BY NULL, -9223372036854775807 WITH CUBE WITH TOTALS UNION ALL SELECT NULL AS a, NULL AS b GROUP BY 1, '21474836.46' WITH TOTALS) AS js2 USING (a, b) ORDER BY nan DESC NULLS LAST, '9223372036854775807' DESC NULLS LAST, a ASC NULLS LAST; diff --git a/tests/queries/0_stateless/02932_set_ttl_where.reference b/tests/queries/0_stateless/02932_set_ttl_where.reference new file mode 100644 index 00000000000..bb0b1cf658d --- /dev/null +++ b/tests/queries/0_stateless/02932_set_ttl_where.reference @@ -0,0 +1,3 @@ +0 +0 +0 diff --git a/tests/queries/0_stateless/02932_set_ttl_where.sql b/tests/queries/0_stateless/02932_set_ttl_where.sql new file mode 100644 index 00000000000..ee8473e1af2 --- /dev/null +++ b/tests/queries/0_stateless/02932_set_ttl_where.sql @@ -0,0 +1,18 @@ +-- Tags: no-ordinary-database + +create or replace table t_temp ( + a UInt32, + timestamp DateTime +) +engine = MergeTree +order by a +TTL timestamp + INTERVAL 2 SECOND WHERE a in (select number from system.numbers limit 100_000); + +select sleep(1); +insert into t_temp select rand(), now() from system.numbers limit 1_000_000; +select sleep(1); +insert into t_temp select rand(), now() from system.numbers limit 1_000_000; +select sleep(1); +optimize table t_temp final; + +DROP TABLE t_temp; diff --git a/tests/queries/0_stateless/02940_variant_text_deserialization.reference b/tests/queries/0_stateless/02940_variant_text_deserialization.reference new file mode 100644 index 00000000000..8836e6c4e57 --- /dev/null +++ b/tests/queries/0_stateless/02940_variant_text_deserialization.reference @@ -0,0 +1,516 @@ +JSON +String +{"v":null,"variantElement(v, 'String')":null} +{"v":"string","variantElement(v, 'String')":"string"} +{"v":"42","variantElement(v, 'String')":null} +FixedString +{"v":null,"variantElement(v, 'FixedString(4)')":null} +{"v":"string","variantElement(v, 'FixedString(4)')":null} +{"v":"abcd","variantElement(v, 'FixedString(4)')":"abcd"} +Bool +{"v":null,"variantElement(v, 'Bool')":null} +{"v":"string","variantElement(v, 'Bool')":null} +{"v":true,"variantElement(v, 'Bool')":true} +Integers +{"v":null,"variantElement(v, 'Int8')":null} +{"v":"string","variantElement(v, 'Int8')":null} +{"v":-1,"variantElement(v, 'Int8')":-1} +{"v":0,"variantElement(v, 'Int8')":0} +{"v":"10000000000","variantElement(v, 'Int8')":null} +{"v":null,"variantElement(v, 'UInt8')":null} +{"v":"string","variantElement(v, 'UInt8')":null} +{"v":"-1","variantElement(v, 'UInt8')":null} +{"v":0,"variantElement(v, 'UInt8')":0} +{"v":"10000000000","variantElement(v, 'UInt8')":null} +{"v":null,"variantElement(v, 'Int16')":null} +{"v":"string","variantElement(v, 'Int16')":null} +{"v":-1,"variantElement(v, 'Int16')":-1} +{"v":0,"variantElement(v, 'Int16')":0} +{"v":"10000000000","variantElement(v, 'Int16')":null} +{"v":null,"variantElement(v, 'UInt16')":null} +{"v":"string","variantElement(v, 'UInt16')":null} +{"v":"-1","variantElement(v, 'UInt16')":null} +{"v":0,"variantElement(v, 'UInt16')":0} +{"v":"10000000000","variantElement(v, 'UInt16')":null} +{"v":null,"variantElement(v, 'Int32')":null} +{"v":"string","variantElement(v, 'Int32')":null} +{"v":-1,"variantElement(v, 'Int32')":-1} +{"v":0,"variantElement(v, 'Int32')":0} +{"v":"10000000000","variantElement(v, 'Int32')":null} +{"v":null,"variantElement(v, 'UInt32')":null} +{"v":"string","variantElement(v, 'UInt32')":null} +{"v":"-1","variantElement(v, 'UInt32')":null} +{"v":0,"variantElement(v, 'UInt32')":0} +{"v":"10000000000","variantElement(v, 'UInt32')":null} +{"v":null,"variantElement(v, 'Int64')":null} +{"v":"string","variantElement(v, 'Int64')":null} +{"v":"-1","variantElement(v, 'Int64')":"-1"} +{"v":"0","variantElement(v, 'Int64')":"0"} +{"v":"10000000000000000000000","variantElement(v, 'Int64')":null} +{"v":null,"variantElement(v, 'UInt64')":null} +{"v":"string","variantElement(v, 'UInt64')":null} +{"v":"-1","variantElement(v, 'UInt64')":null} +{"v":"0","variantElement(v, 'UInt64')":"0"} +{"v":"10000000000000000000000","variantElement(v, 'UInt64')":null} +{"v":null,"variantElement(v, 'Int128')":null} +{"v":"string","variantElement(v, 'Int128')":null} +{"v":"-1","variantElement(v, 'Int128')":"-1"} +{"v":"0","variantElement(v, 'Int128')":"0"} +{"v":null,"variantElement(v, 'UInt128')":null} +{"v":"string","variantElement(v, 'UInt128')":null} +{"v":"-1","variantElement(v, 'UInt128')":null} +{"v":"0","variantElement(v, 'UInt128')":"0"} +Floats +{"v":null,"variantElement(v, 'Float32')":null} +{"v":"string","variantElement(v, 'Float32')":null} +{"v":42.42,"variantElement(v, 'Float32')":42.42} +{"v":null,"variantElement(v, 'Float64')":null} +{"v":"string","variantElement(v, 'Float64')":null} +{"v":42.42,"variantElement(v, 'Float64')":42.42} +Decimals +{"v":null,"variantElement(v, 'Decimal32(6)')":null} +{"v":"string","variantElement(v, 'Decimal32(6)')":null} +{"v":42.42,"variantElement(v, 'Decimal32(6)')":42.42} +{"v":"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242","variantElement(v, 'Decimal32(6)')":null} +{"v":null,"variantElement(v, 'Decimal64(6)')":null} +{"v":"string","variantElement(v, 'Decimal64(6)')":null} +{"v":42.42,"variantElement(v, 'Decimal64(6)')":42.42} +{"v":"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242","variantElement(v, 'Decimal64(6)')":null} +{"v":null,"variantElement(v, 'Decimal128(6)')":null} +{"v":"string","variantElement(v, 'Decimal128(6)')":null} +{"v":42.42,"variantElement(v, 'Decimal128(6)')":42.42} +{"v":"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242","variantElement(v, 'Decimal128(6)')":null} +{"v":null,"variantElement(v, 'Decimal256(6)')":null} +{"v":"string","variantElement(v, 'Decimal256(6)')":null} +{"v":42.42,"variantElement(v, 'Decimal256(6)')":42.42} +{"v":"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242","variantElement(v, 'Decimal256(6)')":null} +Dates and DateTimes +{"v":null,"variantElement(v, 'Date')":null} +{"v":"string","variantElement(v, 'Date')":null} +{"v":"2020-01-01","variantElement(v, 'Date')":"2020-01-01"} +{"v":"2020-01-01 00:00:00.999","variantElement(v, 'Date')":null} +{"v":null,"variantElement(v, 'Date32')":null} +{"v":"string","variantElement(v, 'Date32')":null} +{"v":"1900-01-01","variantElement(v, 'Date32')":"1900-01-01"} +{"v":"2020-01-01 00:00:00.999","variantElement(v, 'Date32')":null} +{"v":null,"variantElement(v, 'DateTime')":null} +{"v":"string","variantElement(v, 'DateTime')":null} +{"v":"2020-01-01 00:00:00","variantElement(v, 'DateTime')":"2020-01-01 00:00:00"} +{"v":"2020-01-01 00:00:00.999","variantElement(v, 'DateTime')":null} +{"v":null,"variantElement(v, 'DateTime64')":null} +{"v":"string","variantElement(v, 'DateTime64')":null} +{"v":"2020-01-01 00:00:00.999","variantElement(v, 'DateTime64')":"2020-01-01 00:00:00.999"} +{"v":"2020-01-01 00:00:00.999999999 ABC","variantElement(v, 'DateTime64')":null} +UUID +{"v":null,"variantElement(v, 'UUID')":null} +{"v":"string","variantElement(v, 'UUID')":null} +{"v":"c8619cca-0caa-445e-ae76-1d4f6e0b3927","variantElement(v, 'UUID')":"c8619cca-0caa-445e-ae76-1d4f6e0b3927"} +IPv4 +{"v":null,"variantElement(v, 'IPv4')":null} +{"v":"string","variantElement(v, 'IPv4')":null} +{"v":"127.0.0.1","variantElement(v, 'IPv4')":"127.0.0.1"} +IPv6 +{"v":null,"variantElement(v, 'IPv6')":null} +{"v":"string","variantElement(v, 'IPv6')":null} +{"v":"2001:db8:85a3::8a2e:370:7334","variantElement(v, 'IPv6')":"2001:db8:85a3::8a2e:370:7334"} +Enum +{"v":null,"variantElement(v, 'Enum(\\'a\\' = 1)')":null} +{"v":"string","variantElement(v, 'Enum(\\'a\\' = 1)')":null} +{"v":"a","variantElement(v, 'Enum(\\'a\\' = 1)')":"a"} +{"v":"a","variantElement(v, 'Enum(\\'a\\' = 1)')":"a"} +{"v":2,"variantElement(v, 'Enum(\\'a\\' = 1)')":null} +Map +{"v":null,"variantElement(v, 'Map(String, UInt64)')":{}} +{"v":"string","variantElement(v, 'Map(String, UInt64)')":{}} +{"v":{"a":"42","b":"43","c":"0"},"variantElement(v, 'Map(String, UInt64)')":{"a":"42","b":"43","c":"0"}} +{"v":"{\"c\" : 44, \"d\" : [1,2,3]}","variantElement(v, 'Map(String, UInt64)')":{}} +Tuple +{"v":null,"variantElement(v, 'Tuple(a UInt64, b UInt64)')":{"a":"0","b":"0"}} +{"v":"string","variantElement(v, 'Tuple(a UInt64, b UInt64)')":{"a":"0","b":"0"}} +{"v":{"a":"42","b":"0"},"variantElement(v, 'Tuple(a UInt64, b UInt64)')":{"a":"42","b":"0"}} +{"v":{"a":"44","b":"0"},"variantElement(v, 'Tuple(a UInt64, b UInt64)')":{"a":"44","b":"0"}} +\N (0,0) +string (0,0) +(42,0) (42,0) +{"a" : 44, "d" : 32} (0,0) +Array +{"v":null,"variantElement(v, 'Array(UInt64)')":[]} +{"v":"string","variantElement(v, 'Array(UInt64)')":[]} +{"v":["1","2","3"],"variantElement(v, 'Array(UInt64)')":["1","2","3"]} +{"v":["0","0","0"],"variantElement(v, 'Array(UInt64)')":["0","0","0"]} +{"v":"[1, 2, \"hello\"]","variantElement(v, 'Array(UInt64)')":[]} +LowCardinality +{"v":null,"variantElement(v, 'LowCardinality(String)')":null} +{"v":"string","variantElement(v, 'LowCardinality(String)')":"string"} +{"v":"42","variantElement(v, 'LowCardinality(String)')":null} +{"v":null,"variantElement(v, 'Array(LowCardinality(Nullable(String)))')":[]} +{"v":["string",null],"variantElement(v, 'Array(LowCardinality(Nullable(String)))')":["string",null]} +{"v":"42","variantElement(v, 'Array(LowCardinality(Nullable(String)))')":[]} +Nullable +{"v":null,"variantElement(v, 'Array(Nullable(String))')":[]} +{"v":"string","variantElement(v, 'Array(Nullable(String))')":[]} +{"v":["hello",null,"world"],"variantElement(v, 'Array(Nullable(String))')":["hello",null,"world"]} +{"repeat('-', 80)":"--------------------------------------------------------------------------------"} +CSV +String +\N,\N +"string","string" +"string","string" +42,\N +FixedString +\N,\N +"string",\N +"string",\N +"abcd","abcd" +Bool +\N,\N +"Truee",\N +true,true +Integers +\N,\N +"string",\N +-1,-1 +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,-1 +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,-1 +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,-1 +0,0 +10000000000000000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +10000000000000000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,-1 +0,0 +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +"42d42",\N +Floats +\N,\N +"string",\N +42.42,42.42 +"42.d42",\N +\N,\N +"string",\N +42.42,42.42 +"42.d42",\N +Decimals +\N,\N +"string",\N +42.42,42.42 +"42d42",\N +"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242",\N +\N,\N +"string",\N +42.42,42.42 +"42d42",\N +"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242",\N +\N,\N +"string",\N +42.42,42.42 +"42d42",\N +"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242",\N +\N,\N +"string",\N +42.42,42.42 +"42d42",\N +"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242",\N +Dates and DateTimes +\N,\N +"string",\N +"2020-01-d1",\N +"2020-01-01","2020-01-01" +"2020-01-01 00:00:00.999",\N +\N,\N +"string",\N +"2020-01-d1",\N +"1900-01-01","1900-01-01" +"2020-01-01 00:00:00.999",\N +\N,\N +"string",\N +"2020-01-d1",\N +"2020-01-01 00:00:00","2020-01-01 00:00:00" +"2020-01-01 00:00:00.999",\N +\N,\N +"string",\N +"2020-01-d1",\N +"2020-01-01 00:00:00.999","2020-01-01 00:00:00.999" +"2020-01-01 00:00:00.999999999 ABC",\N +UUID +\N,\N +"string",\N +"c8619cca-0caa-445e-ae76-1d4f6e0b3927","c8619cca-0caa-445e-ae76-1d4f6e0b3927" +"c8619cca-0caa-445e-ae76-1d4f6e0b3927AAA",\N +IPv4 +\N,\N +"string",\N +"127.0.0.1","127.0.0.1" +"127.0.0.1AAA",\N +IPv6 +\N,\N +"string",\N +"2001:db8:85a3::8a2e:370:7334","2001:db8:85a3::8a2e:370:7334" +"2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA",\N +Enum +\N,\N +"string",\N +"a","a" +"a","a" +2,\N +"aa",\N +Map +\N,"{}" +"string","{}" +"{'a':42,'b':43,'c':0}","{'a':42,'b':43,'c':0}" +"{'c' : 44, 'd' : [1,2,3]}","{}" +"{'c' : 44","{}" +Array +\N,"[]" +"string","[]" +"[1,2,3]","[1,2,3]" +"[0,0,0]","[0,0,0]" +"[1, 2, 'hello']","[]" +"[1, 2","[]" +LowCardinality +\N,\N +"string","string" +42,\N +\N,"[]" +"['string',NULL]","['string',NULL]" +"['string', nul]","[]" +42,"[]" +Nullable +\N,"[]" +"string","[]" +"['hello',NULL,'world']","['hello',NULL,'world']" +"['hello', nul]","[]" +{"repeat('-', 80)":"--------------------------------------------------------------------------------"} +TSV +String +\N \N +string string +42 \N +FixedString +\N \N +string \N +abcd abcd +Bool +\N \N +Truee \N +true true +Integers +\N \N +string \N +-1 -1 +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 \N +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 -1 +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 \N +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 -1 +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 \N +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 -1 +0 0 +10000000000000000000000 \N +42d42 \N +\N \N +string \N +-1 \N +0 0 +10000000000000000000000 \N +42d42 \N +\N \N +string \N +-1 -1 +0 0 +42d42 \N +\N \N +string \N +-1 \N +0 0 +42d42 \N +Floats +\N \N +string \N +42.42 42.42 +42.d42 \N +\N \N +string \N +42.42 42.42 +42.d42 \N +Decimals +\N \N +string \N +42.42 42.42 +42d42 \N +4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242 \N +\N \N +string \N +42.42 42.42 +42d42 \N +4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242 \N +\N \N +string \N +42.42 42.42 +42d42 \N +4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242 \N +\N \N +string \N +42.42 42.42 +42d42 \N +4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242 \N +Dates and DateTimes +\N \N +string \N +2020-01-d1 \N +2020-01-01 2020-01-01 +2020-01-01 00:00:00.999 \N +\N \N +string \N +2020-01-d1 \N +1900-01-01 1900-01-01 +2020-01-01 00:00:00.999 \N +\N \N +string \N +2020-01-d1 \N +2020-01-01 00:00:00 2020-01-01 00:00:00 +2020-01-01 00:00:00.999 \N +\N \N +string \N +2020-01-d1 \N +2020-01-01 00:00:00.999 2020-01-01 00:00:00.999 +2020-01-01 00:00:00.999999999 ABC \N +UUID +\N \N +string \N +c8619cca-0caa-445e-ae76-1d4f6e0b3927 c8619cca-0caa-445e-ae76-1d4f6e0b3927 +c8619cca-0caa-445e-ae76-1d4f6e0b3927AAA \N +IPv4 +\N \N +string \N +127.0.0.1 127.0.0.1 +127.0.0.1AAA \N +IPv6 +\N \N +string \N +2001:db8:85a3::8a2e:370:7334 2001:db8:85a3::8a2e:370:7334 +2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA \N +Enum +\N \N +string \N +a a +a a +2 \N +aa \N +Map +\N {} +string {} +{'a':42,'b':43,'c':0} {'a':42,'b':43,'c':0} +{\'c\' : 44, \'d\' : [1,2,3]} {} +{\'c\' : 44 {} +Array +\N [] +string [] +[1,2,3] [1,2,3] +[0,0,0] [0,0,0] +[1, 2, \'hello\'] [] +[1, 2 [] +LowCardinality +\N \N +string string +42 \N +\N [] +['string',NULL] ['string',NULL] +[\'string\', nul] [] +42 [] +Nullable +\N [] +string [] +['hello',NULL,'world'] ['hello',NULL,'world'] +[\'hello\', nul] [] +{"repeat('-', 80)":"--------------------------------------------------------------------------------"} +Values +String +(NULL,NULL),('string','string'),(42,NULL)FixedString +(NULL,NULL),('string',NULL),('abcd','abcd')Bool +(NULL,NULL),(true,true)Integers +(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000000000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000000000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0)(NULL,NULL),('string',NULL),(-1,NULL),(0,0)Floats +(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)Decimals +(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)Dates and DateTimes +(NULL,NULL),('string',NULL),('2020-01-d1',NULL),('2020-01-01','2020-01-01'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('2020-01-d1',NULL),('1900-01-01','1900-01-01'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('2020-01-d1',NULL),('2020-01-01 00:00:00','2020-01-01 00:00:00'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('2020-01-d1',NULL),('2020-01-01 00:00:00.999','2020-01-01 00:00:00.999'),('2020-01-01 00:00:00.999999999 ABC',NULL)UUID +(NULL,NULL),('string',NULL),('c8619cca-0caa-445e-ae76-1d4f6e0b3927','c8619cca-0caa-445e-ae76-1d4f6e0b3927'),('c8619cca-0caa-445e-ae76-1d4f6e0b3927AAA',NULL)IPv4 +(NULL,NULL),('string',NULL),('127.0.0.1','127.0.0.1'),('127.0.0.1AAA',NULL)IPv6 +(NULL,NULL),('string',NULL),('2001:db8:85a3::8a2e:370:7334','2001:db8:85a3::8a2e:370:7334'),('2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA',NULL)Enum +(NULL,NULL),('string',NULL),('a','a'),(1,NULL),(2,NULL),('aa',NULL)Map +(NULL,{}),('string',{}),({'a':42,'b':43,'c':0},{'a':42,'b':43,'c':0})Array +(NULL,[]),('string',[]),([1,2,3],[1,2,3]),([0,0,0],[0,0,0])LowCardinality +(NULL,NULL),('string','string'),(42,NULL)(NULL,[]),(['string',NULL],['string',NULL]),(42,[])Nullable +(NULL,[]),('string',[]),(['hello',NULL,'world'],['hello',NULL,'world']) diff --git a/tests/queries/0_stateless/02940_variant_text_deserialization.sql b/tests/queries/0_stateless/02940_variant_text_deserialization.sql new file mode 100644 index 00000000000..041d02088ef --- /dev/null +++ b/tests/queries/0_stateless/02940_variant_text_deserialization.sql @@ -0,0 +1,266 @@ +set allow_experimental_variant_type = 1; +set session_timezone = 'UTC'; + +select 'JSON'; +select 'String'; +select v, variantElement(v, 'String') from format(JSONEachRow, 'v Variant(String, UInt64)', '{"v" : null}, {"v" : "string"}, {"v" : 42}') format JSONEachRow; + +select 'FixedString'; +select v, variantElement(v, 'FixedString(4)') from format(JSONEachRow, 'v Variant(String, FixedString(4))', '{"v" : null}, {"v" : "string"}, {"v" : "abcd"}') format JSONEachRow; + +select 'Bool'; +select v, variantElement(v, 'Bool') from format(JSONEachRow, 'v Variant(String, Bool)', '{"v" : null}, {"v" : "string"}, {"v" : true}') format JSONEachRow; + +select 'Integers'; +select v, variantElement(v, 'Int8') from format(JSONEachRow, 'v Variant(String, Int8, UInt64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'UInt8') from format(JSONEachRow, 'v Variant(String, UInt8, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'Int16') from format(JSONEachRow, 'v Variant(String, Int16, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'UInt16') from format(JSONEachRow, 'v Variant(String, UInt16, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'Int32') from format(JSONEachRow, 'v Variant(String, Int32, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'UInt32') from format(JSONEachRow, 'v Variant(String, UInt32, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'Int64') from format(JSONEachRow, 'v Variant(String, Int64, Int128)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000000000000000}') format JSONEachRow; +select v, variantElement(v, 'UInt64') from format(JSONEachRow, 'v Variant(String, UInt64, Int128)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000000000000000}') format JSONEachRow; +select v, variantElement(v, 'Int128') from format(JSONEachRow, 'v Variant(String, Int128, Int256)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}') format JSONEachRow; +select v, variantElement(v, 'UInt128') from format(JSONEachRow, 'v Variant(String, UInt128, Int256)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}') format JSONEachRow; + +select 'Floats'; +select v, variantElement(v, 'Float32') from format(JSONEachRow, 'v Variant(String, Float32)', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}') format JSONEachRow; +select v, variantElement(v, 'Float64') from format(JSONEachRow, 'v Variant(String, Float64)', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}') format JSONEachRow; + +select 'Decimals'; +select v, variantElement(v, 'Decimal32(6)') from format(JSONEachRow, 'v Variant(String, Decimal32(6))', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}, {"v" : 4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242}') format JSONEachRow; +select v, variantElement(v, 'Decimal64(6)') from format(JSONEachRow, 'v Variant(String, Decimal64(6))', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}, {"v" : 4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242}') format JSONEachRow; +select v, variantElement(v, 'Decimal128(6)') from format(JSONEachRow, 'v Variant(String, Decimal128(6))', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}, {"v" : 4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242}') format JSONEachRow; +select v, variantElement(v, 'Decimal256(6)') from format(JSONEachRow, 'v Variant(String, Decimal256(6))', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}, {"v" : 4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242}') format JSONEachRow; + +select 'Dates and DateTimes'; +select v, variantElement(v, 'Date') from format(JSONEachRow, 'v Variant(String, Date, DateTime64)', '{"v" : null}, {"v" : "string"}, {"v" : "2020-01-01"}, {"v" : "2020-01-01 00:00:00.999"}') format JSONEachRow; +select v, variantElement(v, 'Date32') from format(JSONEachRow, 'v Variant(String, Date32, DateTime64)', '{"v" : null}, {"v" : "string"}, {"v" : "1900-01-01"}, {"v" : "2020-01-01 00:00:00.999"}') format JSONEachRow; +select v, variantElement(v, 'DateTime') from format(JSONEachRow, 'v Variant(String, DateTime, DateTime64)', '{"v" : null}, {"v" : "string"}, {"v" : "2020-01-01 00:00:00"}, {"v" : "2020-01-01 00:00:00.999"}') format JSONEachRow; +select v, variantElement(v, 'DateTime64') from format(JSONEachRow, 'v Variant(String, DateTime64)', '{"v" : null}, {"v" : "string"}, {"v" : "2020-01-01 00:00:00.999"}, {"v" : "2020-01-01 00:00:00.999999999 ABC"}') format JSONEachRow; + +select 'UUID'; +select v, variantElement(v, 'UUID') from format(JSONEachRow, 'v Variant(String, UUID)', '{"v" : null}, {"v" : "string"}, {"v" : "c8619cca-0caa-445e-ae76-1d4f6e0b3927"}') format JSONEachRow; + +select 'IPv4'; +select v, variantElement(v, 'IPv4') from format(JSONEachRow, 'v Variant(String, IPv4)', '{"v" : null}, {"v" : "string"}, {"v" : "127.0.0.1"}') format JSONEachRow; + +select 'IPv6'; +select v, variantElement(v, 'IPv6') from format(JSONEachRow, 'v Variant(String, IPv6)', '{"v" : null}, {"v" : "string"}, {"v" : "2001:0db8:85a3:0000:0000:8a2e:0370:7334"}') format JSONEachRow; + +select 'Enum'; +select v, variantElement(v, 'Enum(''a'' = 1)') from format(JSONEachRow, 'v Variant(String, UInt32, Enum(''a'' = 1))', '{"v" : null}, {"v" : "string"}, {"v" : "a"}, {"v" : 1}, {"v" : 2}') format JSONEachRow; + +select 'Map'; +select v, variantElement(v, 'Map(String, UInt64)') from format(JSONEachRow, 'v Variant(String, Map(String, UInt64))', '{"v" : null}, {"v" : "string"}, {"v" : {"a" : 42, "b" : 43, "c" : null}}, {"v" : {"c" : 44, "d" : [1,2,3]}}') format JSONEachRow; + +select 'Tuple'; +select v, variantElement(v, 'Tuple(a UInt64, b UInt64)') from format(JSONEachRow, 'v Variant(String, Tuple(a UInt64, b UInt64))', '{"v" : null}, {"v" : "string"}, {"v" : {"a" : 42, "b" : null}}, {"v" : {"a" : 44, "d" : 32}}') format JSONEachRow; +select v, variantElement(v, 'Tuple(a UInt64, b UInt64)') from format(JSONEachRow, 'v Variant(String, Tuple(a UInt64, b UInt64))', '{"v" : null}, {"v" : "string"}, {"v" : {"a" : 42, "b" : null}}, {"v" : {"a" : 44, "d" : 32}}') settings input_format_json_defaults_for_missing_elements_in_named_tuple=0; + +select 'Array'; +select v, variantElement(v, 'Array(UInt64)') from format(JSONEachRow, 'v Variant(String, Array(UInt64))', '{"v" : null}, {"v" : "string"}, {"v" : [1, 2, 3]}, {"v" : [null, null, null]} {"v" : [1, 2, "hello"]}') format JSONEachRow; + +select 'LowCardinality'; +select v, variantElement(v, 'LowCardinality(String)') from format(JSONEachRow, 'v Variant(LowCardinality(String), UInt64)', '{"v" : null}, {"v" : "string"}, {"v" : 42}') format JSONEachRow; +select v, variantElement(v, 'Array(LowCardinality(Nullable(String)))') from format(JSONEachRow, 'v Variant(Array(LowCardinality(Nullable(String))), UInt64)', '{"v" : null}, {"v" : ["string", null]}, {"v" : 42}') format JSONEachRow; + +select 'Nullable'; +select v, variantElement(v, 'Array(Nullable(String))') from format(JSONEachRow, 'v Variant(String, Array(Nullable(String)))', '{"v" : null}, {"v" : "string"}, {"v" : ["hello", null, "world"]}') format JSONEachRow; + +select repeat('-', 80) format JSONEachRow; + +select 'CSV'; +select 'String'; +select v, variantElement(v, 'String') from format(CSV, 'v Variant(String, UInt64)', '\\N\n"string"\nstring\n42') format CSV; + +select 'FixedString'; +select v, variantElement(v, 'FixedString(4)') from format(CSV, 'v Variant(String, FixedString(4))', '\\N\n"string"\nstring\n"abcd"') format CSV; + +select 'Bool'; +select v, variantElement(v, 'Bool') from format(CSV, 'v Variant(String, Bool)', '\\N\nTruee\nTrue') format CSV; + +select 'Integers'; +select v, variantElement(v, 'Int8') from format(CSV, 'v Variant(String, Int8, UInt64)', '\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'UInt8') from format(CSV, 'v Variant(String, UInt8, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'Int16') from format(CSV, 'v Variant(String, Int16, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'UInt16') from format(CSV, 'v Variant(String, UInt16, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'Int32') from format(CSV, 'v Variant(String, Int32, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'UInt32') from format(CSV, 'v Variant(String, UInt32, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'Int64') from format(CSV, 'v Variant(String, Int64, Int128)', '\\N\n"string"\n-1\n0\n10000000000000000000000\n42d42') format CSV; +select v, variantElement(v, 'UInt64') from format(CSV, 'v Variant(String, UInt64, Int128)', '\\N\n"string"\n-1\n0\n10000000000000000000000\n42d42') format CSV; +select v, variantElement(v, 'Int128') from format(CSV, 'v Variant(String, Int128, Int256)', '\\N\n"string"\n-1\n0\n42d42') format CSV; +select v, variantElement(v, 'UInt128') from format(CSV, 'v Variant(String, UInt128, Int256)', '\\N\n"string"\n-1\n0\n42d42') format CSV; + +select 'Floats'; +select v, variantElement(v, 'Float32') from format(CSV, 'v Variant(String, Float32)', '\\N\n"string"\n42.42\n42.d42') format CSV; +select v, variantElement(v, 'Float64') from format(CSV, 'v Variant(String, Float64)', '\\N\n"string"\n42.42\n42.d42') format CSV; + +select 'Decimals'; +select v, variantElement(v, 'Decimal32(6)') from format(CSV, 'v Variant(String, Decimal32(6))', '\\N\n"string"\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format CSV; +select v, variantElement(v, 'Decimal64(6)') from format(CSV, 'v Variant(String, Decimal64(6))', '\\N\n"string"\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format CSV; +select v, variantElement(v, 'Decimal128(6)') from format(CSV, 'v Variant(String, Decimal128(6))', '\\N\n"string"\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format CSV; +select v, variantElement(v, 'Decimal256(6)') from format(CSV, 'v Variant(String, Decimal256(6))', '\\N\n"string"\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format CSV; + +select 'Dates and DateTimes'; +select v, variantElement(v, 'Date') from format(CSV, 'v Variant(String, Date, DateTime64)', '\\N\n"string"\n"2020-01-d1"\n"2020-01-01"\n"2020-01-01 00:00:00.999"') format CSV; +select v, variantElement(v, 'Date32') from format(CSV, 'v Variant(String, Date32, DateTime64)', '\\N\n"string"\n"2020-01-d1"\n"1900-01-01"\n"2020-01-01 00:00:00.999"') format CSV; +select v, variantElement(v, 'DateTime') from format(CSV, 'v Variant(String, DateTime, DateTime64)', '\\N\n"string"\n"2020-01-d1"\n"2020-01-01 00:00:00"\n"2020-01-01 00:00:00.999"') format CSV; +select v, variantElement(v, 'DateTime64') from format(CSV, 'v Variant(String, DateTime64)', '\\N\n"string"\n"2020-01-d1"\n"2020-01-01 00:00:00.999"\n"2020-01-01 00:00:00.999999999 ABC"') format CSV; + +select 'UUID'; +select v, variantElement(v, 'UUID') from format(CSV, 'v Variant(String, UUID)', '\\N\n"string"\n"c8619cca-0caa-445e-ae76-1d4f6e0b3927"\nc8619cca-0caa-445e-ae76-1d4f6e0b3927AAA') format CSV; + +select 'IPv4'; +select v, variantElement(v, 'IPv4') from format(CSV, 'v Variant(String, IPv4)', '\\N\n"string"\n"127.0.0.1"\n"127.0.0.1AAA"') format CSV; + +select 'IPv6'; +select v, variantElement(v, 'IPv6') from format(CSV, 'v Variant(String, IPv6)', '\\N\n"string"\n"2001:0db8:85a3:0000:0000:8a2e:0370:7334"\n2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA') format CSV; + +select 'Enum'; +select v, variantElement(v, 'Enum(''a'' = 1)') from format(CSV, 'v Variant(String, UInt32, Enum(''a'' = 1))', '\\N\n"string"\n"a"\n1\n2\naa') format CSV; + +select 'Map'; +select v, variantElement(v, 'Map(String, UInt64)') from format(CSV, 'v Variant(String, Map(String, UInt64))', '\\N\n"string"\n"{''a'' : 42, ''b'' : 43, ''c'' : null}"\n"{''c'' : 44, ''d'' : [1,2,3]}"\n"{''c'' : 44"') format CSV; + +select 'Array'; +select v, variantElement(v, 'Array(UInt64)') from format(CSV, 'v Variant(String, Array(UInt64))', '\\N\n"string"\n"[1, 2, 3]"\n"[null, null, null]"\n"[1, 2, ''hello'']"\n"[1, 2"') format CSV; + +select 'LowCardinality'; +select v, variantElement(v, 'LowCardinality(String)') from format(CSV, 'v Variant(LowCardinality(String), UInt64)', '\\N\n"string"\n42') format CSV; +select v, variantElement(v, 'Array(LowCardinality(Nullable(String)))') from format(CSV, 'v Variant(Array(LowCardinality(Nullable(String))), UInt64, String)', '\\N\n"[''string'', null]"\n"[''string'', nul]"\n42') format CSV; + +select 'Nullable'; +select v, variantElement(v, 'Array(Nullable(String))') from format(CSV, 'v Variant(String, Array(Nullable(String)))', '\\N\n"string"\n"[''hello'', null, ''world'']"\n"[''hello'', nul]"') format CSV; + +select repeat('-', 80) format JSONEachRow; + +select 'TSV'; +select 'String'; +select v, variantElement(v, 'String') from format(TSV, 'v Variant(String, UInt64)', '\\N\nstring\n42') format TSV; + +select 'FixedString'; +select v, variantElement(v, 'FixedString(4)') from format(TSV, 'v Variant(String, FixedString(4))', '\\N\nstring\nabcd') format TSV; + +select 'Bool'; +select v, variantElement(v, 'Bool') from format(TSV, 'v Variant(String, Bool)', '\\N\nTruee\nTrue') format TSV; + +select 'Integers'; +select v, variantElement(v, 'Int8') from format(TSV, 'v Variant(String, Int8, UInt64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'UInt8') from format(TSV, 'v Variant(String, UInt8, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'Int16') from format(TSV, 'v Variant(String, Int16, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'UInt16') from format(TSV, 'v Variant(String, UInt16, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'Int32') from format(TSV, 'v Variant(String, Int32, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'UInt32') from format(TSV, 'v Variant(String, UInt32, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'Int64') from format(TSV, 'v Variant(String, Int64, Int128)', '\\N\nstring\n-1\n0\n10000000000000000000000\n42d42') format TSV; +select v, variantElement(v, 'UInt64') from format(TSV, 'v Variant(String, UInt64, Int128)', '\\N\nstring\n-1\n0\n10000000000000000000000\n42d42') format TSV; +select v, variantElement(v, 'Int128') from format(TSV, 'v Variant(String, Int128, Int256)', '\\N\nstring\n-1\n0\n42d42') format TSV; +select v, variantElement(v, 'UInt128') from format(TSV, 'v Variant(String, UInt128, Int256)', '\\N\nstring\n-1\n0\n42d42') format TSV; + +select 'Floats'; +select v, variantElement(v, 'Float32') from format(TSV, 'v Variant(String, Float32)', '\\N\nstring\n42.42\n42.d42') format TSV; +select v, variantElement(v, 'Float64') from format(TSV, 'v Variant(String, Float64)', '\\N\nstring\n42.42\n42.d42') format TSV; + +select 'Decimals'; +select v, variantElement(v, 'Decimal32(6)') from format(TSV, 'v Variant(String, Decimal32(6))', '\\N\nstring\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format TSV; +select v, variantElement(v, 'Decimal64(6)') from format(TSV, 'v Variant(String, Decimal64(6))', '\\N\nstring\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format TSV; +select v, variantElement(v, 'Decimal128(6)') from format(TSV, 'v Variant(String, Decimal128(6))', '\\N\nstring\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format TSV; +select v, variantElement(v, 'Decimal256(6)') from format(TSV, 'v Variant(String, Decimal256(6))', '\\N\nstring\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format TSV; + +select 'Dates and DateTimes'; +select v, variantElement(v, 'Date') from format(TSV, 'v Variant(String, Date, DateTime64)', '\\N\nstring\n2020-01-d1\n2020-01-01\n2020-01-01 00:00:00.999') format TSV; +select v, variantElement(v, 'Date32') from format(TSV, 'v Variant(String, Date32, DateTime64)', '\\N\nstring\n2020-01-d1\n1900-01-01\n2020-01-01 00:00:00.999') format TSV; +select v, variantElement(v, 'DateTime') from format(TSV, 'v Variant(String, DateTime, DateTime64)', '\\N\nstring\n2020-01-d1\n2020-01-01 00:00:00\n2020-01-01 00:00:00.999') format TSV; +select v, variantElement(v, 'DateTime64') from format(TSV, 'v Variant(String, DateTime64)', '\\N\nstring\n2020-01-d1\n2020-01-01 00:00:00.999\n2020-01-01 00:00:00.999999999 ABC') format TSV; + +select 'UUID'; +select v, variantElement(v, 'UUID') from format(TSV, 'v Variant(String, UUID)', '\\N\nstring\nc8619cca-0caa-445e-ae76-1d4f6e0b3927\nc8619cca-0caa-445e-ae76-1d4f6e0b3927AAA') format TSV; + +select 'IPv4'; +select v, variantElement(v, 'IPv4') from format(TSV, 'v Variant(String, IPv4)', '\\N\nstring\n127.0.0.1\n127.0.0.1AAA') format TSV; + +select 'IPv6'; +select v, variantElement(v, 'IPv6') from format(TSV, 'v Variant(String, IPv6)', '\\N\nstring\n2001:0db8:85a3:0000:0000:8a2e:0370:7334\n2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA') format TSV; + +select 'Enum'; +select v, variantElement(v, 'Enum(''a'' = 1)') from format(TSV, 'v Variant(String, UInt32, Enum(''a'' = 1))', '\\N\nstring\na\n1\n2\naa') format TSV; + +select 'Map'; +select v, variantElement(v, 'Map(String, UInt64)') from format(TSV, 'v Variant(String, Map(String, UInt64))', '\\N\nstring\n{''a'' : 42, ''b'' : 43, ''c'' : null}\n{''c'' : 44, ''d'' : [1,2,3]}\n{''c'' : 44') format TSV; + +select 'Array'; +select v, variantElement(v, 'Array(UInt64)') from format(TSV, 'v Variant(String, Array(UInt64))', '\\N\nstring\n[1, 2, 3]\n[null, null, null]\n[1, 2, ''hello'']\n[1, 2') format TSV; + +select 'LowCardinality'; +select v, variantElement(v, 'LowCardinality(String)') from format(TSV, 'v Variant(LowCardinality(String), UInt64)', '\\N\nstring\n42') format TSV; +select v, variantElement(v, 'Array(LowCardinality(Nullable(String)))') from format(TSV, 'v Variant(Array(LowCardinality(Nullable(String))), UInt64, String)', '\\N\n[''string'', null]\n[''string'', nul]\n42') format TSV; + +select 'Nullable'; +select v, variantElement(v, 'Array(Nullable(String))') from format(TSV, 'v Variant(String, Array(Nullable(String)))', '\\N\nstring\n[''hello'', null, ''world'']\n[''hello'', nul]') format TSV; + +select repeat('-', 80) format JSONEachRow; + +select 'Values'; +select 'String'; +select v, variantElement(v, 'String') from format(Values, 'v Variant(String, UInt64)', '(NULL), (''string''), (42)') format Values; + +select 'FixedString'; +select v, variantElement(v, 'FixedString(4)') from format(Values, 'v Variant(String, FixedString(4))', '(NULL), (''string''), (''abcd'')') format Values; + +select 'Bool'; +select v, variantElement(v, 'Bool') from format(Values, 'v Variant(String, Bool)', '(NULL), (True)') format Values; + +select 'Integers'; +select v, variantElement(v, 'Int8') from format(Values, 'v Variant(String, Int8, UInt64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'UInt8') from format(Values, 'v Variant(String, UInt8, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'Int16') from format(Values, 'v Variant(String, Int16, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'UInt16') from format(Values, 'v Variant(String, UInt16, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'Int32') from format(Values, 'v Variant(String, Int32, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'UInt32') from format(Values, 'v Variant(String, UInt32, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'Int64') from format(Values, 'v Variant(String, Int64, Int128)', '(NULL), (''string''), (-1), (0), (10000000000000000000000)') format Values; +select v, variantElement(v, 'UInt64') from format(Values, 'v Variant(String, UInt64, Int128)', '(NULL), (''string''), (-1), (0), (10000000000000000000000)') format Values; +select v, variantElement(v, 'Int128') from format(Values, 'v Variant(String, Int128, Int256)', '(NULL), (''string''), (-1), (0)') format Values; +select v, variantElement(v, 'UInt128') from format(Values, 'v Variant(String, UInt128, Int256)', '(NULL), (''string''), (-1), (0)') format Values; + +select 'Floats'; +select v, variantElement(v, 'Float32') from format(Values, 'v Variant(String, Float32)', '(NULL), (''string''), (42.42)') format Values; +select v, variantElement(v, 'Float64') from format(Values, 'v Variant(String, Float64)', '(NULL), (''string''), (42.42)') format Values; + +select 'Decimals'; +select v, variantElement(v, 'Decimal32(6)') from format(Values, 'v Variant(String, Decimal32(6))', '(NULL), (''string''), (42.42)') format Values; +select v, variantElement(v, 'Decimal64(6)') from format(Values, 'v Variant(String, Decimal64(6))', '(NULL), (''string''), (42.42)') format Values; +select v, variantElement(v, 'Decimal128(6)') from format(Values, 'v Variant(String, Decimal128(6))', '(NULL), (''string''), (42.42)') format Values; +select v, variantElement(v, 'Decimal256(6)') from format(Values, 'v Variant(String, Decimal256(6))', '(NULL), (''string''), (42.42)') format Values; + +select 'Dates and DateTimes'; +select v, variantElement(v, 'Date') from format(Values, 'v Variant(String, Date, DateTime64)', '(NULL), (''string''), (''2020-01-d1''), (''2020-01-01''), (''2020-01-01 00:00:00.999'')') format Values; +select v, variantElement(v, 'Date32') from format(Values, 'v Variant(String, Date32, DateTime64)', '(NULL), (''string''), (''2020-01-d1''), (''1900-01-01''), (''2020-01-01 00:00:00.999'')') format Values; +select v, variantElement(v, 'DateTime') from format(Values, 'v Variant(String, DateTime, DateTime64)', '(NULL), (''string''), (''2020-01-d1''), (''2020-01-01 00:00:00''), (''2020-01-01 00:00:00.999'')') format Values; +select v, variantElement(v, 'DateTime64') from format(Values, 'v Variant(String, DateTime64)', '(NULL), (''string''), (''2020-01-d1''), (''2020-01-01 00:00:00.999''), (''2020-01-01 00:00:00.999999999 ABC'')') format Values; + +select 'UUID'; +select v, variantElement(v, 'UUID') from format(Values, 'v Variant(String, UUID)', '(NULL), (''string''), (''c8619cca-0caa-445e-ae76-1d4f6e0b3927''), (''c8619cca-0caa-445e-ae76-1d4f6e0b3927AAA'')') format Values; + +select 'IPv4'; +select v, variantElement(v, 'IPv4') from format(Values, 'v Variant(String, IPv4)', '(NULL), (''string''), (''127.0.0.1''), (''127.0.0.1AAA'')') format Values; + +select 'IPv6'; +select v, variantElement(v, 'IPv6') from format(Values, 'v Variant(String, IPv6)', '(NULL), (''string''), (''2001:0db8:85a3:0000:0000:8a2e:0370:7334''), (''2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA'')') format Values; + +select 'Enum'; +select v, variantElement(v, 'Enum(''a'' = 1)') from format(Values, 'v Variant(String, UInt32, Enum(''a'' = 1))', '(NULL), (''string''), (''a''), (1), (2), (''aa'')') format Values; + +select 'Map'; +select v, variantElement(v, 'Map(String, UInt64)') from format(Values, 'v Variant(String, Map(String, UInt64))', '(NULL), (''string''), ({''a'' : 42, ''b'' : 43, ''c'' : null})') format Values; + +select 'Array'; +select v, variantElement(v, 'Array(UInt64)') from format(Values, 'v Variant(String, Array(UInt64))', '(NULL), (''string''), ([1, 2, 3]), ([null, null, null])') format Values; + +select 'LowCardinality'; +select v, variantElement(v, 'LowCardinality(String)') from format(Values, 'v Variant(LowCardinality(String), UInt64)', '(NULL), (''string''), (42)') format Values; +select v, variantElement(v, 'Array(LowCardinality(Nullable(String)))') from format(Values, 'v Variant(Array(LowCardinality(Nullable(String))), UInt64, String)', '(NULL), ([''string'', null]), (42)') format Values; + +select 'Nullable'; +select v, variantElement(v, 'Array(Nullable(String))') from format(Values, 'v Variant(String, Array(Nullable(String)))', '(NULL), (''string''), ([''hello'', null, ''world''])') format Values; + +select ''; \ No newline at end of file diff --git a/tests/queries/0_stateless/02941_variant_type_1.reference b/tests/queries/0_stateless/02941_variant_type_1.reference new file mode 100644 index 00000000000..8a6e77d4f6d --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_1.reference @@ -0,0 +1,2472 @@ +Memory +test1 insert +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test3 insert +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- +MergeTree compact +test1 insert +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test3 insert +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- +MergeTree wide +test1 insert +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test3 insert +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- diff --git a/tests/queries/0_stateless/02941_variant_type_1.sh b/tests/queries/0_stateless/02941_variant_type_1.sh new file mode 100755 index 00000000000..ed365bbd244 --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_1.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1" + +function test1_insert() +{ + echo "test1 insert" + $CH_CLIENT -nmq "insert into test select number, NULL from numbers(3); +insert into test select number + 3, number from numbers(3); +insert into test select number + 6, 'str_' || toString(number) from numbers(3); +insert into test select number + 9, ('lc_str_' || toString(number))::LowCardinality(String) from numbers(3); +insert into test select number + 12, tuple(number, number + 1)::Tuple(a UInt32, b UInt32) from numbers(3); +insert into test select number + 15, range(number + 1)::Array(UInt64) from numbers(3);" +} + +function test1_select() +{ + echo "test1 select" + $CH_CLIENT -nmq "select v from test order by id; +select v.String from test order by id; +select v.UInt64 from test order by id; +select v.\`LowCardinality(String)\` from test order by id; +select v.\`Tuple(a UInt32, b UInt32)\` from test order by id; +select v.\`Tuple(a UInt32, b UInt32)\`.a from test order by id; +select v.\`Tuple(a UInt32, b UInt32)\`.b from test order by id; +select v.\`Array(UInt64)\` from test order by id; +select v.\`Array(UInt64)\`.size0 from test order by id;" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function test2_insert() +{ + echo "test2 insert" + $CH_CLIENT -nmq "insert into test select number, NULL from numbers(3); +insert into test select number + 3, number % 2 ? NULL : number from numbers(3); +insert into test select number + 6, number % 2 ? NULL : 'str_' || toString(number) from numbers(3); +insert into test select number + 9, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(('lc_str_' || toString(number))::LowCardinality(String), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(3); +insert into test select number + 12, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(tuple(number, number + 1)::Tuple(a UInt32, b UInt32), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(3); +insert into test select number + 15, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(range(number + 1)::Array(UInt64), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(3);" +} + +function test2_select() +{ + echo "test2 select" + $CH_CLIENT -nmq "select v from test order by id; +select v.String from test order by id; +select v.UInt64 from test order by id; +select v.\`LowCardinality(String)\` from test order by id; +select v.\`Tuple(a UInt32, b UInt32)\` from test order by id; +select v.\`Tuple(a UInt32, b UInt32)\`.a from test order by id; +select v.\`Tuple(a UInt32, b UInt32)\`.b from test order by id; +select v.\`Array(UInt64)\` from test order by id; +select v.\`Array(UInt64)\`.size0 from test order by id;" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function test3_insert() +{ + echo "test3 insert" + $CH_CLIENT -q "insert into test with 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))' as type select number, multiIf(number % 6 == 0, CAST(NULL, type), number % 6 == 1, CAST('str_' || toString(number), type), number % 6 == 2, CAST(number, type), number % 6 == 3, CAST(('lc_str_' || toString(number))::LowCardinality(String), type), number % 6 == 4, CAST(tuple(number, number + 1)::Tuple(a UInt32, b UInt32), type), CAST(range(number + 1)::Array(UInt64), type)) as res from numbers(18);" +} + +function test3_select() +{ + echo "test3 select" + $CH_CLIENT -nmq "select v from test order by id; +select v.String from test order by id; +select v.UInt64 from test order by id; +select v.\`LowCardinality(String)\` from test order by id; +select v.\`Tuple(a UInt32, b UInt32)\` from test order by id; +select v.\`Tuple(a UInt32, b UInt32)\`.a from test order by id; +select v.\`Tuple(a UInt32, b UInt32)\`.b from test order by id; +select v.\`Array(UInt64)\` from test order by id; +select v.\`Array(UInt64)\`.size0 from test order by id;" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function run() +{ + test1_insert + test1_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test1_select + fi + $CH_CLIENT -q "truncate table test;" + test2_insert + test2_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test2_select + fi + $CH_CLIENT -q "truncate table test;" + test3_insert + test3_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test3_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02941_variant_type_2.reference b/tests/queries/0_stateless/02941_variant_type_2.reference new file mode 100644 index 00000000000..4b6d53c52ac --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_2.reference @@ -0,0 +1,51 @@ +Memory +test4 insert +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +MergeTree compact +test4 insert +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +MergeTree wide +test4 insert +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 diff --git a/tests/queries/0_stateless/02941_variant_type_2.sh b/tests/queries/0_stateless/02941_variant_type_2.sh new file mode 100755 index 00000000000..23666a9b4a8 --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_2.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1" + +function test4_insert() +{ + echo "test4 insert" + $CH_CLIENT -nmq "insert into test select number, NULL from numbers(200000); +insert into test select number + 200000, number from numbers(200000); +insert into test select number + 400000, 'str_' || toString(number) from numbers(200000); +insert into test select number + 600000, ('lc_str_' || toString(number))::LowCardinality(String) from numbers(200000); +insert into test select number + 800000, tuple(number, number + 1)::Tuple(a UInt32, b UInt32) from numbers(200000); +insert into test select number + 1000000, range(number % 20 + 1)::Array(UInt64) from numbers(200000);" +} + +function test4_select +{ + echo "test4 select" + $CH_CLIENT -nmq "select v from test format Null; +select count() from test where isNotNull(v); +select v.String from test format Null; +select count() from test where isNotNull(v.String); +select v.UInt64 from test format Null; +select count() from test where isNotNull(v.UInt64); +select v.\`LowCardinality(String)\` from test format Null; +select count() from test where isNotNull(v.\`LowCardinality(String)\`); +select v.\`Tuple(a UInt32, b UInt32)\` from test format Null; +select v.\`Tuple(a UInt32, b UInt32)\`.a from test format Null; +select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.a); +select v.\`Tuple(a UInt32, b UInt32)\`.b from test format Null; +select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.b); +select v.\`Array(UInt64)\` from test format Null; +select count() from test where not empty(v.\`Array(UInt64)\`); +select v.\`Array(UInt64)\`.size0 from test format Null; +select count() from test where isNotNull(v.\`Array(UInt64)\`.size0);" +} + +function run() +{ + test4_insert + test4_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test4_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02941_variant_type_3.reference b/tests/queries/0_stateless/02941_variant_type_3.reference new file mode 100644 index 00000000000..1ccdb3acdff --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_3.reference @@ -0,0 +1,51 @@ +Memory +test5 insert +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 +MergeTree compact +test5 insert +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 +MergeTree wide +test5 insert +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 diff --git a/tests/queries/0_stateless/02941_variant_type_3.sh b/tests/queries/0_stateless/02941_variant_type_3.sh new file mode 100755 index 00000000000..d6309e26414 --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_3.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1" + +function test5_insert() +{ + echo "test5 insert" + $CH_CLIENT -nmq " +insert into test select number, NULL from numbers(200000); +insert into test select number + 200000, number % 2 ? NULL : number from numbers(200000); +insert into test select number + 400000, number % 2 ? NULL : 'str_' || toString(number) from numbers(200000); +insert into test select number + 600000, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(('lc_str_' || toString(number))::LowCardinality(String), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(200000); +insert into test select number + 800000, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(tuple(number, number + 1)::Tuple(a UInt32, b UInt32), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(200000); +insert into test select number + 1000000, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(range(number % 20 + 1)::Array(UInt64), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(200000);" +} + +function test5_select() +{ + echo "test5 select" + $CH_CLIENT -nmq " +select v from test format Null; +select count() from test where isNotNull(v); +select v.String from test format Null; +select count() from test where isNotNull(v.String); +select v.UInt64 from test format Null; +select count() from test where isNotNull(v.UInt64); +select v.\`LowCardinality(String)\` from test format Null; +select count() from test where isNotNull(v.\`LowCardinality(String)\`); +select v.\`Tuple(a UInt32, b UInt32)\` from test format Null; +select v.\`Tuple(a UInt32, b UInt32)\`.a from test format Null; +select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.a); +select v.\`Tuple(a UInt32, b UInt32)\`.b from test format Null; +select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.b); +select v.\`Array(UInt64)\` from test format Null; +select count() from test where not empty(v.\`Array(UInt64)\`); +select v.\`Array(UInt64)\`.size0 from test format Null; +select count() from test where isNotNull(v.\`Array(UInt64)\`.size0);" +} + +function run() +{ + test5_insert + test5_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test5_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02941_variant_type_4.reference b/tests/queries/0_stateless/02941_variant_type_4.reference new file mode 100644 index 00000000000..e13d5820343 --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_4.reference @@ -0,0 +1,56 @@ +Memory +test6 insert +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- +MergeTree compact +test6 insert +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- +MergeTree wide +test6 insert +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- diff --git a/tests/queries/0_stateless/02941_variant_type_4.sh b/tests/queries/0_stateless/02941_variant_type_4.sh new file mode 100755 index 00000000000..5ea04db4bb4 --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_4.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1" + +function test6_insert() +{ + echo "test6 insert" + $CH_CLIENT -q "insert into test with 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))' as type select number, multiIf(number % 6 == 0, CAST(NULL, type), number % 6 == 1, CAST('str_' || toString(number), type), number % 6 == 2, CAST(number, type), number % 6 == 3, CAST(('lc_str_' || toString(number))::LowCardinality(String), type), number % 6 == 4, CAST(tuple(number, number + 1)::Tuple(a UInt32, b UInt32), type), CAST(range(number % 20 + 1)::Array(UInt64), type)) as res from numbers(1200000);" +} + +function test6_select() +{ + echo "test6 select" + $CH_CLIENT -nmq "select v from test format Null; + select count() from test where isNotNull(v); + select v.String from test format Null; + select count() from test where isNotNull(v.String); + select v.UInt64 from test format Null; + select count() from test where isNotNull(v.UInt64); + select v.\`LowCardinality(String)\` from test format Null; + select count() from test where isNotNull(v.\`LowCardinality(String)\`); + select v.\`Tuple(a UInt32, b UInt32)\` from test format Null; + select v.\`Tuple(a UInt32, b UInt32)\`.a from test format Null; + select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.a); + select v.\`Tuple(a UInt32, b UInt32)\`.b from test format Null; + select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.b); + select v.\`Array(UInt64)\` from test format Null; + select count() from test where not empty(v.\`Array(UInt64)\`); + select v.\`Array(UInt64)\`.size0 from test format Null; + select count() from test where isNotNull(v.\`Array(UInt64)\`.size0);" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function run() +{ + test6_insert + test6_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test6_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02941_variant_type_alters.reference b/tests/queries/0_stateless/02941_variant_type_alters.reference new file mode 100644 index 00000000000..52c834e455b --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_alters.reference @@ -0,0 +1,330 @@ +Memory +initial insert +alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +insert after alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +3 3 3 \N 3 +4 4 4 \N 4 +5 5 5 \N 5 +6 6 str_6 str_6 \N +7 7 str_7 str_7 \N +8 8 str_8 str_8 \N +9 9 \N \N \N +10 10 \N \N \N +11 11 \N \N \N +12 12 12 \N 12 +13 13 str_13 str_13 \N +14 14 \N \N \N +alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +insert after alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +15 15 1970-01-16 \N \N 1970-01-16 +16 16 1970-01-17 \N \N 1970-01-17 +17 17 1970-01-18 \N \N 1970-01-18 +18 18 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N +20 20 20 \N 20 \N +21 21 str_21 str_21 \N \N +alter modify column 2 +0 0 \N \N \N \N \N \N +1 1 \N \N \N \N \N \N +2 2 \N \N \N \N \N \N +3 3 \N \N 3 \N 3 \N +4 4 \N \N 4 \N 4 \N +5 5 \N \N 5 \N 5 \N +6 6 \N \N str_6 str_6 \N \N +7 7 \N \N str_7 str_7 \N \N +8 8 \N \N str_8 str_8 \N \N +9 9 \N \N \N \N \N \N +10 10 \N \N \N \N \N \N +11 11 \N \N \N \N \N \N +12 12 \N \N 12 \N 12 \N +13 13 \N \N str_13 str_13 \N \N +14 14 \N \N \N \N \N \N +15 15 \N \N 1970-01-16 \N \N 1970-01-16 +16 16 \N \N 1970-01-17 \N \N 1970-01-17 +17 17 \N \N 1970-01-18 \N \N 1970-01-18 +18 18 \N \N 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N \N \N +20 20 \N \N 20 \N 20 \N +21 21 \N \N str_21 str_21 \N \N +insert after alter modify column 2 +0 0 \N \N \N \N \N \N +1 1 \N \N \N \N \N \N +2 2 \N \N \N \N \N \N +3 3 \N \N 3 \N 3 \N +4 4 \N \N 4 \N 4 \N +5 5 \N \N 5 \N 5 \N +6 6 \N \N str_6 str_6 \N \N +7 7 \N \N str_7 str_7 \N \N +8 8 \N \N str_8 str_8 \N \N +9 9 \N \N \N \N \N \N +10 10 \N \N \N \N \N \N +11 11 \N \N \N \N \N \N +12 12 \N \N 12 \N 12 \N +13 13 \N \N str_13 str_13 \N \N +14 14 \N \N \N \N \N \N +15 15 \N \N 1970-01-16 \N \N 1970-01-16 +16 16 \N \N 1970-01-17 \N \N 1970-01-17 +17 17 \N \N 1970-01-18 \N \N 1970-01-18 +18 18 \N \N 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N \N \N +20 20 \N \N 20 \N 20 \N +21 21 \N \N str_21 str_21 \N \N +22 str_22 \N str_22 \N \N \N \N +23 \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N +MergeTree compact +initial insert +alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +insert after alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +3 3 3 \N 3 +4 4 4 \N 4 +5 5 5 \N 5 +6 6 str_6 str_6 \N +7 7 str_7 str_7 \N +8 8 str_8 str_8 \N +9 9 \N \N \N +10 10 \N \N \N +11 11 \N \N \N +12 12 12 \N 12 +13 13 str_13 str_13 \N +14 14 \N \N \N +alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +insert after alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +15 15 1970-01-16 \N \N 1970-01-16 +16 16 1970-01-17 \N \N 1970-01-17 +17 17 1970-01-18 \N \N 1970-01-18 +18 18 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N +20 20 20 \N 20 \N +21 21 str_21 str_21 \N \N +alter modify column 2 +0 0 0 \N \N \N \N \N +1 1 1 \N \N \N \N \N +2 2 2 \N \N \N \N \N +3 3 3 \N 3 \N 3 \N +4 4 4 \N 4 \N 4 \N +5 5 5 \N 5 \N 5 \N +6 6 6 \N str_6 str_6 \N \N +7 7 7 \N str_7 str_7 \N \N +8 8 8 \N str_8 str_8 \N \N +9 9 9 \N \N \N \N \N +10 10 10 \N \N \N \N \N +11 11 11 \N \N \N \N \N +12 12 12 \N 12 \N 12 \N +13 13 13 \N str_13 str_13 \N \N +14 14 14 \N \N \N \N \N +15 15 15 \N 1970-01-16 \N \N 1970-01-16 +16 16 16 \N 1970-01-17 \N \N 1970-01-17 +17 17 17 \N 1970-01-18 \N \N 1970-01-18 +18 18 18 \N 1970-01-19 \N \N 1970-01-19 +19 19 19 \N \N \N \N \N +20 20 20 \N 20 \N 20 \N +21 21 21 \N str_21 str_21 \N \N +insert after alter modify column 2 +0 0 0 \N \N \N \N \N +1 1 1 \N \N \N \N \N +2 2 2 \N \N \N \N \N +3 3 3 \N 3 \N 3 \N +4 4 4 \N 4 \N 4 \N +5 5 5 \N 5 \N 5 \N +6 6 6 \N str_6 str_6 \N \N +7 7 7 \N str_7 str_7 \N \N +8 8 8 \N str_8 str_8 \N \N +9 9 9 \N \N \N \N \N +10 10 10 \N \N \N \N \N +11 11 11 \N \N \N \N \N +12 12 12 \N 12 \N 12 \N +13 13 13 \N str_13 str_13 \N \N +14 14 14 \N \N \N \N \N +15 15 15 \N 1970-01-16 \N \N 1970-01-16 +16 16 16 \N 1970-01-17 \N \N 1970-01-17 +17 17 17 \N 1970-01-18 \N \N 1970-01-18 +18 18 18 \N 1970-01-19 \N \N 1970-01-19 +19 19 19 \N \N \N \N \N +20 20 20 \N 20 \N 20 \N +21 21 21 \N str_21 str_21 \N \N +22 str_22 \N str_22 \N \N \N \N +23 \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N +MergeTree wide +initial insert +alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +insert after alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +3 3 3 \N 3 +4 4 4 \N 4 +5 5 5 \N 5 +6 6 str_6 str_6 \N +7 7 str_7 str_7 \N +8 8 str_8 str_8 \N +9 9 \N \N \N +10 10 \N \N \N +11 11 \N \N \N +12 12 12 \N 12 +13 13 str_13 str_13 \N +14 14 \N \N \N +alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +insert after alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +15 15 1970-01-16 \N \N 1970-01-16 +16 16 1970-01-17 \N \N 1970-01-17 +17 17 1970-01-18 \N \N 1970-01-18 +18 18 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N +20 20 20 \N 20 \N +21 21 str_21 str_21 \N \N +alter modify column 2 +0 0 0 \N \N \N \N \N +1 1 1 \N \N \N \N \N +2 2 2 \N \N \N \N \N +3 3 3 \N 3 \N 3 \N +4 4 4 \N 4 \N 4 \N +5 5 5 \N 5 \N 5 \N +6 6 6 \N str_6 str_6 \N \N +7 7 7 \N str_7 str_7 \N \N +8 8 8 \N str_8 str_8 \N \N +9 9 9 \N \N \N \N \N +10 10 10 \N \N \N \N \N +11 11 11 \N \N \N \N \N +12 12 12 \N 12 \N 12 \N +13 13 13 \N str_13 str_13 \N \N +14 14 14 \N \N \N \N \N +15 15 15 \N 1970-01-16 \N \N 1970-01-16 +16 16 16 \N 1970-01-17 \N \N 1970-01-17 +17 17 17 \N 1970-01-18 \N \N 1970-01-18 +18 18 18 \N 1970-01-19 \N \N 1970-01-19 +19 19 19 \N \N \N \N \N +20 20 20 \N 20 \N 20 \N +21 21 21 \N str_21 str_21 \N \N +insert after alter modify column 2 +0 0 0 \N \N \N \N \N +1 1 1 \N \N \N \N \N +2 2 2 \N \N \N \N \N +3 3 3 \N 3 \N 3 \N +4 4 4 \N 4 \N 4 \N +5 5 5 \N 5 \N 5 \N +6 6 6 \N str_6 str_6 \N \N +7 7 7 \N str_7 str_7 \N \N +8 8 8 \N str_8 str_8 \N \N +9 9 9 \N \N \N \N \N +10 10 10 \N \N \N \N \N +11 11 11 \N \N \N \N \N +12 12 12 \N 12 \N 12 \N +13 13 13 \N str_13 str_13 \N \N +14 14 14 \N \N \N \N \N +15 15 15 \N 1970-01-16 \N \N 1970-01-16 +16 16 16 \N 1970-01-17 \N \N 1970-01-17 +17 17 17 \N 1970-01-18 \N \N 1970-01-18 +18 18 18 \N 1970-01-19 \N \N 1970-01-19 +19 19 19 \N \N \N \N \N +20 20 20 \N 20 \N 20 \N +21 21 21 \N str_21 str_21 \N \N +22 str_22 \N str_22 \N \N \N \N +23 \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N diff --git a/tests/queries/0_stateless/02941_variant_type_alters.sh b/tests/queries/0_stateless/02941_variant_type_alters.sh new file mode 100755 index 00000000000..7c151d1fe9e --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_alters.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 " + +function run() +{ + echo "initial insert" + $CH_CLIENT -q "insert into test select number, number from numbers(3)" + + echo "alter add column 1" + $CH_CLIENT -q "alter table test add column v Variant(UInt64, String) settings mutations_sync=1" + $CH_CLIENT -q "select x, y, v, v.String, v.UInt64 from test order by x" + + echo "insert after alter add column 1" + $CH_CLIENT -q "insert into test select number, number, number from numbers(3, 3)" + $CH_CLIENT -q "insert into test select number, number, 'str_' || toString(number) from numbers(6, 3)" + $CH_CLIENT -q "insert into test select number, number, NULL from numbers(9, 3)" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 3 == 0, number, number % 3 == 1, 'str_' || toString(number), NULL) from numbers(12, 3)" + $CH_CLIENT -q "select x, y, v, v.String, v.UInt64 from test order by x" + + echo "alter modify column 1" + $CH_CLIENT -q "alter table test modify column v Variant(UInt64, String, Date) settings mutations_sync=1" + $CH_CLIENT -q "select x, y, v, v.String, v.UInt64, v.Date from test order by x" + + echo "insert after alter modify column 1" + $CH_CLIENT -q "insert into test select number, number, toDate(number) from numbers(15, 3)" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, toDate(number), NULL) from numbers(18, 4)" + $CH_CLIENT -q "select x, y, v, v.String, v.UInt64, v.Date from test order by x" + + echo "alter modify column 2" + $CH_CLIENT -q "alter table test modify column y Variant(UInt64, String) settings mutations_sync=1" + $CH_CLIENT -q "select x, y, y.UInt64, y.String, v, v.String, v.UInt64, v.Date from test order by x" + + echo "insert after alter modify column 2" + $CH_CLIENT -q "insert into test select number, multiIf(number % 3 == 0, number, number % 3 == 1, 'str_' || toString(number), NULL), NULL from numbers(22, 3)" + $CH_CLIENT -q "select x, y, y.UInt64, y.String, v, v.String, v.UInt64, v.Date from test order by x" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (x UInt64, y UInt64) engine=Memory" +run +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (x UInt64, y UInt64) engine=MergeTree order by x settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (x UInt64, y UInt64 ) engine=MergeTree order by x settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02942_variant_cast.reference b/tests/queries/0_stateless/02942_variant_cast.reference new file mode 100644 index 00000000000..f3fd7a9ba33 --- /dev/null +++ b/tests/queries/0_stateless/02942_variant_cast.reference @@ -0,0 +1,25 @@ +\N +42 +0 +\N +2 +\N +Hello +Hello +NULL +Hello +Hello +\N +Hello +\N +0 +\N +42 +\N +Hello +2 +\N +Hello +5 +0 +1 diff --git a/tests/queries/0_stateless/02942_variant_cast.sql b/tests/queries/0_stateless/02942_variant_cast.sql new file mode 100644 index 00000000000..fc2d1d63657 --- /dev/null +++ b/tests/queries/0_stateless/02942_variant_cast.sql @@ -0,0 +1,24 @@ +set allow_experimental_variant_type=1; +set allow_experimental_analyzer=0; -- It's currently doesn't work with analyzer because of the way it works with constants, but it will be refactored and fixed in future + +select NULL::Variant(String, UInt64); +select 42::UInt64::Variant(String, UInt64); +select 42::UInt32::Variant(String, UInt64); -- {serverError CANNOT_CONVERT_TYPE} +select now()::Variant(String, UInt64); -- {serverError CANNOT_CONVERT_TYPE} +select CAST(number % 2 ? NULL : number, 'Variant(String, UInt64)') from numbers(4); +select 'Hello'::LowCardinality(String)::Variant(LowCardinality(String), UInt64); +select 'Hello'::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64); +select 'NULL'::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64); +select 'Hello'::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64); +select CAST(CAST(number % 2 ? NULL : 'Hello', 'LowCardinality(Nullable(String))'), 'Variant(LowCardinality(String), UInt64)') from numbers(4); + +select NULL::Variant(String, UInt64)::UInt64; +select NULL::Variant(String, UInt64)::Nullable(UInt64); +select '42'::Variant(String, UInt64)::UInt64; +select 'str'::Variant(String, UInt64)::UInt64; -- {serverError CANNOT_PARSE_TEXT} +select CAST(multiIf(number % 3 == 0, NULL::Variant(String, UInt64), number % 3 == 1, 'Hello'::Variant(String, UInt64), number::Variant(String, UInt64)), 'Nullable(String)') from numbers(6); +select CAST(multiIf(number == 1, NULL::Variant(String, UInt64), number == 2, 'Hello'::Variant(String, UInt64), number::Variant(String, UInt64)), 'UInt64') from numbers(6); -- {serverError CANNOT_PARSE_TEXT} + + +select number::Variant(UInt64)::Variant(String, UInt64)::Variant(Array(String), String, UInt64) from numbers(2); +select 'str'::Variant(String, UInt64)::Variant(String, Array(UInt64)); -- {serverError CANNOT_CONVERT_TYPE} diff --git a/tests/queries/0_stateless/02943_order_by_all.reference b/tests/queries/0_stateless/02943_order_by_all.reference index 48d828b6924..6eed33cc68d 100644 --- a/tests/queries/0_stateless/02943_order_by_all.reference +++ b/tests/queries/0_stateless/02943_order_by_all.reference @@ -82,3 +82,12 @@ B 3 10 D 1 20 A 2 30 C \N 40 +-- test SELECT * ORDER BY ALL with no "all" column in the SELECT clause +A 2 30 +B 3 10 +C \N 40 +D 1 20 +A 2 30 +B 3 10 +C \N 40 +D 1 20 diff --git a/tests/queries/0_stateless/02943_order_by_all.sql b/tests/queries/0_stateless/02943_order_by_all.sql index 0756563946c..0960d75ad96 100644 --- a/tests/queries/0_stateless/02943_order_by_all.sql +++ b/tests/queries/0_stateless/02943_order_by_all.sql @@ -87,3 +87,23 @@ SET allow_experimental_analyzer = 1; SELECT a, b, all FROM order_by_all ORDER BY all, a; DROP TABLE order_by_all; + +SELECT '-- test SELECT * ORDER BY ALL with no "all" column in the SELECT clause'; + +CREATE TABLE order_by_all +( + a String, + b Nullable(Int32), + c UInt64, +) + ENGINE = Memory; + +INSERT INTO order_by_all VALUES ('B', 3, 10), ('C', NULL, 40), ('D', 1, 20), ('A', 2, 30); + +SET allow_experimental_analyzer = 0; +SELECT * FROM order_by_all ORDER BY ALL; + +SET allow_experimental_analyzer = 1; +SELECT * FROM order_by_all ORDER BY ALL; + +DROP TABLE order_by_all; diff --git a/tests/queries/0_stateless/02943_positional_arguments_bugs.reference b/tests/queries/0_stateless/02943_positional_arguments_bugs.reference index 702e1261186..08310b7cf27 100644 --- a/tests/queries/0_stateless/02943_positional_arguments_bugs.reference +++ b/tests/queries/0_stateless/02943_positional_arguments_bugs.reference @@ -1,2 +1,12 @@ +0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 45 1 processed 99 0 diff --git a/tests/queries/0_stateless/02943_positional_arguments_bugs.sql b/tests/queries/0_stateless/02943_positional_arguments_bugs.sql index b8cf73da42d..9b1b872ae40 100644 --- a/tests/queries/0_stateless/02943_positional_arguments_bugs.sql +++ b/tests/queries/0_stateless/02943_positional_arguments_bugs.sql @@ -2,18 +2,21 @@ DROP TABLE IF EXISTS t; CREATE TABLE t ( - `n` int + `n` int, + `__unused_group_by_column` int ) - ENGINE = MergeTree - ORDER BY n AS -SELECT * +ENGINE = MergeTree +ORDER BY n AS +SELECT number, number FROM numbers(10); SELECT sum(n), - 1 AS x + __unused_group_by_column FROM t -GROUP BY x; +GROUP BY __unused_group_by_column ORDER BY __unused_group_by_column; + +SELECT sum(n), 1 as x from t group by x; SELECT 'processed' AS type, diff --git a/tests/queries/0_stateless/02943_variant_element.reference b/tests/queries/0_stateless/02943_variant_element.reference new file mode 100644 index 00000000000..ab8aaa8fdef --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_element.reference @@ -0,0 +1,44 @@ +\N +\N +\N +\N +0 +1 +2 +3 +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +[] +[[0]] +[[NULL]] +[[2]] +[[NULL]] diff --git a/tests/queries/0_stateless/02943_variant_element.sql b/tests/queries/0_stateless/02943_variant_element.sql new file mode 100644 index 00000000000..556c0147e56 --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_element.sql @@ -0,0 +1,16 @@ +set allow_experimental_variant_type=1; +set use_variant_as_common_type=1; + +select variantElement(NULL::Variant(String, UInt64), 'UInt64') from numbers(4); +select variantElement(number::Variant(String, UInt64), 'UInt64') from numbers(4); +select variantElement(number::Variant(String, UInt64), 'String') from numbers(4); +select variantElement((number % 2 ? NULL : number)::Variant(String, UInt64), 'UInt64') from numbers(4); +select variantElement((number % 2 ? NULL : number)::Variant(String, UInt64), 'String') from numbers(4); +select variantElement((number % 2 ? NULL : 'str_' || toString(number))::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64), 'LowCardinality(String)') from numbers(4); +select variantElement(NULL::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64), 'LowCardinality(String)') from numbers(4); +select variantElement((number % 2 ? NULL : number)::Variant(Array(UInt64), UInt64), 'Array(UInt64)') from numbers(4); +select variantElement(NULL::Variant(Array(UInt64), UInt64), 'Array(UInt64)') from numbers(4); +select variantElement(number % 2 ? NULL : range(number + 1), 'Array(UInt64)') from numbers(4); + +select variantElement([[(number % 2 ? NULL : number)::Variant(String, UInt64)]], 'UInt64') from numbers(4); + diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns.reference b/tests/queries/0_stateless/02943_variant_read_subcolumns.reference new file mode 100644 index 00000000000..4b93782cddf --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_read_subcolumns.reference @@ -0,0 +1,6 @@ +Memory +test +MergeTree compact +test +MergeTree wide +test diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns.sh b/tests/queries/0_stateless/02943_variant_read_subcolumns.sh new file mode 100755 index 00000000000..88be09c2036 --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_read_subcolumns.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 " + + +function test() +{ + echo "test" + $CH_CLIENT -q "insert into test select number, multiIf(number % 3 == 2, NULL, number % 3 == 1, number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10))) from numbers(1000000) settings min_insert_block_size_rows=100000" + $CH_CLIENT -q "select v, v.UInt64, v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64 from test order by id format Null" + $CH_CLIENT -q "select v.UInt64, v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64 from test order by id format Null" + $CH_CLIENT -q "select v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64, v.\`Array(Variant(String, UInt64))\`.String from test order by id format Null" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=Memory" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +test +$CH_CLIENT -q "drop table test;" + diff --git a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference new file mode 100644 index 00000000000..1736a307c42 --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference @@ -0,0 +1,244 @@ +Memory +test1 insert +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +2500000 +750000 +1750000 +----------------------------------------------------------------------------------------------------------- +MergeTree compact +test1 insert +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +2500000 +750000 +1750000 +----------------------------------------------------------------------------------------------------------- +test2 select +2500000 +750000 +1750000 +----------------------------------------------------------------------------------------------------------- +MergeTree wide +test1 insert +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +2500000 +750000 +1750000 +----------------------------------------------------------------------------------------------------------- +test2 select +2500000 +750000 +1750000 +----------------------------------------------------------------------------------------------------------- diff --git a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh new file mode 100755 index 00000000000..e4c1206263f --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +# Tags: long, no-debug + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 " + + +function test1_insert() +{ + echo "test1 insert" + $CH_CLIENT -q "insert into test select number, number::Variant(UInt64)::Variant(UInt64, Array(UInt64)) from numbers(10) settings max_block_size=3" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)) as res from numbers(10, 10) settings max_block_size=3" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64)) as res from numbers(20, 10) settings max_block_size=3" + $CH_CLIENT -q "insert into test select number, if(number < 35, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)), if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64))) from numbers(30, 10) settings max_block_size=3" +} + +function test1_select() +{ + echo "test1 select" + $CH_CLIENT -q "select v, v.String, v.UInt64 from test order by id;" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function test2_insert() +{ + echo "test2 insert" + $CH_CLIENT -q "insert into test select number, number::Variant(UInt64)::Variant(UInt64, Array(UInt64)) from numbers(1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)) as res from numbers(1000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64)) as res from numbers(2000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" + $CH_CLIENT -q "insert into test select number, if(number < 3500000, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)), if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64))) from numbers(3000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" +} + +function test2_select() +{ + echo "test2 select" + $CH_CLIENT -q "select v, v.String, v.UInt64 from test format Null;" + $CH_CLIENT -q "select v from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v);" + $CH_CLIENT -q "select v.String from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.String);" + $CH_CLIENT -q "select v.UInt64 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.UInt64);" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function run() +{ + test1_insert + test1_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test1_select + fi + $CH_CLIENT -q "truncate table test;" + test2_insert + test2_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test2_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, String, Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, String, Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000, index_granularity = 8192, index_granularity_bytes = '10Mi';" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, String, Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, index_granularity = 8192, index_granularity_bytes = '10Mi';" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02944_variant_as_common_type.reference b/tests/queries/0_stateless/02944_variant_as_common_type.reference new file mode 100644 index 00000000000..0425a8cfa30 --- /dev/null +++ b/tests/queries/0_stateless/02944_variant_as_common_type.reference @@ -0,0 +1,103 @@ +Array(UInt8) [1,2,3] +Array(UInt8) [1,2,3] +String str_1 +Nullable(String) str_1 +String str_1 +Nullable(String) str_1 +Variant(Array(UInt8), String) str_1 +Variant(Array(UInt8), String) str_1 +Array(UInt8) [1,2,3] +Array(UInt8) [1,2,3] +String str_1 +Nullable(String) str_1 +String str_1 +Nullable(String) str_1 +Variant(Array(UInt8), String) str_1 +Variant(Array(UInt8), String) str_1 +Array(UInt8) [1,2,3] +Array(UInt8) [1,2,3] +String str_1 +Nullable(String) str_1 +String str_1 +Nullable(String) str_1 +Variant(Array(UInt8), String) str_1 +Variant(Array(UInt8), String) str_1 +String str_0 +String str_1 +String str_2 +String str_3 +Nullable(String) str_0 +Nullable(String) str_1 +Nullable(String) str_2 +Nullable(String) str_3 +Array(UInt64) [0] +Array(UInt64) [0,1] +Array(UInt64) [0,1,2] +Array(UInt64) [0,1,2,3] +Array(UInt64) [0] +Array(UInt64) [0,1] +Array(UInt64) [0,1,2] +Array(UInt64) [0,1,2,3] +String str_0 +String str_1 +String str_2 +String str_3 +Nullable(String) str_0 +Nullable(String) str_1 +Nullable(String) str_2 +Nullable(String) str_3 +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) str_1 +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) str_3 +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) str_1 +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) str_3 +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) [0,1] +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) [0,1,2,3] +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) [0,1] +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) [0,1,2,3] +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) [0,1] +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) [0,1,2,3] +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) [0,1] +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) [0] +Variant(Array(UInt64), String, UInt64) 1 +Variant(Array(UInt64), String, UInt64) str_2 +Variant(Array(UInt64), String, UInt64) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) 4 +Variant(Array(UInt64), String, UInt64) str_5 +Variant(Array(UInt64), String, UInt64) [0] +Variant(Array(UInt64), String, UInt64) 1 +Variant(Array(UInt64), String, UInt64) str_2 +Variant(Array(UInt64), String, UInt64) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) 4 +Variant(Array(UInt64), String, UInt64) str_5 +Variant(Array(UInt64), String, UInt64) [0] +Variant(Array(UInt64), String, UInt64) 1 +Variant(Array(UInt64), String, UInt64) str_2 +Variant(Array(UInt64), String, UInt64) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) 4 +Variant(Array(UInt64), String, UInt64) str_5 +Variant(Array(UInt64), String, UInt64) [0] +Variant(Array(UInt64), String, UInt64) 1 +Variant(Array(UInt64), String, UInt64) str_2 +Variant(Array(UInt64), String, UInt64) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) 4 +Variant(Array(UInt64), String, UInt64) str_5 +Array(Variant(String, UInt8)) [1,'str_1',2,'str_2'] +Array(Variant(Array(String), Array(UInt8))) [[1,2,3],['str_1','str_2','str_3']] +Array(Variant(Array(UInt8), Array(Variant(Array(String), Array(UInt8))))) [[[1,2,3],['str_1','str_2','str_3']],[1,2,3]] +Array(Variant(Array(Array(UInt8)), Array(UInt8))) [[1,2,3],[[1,2,3]]] +Map(String, Variant(String, UInt8)) {'a':1,'b':'str_1'} +Map(String, Variant(Map(String, Variant(String, UInt8)), UInt8)) {'a':1,'b':{'c':2,'d':'str_1'}} +Map(String, Variant(Array(Array(UInt8)), Array(UInt8), UInt8)) {'a':1,'b':[1,2,3],'c':[[4,5,6]]} diff --git a/tests/queries/0_stateless/02944_variant_as_common_type.sql b/tests/queries/0_stateless/02944_variant_as_common_type.sql new file mode 100644 index 00000000000..e985cf365dd --- /dev/null +++ b/tests/queries/0_stateless/02944_variant_as_common_type.sql @@ -0,0 +1,76 @@ +set allow_experimental_analyzer=0; -- The result type for if function with constant is different with analyzer. It wil be fixed after refactoring around constants in analyzer. + +set allow_experimental_variant_type=1; +set use_variant_as_common_type=1; + +select toTypeName(res), if(1, [1,2,3], 'str_1') as res; +select toTypeName(res), if(1, [1,2,3], 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(0, [1,2,3], 'str_1') as res; +select toTypeName(res), if(0, [1,2,3], 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(NULL, [1,2,3], 'str_1') as res; +select toTypeName(res), if(NULL, [1,2,3], 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), [1,2,3], 'str_1') as res; +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), [1,2,3], 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(1, materialize([1,2,3]), 'str_1') as res; +select toTypeName(res), if(1, materialize([1,2,3]), 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(0, materialize([1,2,3]), 'str_1') as res; +select toTypeName(res), if(0, materialize([1,2,3]), 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(NULL, materialize([1,2,3]), 'str_1') as res; +select toTypeName(res), if(NULL, materialize([1,2,3]), 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), materialize([1,2,3]), 'str_1') as res; +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), materialize([1,2,3]), 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(1, [1,2,3], materialize('str_1')) as res; +select toTypeName(res), if(1, [1,2,3], materialize('str_1')::Nullable(String)) as res; + +select toTypeName(res), if(0, [1,2,3], materialize('str_1')) as res; +select toTypeName(res), if(0, [1,2,3], materialize('str_1')::Nullable(String)) as res; + +select toTypeName(res), if(NULL, [1,2,3], materialize('str_1')) as res; +select toTypeName(res), if(NULL, [1,2,3], materialize('str_1')::Nullable(String)) as res; + +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), [1,2,3], materialize('str_1')) as res; +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), [1,2,3], materialize('str_1')::Nullable(String)) as res; + + +select toTypeName(res), if(0, range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(0, range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(1, range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(1, range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(NULL, range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(NULL, range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(number % 2, range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(number % 2, range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(number % 2, range(number + 1), ('str_' || toString(number))::LowCardinality(String)) as res from numbers(4); +select toTypeName(res), if(number % 2, range(number + 1), ('str_' || toString(number))::LowCardinality(Nullable(String))) as res from numbers(4); + + +select toTypeName(res), multiIf(number % 3 == 0, range(number + 1), number % 3 == 1, number, 'str_' || toString(number)) as res from numbers(6); +select toTypeName(res), multiIf(number % 3 == 0, range(number + 1), number % 3 == 1, number, ('str_' || toString(number))::Nullable(String)) as res from numbers(6); +select toTypeName(res), multiIf(number % 3 == 0, range(number + 1), number % 3 == 1, number, ('str_' || toString(number))::LowCardinality(String)) as res from numbers(6); +select toTypeName(res), multiIf(number % 3 == 0, range(number + 1), number % 3 == 1, number, ('str_' || toString(number))::LowCardinality(Nullable(String))) as res from numbers(6); + + +select toTypeName(res), array(1, 'str_1', 2, 'str_2') as res; +select toTypeName(res), array([1, 2, 3], ['str_1', 'str_2', 'str_3']) as res; +select toTypeName(res), array(array([1, 2, 3], ['str_1', 'str_2', 'str_3']), [1, 2, 3]) as res; +select toTypeName(res), array([1, 2, 3], [[1, 2, 3]]) as res; + +select toTypeName(res), map('a', 1, 'b', 'str_1') as res; +select toTypeName(res), map('a', 1, 'b', map('c', 2, 'd', 'str_1')) as res; +select toTypeName(res), map('a', 1, 'b', [1, 2, 3], 'c', [[4, 5, 6]]) as res; + diff --git a/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.reference b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.reference new file mode 100644 index 00000000000..461075e9607 --- /dev/null +++ b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.reference @@ -0,0 +1,45 @@ +DEFAULT expressions +-- Compact parts +Before materialize +1 1 +2 54321 +After materialize +1 1 +2 54321 +-- Wide parts +Before materialize +1 1 +2 54321 +After materialize +1 1 +2 54321 +-- Nullable column != physically absent +Before materialize +1 1 +2 \N +3 54321 +After materialize +1 1 +2 \N +3 54321 +-- Parts with renamed column +Before materialize +1 1 +2 54321 +After rename +1 1 +2 54321 +After materialize +1 1 +2 54321 +MATERIALIZED expressions +-- Compact parts +Before materialize +1 54321 +After materialize +1 65432 +-- Compact parts +Before materialize +1 54321 +After materialize +1 65432 diff --git a/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.sql b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.sql new file mode 100644 index 00000000000..cfdde287712 --- /dev/null +++ b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.sql @@ -0,0 +1,85 @@ +SET mutations_sync = 2; + +DROP TABLE IF EXISTS tab; + +-- Tests that existing parts which contain a non-default value in columns with DEFAULT expression remain unchanged by MATERIALIZE COLUMN> +SELECT 'DEFAULT expressions'; + +SELECT '-- Compact parts'; + +CREATE TABLE tab (id Int64, dflt Int64 DEFAULT 54321) ENGINE MergeTree ORDER BY id; +INSERT INTO tab (id, dflt) VALUES (1, 1); +INSERT INTO tab (id) VALUES (2); +SELECT 'Before materialize'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN dflt; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; + +SELECT '-- Wide parts'; + +CREATE TABLE tab (id Int64, dflt Int64 DEFAULT 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; +INSERT INTO tab (id, dflt) VALUES (1, 1); +INSERT INTO tab (id) VALUES (2); +SELECT 'Before materialize'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN dflt; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; + +SELECT '-- Nullable column != physically absent'; + +CREATE TABLE tab (id Int64, dflt Nullable(Int64) DEFAULT 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; +INSERT INTO tab (id, dflt) VALUES (1, 1); +INSERT INTO tab (id, dflt) VALUES (2, NULL); +INSERT INTO tab (id) VALUES (3); +SELECT 'Before materialize'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN dflt; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; + +SELECT '-- Parts with renamed column'; + +CREATE TABLE tab (id Int64, dflt Int64 DEFAULT 54321) ENGINE MergeTree ORDER BY id; +INSERT INTO tab (id, dflt) VALUES (1, 1); +INSERT INTO tab (id) VALUES (2); +SELECT 'Before materialize'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab RENAME COLUMN dflt TO dflt2; +SELECT 'After rename'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN dflt2; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; + +-- But for columns with MATERIALIZED expression, all existing parts should be rewritten in case a new expression was set in the meantime. +SELECT 'MATERIALIZED expressions'; + +SELECT '-- Compact parts'; + +CREATE TABLE tab (id Int64, mtrl Int64 MATERIALIZED 54321) ENGINE MergeTree ORDER BY id; +INSERT INTO tab (id) VALUES (1); +SELECT 'Before materialize'; +SELECT id, mtrl FROM tab ORDER BY id; +ALTER TABLE tab MODIFY COLUMN mtrl Int64 MATERIALIZED 65432; +ALTER TABLE tab MATERIALIZE COLUMN mtrl; +SELECT 'After materialize'; +SELECT id, mtrl FROM tab ORDER BY id; +DROP TABLE tab; + +SELECT '-- Compact parts'; + +CREATE TABLE tab (id Int64, mtrl Int64 MATERIALIZED 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; +INSERT INTO tab (id) VALUES (1); +SELECT 'Before materialize'; +SELECT id, mtrl FROM tab ORDER BY id; +ALTER TABLE tab MODIFY COLUMN mtrl Int64 MATERIALIZED 65432; +ALTER TABLE tab MATERIALIZE COLUMN mtrl; +SELECT 'After materialize'; +SELECT id, mtrl FROM tab ORDER BY id; +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.reference b/tests/queries/0_stateless/02950_distributed_initial_query_event.reference new file mode 100644 index 00000000000..cf10427e9b3 --- /dev/null +++ b/tests/queries/0_stateless/02950_distributed_initial_query_event.reference @@ -0,0 +1,6 @@ +Local situation +Initial Query Difference: 1 +Query Difference: 1 +Distributed situation +Initial Query Difference: 1 +Query Difference: 3 diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh new file mode 100755 index 00000000000..7f690a681c4 --- /dev/null +++ b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# Tags:no-parallel,shard + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh +# CREATE TABLE local (x UInt8) Engine=Memory; +# CREATE TABLE distributed ON CLUSTER cluster (p Date, i Int32) ENGINE = Distributed(test_cluster_two_shards, currentDatabase(), x) +$CLICKHOUSE_CLIENT -n -q " +DROP TABLE IF EXISTS local; +DROP TABLE IF EXISTS distributed; +CREATE TABLE local (x UInt8) Engine=Memory; +CREATE TABLE distributed AS local ENGINE = Distributed(test_cluster_two_shards, currentDatabase(), local, x); +INSERT INTO distributed SELECT number FROM numbers(10); +SYSTEM FLUSH DISTRIBUTED distributed; +" +echo "Local situation" +# before SELECT * FROM local +query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'InitialQuery'") +query_countQ=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'Query'") + +# Execute SELECT * FROM local +$CLICKHOUSE_CLIENT -q "SELECT * FROM local" > /dev/null + +# Counts after SELECT * FROM local +After_query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'InitialQuery'") +After_query_countQ=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'Query'") + +# Calculate the differences +Initial_query_diff=$(($After_query_countI-$query_countI-2)) +query_diff=$(($After_query_countQ-$query_countQ-2)) + +echo "Initial Query Difference: $Initial_query_diff" +echo "Query Difference: $query_diff" +echo "Distributed situation" + +# before SELECT * FROM distributed +query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'InitialQuery'") +query_countQ=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'Query'") + +# Execute SELECT * FROM distributed +$CLICKHOUSE_CLIENT -q "SELECT * FROM distributed SETTINGS prefer_localhost_replica = 0" > /dev/null + +# Counts after SELECT * FROM distributed +After_query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'InitialQuery'") +After_query_countQ=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'Query'") + +# Calculate the differences +Initial_query_diff=$(($After_query_countI-$query_countI-2)) +query_diff=$(($After_query_countQ-$query_countQ-2)) + +echo "Initial Query Difference: $Initial_query_diff" +echo "Query Difference: $query_diff" diff --git a/tests/queries/0_stateless/02966_s3_access_key_id_restriction.sql b/tests/queries/0_stateless/02966_s3_access_key_id_restriction.sql deleted file mode 100644 index c1ca0b4bcd5..00000000000 --- a/tests/queries/0_stateless/02966_s3_access_key_id_restriction.sql +++ /dev/null @@ -1,6 +0,0 @@ --- Tags: no-fasttest - -select * from s3('http://localhost:11111/test/a.tsv', '\ninjection\n', 'admin'); -- { serverError 36 } -select * from deltaLake('http://localhost:11111/test/a.tsv', '\ninjection\n', 'admin'); -- { serverError 36 } -select * from hudi('http://localhost:11111/test/a.tsv', '\ninjection\n', 'admin'); -- { serverError 36 } -select * from iceberg('http://localhost:11111/test/a.tsv', '\ninjection\n', 'admin'); -- { serverError 36 } diff --git a/tests/queries/0_stateless/02967_mysql_settings_override.reference b/tests/queries/0_stateless/02967_mysql_settings_override.reference new file mode 100644 index 00000000000..96cf7ecc403 --- /dev/null +++ b/tests/queries/0_stateless/02967_mysql_settings_override.reference @@ -0,0 +1,23 @@ +-- Init +s +a +b +c +d +-- Uppercase setting name +s +a +b +name value +send_timeout 22 +name value +receive_timeout 33 +-- Lowercase setting name +s +a +b +c +name value +send_timeout 55 +name value +receive_timeout 66 diff --git a/tests/queries/0_stateless/02967_mysql_settings_override.sh b/tests/queries/0_stateless/02967_mysql_settings_override.sh new file mode 100755 index 00000000000..59a2099190a --- /dev/null +++ b/tests/queries/0_stateless/02967_mysql_settings_override.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Tags: no-fasttest +# Tag no-fasttest: requires mysql client + +# Tests that certain MySQL-proprietary settings are mapped to ClickHouse-native settings. + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +CHANGED_SETTINGS_QUERY="SELECT name, value FROM system.settings WHERE name IN ('send_timeout', 'receive_timeout') AND changed;" + +TEST_TABLE="mysql_settings_override_test" + +DROP_TABLE="DROP TABLE IF EXISTS $TEST_TABLE;" +CREATE_TABLE="CREATE TABLE $TEST_TABLE (s String) ENGINE MergeTree ORDER BY s;" +INSERT_STMT="INSERT INTO $TEST_TABLE VALUES ('a'), ('b'), ('c'), ('d');" +SELECT_STMT="SELECT * FROM $TEST_TABLE ORDER BY s;" + +echo "-- Init" +${MYSQL_CLIENT} --execute "$DROP_TABLE $CREATE_TABLE $INSERT_STMT $SELECT_STMT" # should fetch all 4 records + +echo "-- Uppercase setting name" +${MYSQL_CLIENT} --execute "SET SQL_SELECT_LIMIT = 2; $SELECT_STMT" # should fetch 2 records out of 4 +${MYSQL_CLIENT} --execute "SET NET_WRITE_TIMEOUT = 22; $CHANGED_SETTINGS_QUERY" +${MYSQL_CLIENT} --execute "SET NET_READ_TIMEOUT = 33; $CHANGED_SETTINGS_QUERY" + +echo "-- Lowercase setting name" +${MYSQL_CLIENT} --execute "set sql_select_limit=3; $SELECT_STMT" # should fetch 3 records out of 4 +${MYSQL_CLIENT} --execute "set net_write_timeout=55; $CHANGED_SETTINGS_QUERY" +${MYSQL_CLIENT} --execute "set net_read_timeout=66; $CHANGED_SETTINGS_QUERY" + +${MYSQL_CLIENT} --execute "$DROP_TABLE" diff --git a/tests/queries/0_stateless/02967_parallel_replicas_join_algo_and_analyzer.reference b/tests/queries/0_stateless/02967_parallel_replicas_join_algo_and_analyzer.reference new file mode 100644 index 00000000000..fa343571ba0 --- /dev/null +++ b/tests/queries/0_stateless/02967_parallel_replicas_join_algo_and_analyzer.reference @@ -0,0 +1,177 @@ + +simple join with analyzer +4200000 4200000 4200000 -1400000 +4200006 4200006 4200006 -1400002 +4200012 4200012 4200012 -1400004 +4200018 4200018 4200018 -1400006 +4200024 4200024 4200024 -1400008 +4200030 4200030 4200030 -1400010 +4200036 4200036 4200036 -1400012 +4200042 4200042 4200042 -1400014 +4200048 4200048 4200048 -1400016 +4200054 4200054 4200054 -1400018 + +simple (global) join with analyzer and parallel replicas +4200000 4200000 4200000 -1400000 +4200006 4200006 4200006 -1400002 +4200012 4200012 4200012 -1400004 +4200018 4200018 4200018 -1400006 +4200024 4200024 4200024 -1400008 +4200030 4200030 4200030 -1400010 +4200036 4200036 4200036 -1400012 +4200042 4200042 4200042 -1400014 +4200048 4200048 4200048 -1400016 +4200054 4200054 4200054 -1400018 +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value` FROM `default`.`num_2` AS `__table1` (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value` FROM `default`.`num_2` AS `__table1` (stage: WithMergeableState) + DefaultCoordinator: Coordination done +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(700000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, allow_experimental_parallel_reading_from_replicas = 2, send_logs_level = 'trace', max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join = 0 (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(700000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, allow_experimental_parallel_reading_from_replicas = 2, send_logs_level = 'trace', max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join = 0 (stage: WithMergeableState) + DefaultCoordinator: Coordination done + +simple (local) join with analyzer and parallel replicas +4200000 4200000 4200000 -1400000 +4200006 4200006 4200006 -1400002 +4200012 4200012 4200012 -1400004 +4200018 4200018 4200018 -1400006 +4200024 4200024 4200024 -1400008 +4200030 4200030 4200030 -1400010 +4200036 4200036 4200036 -1400012 +4200042 4200042 4200042 -1400014 +4200048 4200048 4200048 -1400016 +4200054 4200054 4200054 -1400018 +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` ALL INNER JOIN (SELECT `__table4`.`key` AS `key`, `__table4`.`value` AS `value` FROM `default`.`num_2` AS `__table4`) AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(700000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join = 1 (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` ALL INNER JOIN (SELECT `__table4`.`key` AS `key`, `__table4`.`value` AS `value` FROM `default`.`num_2` AS `__table4`) AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(700000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join = 1 (stage: WithMergeableState) + DefaultCoordinator: Coordination done + +simple (local) join with analyzer and parallel replicas and full sorting merge join +4200000 4200000 4200000 -1400000 +4200006 4200006 4200006 -1400002 +4200012 4200012 4200012 -1400004 +4200018 4200018 4200018 -1400006 +4200024 4200024 4200024 -1400008 +4200030 4200030 4200030 -1400010 +4200036 4200036 4200036 -1400012 +4200042 4200042 4200042 -1400014 +4200048 4200048 4200048 -1400016 +4200054 4200054 4200054 -1400018 +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` ALL INNER JOIN (SELECT `__table4`.`key` AS `key`, `__table4`.`value` AS `value` FROM `default`.`num_2` AS `__table4`) AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(700000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, join_algorithm = 'full_sorting_merge', send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join = 1 (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` ALL INNER JOIN (SELECT `__table4`.`key` AS `key`, `__table4`.`value` AS `value` FROM `default`.`num_2` AS `__table4`) AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(700000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, join_algorithm = 'full_sorting_merge', send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join = 1 (stage: WithMergeableState) + WithOrderCoordinator: Coordination done + +nested join with analyzer +420000 420000 420000 -140000 +420042 420042 420042 -140014 +420084 420084 420084 -140028 +420126 420126 420126 -140042 +420168 420168 420168 -140056 +420210 420210 420210 -140070 +420252 420252 420252 -140084 +420294 420294 420294 -140098 +420336 420336 420336 -140112 +420378 420378 420378 -140126 + +nested join with analyzer and parallel replicas, both local +420000 420000 420000 -140000 +420042 420042 420042 -140014 +420084 420084 420084 -140028 +420126 420126 420126 -140042 +420168 420168 420168 -140056 +420210 420210 420210 -140070 +420252 420252 420252 -140084 +420294 420294 420294 -140098 +420336 420336 420336 -140112 +420378 420378 420378 -140126 +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` ALL INNER JOIN (SELECT `__table4`.`key` AS `key`, `__table4`.`value` AS `value` FROM `default`.`num_2` AS `__table4` ALL INNER JOIN (SELECT `__table6`.`number` * 7 AS `key` FROM numbers(100000.) AS `__table6`) AS `__table5` ON `__table4`.`key` = `__table5`.`key` SETTINGS parallel_replicas_prefer_local_join = 1) AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(10000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, join_algorithm = 'full_sorting_merge', send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join = 1 (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` ALL INNER JOIN (SELECT `__table4`.`key` AS `key`, `__table4`.`value` AS `value` FROM `default`.`num_2` AS `__table4` ALL INNER JOIN (SELECT `__table6`.`number` * 7 AS `key` FROM numbers(100000.) AS `__table6`) AS `__table5` ON `__table4`.`key` = `__table5`.`key` SETTINGS parallel_replicas_prefer_local_join = 1) AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(10000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, join_algorithm = 'full_sorting_merge', send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join = 1 (stage: WithMergeableState) + WithOrderCoordinator: Coordination done + +nested join with analyzer and parallel replicas, both global +420000 420000 420000 -140000 +420042 420042 420042 -140014 +420084 420084 420084 -140028 +420126 420126 420126 -140042 +420168 420168 420168 -140056 +420210 420210 420210 -140070 +420252 420252 420252 -140084 +420294 420294 420294 -140098 +420336 420336 420336 -140112 +420378 420378 420378 -140126 +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value` FROM `default`.`num_2` AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table2` ON `__table1`.`key` = `__table2`.`key` SETTINGS parallel_replicas_prefer_local_join = 0 (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value` FROM `default`.`num_2` AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table2` ON `__table1`.`key` = `__table2`.`key` SETTINGS parallel_replicas_prefer_local_join = 0 (stage: WithMergeableState) + DefaultCoordinator: Coordination done +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(10000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join = 0 (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(10000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join = 0 (stage: WithMergeableState) + DefaultCoordinator: Coordination done + +nested join with analyzer and parallel replicas, global + local +420000 420000 420000 -140000 +420042 420042 420042 -140014 +420084 420084 420084 -140028 +420126 420126 420126 -140042 +420168 420168 420168 -140056 +420210 420210 420210 -140070 +420252 420252 420252 -140084 +420294 420294 420294 -140098 +420336 420336 420336 -140112 +420378 420378 420378 -140126 +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value` FROM `default`.`num_2` AS `__table1` ALL INNER JOIN (SELECT `__table3`.`number` * 7 AS `key` FROM numbers(100000.) AS `__table3`) AS `__table2` ON `__table1`.`key` = `__table2`.`key` SETTINGS parallel_replicas_prefer_local_join = 1 (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value` FROM `default`.`num_2` AS `__table1` ALL INNER JOIN (SELECT `__table3`.`number` * 7 AS `key` FROM numbers(100000.) AS `__table3`) AS `__table2` ON `__table1`.`key` = `__table2`.`key` SETTINGS parallel_replicas_prefer_local_join = 1 (stage: WithMergeableState) + DefaultCoordinator: Coordination done +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(10000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join = 0 (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(10000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join = 0 (stage: WithMergeableState) + DefaultCoordinator: Coordination done + +nested join with analyzer and parallel replicas, both local, both full sorting merge join +420000 420000 420000 -140000 +420042 420042 420042 -140014 +420084 420084 420084 -140028 +420126 420126 420126 -140042 +420168 420168 420168 -140056 +420210 420210 420210 -140070 +420252 420252 420252 -140084 +420294 420294 420294 -140098 +420336 420336 420336 -140112 +420378 420378 420378 -140126 +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value` FROM `default`.`num_2` AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table2` ON `__table1`.`key` = `__table2`.`key` SETTINGS join_algorithm = 'full_sorting_merge' (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value` FROM `default`.`num_2` AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table2` ON `__table1`.`key` = `__table2`.`key` SETTINGS join_algorithm = 'full_sorting_merge' (stage: WithMergeableState) + WithOrderCoordinator: Coordination done +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(10000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, parallel_replicas_prefer_local_join = 0, send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', join_algorithm = 'full_sorting_merge' (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(10000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, parallel_replicas_prefer_local_join = 0, send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', join_algorithm = 'full_sorting_merge' (stage: WithMergeableState) + WithOrderCoordinator: Coordination done + +nested join with analyzer and parallel replicas, both local, both full sorting and hash join +420000 420000 420000 -140000 +420042 420042 420042 -140014 +420084 420084 420084 -140028 +420126 420126 420126 -140042 +420168 420168 420168 -140056 +420210 420210 420210 -140070 +420252 420252 420252 -140084 +420294 420294 420294 -140098 +420336 420336 420336 -140112 +420378 420378 420378 -140126 +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value` FROM `default`.`num_2` AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table2` ON `__table1`.`key` = `__table2`.`key` SETTINGS join_algorithm = 'hash' (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value` FROM `default`.`num_2` AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table2` ON `__table1`.`key` = `__table2`.`key` SETTINGS join_algorithm = 'hash' (stage: WithMergeableState) + DefaultCoordinator: Coordination done +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(10000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, parallel_replicas_prefer_local_join = 0, send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', join_algorithm = 'full_sorting_merge' (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(10000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, parallel_replicas_prefer_local_join = 0, send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', join_algorithm = 'full_sorting_merge' (stage: WithMergeableState) + WithOrderCoordinator: Coordination done + +nested join with analyzer and parallel replicas, both local, both full sorting and hash join +420000 420000 420000 -140000 +420042 420042 420042 -140014 +420084 420084 420084 -140028 +420126 420126 420126 -140042 +420168 420168 420168 -140056 +420210 420210 420210 -140070 +420252 420252 420252 -140084 +420294 420294 420294 -140098 +420336 420336 420336 -140112 +420378 420378 420378 -140126 +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value` FROM `default`.`num_2` AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table2` ON `__table1`.`key` = `__table2`.`key` SETTINGS join_algorithm = 'full_sorting_merge' (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value` FROM `default`.`num_2` AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table2` ON `__table1`.`key` = `__table2`.`key` SETTINGS join_algorithm = 'full_sorting_merge' (stage: WithMergeableState) + WithOrderCoordinator: Coordination done +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(10000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, parallel_replicas_prefer_local_join = 0, send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', join_algorithm = 'hash' (stage: WithMergeableState) +SELECT `__table1`.`key` AS `key`, `__table1`.`value` AS `value`, `__table3`.`key` AS `r.key`, `__table3`.`value` AS `r.value` FROM (SELECT `__table2`.`key` AS `key`, `__table2`.`value` AS `value` FROM `default`.`num_1` AS `__table2`) AS `__table1` GLOBAL ALL INNER JOIN `_data_` AS `__table3` ON `__table1`.`key` = `__table3`.`key` ORDER BY `__table1`.`key` ASC LIMIT _CAST(10000, 'UInt64'), _CAST(10, 'UInt64') SETTINGS allow_experimental_analyzer = 1, parallel_replicas_prefer_local_join = 0, send_logs_level = 'trace', allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', join_algorithm = 'hash' (stage: WithMergeableState) + DefaultCoordinator: Coordination done diff --git a/tests/queries/0_stateless/02967_parallel_replicas_join_algo_and_analyzer.sh b/tests/queries/0_stateless/02967_parallel_replicas_join_algo_and_analyzer.sh new file mode 100755 index 00000000000..2840482da6d --- /dev/null +++ b/tests/queries/0_stateless/02967_parallel_replicas_join_algo_and_analyzer.sh @@ -0,0 +1,263 @@ +#!/usr/bin/env bash +# Tags: long, no-random-settings, no-random-merge-tree-settings + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +$CLICKHOUSE_CLIENT -nm -q " +drop table if exists num_1; +drop table if exists num_2; + +create table num_1 (key UInt64, value String) engine = MergeTree order by key; +create table num_2 (key UInt64, value Int64) engine = MergeTree order by key; + +insert into num_1 select number * 2, toString(number * 2) from numbers(1e7); +insert into num_2 select number * 3, -number from numbers(1.5e6); +" + +############## +echo +echo "simple join with analyzer" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2) r on l.key = r.key +order by l.key limit 10 offset 700000 +SETTINGS allow_experimental_analyzer=1" + +############## +echo +echo "simple (global) join with analyzer and parallel replicas" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2) r on l.key = r.key +order by l.key limit 10 offset 700000 +SETTINGS allow_experimental_analyzer=1, allow_experimental_parallel_reading_from_replicas = 2, +max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join=0" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2) r on l.key = r.key +order by l.key limit 10 offset 700000 +SETTINGS allow_experimental_analyzer=1, allow_experimental_parallel_reading_from_replicas = 2, send_logs_level='trace', +max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join=0" 2>&1 | +grep "executeQuery\|.*Coordinator: Coordination done" | +grep -o "SELECT.*WithMergeableState)\|.*Coordinator: Coordination done" | +sed -re 's/_data_[[:digit:]]+_[[:digit:]]+/_data_/g' + +############## +echo +echo "simple (local) join with analyzer and parallel replicas" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2) r on l.key = r.key +order by l.key limit 10 offset 700000 +SETTINGS allow_experimental_analyzer=1, +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join=1" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2) r on l.key = r.key +order by l.key limit 10 offset 700000 +SETTINGS allow_experimental_analyzer=1, send_logs_level='trace', +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join=1" 2>&1 | +grep "executeQuery\|.*Coordinator: Coordination done" | +grep -o "SELECT.*WithMergeableState)\|.*Coordinator: Coordination done" | +sed -re 's/_data_[[:digit:]]+_[[:digit:]]+/_data_/g' + + +############## +echo +echo "simple (local) join with analyzer and parallel replicas and full sorting merge join" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2) r on l.key = r.key +order by l.key limit 10 offset 700000 +SETTINGS allow_experimental_analyzer=1, join_algorithm='full_sorting_merge', +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join=1" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2) r on l.key = r.key +order by l.key limit 10 offset 700000 +SETTINGS allow_experimental_analyzer=1, join_algorithm='full_sorting_merge', send_logs_level='trace', +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join=1" 2>&1 | +grep "executeQuery\|.*Coordinator: Coordination done" | +grep -o "SELECT.*WithMergeableState)\|.*Coordinator: Coordination done" | +sed -re 's/_data_[[:digit:]]+_[[:digit:]]+/_data_/g' + + +############## +echo +echo "nested join with analyzer" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2 inner join + (select number * 7 as key from numbers(1e5)) as nn on num_2.key = nn.key settings parallel_replicas_prefer_local_join=1) r +on l.key = r.key order by l.key limit 10 offset 10000 +SETTINGS allow_experimental_analyzer=1" + + +############## +echo +echo "nested join with analyzer and parallel replicas, both local" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2 inner join + (select number * 7 as key from numbers(1e5)) as nn on num_2.key = nn.key settings parallel_replicas_prefer_local_join=1) r +on l.key = r.key order by l.key limit 10 offset 10000 +SETTINGS allow_experimental_analyzer=1, +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join=1" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2 inner join + (select number * 7 as key from numbers(1e5)) as nn on num_2.key = nn.key settings parallel_replicas_prefer_local_join=1) r +on l.key = r.key order by l.key limit 10 offset 10000 +SETTINGS allow_experimental_analyzer=1, join_algorithm='full_sorting_merge', send_logs_level='trace', +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join=1" 2>&1 | +grep "executeQuery\|.*Coordinator: Coordination done" | +grep -o "SELECT.*WithMergeableState)\|.*Coordinator: Coordination done" | +sed -re 's/_data_[[:digit:]]+_[[:digit:]]+/_data_/g' + + +############## +echo +echo "nested join with analyzer and parallel replicas, both global" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2 inner join + (select number * 7 as key from numbers(1e5)) as nn on num_2.key = nn.key settings parallel_replicas_prefer_local_join=0) r +on l.key = r.key order by l.key limit 10 offset 10000 +SETTINGS allow_experimental_analyzer=1, +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join=0" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2 inner join + (select number * 7 as key from numbers(1e5)) as nn on num_2.key = nn.key settings parallel_replicas_prefer_local_join=0) r +on l.key = r.key order by l.key limit 10 offset 10000 +SETTINGS allow_experimental_analyzer=1, send_logs_level='trace', +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join=0" 2>&1 | +grep "executeQuery\|.*Coordinator: Coordination done" | +grep -o "SELECT.*WithMergeableState)\|.*Coordinator: Coordination done" | +sed -re 's/_data_[[:digit:]]+_[[:digit:]]+/_data_/g' + +############## +echo +echo "nested join with analyzer and parallel replicas, global + local" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2 inner join + (select number * 7 as key from numbers(1e5)) as nn on num_2.key = nn.key settings parallel_replicas_prefer_local_join=1) r +on l.key = r.key order by l.key limit 10 offset 10000 +SETTINGS allow_experimental_analyzer=1, +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join=0" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2 inner join + (select number * 7 as key from numbers(1e5)) as nn on num_2.key = nn.key settings parallel_replicas_prefer_local_join=1) r +on l.key = r.key order by l.key limit 10 offset 10000 +SETTINGS allow_experimental_analyzer=1, send_logs_level='trace', +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_prefer_local_join=0" 2>&1 | +grep "executeQuery\|.*Coordinator: Coordination done" | +grep -o "SELECT.*WithMergeableState)\|.*Coordinator: Coordination done" | +sed -re 's/_data_[[:digit:]]+_[[:digit:]]+/_data_/g' + + +############## +echo +echo "nested join with analyzer and parallel replicas, both local, both full sorting merge join" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2 inner join + (select number * 7 as key from numbers(1e5)) as nn on num_2.key = nn.key settings join_algorithm='full_sorting_merge') r +on l.key = r.key order by l.key limit 10 offset 10000 +SETTINGS allow_experimental_analyzer=1, parallel_replicas_prefer_local_join=0, +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', join_algorithm='full_sorting_merge'" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2 inner join + (select number * 7 as key from numbers(1e5)) as nn on num_2.key = nn.key settings join_algorithm='full_sorting_merge') r +on l.key = r.key order by l.key limit 10 offset 10000 +SETTINGS allow_experimental_analyzer=1, parallel_replicas_prefer_local_join=0, send_logs_level='trace', +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', join_algorithm='full_sorting_merge'" 2>&1 | +grep "executeQuery\|.*Coordinator: Coordination done" | +grep -o "SELECT.*WithMergeableState)\|.*Coordinator: Coordination done" | +sed -re 's/_data_[[:digit:]]+_[[:digit:]]+/_data_/g' + +############## +echo +echo "nested join with analyzer and parallel replicas, both local, both full sorting and hash join" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2 inner join + (select number * 7 as key from numbers(1e5)) as nn on num_2.key = nn.key settings join_algorithm='hash') r +on l.key = r.key order by l.key limit 10 offset 10000 +SETTINGS allow_experimental_analyzer=1, parallel_replicas_prefer_local_join=0, +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', join_algorithm='full_sorting_merge'" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2 inner join + (select number * 7 as key from numbers(1e5)) as nn on num_2.key = nn.key settings join_algorithm='hash') r +on l.key = r.key order by l.key limit 10 offset 10000 +SETTINGS allow_experimental_analyzer=1, parallel_replicas_prefer_local_join=0, send_logs_level='trace', +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', join_algorithm='full_sorting_merge'" 2>&1 | +grep "executeQuery\|.*Coordinator: Coordination done" | +grep -o "SELECT.*WithMergeableState)\|.*Coordinator: Coordination done" | +sed -re 's/_data_[[:digit:]]+_[[:digit:]]+/_data_/g' + +############## +echo +echo "nested join with analyzer and parallel replicas, both local, both full sorting and hash join" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2 inner join + (select number * 7 as key from numbers(1e5)) as nn on num_2.key = nn.key settings join_algorithm='full_sorting_merge') r +on l.key = r.key order by l.key limit 10 offset 10000 +SETTINGS allow_experimental_analyzer=1, parallel_replicas_prefer_local_join=0, +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', join_algorithm='hash'" + +$CLICKHOUSE_CLIENT -q " +select * from (select key, value from num_1) l +inner join (select key, value from num_2 inner join + (select number * 7 as key from numbers(1e5)) as nn on num_2.key = nn.key settings join_algorithm='full_sorting_merge') r +on l.key = r.key order by l.key limit 10 offset 10000 +SETTINGS allow_experimental_analyzer=1, parallel_replicas_prefer_local_join=0, send_logs_level='trace', +allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, +cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', join_algorithm='hash'" 2>&1 | +grep "executeQuery\|.*Coordinator: Coordination done" | +grep -o "SELECT.*WithMergeableState)\|.*Coordinator: Coordination done" | +sed -re 's/_data_[[:digit:]]+_[[:digit:]]+/_data_/g' diff --git a/tests/queries/0_stateless/02967_parallel_replicas_joins_and_analyzer.reference b/tests/queries/0_stateless/02967_parallel_replicas_joins_and_analyzer.reference new file mode 100644 index 00000000000..6b1fdfd42a2 --- /dev/null +++ b/tests/queries/0_stateless/02967_parallel_replicas_joins_and_analyzer.reference @@ -0,0 +1,502 @@ +-- { echoOn } + +set parallel_replicas_prefer_local_join = 0; +-- A query with only INNER/LEFT joins is fully send to replicas. JOIN is executed in GLOBAL mode. +select x, y, r.y, z, rr.z, a from (select l.x, l.y, r.y, r.z as z from (select x, y from tab1 where x != 2) l any left join (select y, z from tab2 where y != 4) r on l.y = r.y) ll any left join (select z, a from tab3 where z != 8) rr on ll.z = rr.z order by x SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +0 0 0 0 0 0 +1 1 0 0 0 0 +3 3 0 0 0 0 +4 4 0 0 0 0 +5 5 0 0 0 0 +6 6 6 6 0 0 +7 7 0 0 0 0 +8 8 8 8 0 0 +9 9 0 0 0 0 +10 10 10 10 0 0 +11 11 0 0 0 0 +12 12 12 12 12 12 +13 13 0 0 0 0 +14 14 14 14 0 0 +15 15 0 0 0 0 +explain description=0 select x, y, r.y, z, rr.z, a from (select l.x, l.y, r.y, r.z as z from (select x, y from tab1 where x != 2) l any left join (select y, z from tab2 where y != 4) r on l.y = r.y) ll any left join (select z, a from tab3 where z != 8) rr on ll.z = rr.z SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +Expression + ReadFromRemoteParallelReplicas +-- +-- The same query with cte; +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +0 0 0 0 0 0 +1 1 0 0 0 0 +3 3 0 0 0 0 +4 4 0 0 0 0 +5 5 0 0 0 0 +6 6 6 6 0 0 +7 7 0 0 0 0 +8 8 8 8 0 0 +9 9 0 0 0 0 +10 10 10 10 0 0 +11 11 0 0 0 0 +12 12 12 12 12 12 +13 13 0 0 0 0 +14 14 14 14 0 0 +15 15 0 0 0 0 +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +Expression + Sorting + Expression + ReadFromRemoteParallelReplicas +-- +-- GROUP BY should work up to WithMergableStage +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select sum(x), sum(y), sum(r.y), sum(z), sum(rr.z), sum(a), key from sub3 ll any left join sub4 rr on ll.z = rr.z group by x % 2 as key) +select * from sub5 order by key +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +54 54 50 50 12 12 0 +64 64 0 0 0 0 1 +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select sum(x), sum(y), sum(r.y), sum(z), sum(rr.z), sum(a), key from sub3 ll any left join sub4 rr on ll.z = rr.z group by x % 2 as key) +select * from sub5 order by key +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +Expression + Sorting + Expression + MergingAggregated + Expression + ReadFromRemoteParallelReplicas +-- +-- ORDER BY in sub3 : sub1 is fully pushed, sub3 -> WithMergableStage +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y order by l.x), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +0 0 0 0 0 0 +1 1 0 0 0 0 +3 3 0 0 0 0 +4 4 0 0 0 0 +5 5 0 0 0 0 +6 6 6 6 0 0 +7 7 0 0 0 0 +8 8 8 8 0 0 +9 9 0 0 0 0 +10 10 10 10 0 0 +11 11 0 0 0 0 +12 12 12 12 12 12 +13 13 0 0 0 0 +14 14 14 14 0 0 +15 15 0 0 0 0 +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y order by l.x), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +Expression + Sorting + Expression + Join + Expression + ReadFromRemoteParallelReplicas + Expression + ReadFromRemoteParallelReplicas +-- +-- ORDER BY in sub1 : sub1 -> WithMergableStage +with sub1 as (select x, y from tab1 where x != 2 order by y), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +0 0 0 0 0 0 +1 1 0 0 0 0 +3 3 0 0 0 0 +4 4 0 0 0 0 +5 5 0 0 0 0 +6 6 6 6 0 0 +7 7 0 0 0 0 +8 8 8 8 0 0 +9 9 0 0 0 0 +10 10 10 10 0 0 +11 11 0 0 0 0 +12 12 12 12 12 12 +13 13 0 0 0 0 +14 14 14 14 0 0 +15 15 0 0 0 0 +explain description=0 +with sub1 as (select x, y from tab1 where x != 2 order by y), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +Expression + Sorting + Expression + Join + Expression + Join + Expression + ReadFromRemoteParallelReplicas + Expression + ReadFromRemoteParallelReplicas + Expression + ReadFromRemoteParallelReplicas +-- +-- RIGHT JOIN in sub3: sub3 -> WithMergableStage +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub2 r any right join sub1 l on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, l.y, y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +0 0 0 0 0 0 +6 6 6 6 0 0 +8 8 8 8 0 0 +10 10 10 10 0 0 +12 12 12 12 12 12 +14 14 14 14 0 0 +4 4 0 0 0 0 +3 3 0 0 0 0 +5 5 0 0 0 0 +1 1 0 0 0 0 +7 7 0 0 0 0 +9 9 0 0 0 0 +15 15 0 0 0 0 +11 11 0 0 0 0 +13 13 0 0 0 0 +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub2 r any right join sub1 l on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, l.y, y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +Expression + Join + Expression + Join + Expression + ReadFromRemoteParallelReplicas + Expression + ReadFromRemoteParallelReplicas + Expression + ReadFromRemoteParallelReplicas +-- +-- RIGHT JOIN in sub5: sub5 -> WithMergableStage +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select z, a, x, y, r.y, ll.z from sub4 rr any right join sub3 ll on ll.z = rr.z) +select * from sub5 order by x SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +0 0 0 0 0 0 +0 0 1 1 0 0 +0 0 3 3 0 0 +0 0 4 4 0 0 +0 0 5 5 0 0 +0 0 6 6 6 6 +0 0 7 7 0 0 +0 0 8 8 8 8 +0 0 9 9 0 0 +0 0 10 10 10 10 +0 0 11 11 0 0 +12 12 12 12 12 12 +0 0 13 13 0 0 +0 0 14 14 14 14 +0 0 15 15 0 0 +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select z, a, x, y, r.y, ll.z from sub4 rr any right join sub3 ll on ll.z = rr.z) +select * from sub5 order by x SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1;-- { echoOn } +Expression + Sorting + Expression + Join + Expression + ReadFromRemoteParallelReplicas + Expression + Join + Expression + ReadFromRemoteParallelReplicas + Expression + ReadFromRemoteParallelReplicas +set parallel_replicas_prefer_local_join = 1; +-- A query with only INNER/LEFT joins is fully send to replicas. JOIN is executed in GLOBAL mode. +select x, y, r.y, z, rr.z, a from (select l.x, l.y, r.y, r.z as z from (select x, y from tab1 where x != 2) l any left join (select y, z from tab2 where y != 4) r on l.y = r.y) ll any left join (select z, a from tab3 where z != 8) rr on ll.z = rr.z order by x SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +0 0 0 0 0 0 +1 1 0 0 0 0 +3 3 0 0 0 0 +4 4 0 0 0 0 +5 5 0 0 0 0 +6 6 6 6 0 0 +7 7 0 0 0 0 +8 8 8 8 0 0 +9 9 0 0 0 0 +10 10 10 10 0 0 +11 11 0 0 0 0 +12 12 12 12 12 12 +13 13 0 0 0 0 +14 14 14 14 0 0 +15 15 0 0 0 0 +explain description=0 select x, y, r.y, z, rr.z, a from (select l.x, l.y, r.y, r.z as z from (select x, y from tab1 where x != 2) l any left join (select y, z from tab2 where y != 4) r on l.y = r.y) ll any left join (select z, a from tab3 where z != 8) rr on ll.z = rr.z SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +Expression + ReadFromRemoteParallelReplicas +-- +-- The same query with cte; +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +0 0 0 0 0 0 +1 1 0 0 0 0 +3 3 0 0 0 0 +4 4 0 0 0 0 +5 5 0 0 0 0 +6 6 6 6 0 0 +7 7 0 0 0 0 +8 8 8 8 0 0 +9 9 0 0 0 0 +10 10 10 10 0 0 +11 11 0 0 0 0 +12 12 12 12 12 12 +13 13 0 0 0 0 +14 14 14 14 0 0 +15 15 0 0 0 0 +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +Expression + Sorting + Expression + ReadFromRemoteParallelReplicas +-- +-- GROUP BY should work up to WithMergableStage +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select sum(x), sum(y), sum(r.y), sum(z), sum(rr.z), sum(a), key from sub3 ll any left join sub4 rr on ll.z = rr.z group by x % 2 as key) +select * from sub5 order by key +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +54 54 50 50 12 12 0 +64 64 0 0 0 0 1 +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select sum(x), sum(y), sum(r.y), sum(z), sum(rr.z), sum(a), key from sub3 ll any left join sub4 rr on ll.z = rr.z group by x % 2 as key) +select * from sub5 order by key +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +Expression + Sorting + Expression + MergingAggregated + Expression + ReadFromRemoteParallelReplicas +-- +-- ORDER BY in sub3 : sub1 is fully pushed, sub3 -> WithMergableStage +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y order by l.x), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +0 0 0 0 0 0 +1 1 0 0 0 0 +3 3 0 0 0 0 +4 4 0 0 0 0 +5 5 0 0 0 0 +6 6 6 6 0 0 +7 7 0 0 0 0 +8 8 8 8 0 0 +9 9 0 0 0 0 +10 10 10 10 0 0 +11 11 0 0 0 0 +12 12 12 12 12 12 +13 13 0 0 0 0 +14 14 14 14 0 0 +15 15 0 0 0 0 +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y order by l.x), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +Expression + Sorting + Expression + Join + Expression + ReadFromRemoteParallelReplicas + Expression + ReadFromRemoteParallelReplicas +-- +-- ORDER BY in sub1 : sub1 -> WithMergableStage +with sub1 as (select x, y from tab1 where x != 2 order by y), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +0 0 0 0 0 0 +1 1 0 0 0 0 +3 3 0 0 0 0 +4 4 0 0 0 0 +5 5 0 0 0 0 +6 6 6 6 0 0 +7 7 0 0 0 0 +8 8 8 8 0 0 +9 9 0 0 0 0 +10 10 10 10 0 0 +11 11 0 0 0 0 +12 12 12 12 12 12 +13 13 0 0 0 0 +14 14 14 14 0 0 +15 15 0 0 0 0 +explain description=0 +with sub1 as (select x, y from tab1 where x != 2 order by y), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +Expression + Sorting + Expression + Join + Expression + Join + Expression + ReadFromRemoteParallelReplicas + Expression + ReadFromRemoteParallelReplicas + Expression + ReadFromRemoteParallelReplicas +-- +-- RIGHT JOIN in sub3: sub3 -> WithMergableStage +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub2 r any right join sub1 l on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, l.y, y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +0 0 0 0 0 0 +6 6 6 6 0 0 +8 8 8 8 0 0 +10 10 10 10 0 0 +12 12 12 12 12 12 +14 14 14 14 0 0 +4 4 0 0 0 0 +3 3 0 0 0 0 +5 5 0 0 0 0 +1 1 0 0 0 0 +7 7 0 0 0 0 +9 9 0 0 0 0 +15 15 0 0 0 0 +11 11 0 0 0 0 +13 13 0 0 0 0 +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub2 r any right join sub1 l on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, l.y, y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +Expression + Join + Expression + Join + Expression + ReadFromRemoteParallelReplicas + Expression + ReadFromRemoteParallelReplicas + Expression + ReadFromRemoteParallelReplicas +-- +-- RIGHT JOIN in sub5: sub5 -> WithMergableStage +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select z, a, x, y, r.y, ll.z from sub4 rr any right join sub3 ll on ll.z = rr.z) +select * from sub5 order by x SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +0 0 0 0 0 0 +0 0 1 1 0 0 +0 0 3 3 0 0 +0 0 4 4 0 0 +0 0 5 5 0 0 +0 0 6 6 6 6 +0 0 7 7 0 0 +0 0 8 8 8 8 +0 0 9 9 0 0 +0 0 10 10 10 10 +0 0 11 11 0 0 +12 12 12 12 12 12 +0 0 13 13 0 0 +0 0 14 14 14 14 +0 0 15 15 0 0 +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select z, a, x, y, r.y, ll.z from sub4 rr any right join sub3 ll on ll.z = rr.z) +select * from sub5 order by x SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +Expression + Sorting + Expression + Join + Expression + ReadFromRemoteParallelReplicas + Expression + Join + Expression + ReadFromRemoteParallelReplicas + Expression + ReadFromRemoteParallelReplicas diff --git a/tests/queries/0_stateless/02967_parallel_replicas_joins_and_analyzer.sql.j2 b/tests/queries/0_stateless/02967_parallel_replicas_joins_and_analyzer.sql.j2 new file mode 100644 index 00000000000..7d2766d52f8 --- /dev/null +++ b/tests/queries/0_stateless/02967_parallel_replicas_joins_and_analyzer.sql.j2 @@ -0,0 +1,129 @@ +drop table if exists tab1; +drop table if exists tab2; +drop table if exists tab3; + +create table tab1 (x UInt32, y UInt32, shard UInt32) engine = MergeTree order by shard; +create table tab2 (y UInt32, z UInt32) engine = MergeTree order by tuple(); +create table tab3 (z UInt32, a UInt32) engine = MergeTree order by tuple(); + +insert into tab1 select number, number, number from numbers(16); +insert into tab2 select number * 2, number * 2 from numbers(8); +insert into tab3 select number * 4, number * 4 from numbers(4); + +{% for use_global_in in [0, 1] -%} + +-- { echoOn } + +set parallel_replicas_prefer_local_join = {{use_global_in}}; + +-- A query with only INNER/LEFT joins is fully send to replicas. JOIN is executed in GLOBAL mode. +select x, y, r.y, z, rr.z, a from (select l.x, l.y, r.y, r.z as z from (select x, y from tab1 where x != 2) l any left join (select y, z from tab2 where y != 4) r on l.y = r.y) ll any left join (select z, a from tab3 where z != 8) rr on ll.z = rr.z order by x SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +explain description=0 select x, y, r.y, z, rr.z, a from (select l.x, l.y, r.y, r.z as z from (select x, y from tab1 where x != 2) l any left join (select y, z from tab2 where y != 4) r on l.y = r.y) ll any left join (select z, a from tab3 where z != 8) rr on ll.z = rr.z SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +-- +-- The same query with cte; +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; + +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +-- +-- GROUP BY should work up to WithMergableStage +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select sum(x), sum(y), sum(r.y), sum(z), sum(rr.z), sum(a), key from sub3 ll any left join sub4 rr on ll.z = rr.z group by x % 2 as key) +select * from sub5 order by key +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; + +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select sum(x), sum(y), sum(r.y), sum(z), sum(rr.z), sum(a), key from sub3 ll any left join sub4 rr on ll.z = rr.z group by x % 2 as key) +select * from sub5 order by key +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +-- +-- ORDER BY in sub3 : sub1 is fully pushed, sub3 -> WithMergableStage +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y order by l.x), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; + +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y order by l.x), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +-- +-- ORDER BY in sub1 : sub1 -> WithMergableStage +with sub1 as (select x, y from tab1 where x != 2 order by y), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; + +explain description=0 +with sub1 as (select x, y from tab1 where x != 2 order by y), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, y, r.y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 order by x +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +-- +-- RIGHT JOIN in sub3: sub3 -> WithMergableStage +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub2 r any right join sub1 l on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, l.y, y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; + +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub2 r any right join sub1 l on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select x, l.y, y, z, rr.z, a from sub3 ll any left join sub4 rr on ll.z = rr.z) +select * from sub5 +SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; +-- +-- RIGHT JOIN in sub5: sub5 -> WithMergableStage +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select z, a, x, y, r.y, ll.z from sub4 rr any right join sub3 ll on ll.z = rr.z) +select * from sub5 order by x SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; + +explain description=0 +with sub1 as (select x, y from tab1 where x != 2), +sub2 as (select y, z from tab2 where y != 4), +sub3 as (select l.x, l.y, r.y, r.z as z from sub1 l any left join sub2 r on l.y = r.y), +sub4 as (select z, a from tab3 where z != 8), +sub5 as (select z, a, x, y, r.y, ll.z from sub4 rr any right join sub3 ll on ll.z = rr.z) +select * from sub5 order by x SETTINGS allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, prefer_localhost_replica = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', allow_experimental_analyzer=1; + +{%- endfor %} diff --git a/tests/queries/0_stateless/02968_adaptive_async_insert_timeout.reference b/tests/queries/0_stateless/02968_adaptive_async_insert_timeout.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02968_adaptive_async_insert_timeout.sql b/tests/queries/0_stateless/02968_adaptive_async_insert_timeout.sql new file mode 100644 index 00000000000..f9606cace6e --- /dev/null +++ b/tests/queries/0_stateless/02968_adaptive_async_insert_timeout.sql @@ -0,0 +1,51 @@ +DROP TABLE IF EXISTS async_insert_mt_test; +CREATE TABLE async_insert_mt_test (a UInt64, b Array(UInt64)) ENGINE=MergeTree() ORDER BY a; + +SET async_insert_use_adaptive_busy_timeout = 1; + +INSERT INTO async_insert_mt_test + SETTINGS + async_insert=1, + wait_for_async_insert=1, + async_insert_busy_timeout_min_ms=10, + async_insert_busy_timeout_max_ms=500, + async_insert_busy_timeout_increase_rate=1.0, + async_insert_busy_timeout_decrease_rate=1.0 + VALUES (3, []), (1, [1, 3]), (2, [7, 8]), (4, [5, 9]), (5, [2, 6]); + + +INSERT INTO async_insert_mt_test + SETTINGS + async_insert=1, + wait_for_async_insert=1, + async_insert_busy_timeout_ms=500, + async_insert_busy_timeout_min_ms=500 + VALUES (3, []), (1, [1, 3]), (2, [7, 8]), (4, [5, 9]), (5, [2, 6]); + + +INSERT INTO async_insert_mt_test + SETTINGS + async_insert=1, + wait_for_async_insert=1, + async_insert_busy_timeout_ms=100, + async_insert_busy_timeout_min_ms=500 + VALUES (3, []), (1, [1, 3]), (2, [7, 8]), (4, [5, 9]), (5, [2, 6]); + + +INSERT INTO async_insert_mt_test + SETTINGS + async_insert=1, + wait_for_async_insert=1, + async_insert_busy_timeout_increase_rate=-1.0 + VALUES (3, []), (1, [1, 3]), (2, [7, 8]), (4, [5, 9]), (5, [2, 6]); -- { serverError INVALID_SETTING_VALUE } + + +INSERT INTO async_insert_mt_test + SETTINGS + async_insert=1, + wait_for_async_insert=1, + async_insert_busy_timeout_decrease_rate=-1.0 + VALUES (3, []), (1, [1, 3]), (2, [7, 8]), (4, [5, 9]), (5, [2, 6]); -- { serverError INVALID_SETTING_VALUE } + + +DROP TABLE IF EXISTS async_insert_mt_test; diff --git a/tests/queries/0_stateless/02968_url_args.reference b/tests/queries/0_stateless/02968_url_args.reference index aa19e45301c..1c3693e4a66 100644 --- a/tests/queries/0_stateless/02968_url_args.reference +++ b/tests/queries/0_stateless/02968_url_args.reference @@ -1 +1,8 @@ -CREATE TABLE default.a\n(\n `x` Int64\n)\nENGINE = URL(\'https://example.com/\', \'CSV\', headers(\'foo\' = \'bar\')) +CREATE TABLE default.a\n(\n `x` Int64\n)\nENGINE = URL(\'https://example.com/\', \'CSV\', headers(\'foo\' = \'[HIDDEN]\', \'a\' = \'[HIDDEN]\')) +CREATE TABLE default.b\n(\n `x` Int64\n)\nENGINE = URL(\'https://example.com/\', \'CSV\', headers()) +CREATE TABLE default.c\n(\n `x` Int64\n)\nENGINE = S3(\'https://example.s3.amazonaws.com/a.csv\', \'NOSIGN\', \'CSV\', headers(\'foo\' = \'[HIDDEN]\')) +CREATE TABLE default.d\n(\n `x` Int64\n)\nENGINE = S3(\'https://example.s3.amazonaws.com/a.csv\', \'NOSIGN\', headers(\'foo\' = \'[HIDDEN]\')) +CREATE VIEW default.e\n(\n `x` Int64\n) AS\nSELECT count()\nFROM url(\'https://example.com/\', CSV, headers(\'foo\' = \'[HIDDEN]\', \'a\' = \'[HIDDEN]\')) +CREATE VIEW default.f\n(\n `x` Int64\n) AS\nSELECT count()\nFROM url(\'https://example.com/\', CSV, headers()) +CREATE VIEW default.g\n(\n `x` Int64\n) AS\nSELECT count()\nFROM s3(\'https://example.s3.amazonaws.com/a.csv\', CSV, headers(\'foo\' = \'[HIDDEN]\')) +CREATE VIEW default.h\n(\n `x` Int64\n) AS\nSELECT count()\nFROM s3(\'https://example.s3.amazonaws.com/a.csv\', headers(\'foo\' = \'[HIDDEN]\')) diff --git a/tests/queries/0_stateless/02968_url_args.sql b/tests/queries/0_stateless/02968_url_args.sql index 8bee9fec0ac..a9ac96970e0 100644 --- a/tests/queries/0_stateless/02968_url_args.sql +++ b/tests/queries/0_stateless/02968_url_args.sql @@ -1,2 +1,19 @@ -create table a (x Int64) engine URL('https://example.com/', CSV, headers('foo' = 'bar')); +-- Tags: no-fasttest + +create table a (x Int64) engine URL('https://example.com/', CSV, headers('foo' = 'bar', 'a' = '13')); show create a; +create table b (x Int64) engine URL('https://example.com/', CSV, headers()); +show create b; +create table c (x Int64) engine S3('https://example.s3.amazonaws.com/a.csv', NOSIGN, CSV, headers('foo' = 'bar')); +show create c; +create table d (x Int64) engine S3('https://example.s3.amazonaws.com/a.csv', NOSIGN, headers('foo' = 'bar')); +show create d; + +create view e (x Int64) as select count() from url('https://example.com/', CSV, headers('foo' = 'bar', 'a' = '13')); +show create e; +create view f (x Int64) as select count() from url('https://example.com/', CSV, headers()); +show create f; +create view g (x Int64) as select count() from s3('https://example.s3.amazonaws.com/a.csv', CSV, headers('foo' = 'bar')); +show create g; +create view h (x Int64) as select count() from s3('https://example.s3.amazonaws.com/a.csv', headers('foo' = 'bar')); +show create h; diff --git a/tests/queries/0_stateless/02969_auto_format_detection.reference b/tests/queries/0_stateless/02969_auto_format_detection.reference new file mode 100644 index 00000000000..4b86be04996 --- /dev/null +++ b/tests/queries/0_stateless/02969_auto_format_detection.reference @@ -0,0 +1,123 @@ +Parquet +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +ORC +a Nullable(Int64) +b Nullable(String) +c Array(Nullable(Int64)) +d Tuple(\n a Nullable(Int64),\n b Nullable(String)) +Arrow +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +ArrowStream +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +Avro +a Int64 +b String +c Array(Int64) +d Tuple(\n a Int64,\n b String) +Native +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +BSONEachRow +a Nullable(Int64) +b Nullable(String) +c Array(Nullable(Int64)) +d Tuple(\n a Nullable(Int64),\n b Nullable(String)) +JSONCompact +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +Values +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Tuple(Nullable(UInt64), Nullable(String)) +TSKV +a Nullable(String) +b Nullable(String) +c Array(Nullable(UInt64)) +d Nullable(String) +JSONObjectEachRow +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +JSONColumns +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +JSONCompactColumns +c1 Nullable(String) +c2 Nullable(String) +c3 Array(Nullable(String)) +c4 Tuple(\n a Nullable(String),\n b Nullable(String)) +JSONCompact +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +JSON +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +TSV +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Tuple(Nullable(UInt64), Nullable(String)) +CSV +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Nullable(UInt64) +c5 Nullable(String) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +1 +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) diff --git a/tests/queries/0_stateless/02969_auto_format_detection.sh b/tests/queries/0_stateless/02969_auto_format_detection.sh new file mode 100755 index 00000000000..88d6575e499 --- /dev/null +++ b/tests/queries/0_stateless/02969_auto_format_detection.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME.data + +for format in Parquet ORC Arrow ArrowStream Avro Native BSONEachRow JSONCompact Values TSKV JSONObjectEachRow JSONColumns JSONCompactColumns JSONCompact JSON TSV CSV +do + echo $format + $CLICKHOUSE_LOCAL -q "select * from generateRandom('a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)', 42) limit 10 format $format" > $DATA_FILE + $CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE')" +done + +rm $DATA_FILE + +$CLICKHOUSE_LOCAL -q "select * from generateRandom('a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)', 42) limit 10 format JSONEachRow" > $DATA_FILE.jsonl +$CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE*')" + + +$CLICKHOUSE_LOCAL -q "select * from generateRandom('a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)', 42) limit 10 format JSONEachRow" > $DATA_FILE + +$CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE', auto, 'a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)')" + +$CLICKHOUSE_LOCAL -nmq " +desc file('$DATA_FILE'); +desc file('$DATA_FILE'); +" + +$CLICKHOUSE_LOCAL -nmq " +desc file('$DATA_FILE', JSONEachRow); +desc file('$DATA_FILE'); +" + +touch $DATA_FILE.1 +$CLICKHOUSE_LOCAL -q "select * from generateRandom('a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)', 42) limit 10 format JSONEachRow" > $DATA_FILE.2 +$CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE.{1,2}')" +$CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE.{1,2}') settings schema_inference_mode='union'" 2>&1 | grep -c "CANNOT_DETECT_FORMAT" + +$CLICKHOUSE_LOCAL -nmq " +desc file('$DATA_FILE.2'); +desc file('$DATA_FILE.{1,2}'); +" + +rm $DATA_FILE* diff --git a/tests/queries/0_stateless/02971_analyzer_remote_id.reference b/tests/queries/0_stateless/02971_analyzer_remote_id.reference new file mode 100644 index 00000000000..b8626c4cff2 --- /dev/null +++ b/tests/queries/0_stateless/02971_analyzer_remote_id.reference @@ -0,0 +1 @@ +4 diff --git a/tests/queries/0_stateless/02971_analyzer_remote_id.sh b/tests/queries/0_stateless/02971_analyzer_remote_id.sh new file mode 100755 index 00000000000..463e4cc1f0c --- /dev/null +++ b/tests/queries/0_stateless/02971_analyzer_remote_id.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query="DROP DATABASE IF EXISTS test_02971" +${CLICKHOUSE_CLIENT} --query="CREATE DATABASE test_02971" + +${CLICKHOUSE_CLIENT} --query="CREATE TABLE test_02971.x ENGINE = MergeTree() ORDER BY number AS SELECT * FROM numbers(2)" +${CLICKHOUSE_LOCAL} --query="SELECT count() FROM remote('127.0.0.{2,3}', 'test_02971.x') SETTINGS allow_experimental_analyzer = 1" 2>&1 \ + | grep -av "ASan doesn't fully support makecontext/swapcontext functions" + +${CLICKHOUSE_CLIENT} --query="DROP DATABASE IF EXISTS test_02971" diff --git a/tests/queries/0_stateless/02972_insert_deduplication_token_hierarchical_inserts.reference b/tests/queries/0_stateless/02972_insert_deduplication_token_hierarchical_inserts.reference new file mode 100644 index 00000000000..71c9053d644 --- /dev/null +++ b/tests/queries/0_stateless/02972_insert_deduplication_token_hierarchical_inserts.reference @@ -0,0 +1,9 @@ +0 +ds_1_1 all_1_1_0 0 +ds_1_2 all_1_1_0 0 +ds_2_1 all_1_1_0 0 +ds_2_1 all_2_2_0 0 +ds_3_1 all_1_1_0 0 +ds_3_1 all_2_2_0 0 +landing all_1_1_0 0 +10 diff --git a/tests/queries/0_stateless/02972_insert_deduplication_token_hierarchical_inserts.sql b/tests/queries/0_stateless/02972_insert_deduplication_token_hierarchical_inserts.sql new file mode 100644 index 00000000000..242133e9122 --- /dev/null +++ b/tests/queries/0_stateless/02972_insert_deduplication_token_hierarchical_inserts.sql @@ -0,0 +1,103 @@ +SET insert_deduplicate = 1; +SET deduplicate_blocks_in_dependent_materialized_views = 1; +SET update_insert_deduplication_token_in_dependent_materialized_views = 1; +SET insert_deduplication_token = 'test'; + +DROP TABLE IF EXISTS landing; +CREATE TABLE landing +( + timestamp UInt64, + value UInt64 +) +ENGINE = MergeTree ORDER BY tuple() SETTINGS non_replicated_deduplication_window = 1000; + +DROP TABLE IF EXISTS ds_1_1; +CREATE TABLE ds_1_1 +( + t UInt64, + v UInt64 +) +ENGINE = MergeTree ORDER BY tuple() SETTINGS non_replicated_deduplication_window = 1000; + +DROP VIEW IF EXISTS mv_1_1; +CREATE MATERIALIZED VIEW mv_1_1 TO ds_1_1 as +SELECT + timestamp t, sum(value) v +FROM landing +GROUP BY t; + +DROP TABLE IF EXISTS ds_1_2; +CREATE TABLE ds_1_2 +( + t UInt64, + v UInt64 +) +ENGINE = MergeTree ORDER BY tuple() SETTINGS non_replicated_deduplication_window = 1000; + +DROP VIEW IF EXISTS mv_1_2; +CREATE MATERIALIZED VIEW mv_1_2 TO ds_1_2 as +SELECT + timestamp t, sum(value) v +FROM landing +GROUP BY t; + +DROP TABLE IF EXISTS ds_2_1; +CREATE TABLE ds_2_1 +( + l String, + t DateTime, + v UInt64 +) +ENGINE = MergeTree ORDER BY tuple() SETTINGS non_replicated_deduplication_window = 1000; + +DROP VIEW IF EXISTS mv_2_1; +CREATE MATERIALIZED VIEW mv_2_1 TO ds_2_1 as +SELECT '2_1' l, t, v +FROM ds_1_1; + +DROP VIEW IF EXISTS mv_2_2; +CREATE MATERIALIZED VIEW mv_2_2 TO ds_2_1 as +SELECT '2_2' l, t, v +FROM ds_1_2; + +DROP TABLE IF EXISTS ds_3_1; +CREATE TABLE ds_3_1 +( + l String, + t DateTime, + v UInt64 +) +ENGINE = MergeTree ORDER BY tuple() SETTINGS non_replicated_deduplication_window = 1000; + +DROP VIEW IF EXISTS mv_3_1; +CREATE MATERIALIZED VIEW mv_3_1 TO ds_3_1 as +SELECT '3_1' l, t, v +FROM ds_2_1; + +INSERT INTO landing SELECT 1 as timestamp, 1 AS value FROM numbers(10); + +SELECT sleep(3); + +INSERT INTO landing SELECT 1 as timestamp, 1 AS value FROM numbers(10); + +SYSTEM FLUSH LOGS; +SELECT table, name, error FROM system.part_log +WHERE database = currentDatabase() +ORDER BY table, name; + +SELECT count() FROM landing; + +DROP TABLE landing; + +DROP TABLE ds_1_1; +DROP VIEW mv_1_1; + +DROP TABLE ds_1_2; +DROP VIEW mv_1_2; + +DROP TABLE ds_2_1; +DROP VIEW mv_2_1; +DROP VIEW mv_2_2; + +DROP TABLE ds_3_1; +DROP VIEW mv_3_1; diff --git a/tests/queries/0_stateless/02972_insert_deduplication_token_hierarchical_inserts_views.reference b/tests/queries/0_stateless/02972_insert_deduplication_token_hierarchical_inserts_views.reference new file mode 100644 index 00000000000..e1bcc64aaeb --- /dev/null +++ b/tests/queries/0_stateless/02972_insert_deduplication_token_hierarchical_inserts_views.reference @@ -0,0 +1,5 @@ +0 +ds_1_1 all_1_1_0 0 +ds_1_1 all_2_2_0 0 +landing all_1_1_0 0 +10 diff --git a/tests/queries/0_stateless/02972_insert_deduplication_token_hierarchical_inserts_views.sql b/tests/queries/0_stateless/02972_insert_deduplication_token_hierarchical_inserts_views.sql new file mode 100644 index 00000000000..d82ff4afd93 --- /dev/null +++ b/tests/queries/0_stateless/02972_insert_deduplication_token_hierarchical_inserts_views.sql @@ -0,0 +1,53 @@ +SET insert_deduplicate = 1; +SET deduplicate_blocks_in_dependent_materialized_views = 1; +SET update_insert_deduplication_token_in_dependent_materialized_views = 1; +SET insert_deduplication_token = 'test'; + +DROP TABLE IF EXISTS landing; +CREATE TABLE landing +( + timestamp UInt64, + value UInt64 +) +ENGINE = MergeTree ORDER BY tuple() SETTINGS non_replicated_deduplication_window = 1000; + +DROP TABLE IF EXISTS ds_1_1; +CREATE TABLE ds_1_1 +( + t UInt64, + v UInt64 +) +ENGINE = MergeTree ORDER BY tuple() SETTINGS non_replicated_deduplication_window = 1000; + +DROP VIEW IF EXISTS mv_1_1; +CREATE MATERIALIZED VIEW mv_1_1 TO ds_1_1 as +SELECT + timestamp t, sum(value) v +FROM landing +GROUP BY t; + +DROP VIEW IF EXISTS mv_1_2; +CREATE MATERIALIZED VIEW mv_1_2 TO ds_1_1 as +SELECT + timestamp t, sum(value) v +FROM landing +GROUP BY t; + +INSERT INTO landing SELECT 1 as timestamp, 1 AS value FROM numbers(10); + +SELECT sleep(3); + +INSERT INTO landing SELECT 1 as timestamp, 1 AS value FROM numbers(10); + +SYSTEM FLUSH LOGS; +SELECT table, name, error FROM system.part_log +WHERE database = currentDatabase() +ORDER BY table, name; + +SELECT count() FROM landing; + +DROP TABLE landing; + +DROP TABLE ds_1_1; +DROP VIEW mv_1_1; +DROP VIEW mv_1_2; diff --git a/tests/queries/0_stateless/02972_parallel_replicas_cte.reference b/tests/queries/0_stateless/02972_parallel_replicas_cte.reference new file mode 100644 index 00000000000..449fe3d34e3 --- /dev/null +++ b/tests/queries/0_stateless/02972_parallel_replicas_cte.reference @@ -0,0 +1,3 @@ +990000 +990000 +10 diff --git a/tests/queries/0_stateless/02972_parallel_replicas_cte.sql b/tests/queries/0_stateless/02972_parallel_replicas_cte.sql new file mode 100644 index 00000000000..3702184e336 --- /dev/null +++ b/tests/queries/0_stateless/02972_parallel_replicas_cte.sql @@ -0,0 +1,27 @@ +DROP TABLE IF EXISTS pr_1; +DROP TABLE IF EXISTS pr_2; + +CREATE TABLE pr_1 (`a` UInt32) ENGINE = MergeTree ORDER BY a PARTITION BY a % 10 AS +SELECT 10 * intDiv(number, 10) + 1 FROM numbers(1_000_000); + +CREATE TABLE pr_2 (`a` UInt32) ENGINE = MergeTree ORDER BY a AS +SELECT * FROM numbers(1_000_000); + +WITH filtered_groups AS (SELECT a FROM pr_1 WHERE a >= 10000) +SELECT count() FROM pr_2 INNER JOIN filtered_groups ON pr_2.a = filtered_groups.a; + +WITH filtered_groups AS (SELECT a FROM pr_1 WHERE a >= 10000) +SELECT count() FROM pr_2 INNER JOIN filtered_groups ON pr_2.a = filtered_groups.a +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', max_parallel_replicas = 3; + +-- Testing that it is disabled for allow_experimental_analyzer=0. With analyzer it will be supported (with correct result) +WITH filtered_groups AS (SELECT a FROM pr_1 WHERE a >= 10000) +SELECT count() FROM pr_2 INNER JOIN filtered_groups ON pr_2.a = filtered_groups.a +SETTINGS allow_experimental_analyzer = 0, allow_experimental_parallel_reading_from_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', max_parallel_replicas = 3; -- { serverError SUPPORT_IS_DISABLED } + +-- Sanitizer +SELECT count() FROM pr_2 JOIN numbers(10) as pr_1 ON pr_2.a = pr_1.number +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', max_parallel_replicas = 3; + +DROP TABLE IF EXISTS pr_1; +DROP TABLE IF EXISTS pr_2; diff --git a/tests/queries/0_stateless/02973_backup_of_in_memory_compressed.reference b/tests/queries/0_stateless/02973_backup_of_in_memory_compressed.reference new file mode 100644 index 00000000000..00479541d22 --- /dev/null +++ b/tests/queries/0_stateless/02973_backup_of_in_memory_compressed.reference @@ -0,0 +1,2 @@ +0 +1000000 Hello, world Hello, world diff --git a/tests/queries/0_stateless/02973_backup_of_in_memory_compressed.sh b/tests/queries/0_stateless/02973_backup_of_in_memory_compressed.sh new file mode 100755 index 00000000000..b212e42061f --- /dev/null +++ b/tests/queries/0_stateless/02973_backup_of_in_memory_compressed.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest +# Because we are creating a backup with fixed path. + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --multiquery " +DROP TABLE IF EXISTS test; +CREATE TABLE test (x String) ENGINE = Memory SETTINGS compress = 1; +INSERT INTO test SELECT 'Hello, world' FROM numbers(1000000); +" + +$CLICKHOUSE_CLIENT --multiquery " +BACKUP TABLE test TO File('test.zip'); +" --format Null + +$CLICKHOUSE_CLIENT --multiquery " +TRUNCATE TABLE test; +SELECT count() FROM test; +" + +$CLICKHOUSE_CLIENT --multiquery " +RESTORE TABLE test FROM File('test.zip'); +" --format Null + +$CLICKHOUSE_CLIENT --multiquery " +SELECT count(), min(x), max(x) FROM test; +DROP TABLE test; +" diff --git a/tests/queries/0_stateless/02973_block_number_sparse_serialization_and_mutation.reference b/tests/queries/0_stateless/02973_block_number_sparse_serialization_and_mutation.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02973_block_number_sparse_serialization_and_mutation.sql b/tests/queries/0_stateless/02973_block_number_sparse_serialization_and_mutation.sql new file mode 100644 index 00000000000..7a1de2897fb --- /dev/null +++ b/tests/queries/0_stateless/02973_block_number_sparse_serialization_and_mutation.sql @@ -0,0 +1,39 @@ +-- Tags: zookeeper + +-- we need exact block-numbers +SET insert_keeper_fault_injection_probability=0; + +DROP TABLE IF EXISTS table_with_some_columns; + +CREATE TABLE table_with_some_columns( + key UInt64, + value0 UInt8 +) +ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/table_with_some_columns', '1') +ORDER BY key +SETTINGS allow_experimental_block_number_column=1, +ratio_of_defaults_for_sparse_serialization=0.0001, +min_bytes_for_wide_part = 0, +replace_long_file_name_to_hash=0; -- simpler to debug + +INSERT INTO table_with_some_columns SELECT rand(), number + 10 from numbers(100000); + +INSERT INTO table_with_some_columns SELECT rand(), number + 10 from numbers(1); + +OPTIMIZE TABLE table_with_some_columns FINAL; + +INSERT INTO table_with_some_columns SELECT rand(), number+222222222 from numbers(1); + +OPTIMIZE TABLE table_with_some_columns FINAL; + +set alter_sync = 2; + +ALTER TABLE table_with_some_columns DROP COLUMN value0; + +INSERT INTO table_with_some_columns SELECT rand() from numbers(1); + +OPTIMIZE TABLE table_with_some_columns FINAL; + +SELECT *, _block_number FROM table_with_some_columns where not ignore(*) Format Null; + +DROP TABLE IF EXISTS table_with_some_columns; diff --git a/tests/queries/0_stateless/02973_dictionary_table_exception_fix.reference b/tests/queries/0_stateless/02973_dictionary_table_exception_fix.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02973_dictionary_table_exception_fix.sql b/tests/queries/0_stateless/02973_dictionary_table_exception_fix.sql new file mode 100644 index 00000000000..f8061b42670 --- /dev/null +++ b/tests/queries/0_stateless/02973_dictionary_table_exception_fix.sql @@ -0,0 +1,6 @@ +CREATE TABLE test_table (i Int64) engine=MergeTree order by i; +CREATE DICTIONARY test_dict (y String, value UInt64 DEFAULT 0) PRIMARY KEY y SOURCE(CLICKHOUSE(TABLE 'test_table')) LAYOUT(DIRECT()); +CREATE TABLE test_dict (y Int64) engine=MergeTree order by y; -- { serverError DICTIONARY_ALREADY_EXISTS } +CREATE DICTIONARY test_table (y String, value UInt64 DEFAULT 0) PRIMARY KEY y SOURCE(CLICKHOUSE(TABLE 'test_table')) LAYOUT(DIRECT()); -- { serverError TABLE_ALREADY_EXISTS } +CREATE DICTIONARY test_dict (y String, value UInt64 DEFAULT 0) PRIMARY KEY y SOURCE(CLICKHOUSE(TABLE 'test_table')) LAYOUT(DIRECT()); -- { serverError DICTIONARY_ALREADY_EXISTS } +CREATE TABLE test_table (y Int64) engine=MergeTree order by y; -- { serverError TABLE_ALREADY_EXISTS } diff --git a/tests/queries/0_stateless/02973_s3_compressed_file_in_error_message.reference b/tests/queries/0_stateless/02973_s3_compressed_file_in_error_message.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02973_s3_compressed_file_in_error_message.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02973_s3_compressed_file_in_error_message.sh b/tests/queries/0_stateless/02973_s3_compressed_file_in_error_message.sh new file mode 100755 index 00000000000..a4984583637 --- /dev/null +++ b/tests/queries/0_stateless/02973_s3_compressed_file_in_error_message.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --allow_repeated_settings --send_logs_level=none -q "select * from s3('http://localhost:11111/test/a.tsv', TSV, 'x String', 'gzip')" 2>&1 | grep -c "(in file/uri.*a\.tsv)" diff --git a/tests/queries/0_stateless/02974_analyzer_array_join_subcolumn.reference b/tests/queries/0_stateless/02974_analyzer_array_join_subcolumn.reference new file mode 100644 index 00000000000..827c710ef1a --- /dev/null +++ b/tests/queries/0_stateless/02974_analyzer_array_join_subcolumn.reference @@ -0,0 +1,16 @@ +('a',(1,2)) 1 +('b',(2,3)) 2 +('a',(1,2)) 1 +('b',(2,3)) 2 +('a',(1,2)) 1 +('b',(2,3)) 2 +('a',(1,2)) 1 +('b',(2,3)) 2 +('a',(1,2)) 1 +('b',(2,3)) 2 +('a',(1,2)) 1 +('b',(2,3)) 2 +('a',(1,2)) 1 +('b',(2,3)) 2 +('a',(1,2)) 1 +('b',(2,3)) 2 diff --git a/tests/queries/0_stateless/02974_analyzer_array_join_subcolumn.sql b/tests/queries/0_stateless/02974_analyzer_array_join_subcolumn.sql new file mode 100644 index 00000000000..14823644b96 --- /dev/null +++ b/tests/queries/0_stateless/02974_analyzer_array_join_subcolumn.sql @@ -0,0 +1,24 @@ +DROP TABLE IF EXISTS t2; +DROP TABLE IF EXISTS t3; + +CREATE TABLE t2 (id Int32, pe Map(String, Tuple(a UInt64, b UInt64))) ENGINE = MergeTree ORDER BY id; +INSERT INTO t2 VALUES (1, {'a': (1, 2), 'b': (2, 3)}), + +CREATE TABLE t3 (id Int32, c Tuple(v String, pe Map(String, Tuple(a UInt64, b UInt64)))) ENGINE = MergeTree ORDER BY id; +INSERT INTO t3 VALUES (1, ('A', {'a':(1, 2),'b':(2, 3)})); + +SELECT pe, pe.values.a FROM (SELECT * FROM t2) ARRAY JOIN pe SETTINGS allow_experimental_analyzer = 1; +SELECT p, p.values.a FROM (SELECT * FROM t2) ARRAY JOIN pe AS p SETTINGS allow_experimental_analyzer = 1; + +SELECT pe, pe.values.a FROM t2 ARRAY JOIN pe; +SELECT p, p.values.a FROM t2 ARRAY JOIN pe AS p; + +SELECT c.pe, c.pe.values.a FROM (SELECT * FROM t3) ARRAY JOIN c.pe SETTINGS allow_experimental_analyzer = 1; +SELECT p, p.values.a FROM (SELECT * FROM t3) ARRAY JOIN c.pe as p SETTINGS allow_experimental_analyzer = 1; + +SELECT c.pe, c.pe.values.a FROM t3 ARRAY JOIN c.pe SETTINGS allow_experimental_analyzer = 1; +SELECT p, p.values.a FROM t3 ARRAY JOIN c.pe as p; + + +DROP TABLE IF EXISTS t2; +DROP TABLE IF EXISTS t3; diff --git a/tests/queries/0_stateless/02974_backup_query_format_null.reference b/tests/queries/0_stateless/02974_backup_query_format_null.reference new file mode 100644 index 00000000000..67bfe658c1f --- /dev/null +++ b/tests/queries/0_stateless/02974_backup_query_format_null.reference @@ -0,0 +1,3 @@ +2 +80 +-12345 diff --git a/tests/queries/0_stateless/02974_backup_query_format_null.sh b/tests/queries/0_stateless/02974_backup_query_format_null.sh new file mode 100755 index 00000000000..ddba2f6de16 --- /dev/null +++ b/tests/queries/0_stateless/02974_backup_query_format_null.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -nm --query " +DROP TABLE IF EXISTS tbl; +CREATE TABLE tbl (a Int32) ENGINE = MergeTree() ORDER BY tuple(); +INSERT INTO tbl VALUES (2), (80), (-12345); +" + +backup_name="Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}')" + +${CLICKHOUSE_CLIENT} --query "BACKUP TABLE tbl TO ${backup_name} FORMAT Null" + +${CLICKHOUSE_CLIENT} -nm --query " +DROP TABLE tbl; +RESTORE ALL FROM ${backup_name} FORMAT Null +" + +${CLICKHOUSE_CLIENT} --query "SELECT * FROM tbl" diff --git a/tests/queries/0_stateless/02974_if_with_map.reference b/tests/queries/0_stateless/02974_if_with_map.reference new file mode 100644 index 00000000000..a4ebf6a9700 --- /dev/null +++ b/tests/queries/0_stateless/02974_if_with_map.reference @@ -0,0 +1,40 @@ +{1:2,3:4} +{3:4,5:6} +{1:2,3:4} +{3:4,5:6} +{3:4,5:6} +{1:2,3:4} +{3:4,5:6} +{1:2,3:4} +{1:2,3:4} +{3:4} +{1:2,3:4} +{3:4} +{3:4,5:6} +{1:2} +{3:4,5:6} +{1:2} +{1:2,3:4} +{1:2,3:4} +{3:4,5:6} +{3:4,5:6} +{3:4,5:6} +{3:4,5:6} +{1:2,3:4} +{1:2,3:4} +{3:4,5:6} +{3:4,5:6} +{3:4,5:6} +{3:4,5:6} +{3:4,5:6} +{3:4,5:6} +{1:2,3:4} +{1:2,3:4} +{1:2,3:4} +{1:2,3:4} +{3:4,5:6} +{3:4,5:6} +{1:2,3:4} +{1:2,3:4} +{1:2,3:4} +{1:2,3:4} diff --git a/tests/queries/0_stateless/02974_if_with_map.sql b/tests/queries/0_stateless/02974_if_with_map.sql new file mode 100644 index 00000000000..2387cffd4bf --- /dev/null +++ b/tests/queries/0_stateless/02974_if_with_map.sql @@ -0,0 +1,20 @@ +select if(number % 2 = 0, map(1,2,3,4), map(3,4,5,6)) from numbers(2); +select if(number % 2 = 0, materialize(map(1,2,3,4)), map(3,4,5,6)) from numbers(2); +select if(number % 2 = 0, map(3,4,5,6), materialize(map(1,2,3,4))) from numbers(2); +select if(number % 2 = 0, materialize(map(3,4,5,6)), materialize(map(1,2,3,4))) from numbers(2); +select if(number % 2 = 0, map(1,2,3,4), map(3,4)) from numbers(2); +select if(number % 2 = 0, materialize(map(1,2,3,4)), map(3,4)) from numbers(2); +select if(number % 2 = 0, map(3,4,5,6), materialize(map(1,2))) from numbers(2); +select if(number % 2 = 0, materialize(map(3,4,5,6)), materialize(map(1,2))) from numbers(2); +select if(1, map(1,2,3,4), map(3,4,5,6)) from numbers(2); +select if(0, map(1,2,3,4), map(3,4,5,6)) from numbers(2); +select if(null, map(1,2,3,4), map(3,4,5,6)) from numbers(2); +select if(1, materialize(map(1,2,3,4)), map(3,4,5,6)) from numbers(2); +select if(0, materialize(map(1,2,3,4)), map(3,4,5,6)) from numbers(2); +select if(null, materialize(map(1,2,3,4)), map(3,4,5,6)) from numbers(2); +select if(1, map(3,4,5,6), materialize(map(1,2,3,4))) from numbers(2); +select if(0, map(3,4,5,6), materialize(map(1,2,3,4))) from numbers(2); +select if(null, map(3,4,5,6), materialize(map(1,2,3,4))) from numbers(2); +select if(1, materialize(map(3,4,5,6)), materialize(map(1,2,3,4))) from numbers(2); +select if(0, materialize(map(3,4,5,6)), materialize(map(1,2,3,4))) from numbers(2); +select if(null, materialize(map(3,4,5,6)), materialize(map(1,2,3,4))) from numbers(2); diff --git a/tests/queries/0_stateless/02975_intdiv_with_decimal.reference b/tests/queries/0_stateless/02975_intdiv_with_decimal.reference new file mode 100644 index 00000000000..5540734ae4c --- /dev/null +++ b/tests/queries/0_stateless/02975_intdiv_with_decimal.reference @@ -0,0 +1,68 @@ +2 +2 +1 +2 +2 +2 +2 +2 +2 +2 +2 +1 +2 +1 +2 +1 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +1 +1 +1 +1 +2 +2 +2 +2 +2 +2 +1 +2 +2 +2 +2 +2 +2 +2 +2 +1 +2 +1 +2 +1 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +1 +1 +1 +1 +2 +2 +2 +2 diff --git a/tests/queries/0_stateless/02975_intdiv_with_decimal.sql b/tests/queries/0_stateless/02975_intdiv_with_decimal.sql new file mode 100644 index 00000000000..0911a481251 --- /dev/null +++ b/tests/queries/0_stateless/02975_intdiv_with_decimal.sql @@ -0,0 +1,70 @@ +--intDiv-- +SELECT intDiv(4,2); +SELECT intDiv(toDecimal32(4.4, 2), 2); +SELECT intDiv(4, toDecimal32(2.2, 2)); +SELECT intDiv(toDecimal32(4.4, 2), 2); +SELECT intDiv(toDecimal32(4.4, 2), toDecimal32(2.2, 2)); +SELECT intDiv(toDecimal64(4.4, 3), 2); +SELECT intDiv(toDecimal64(4.4, 3), toDecimal32(2.2, 2)); +SELECT intDiv(toDecimal128(4.4, 4), 2); +SELECT intDiv(toDecimal128(4.4, 4), toDecimal32(2.2, 2)); +SELECT intDiv(toDecimal256(4.4, 5), 2); +SELECT intDiv(toDecimal256(4.4, 5), toDecimal32(2.2, 2)); +SELECT intDiv(4, toDecimal64(2.2, 2)); +SELECT intDiv(toDecimal32(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDiv(4, toDecimal128(2.2, 3)); +SELECT intDiv(toDecimal32(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDiv(4, toDecimal256(2.2, 4)); +SELECT intDiv(toDecimal32(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDiv(toDecimal64(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDiv(toDecimal128(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDiv(toDecimal256(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDiv(toDecimal64(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDiv(toDecimal128(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDiv(toDecimal256(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDiv(toDecimal64(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDiv(toDecimal128(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDiv(toDecimal256(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDiv(4.2, toDecimal32(2.2, 2)); +SELECT intDiv(4.2, toDecimal64(2.2, 2)); +SELECT intDiv(4.2, toDecimal128(2.2, 2)); +SELECT intDiv(4.2, toDecimal256(2.2, 2)); +SELECT intDiv(toDecimal32(4.4, 2), 2.2); +SELECT intDiv(toDecimal64(4.4, 2), 2.2); +SELECT intDiv(toDecimal128(4.4, 2), 2.2); +SELECT intDiv(toDecimal256(4.4, 2), 2.2); +--intDivOrZero-- +SELECT intDivOrZero(4,2); +SELECT intDivOrZero(toDecimal32(4.4, 2), 2); +SELECT intDivOrZero(4, toDecimal32(2.2, 2)); +SELECT intDivOrZero(toDecimal32(4.4, 2), 2); +SELECT intDivOrZero(toDecimal32(4.4, 2), toDecimal32(2.2, 2)); +SELECT intDivOrZero(toDecimal64(4.4, 3), 2); +SELECT intDivOrZero(toDecimal64(4.4, 3), toDecimal32(2.2, 2)); +SELECT intDivOrZero(toDecimal128(4.4, 4), 2); +SELECT intDivOrZero(toDecimal128(4.4, 4), toDecimal32(2.2, 2)); +SELECT intDivOrZero(toDecimal256(4.4, 5), 2); +SELECT intDivOrZero(toDecimal256(4.4, 5), toDecimal32(2.2, 2)); +SELECT intDivOrZero(4, toDecimal64(2.2, 2)); +SELECT intDivOrZero(toDecimal32(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDivOrZero(4, toDecimal128(2.2, 3)); +SELECT intDivOrZero(toDecimal32(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDivOrZero(4, toDecimal256(2.2, 4)); +SELECT intDivOrZero(toDecimal32(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDivOrZero(toDecimal64(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDivOrZero(toDecimal128(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDivOrZero(toDecimal256(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDivOrZero(toDecimal64(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDivOrZero(toDecimal128(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDivOrZero(toDecimal256(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDivOrZero(toDecimal64(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDivOrZero(toDecimal128(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDivOrZero(toDecimal256(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDivOrZero(4.2, toDecimal32(2.2, 2)); +SELECT intDivOrZero(4.2, toDecimal64(2.2, 2)); +SELECT intDivOrZero(4.2, toDecimal128(2.2, 2)); +SELECT intDivOrZero(4.2, toDecimal256(2.2, 2)); +SELECT intDivOrZero(toDecimal32(4.4, 2), 2.2); +SELECT intDivOrZero(toDecimal64(4.4, 2), 2.2); +SELECT intDivOrZero(toDecimal128(4.4, 2), 2.2); +SELECT intDivOrZero(toDecimal256(4.4, 2), 2.2); diff --git a/tests/queries/0_stateless/02975_system_zookeeper_retries.reference b/tests/queries/0_stateless/02975_system_zookeeper_retries.reference new file mode 100644 index 00000000000..9a636ba56d0 --- /dev/null +++ b/tests/queries/0_stateless/02975_system_zookeeper_retries.reference @@ -0,0 +1,3 @@ +/keeper api_version +/keeper feature_flags +1 diff --git a/tests/queries/0_stateless/02975_system_zookeeper_retries.sql b/tests/queries/0_stateless/02975_system_zookeeper_retries.sql new file mode 100644 index 00000000000..8b402ec6d65 --- /dev/null +++ b/tests/queries/0_stateless/02975_system_zookeeper_retries.sql @@ -0,0 +1,22 @@ +-- Tags: zookeeper, no-parallel, no-fasttest + +SELECT path, name +FROM system.zookeeper +WHERE path = '/keeper' +ORDER BY path, name +SETTINGS + insert_keeper_retry_initial_backoff_ms = 1, + insert_keeper_retry_max_backoff_ms = 20, + insert_keeper_fault_injection_probability=0.3, + insert_keeper_fault_injection_seed=4, + log_comment='02975_system_zookeeper_retries'; + + +SYSTEM FLUSH LOGS; + +-- Check that there where zk session failures +SELECT ProfileEvents['ZooKeeperHardwareExceptions'] > 0 +FROM system.query_log +WHERE current_database = currentDatabase() AND type = 'QueryFinish' AND log_comment='02975_system_zookeeper_retries' +ORDER BY event_time_microseconds DESC +LIMIT 1; diff --git a/tests/queries/0_stateless/02976_system_zookeeper_filters.reference b/tests/queries/0_stateless/02976_system_zookeeper_filters.reference new file mode 100644 index 00000000000..a9e2f17562a --- /dev/null +++ b/tests/queries/0_stateless/02976_system_zookeeper_filters.reference @@ -0,0 +1,6 @@ +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/02976_system_zookeeper_filters.sql b/tests/queries/0_stateless/02976_system_zookeeper_filters.sql new file mode 100644 index 00000000000..3664c817da7 --- /dev/null +++ b/tests/queries/0_stateless/02976_system_zookeeper_filters.sql @@ -0,0 +1,17 @@ +-- Tags: zookeeper, no-parallel, no-fasttest, long + +SET allow_unrestricted_reads_from_keeper = 'false'; + +SELECT count() > 0 FROM system.zookeeper; -- { serverError BAD_ARGUMENTS } +SELECT count() > 0 FROM system.zookeeper WHERE name LIKE '%_%'; -- { serverError BAD_ARGUMENTS } +SELECT count() > 0 FROM system.zookeeper WHERE value LIKE '%'; -- { serverError BAD_ARGUMENTS } +SELECT count() > 0 FROM system.zookeeper WHERE path LIKE '/%'; -- { serverError BAD_ARGUMENTS } +SELECT count() > 0 FROM system.zookeeper WHERE path = '/'; + +SET allow_unrestricted_reads_from_keeper = 'true'; + +SELECT count() > 0 FROM system.zookeeper; +SELECT count() > 0 FROM system.zookeeper WHERE name LIKE '%_%'; +SELECT count() > 0 FROM system.zookeeper WHERE value LIKE '%'; +SELECT count() > 0 FROM system.zookeeper WHERE path LIKE '/%'; +SELECT count() > 0 FROM system.zookeeper WHERE path = '/'; diff --git a/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_MergeTree.reference b/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_MergeTree.reference new file mode 100644 index 00000000000..531163e1d84 --- /dev/null +++ b/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_MergeTree.reference @@ -0,0 +1,30 @@ +data after INSERT 1 +data after ATTACH 1 +Files before DETACH TABLE +all_1_1_0 + +backups/ordinary_default/data/ordinary_default/data/all_1_1_0: +primary.cidx +serialization.json +metadata_version.txt +default_compression_codec.txt +data.bin +data.cmrk3 +count.txt +columns.txt +checksums.txt + +Files after DETACH TABLE +all_1_1_0 + +backups/ordinary_default/data/ordinary_default/data/all_1_1_0: +primary.cidx +serialization.json +metadata_version.txt +default_compression_codec.txt +data.bin +data.cmrk3 +count.txt +columns.txt +checksums.txt + diff --git a/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_MergeTree.sh b/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_MergeTree.sh new file mode 100755 index 00000000000..386c29704b6 --- /dev/null +++ b/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_MergeTree.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-random-settings, no-random-merge-tree-settings +# Tag no-fasttest: requires S3 +# Tag no-random-settings, no-random-merge-tree-settings: to avoid creating extra files like serialization.json, this test too exocit anyway + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# config for clickhouse-disks (to check leftovers) +config="${BASH_SOURCE[0]/.sh/.yml}" + +# only in Atomic ATTACH from s3_plain works +new_database="ordinary_$CLICKHOUSE_DATABASE" +$CLICKHOUSE_CLIENT --allow_deprecated_database_ordinary=1 -q "create database $new_database engine=Ordinary" +CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT/--database=$CLICKHOUSE_DATABASE/--database=$new_database} +CLICKHOUSE_DATABASE="$new_database" + +$CLICKHOUSE_CLIENT -nm -q " + drop table if exists data; + create table data (key Int) engine=MergeTree() order by key; + insert into data values (1); + select 'data after INSERT', count() from data; +" + +# suppress output +$CLICKHOUSE_CLIENT -q "backup table data to S3('http://localhost:11111/test/s3_plain/backups/$CLICKHOUSE_DATABASE', 'test', 'testtest')" > /dev/null + +$CLICKHOUSE_CLIENT -nm -q " + drop table data; + attach table data (key Int) engine=MergeTree() order by key + settings + max_suspicious_broken_parts=0, + disk=disk(type=s3_plain, + endpoint='http://localhost:11111/test/s3_plain/backups/$CLICKHOUSE_DATABASE', + access_key_id='test', + secret_access_key='testtest'); + select 'data after ATTACH', count() from data; + + insert into data values (1); -- { serverError TABLE_IS_READ_ONLY } + optimize table data final; -- { serverError TABLE_IS_READ_ONLY } +" + +path=$($CLICKHOUSE_CLIENT -q "SELECT replace(data_paths[1], 's3_plain', '') FROM system.tables WHERE database = '$CLICKHOUSE_DATABASE' AND table = 'data'") +# trim / to fix "Unable to parse ExceptionName: XMinioInvalidObjectName Message: Object name contains unsupported characters." +path=${path%/} + +echo "Files before DETACH TABLE" +clickhouse-disks -C "$config" --disk s3_plain_disk list --recursive "${path:?}" | tail -n+2 + +$CLICKHOUSE_CLIENT -q "detach table data" +echo "Files after DETACH TABLE" +clickhouse-disks -C "$config" --disk s3_plain_disk list --recursive "$path" | tail -n+2 + +# metadata file is left +$CLICKHOUSE_CLIENT --force_remove_data_recursively_on_drop=1 -q "drop database if exists $CLICKHOUSE_DATABASE" diff --git a/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_MergeTree.yml b/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_MergeTree.yml new file mode 100644 index 00000000000..ca5036736d8 --- /dev/null +++ b/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_MergeTree.yml @@ -0,0 +1,7 @@ +storage_configuration: + disks: + s3_plain_disk: + type: s3_plain + endpoint: http://localhost:11111/test/s3_plain/ + access_key_id: clickhouse + secret_access_key: clickhouse diff --git a/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_ReplicatedMergeTree.reference b/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_ReplicatedMergeTree.reference new file mode 100644 index 00000000000..1e191b719a5 --- /dev/null +++ b/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_ReplicatedMergeTree.reference @@ -0,0 +1,30 @@ +data after INSERT 1 +data after ATTACH 1 +Files before DETACH TABLE +all_X_X_X + +backups/ordinary_default/data/ordinary_default/data_read/all_X_X_X: +primary.cidx +serialization.json +metadata_version.txt +default_compression_codec.txt +data.bin +data.cmrk3 +count.txt +columns.txt +checksums.txt + +Files after DETACH TABLE +all_X_X_X + +backups/ordinary_default/data/ordinary_default/data_read/all_X_X_X: +primary.cidx +serialization.json +metadata_version.txt +default_compression_codec.txt +data.bin +data.cmrk3 +count.txt +columns.txt +checksums.txt + diff --git a/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_ReplicatedMergeTree.sh b/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_ReplicatedMergeTree.sh new file mode 100755 index 00000000000..bf20247c7aa --- /dev/null +++ b/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_ReplicatedMergeTree.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-random-settings, no-random-merge-tree-settings +# Tag no-fasttest: requires S3 +# Tag no-random-settings, no-random-merge-tree-settings: to avoid creating extra files like serialization.json, this test too exocit anyway + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +config="${BASH_SOURCE[0]/.sh/.yml}" + +# only in Atomic ATTACH from s3_plain works +new_database="ordinary_$CLICKHOUSE_DATABASE" +$CLICKHOUSE_CLIENT --allow_deprecated_database_ordinary=1 -q "create database $new_database engine=Ordinary" +CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT/--database=$CLICKHOUSE_DATABASE/--database=$new_database} +CLICKHOUSE_DATABASE="$new_database" + +$CLICKHOUSE_CLIENT -nm -q " + drop table if exists data_read; + drop table if exists data_write; + + create table data_write (key Int) engine=ReplicatedMergeTree('/tables/{database}/data', 'write') order by key; + create table data_read (key Int) engine=ReplicatedMergeTree('/tables/{database}/data', 'read') order by key; + + insert into data_write values (1); + system sync replica data_read; + select 'data after INSERT', count() from data_read; +" + +# suppress output +$CLICKHOUSE_CLIENT -q "backup table data_read to S3('http://localhost:11111/test/s3_plain/backups/$CLICKHOUSE_DATABASE', 'test', 'testtest')" > /dev/null + +$CLICKHOUSE_CLIENT -nm -q " + drop table data_read; + attach table data_read (key Int) engine=ReplicatedMergeTree('/tables/{database}/data', 'read') order by key + settings + max_suspicious_broken_parts=0, + disk=disk(type=s3_plain, + endpoint='http://localhost:11111/test/s3_plain/backups/$CLICKHOUSE_DATABASE', + access_key_id='test', + secret_access_key='testtest'); + select 'data after ATTACH', count() from data_read; + + insert into data_read values (1); -- { serverError TABLE_IS_READ_ONLY } + optimize table data_read final; -- { serverError TABLE_IS_READ_ONLY } + system sync replica data_read; -- { serverError TABLE_IS_READ_ONLY } +" + +path=$($CLICKHOUSE_CLIENT -q "SELECT replace(data_paths[1], 's3_plain', '') FROM system.tables WHERE database = '$CLICKHOUSE_DATABASE' AND table = 'data_read'") +# trim / to fix "Unable to parse ExceptionName: XMinioInvalidObjectName Message: Object name contains unsupported characters." +path=${path%/} + +echo "Files before DETACH TABLE" +# sed to match any part, since in case of fault injection part name may not be all_0_0_0 but all_1_1_0 +clickhouse-disks -C "$config" --disk s3_plain_disk list --recursive "${path:?}" | tail -n+2 | sed 's/all_[^_]*_[^_]*_0/all_X_X_X/g' + +$CLICKHOUSE_CLIENT -nm -q " + detach table data_read; + detach table data_write; +" +echo "Files after DETACH TABLE" +clickhouse-disks -C "$config" --disk s3_plain_disk list --recursive "$path" | tail -n+2 | sed 's/all_[^_]*_[^_]*_0/all_X_X_X/g' + +# metadata file is left +$CLICKHOUSE_CLIENT --force_remove_data_recursively_on_drop=1 -q "drop database if exists $CLICKHOUSE_DATABASE" diff --git a/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_ReplicatedMergeTree.yml b/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_ReplicatedMergeTree.yml new file mode 100644 index 00000000000..ca5036736d8 --- /dev/null +++ b/tests/queries/0_stateless/02980_s3_plain_DROP_TABLE_ReplicatedMergeTree.yml @@ -0,0 +1,7 @@ +storage_configuration: + disks: + s3_plain_disk: + type: s3_plain + endpoint: http://localhost:11111/test/s3_plain/ + access_key_id: clickhouse + secret_access_key: clickhouse diff --git a/tests/queries/0_stateless/02981_translate_fixedstring.reference b/tests/queries/0_stateless/02981_translate_fixedstring.reference new file mode 100644 index 00000000000..e506d4a22f7 --- /dev/null +++ b/tests/queries/0_stateless/02981_translate_fixedstring.reference @@ -0,0 +1,5 @@ +AAA\0\0\0\0\0\0\0 +A +1 +2 +3 diff --git a/tests/queries/0_stateless/02981_translate_fixedstring.sql b/tests/queries/0_stateless/02981_translate_fixedstring.sql new file mode 100644 index 00000000000..209efa4ba4a --- /dev/null +++ b/tests/queries/0_stateless/02981_translate_fixedstring.sql @@ -0,0 +1,2 @@ +SELECT translate('aaa'::FixedString(10), 'a','A'); +SELECT translate(number::String::FixedString(1), '0','A') from numbers(4); diff --git a/tests/queries/0_stateless/02981_variant_type_function.reference b/tests/queries/0_stateless/02981_variant_type_function.reference new file mode 100644 index 00000000000..4fae89810ef --- /dev/null +++ b/tests/queries/0_stateless/02981_variant_type_function.reference @@ -0,0 +1,10 @@ +None +UInt64 +String +Array(UInt64) +Enum8(\'None\' = -1, \'Array(UInt64)\' = 0, \'String\' = 1, \'UInt64\' = 2) +None +UInt64 +String +Array(UInt64) +Enum8(\'None\' = -1, \'Array(UInt64)\' = 0, \'Date\' = 1, \'String\' = 2, \'UInt64\' = 3) diff --git a/tests/queries/0_stateless/02981_variant_type_function.sql b/tests/queries/0_stateless/02981_variant_type_function.sql new file mode 100644 index 00000000000..cba653d7374 --- /dev/null +++ b/tests/queries/0_stateless/02981_variant_type_function.sql @@ -0,0 +1,13 @@ +SET allow_experimental_variant_type = 1; +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT variantType(v) as type FROM test; +SELECT toTypeName(variantType(v)) from test limit 1; + +SELECT variantType() FROM test; -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +SELECT variantType(v, v) FROM test; -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +SELECT variantType(v.String) FROM test; -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} + +SELECT variantType(v::Variant(UInt64, String, Array(UInt64), Date)) as type FROM test; +SELECT toTypeName(variantType(v::Variant(UInt64, String, Array(UInt64), Date))) from test limit 1; + diff --git a/tests/queries/0_stateless/02981_vertical_merges_memory_usage.reference b/tests/queries/0_stateless/02981_vertical_merges_memory_usage.reference new file mode 100644 index 00000000000..60c254e152b --- /dev/null +++ b/tests/queries/0_stateless/02981_vertical_merges_memory_usage.reference @@ -0,0 +1 @@ +Vertical OK diff --git a/tests/queries/0_stateless/02981_vertical_merges_memory_usage.sql b/tests/queries/0_stateless/02981_vertical_merges_memory_usage.sql new file mode 100644 index 00000000000..b784e734457 --- /dev/null +++ b/tests/queries/0_stateless/02981_vertical_merges_memory_usage.sql @@ -0,0 +1,37 @@ +-- Tags: long, no-random-merge-tree-settings + +DROP TABLE IF EXISTS t_vertical_merge_memory; + +CREATE TABLE t_vertical_merge_memory (id UInt64, arr Array(String)) +ENGINE = MergeTree ORDER BY id +SETTINGS + min_bytes_for_wide_part = 0, + vertical_merge_algorithm_min_rows_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1, + index_granularity = 8192, + index_granularity_bytes = '10M', + merge_max_block_size = 8192, + merge_max_block_size_bytes = '10M'; + +INSERT INTO t_vertical_merge_memory SELECT number, arrayMap(x -> repeat('a', 50), range(1000)) FROM numbers(3000); +-- Why 3001? - Deduplication, which is off with normal MergeTree by default but on for ReplicatedMergeTree and SharedMergeTree. +-- We automatically replace MergeTree with SharedMergeTree in ClickHouse Cloud. +INSERT INTO t_vertical_merge_memory SELECT number, arrayMap(x -> repeat('a', 50), range(1000)) FROM numbers(3001); + +OPTIMIZE TABLE t_vertical_merge_memory FINAL; + +SYSTEM FLUSH LOGS; + +SELECT + merge_algorithm, + peak_memory_usage < 500 * 1024 * 1024 + ? 'OK' + : format('FAIL: memory usage: {}', formatReadableSize(peak_memory_usage)) +FROM system.part_log +WHERE + database = currentDatabase() + AND table = 't_vertical_merge_memory' + AND event_type = 'MergeParts' + AND length(merged_from) = 2; + +DROP TABLE IF EXISTS t_vertical_merge_memory; diff --git a/tests/queries/0_stateless/02982_aggregation_states_destruction.reference b/tests/queries/0_stateless/02982_aggregation_states_destruction.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02982_aggregation_states_destruction.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02982_aggregation_states_destruction.sh b/tests/queries/0_stateless/02982_aggregation_states_destruction.sh new file mode 100755 index 00000000000..1c72cf2b8c1 --- /dev/null +++ b/tests/queries/0_stateless/02982_aggregation_states_destruction.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# Tags: no-random-settings + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +query_id="02982_$RANDOM" +$CLICKHOUSE_CLIENT --query_id $query_id --log_query_threads 1 --query="select number, uniq(number) from numbers_mt(1e7) group by number limit 100 format Null;" + +$CLICKHOUSE_CLIENT -q "system flush logs;" + +$CLICKHOUSE_CLIENT -q "select count() > 1 from system.query_thread_log where query_id = '$query_id' and current_database = currentDatabase() and thread_name = 'AggregDestruct';" diff --git a/tests/queries/0_stateless/02982_comments_in_system_tables.reference b/tests/queries/0_stateless/02982_comments_in_system_tables.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02982_comments_in_system_tables.sh b/tests/queries/0_stateless/02982_comments_in_system_tables.sh new file mode 100755 index 00000000000..2d7fbf4d35a --- /dev/null +++ b/tests/queries/0_stateless/02982_comments_in_system_tables.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_LOCAL} --query "SELECT 'Table ' || database || '.' || name || ' does not have a comment' FROM system.tables WHERE name NOT LIKE '%\_log\_%' AND database='system' AND comment==''" +${CLICKHOUSE_CLIENT} --query "SELECT 'Table ' || database || '.' || name || ' does not have a comment' FROM system.tables WHERE name NOT LIKE '%\_log\_%' AND database='system' AND comment==''" diff --git a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference new file mode 100644 index 00000000000..b6d1ff865e5 --- /dev/null +++ b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference @@ -0,0 +1,2 @@ +c1 Nullable(String) +c1 Nullable(Float64) diff --git a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql new file mode 100644 index 00000000000..2a281e898f1 --- /dev/null +++ b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql @@ -0,0 +1,2 @@ +DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 0; +DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 1; diff --git a/tests/queries/0_stateless/02982_json_columns_with_metadata_http.reference b/tests/queries/0_stateless/02982_json_columns_with_metadata_http.reference new file mode 100644 index 00000000000..d03bf4df6cb --- /dev/null +++ b/tests/queries/0_stateless/02982_json_columns_with_metadata_http.reference @@ -0,0 +1,3 @@ +1 4 +2 5 +3 6 diff --git a/tests/queries/0_stateless/02982_json_columns_with_metadata_http.sh b/tests/queries/0_stateless/02982_json_columns_with_metadata_http.sh new file mode 100755 index 00000000000..41c06ceae61 --- /dev/null +++ b/tests/queries/0_stateless/02982_json_columns_with_metadata_http.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "drop table if exists test" +$CLICKHOUSE_CLIENT -q "create table test(x UInt32, y UInt32) engine=Memory" + +echo -ne '{"meta":[{"name":"x","type":"UInt32"}, {"name":"y", "type":"UInt32"}],"data":{"x":[1,2,3],"y":[4,5,6]}}\n' | ${CLICKHOUSE_CURL} -sS "{$CLICKHOUSE_URL}&query=INSERT%20INTO%20test%20FORMAT%20JSONColumnsWithMetadata" --data-binary @- + +$CLICKHOUSE_CLIENT -q "select * from test" +$CLICKHOUSE_CLIENT -q "drop table test" + diff --git a/tests/queries/0_stateless/02982_minmax_nan_null_order.reference b/tests/queries/0_stateless/02982_minmax_nan_null_order.reference new file mode 100644 index 00000000000..2b3ef0b7733 --- /dev/null +++ b/tests/queries/0_stateless/02982_minmax_nan_null_order.reference @@ -0,0 +1,44 @@ +-- { echoOn } +-- Tuples with NaN +SELECT min((c1, c2)), max((c1, c2)) FROM values((nan, 0.), (0., 0.), (5., 5.)); +(0,0) (5,5) +SELECT minIf((c1, c2), c2 >= 0.0), maxIf((c1, c2), c2 >= 0.0) FROM values((nan, 0.), (0., 0.), (5., 5.)); +(0,0) (5,5) +SELECT (c1, c2) as t FROM values((nan, 0.), (0., 0.), (5., 5.)) ORDER BY t ASC LIMIT 1; +(0,0) +SELECT (c1, c2) as t FROM values((nan, 0.), (0., 0.), (5., 5.)) ORDER BY t DESC LIMIT 1; +(5,5) +SELECT min((c1, c2)), max((c1, c2)) FROM values((-5, 0), (nan, 0.), (0., 0.), (5., 5.)); +(-5,0) (5,5) +SELECT minIf((c1, c2), c2 >= 0.0), maxIf((c1, c2), c2 >= 0.0) FROM values((-5, 0), (nan, 0.), (0., 0.), (5., 5.)); +(-5,0) (5,5) +SELECT (c1, c2) as t FROM values((-5, 0), (nan, 0.), (0., 0.), (5., 5.)) ORDER BY t ASC LIMIT 1; +(-5,0) +SELECT (c1, c2) as t FROM values((-5, 0), (nan, 0.), (0., 0.), (5., 5.)) ORDER BY t DESC LIMIT 1; +(5,5) +-- Tuples with NULL +SELECT min((c1, c2)), max((c1, c2)) FROM values((NULL, 0.), (0., 0.), (5., 5.)); +(0,0) (5,5) +SELECT minIf((c1, c2), c2 >= 0), maxIf((c1, c2), c2 >= 0) FROM values((NULL, 0.), (0., 0.), (5., 5.)); +(0,0) (5,5) +SELECT (c1, c2) as t FROM values((NULL, 0.), (0., 0.), (5., 5.)) ORDER BY t ASC LIMIT 1; +(0,0) +SELECT (c1, c2) as t FROM values((NULL, 0.), (0., 0.), (5., 5.)) ORDER BY t DESC LIMIT 1; +(5,5) +SELECT min((c1, c2)), max((c1, c2)) FROM values((0., 0.), (5., 5.), (NULL, 0.)); +(0,0) (5,5) +SELECT minIf((c1, c2), c2 >= 0), maxIf((c1, c2), c2 >= 0) FROM values((0., 0.), (5., 5.), (NULL, 0.)); +(0,0) (5,5) +SELECT (c1, c2) as t FROM values((NULL, 0.), (0., 0.), (5., 5.), (NULL, 0.)) ORDER BY t ASC LIMIT 1; +(0,0) +SELECT (c1, c2) as t FROM values((NULL, 0.), (0., 0.), (5., 5.), (NULL, 0.)) ORDER BY t DESC LIMIT 1; +(5,5) +-- Map with NULL +SELECT min(map(0, c1)), max(map(0, c1)) FROM values(NULL, 0, 5., 5.); +{0:0} {0:5} +SELECT minIf(map(0, c1), assumeNotNull(c1) >= 0), maxIf(map(0, c1), assumeNotNull(c1) >= 0) FROM values(NULL, 0, 5., 5.); +{0:0} {0:5} +SELECT map(0, c1) as t FROM values(NULL, 0, 5., 5.) ORDER BY t ASC LIMIT 1; +{0:0} +SELECT map(0, c1) as t FROM values(NULL, 0, 5., 5.) ORDER BY t DESC LIMIT 1; +{0:5} diff --git a/tests/queries/0_stateless/02982_minmax_nan_null_order.sql b/tests/queries/0_stateless/02982_minmax_nan_null_order.sql new file mode 100644 index 00000000000..ad9e40874a7 --- /dev/null +++ b/tests/queries/0_stateless/02982_minmax_nan_null_order.sql @@ -0,0 +1,28 @@ +-- { echoOn } +-- Tuples with NaN +SELECT min((c1, c2)), max((c1, c2)) FROM values((nan, 0.), (0., 0.), (5., 5.)); +SELECT minIf((c1, c2), c2 >= 0.0), maxIf((c1, c2), c2 >= 0.0) FROM values((nan, 0.), (0., 0.), (5., 5.)); +SELECT (c1, c2) as t FROM values((nan, 0.), (0., 0.), (5., 5.)) ORDER BY t ASC LIMIT 1; +SELECT (c1, c2) as t FROM values((nan, 0.), (0., 0.), (5., 5.)) ORDER BY t DESC LIMIT 1; + +SELECT min((c1, c2)), max((c1, c2)) FROM values((-5, 0), (nan, 0.), (0., 0.), (5., 5.)); +SELECT minIf((c1, c2), c2 >= 0.0), maxIf((c1, c2), c2 >= 0.0) FROM values((-5, 0), (nan, 0.), (0., 0.), (5., 5.)); +SELECT (c1, c2) as t FROM values((-5, 0), (nan, 0.), (0., 0.), (5., 5.)) ORDER BY t ASC LIMIT 1; +SELECT (c1, c2) as t FROM values((-5, 0), (nan, 0.), (0., 0.), (5., 5.)) ORDER BY t DESC LIMIT 1; + +-- Tuples with NULL +SELECT min((c1, c2)), max((c1, c2)) FROM values((NULL, 0.), (0., 0.), (5., 5.)); +SELECT minIf((c1, c2), c2 >= 0), maxIf((c1, c2), c2 >= 0) FROM values((NULL, 0.), (0., 0.), (5., 5.)); +SELECT (c1, c2) as t FROM values((NULL, 0.), (0., 0.), (5., 5.)) ORDER BY t ASC LIMIT 1; +SELECT (c1, c2) as t FROM values((NULL, 0.), (0., 0.), (5., 5.)) ORDER BY t DESC LIMIT 1; + +SELECT min((c1, c2)), max((c1, c2)) FROM values((0., 0.), (5., 5.), (NULL, 0.)); +SELECT minIf((c1, c2), c2 >= 0), maxIf((c1, c2), c2 >= 0) FROM values((0., 0.), (5., 5.), (NULL, 0.)); +SELECT (c1, c2) as t FROM values((NULL, 0.), (0., 0.), (5., 5.), (NULL, 0.)) ORDER BY t ASC LIMIT 1; +SELECT (c1, c2) as t FROM values((NULL, 0.), (0., 0.), (5., 5.), (NULL, 0.)) ORDER BY t DESC LIMIT 1; + +-- Map with NULL +SELECT min(map(0, c1)), max(map(0, c1)) FROM values(NULL, 0, 5., 5.); +SELECT minIf(map(0, c1), assumeNotNull(c1) >= 0), maxIf(map(0, c1), assumeNotNull(c1) >= 0) FROM values(NULL, 0, 5., 5.); +SELECT map(0, c1) as t FROM values(NULL, 0, 5., 5.) ORDER BY t ASC LIMIT 1; +SELECT map(0, c1) as t FROM values(NULL, 0, 5., 5.) ORDER BY t DESC LIMIT 1; diff --git a/tests/queries/0_stateless/02982_parallel_replicas_unexpected_cluster.reference b/tests/queries/0_stateless/02982_parallel_replicas_unexpected_cluster.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02982_parallel_replicas_unexpected_cluster.sql b/tests/queries/0_stateless/02982_parallel_replicas_unexpected_cluster.sql new file mode 100644 index 00000000000..210b7d2a18a --- /dev/null +++ b/tests/queries/0_stateless/02982_parallel_replicas_unexpected_cluster.sql @@ -0,0 +1,8 @@ +DROP TABLE IF EXISTS test_unexpected_cluster; +CREATE TABLE test_unexpected_cluster (n UInt64) ENGINE=MergeTree() ORDER BY tuple(); +INSERT INTO test_unexpected_cluster SELECT * FROM numbers(10); + +SET allow_experimental_parallel_reading_from_replicas=2, max_parallel_replicas=2, cluster_for_parallel_replicas='test_cluster_two_shards', parallel_replicas_for_non_replicated_merge_tree=1; +SELECT count() FROM test_unexpected_cluster WHERE NOT ignore(*); -- { serverError UNEXPECTED_CLUSTER } + +DROP TABLE test_unexpected_cluster; diff --git a/tests/queries/0_stateless/02982_perf_introspection_for_inserts.reference b/tests/queries/0_stateless/02982_perf_introspection_for_inserts.reference new file mode 100644 index 00000000000..50d4d226b46 --- /dev/null +++ b/tests/queries/0_stateless/02982_perf_introspection_for_inserts.reference @@ -0,0 +1 @@ +1 1 1 1 1 diff --git a/tests/queries/0_stateless/02982_perf_introspection_for_inserts.sh b/tests/queries/0_stateless/02982_perf_introspection_for_inserts.sh new file mode 100755 index 00000000000..f5fb54b54d3 --- /dev/null +++ b/tests/queries/0_stateless/02982_perf_introspection_for_inserts.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +$CLICKHOUSE_CLIENT -q """ +CREATE TABLE t02982 +( + n UInt64, + s Nullable(String), + INDEX idx1 n TYPE minmax GRANULARITY 2, + INDEX idx2 n * length(s) TYPE set(1000) GRANULARITY 2, + PROJECTION pr_sort + ( + SELECT + n, + sum(length(s)) + GROUP BY n + ) +) +ENGINE = MergeTree +ORDER BY n; +""" + +query_id=$RANDOM + +$CLICKHOUSE_CLIENT --query_id $query_id -q """ +INSERT INTO t02982 SELECT + number, + 'a' +FROM numbers_mt(1000000); +""" + +$CLICKHOUSE_CLIENT -q "SYSTEM FLUSH LOGS" +$CLICKHOUSE_CLIENT -q """ +SELECT + ProfileEvents['MergeTreeDataProjectionWriterMergingBlocksMicroseconds'] > 0, + ProfileEvents['MergeTreeDataProjectionWriterSortingBlocksMicroseconds'] > 0, + ProfileEvents['MergeTreeDataWriterSortingBlocksMicroseconds'] > 0, + ProfileEvents['MergeTreeDataWriterProjectionsCalculationMicroseconds'] > 0, + ProfileEvents['MergeTreeDataWriterSkipIndicesCalculationMicroseconds'] > 0 +FROM system.query_log +WHERE current_database = currentDatabase() AND query_id='$query_id' AND type = 'QueryFinish'; +""" diff --git a/tests/queries/0_stateless/02983_const_sharding_key.reference b/tests/queries/0_stateless/02983_const_sharding_key.reference new file mode 100644 index 00000000000..06e567b11df --- /dev/null +++ b/tests/queries/0_stateless/02983_const_sharding_key.reference @@ -0,0 +1,7 @@ +1 +2 +3 +4 +5 +6 +7 diff --git a/tests/queries/0_stateless/02983_const_sharding_key.sql b/tests/queries/0_stateless/02983_const_sharding_key.sql new file mode 100644 index 00000000000..339293b8b81 --- /dev/null +++ b/tests/queries/0_stateless/02983_const_sharding_key.sql @@ -0,0 +1,26 @@ +-- Tags: distributed, no-parallel + +DROP DATABASE IF EXISTS shard_0; +DROP DATABASE IF EXISTS shard_1; +DROP TABLE IF EXISTS t_distr; + +CREATE DATABASE IF NOT EXISTS shard_0; +CREATE DATABASE IF NOT EXISTS shard_1; + +CREATE TABLE shard_0.t_local (a Int) ENGINE = Memory; +CREATE TABLE shard_1.t_local (a Int) ENGINE = Memory; +CREATE TABLE t_distr (a Int) ENGINE = Distributed(test_cluster_two_shards_different_databases, '', 't_local', 1000); + +SET distributed_foreground_insert=0; +INSERT INTO t_distr VALUES (1), (2), (3); + +SET distributed_foreground_insert=1; +INSERT INTO t_distr VALUES (4), (5), (6), (7); + +SYSTEM FLUSH DISTRIBUTED t_distr; + +SELECT * FROM t_distr ORDER BY a; + +DROP TABLE t_distr; +DROP DATABASE shard_0; +DROP DATABASE shard_1; diff --git a/tests/queries/0_stateless/02983_empty_map.reference b/tests/queries/0_stateless/02983_empty_map.reference new file mode 100644 index 00000000000..fadedaf23ae --- /dev/null +++ b/tests/queries/0_stateless/02983_empty_map.reference @@ -0,0 +1,7 @@ +-- { echoOn } +SELECT f1, f2['2'], count() FROM t1 GROUP BY 1,2 order by 1,2; +1 1000111 +SELECT f1, f3['2'], count() FROM t1 GROUP BY 1,2 order by 1,2; +1 1000111 +SELECT f1, f4[2], count() FROM t1 GROUP BY 1,2 order by 1,2; +1 0 1000111 diff --git a/tests/queries/0_stateless/02983_empty_map.sql b/tests/queries/0_stateless/02983_empty_map.sql new file mode 100644 index 00000000000..78bc5d8736f --- /dev/null +++ b/tests/queries/0_stateless/02983_empty_map.sql @@ -0,0 +1,21 @@ +--https://github.com/ClickHouse/ClickHouse/issues/59402 +CREATE TABLE t1 +( + f1 Int32, + f2 Map(LowCardinality(String),LowCardinality(String)), + f3 Map(String,String), + f4 Map(Int32,Int32) +) +ENGINE=Memory AS +SELECT 1 as f1, + map(number%2,number%10) as f2, + f2 as f3, + f2 as f4 +from numbers(1000111); + +SET max_block_size=10; + +-- { echoOn } +SELECT f1, f2['2'], count() FROM t1 GROUP BY 1,2 order by 1,2; +SELECT f1, f3['2'], count() FROM t1 GROUP BY 1,2 order by 1,2; +SELECT f1, f4[2], count() FROM t1 GROUP BY 1,2 order by 1,2; diff --git a/tests/queries/0_stateless/02984_topk_empty_merge.reference b/tests/queries/0_stateless/02984_topk_empty_merge.reference new file mode 100644 index 00000000000..fe51488c706 --- /dev/null +++ b/tests/queries/0_stateless/02984_topk_empty_merge.reference @@ -0,0 +1 @@ +[] diff --git a/tests/queries/0_stateless/02984_topk_empty_merge.sql b/tests/queries/0_stateless/02984_topk_empty_merge.sql new file mode 100644 index 00000000000..754b0cb26a2 --- /dev/null +++ b/tests/queries/0_stateless/02984_topk_empty_merge.sql @@ -0,0 +1,2 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/59107 +SELECT topK('102.4') FROM remote('127.0.0.{1,2}', view(SELECT NULL FROM system.one WHERE dummy = 1)); diff --git a/tests/queries/0_stateless/02985_if_over_big_int_decimal.reference b/tests/queries/0_stateless/02985_if_over_big_int_decimal.reference new file mode 100644 index 00000000000..1dfad945ee2 --- /dev/null +++ b/tests/queries/0_stateless/02985_if_over_big_int_decimal.reference @@ -0,0 +1,12 @@ +49500 +49500 +49500 +49500 +49500 +49500 +450000 +450000 +450000 +450000 +450000 +450000 diff --git a/tests/queries/0_stateless/02985_if_over_big_int_decimal.sql b/tests/queries/0_stateless/02985_if_over_big_int_decimal.sql new file mode 100644 index 00000000000..0295a64a092 --- /dev/null +++ b/tests/queries/0_stateless/02985_if_over_big_int_decimal.sql @@ -0,0 +1,14 @@ +select sumIf(number::Int128, number % 10 == 0) from numbers(1000); +select sumIf(number::UInt128, number % 10 == 0) from numbers(1000); +select sumIf(number::Int256, number % 10 == 0) from numbers(1000); +select sumIf(number::UInt256, number % 10 == 0) from numbers(1000); +select sumIf(number::Decimal128(3), number % 10 == 0) from numbers(1000); +select sumIf(number::Decimal256(3), number % 10 == 0) from numbers(1000); + +-- Test when the condition is neither 0 nor 1 +select sumIf(number::Int128, number % 10) from numbers(1000); +select sumIf(number::UInt128, number % 10) from numbers(1000); +select sumIf(number::Int256, number % 10) from numbers(1000); +select sumIf(number::UInt256, number % 10) from numbers(1000); +select sumIf(number::Decimal128(3), number % 10) from numbers(1000); +select sumIf(number::Decimal256(3), number % 10) from numbers(1000); diff --git a/tests/queries/0_stateless/02985_minmax_index_aggregate_function.reference b/tests/queries/0_stateless/02985_minmax_index_aggregate_function.reference new file mode 100644 index 00000000000..e71eb4f0d57 --- /dev/null +++ b/tests/queries/0_stateless/02985_minmax_index_aggregate_function.reference @@ -0,0 +1,6 @@ +1 +5 10 +6 11 +7 12 +8 13 +9 14 diff --git a/tests/queries/0_stateless/02985_minmax_index_aggregate_function.sql b/tests/queries/0_stateless/02985_minmax_index_aggregate_function.sql new file mode 100644 index 00000000000..7d35c1b310b --- /dev/null +++ b/tests/queries/0_stateless/02985_minmax_index_aggregate_function.sql @@ -0,0 +1,36 @@ +DROP TABLE IF EXISTS t_index_agg_func; + +CREATE TABLE t_index_agg_func +( + id UInt64, + v AggregateFunction(avg, UInt64), + INDEX idx_v v TYPE minmax GRANULARITY 1 +) +ENGINE = AggregatingMergeTree ORDER BY id +SETTINGS index_granularity = 4; -- { serverError BAD_ARGUMENTS } + +CREATE TABLE t_index_agg_func +( + id UInt64, + v AggregateFunction(avg, UInt64), +) +ENGINE = AggregatingMergeTree ORDER BY id +SETTINGS index_granularity = 4; + +ALTER TABLE t_index_agg_func ADD INDEX idx_v v TYPE minmax GRANULARITY 1; -- { serverError BAD_ARGUMENTS } + +ALTER TABLE t_index_agg_func ADD INDEX idx_v finalizeAggregation(v) TYPE minmax GRANULARITY 1; + +INSERT INTO t_index_agg_func SELECT number % 10, initializeAggregation('avgState', toUInt64(number % 20)) FROM numbers(1000); +INSERT INTO t_index_agg_func SELECT number % 10, initializeAggregation('avgState', toUInt64(number % 20)) FROM numbers(1000, 1000); + +OPTIMIZE TABLE t_index_agg_func FINAL; + +SELECT count() FROM system.parts WHERE table = 't_index_agg_func' AND database = currentDatabase() AND active; + +SET force_data_skipping_indices = 'idx_v'; +SET use_skip_indexes_if_final = 1; + +SELECT id, finalizeAggregation(v) AS vv FROM t_index_agg_func FINAL WHERE vv >= 10 ORDER BY id; + +DROP TABLE t_index_agg_func; diff --git a/tests/queries/0_stateless/02985_parser_check_stack_size.reference b/tests/queries/0_stateless/02985_parser_check_stack_size.reference new file mode 100644 index 00000000000..f83e0818db2 --- /dev/null +++ b/tests/queries/0_stateless/02985_parser_check_stack_size.reference @@ -0,0 +1 @@ +TOO_DEEP diff --git a/tests/queries/0_stateless/02985_parser_check_stack_size.sh b/tests/queries/0_stateless/02985_parser_check_stack_size.sh new file mode 100755 index 00000000000..c91a0a3eacc --- /dev/null +++ b/tests/queries/0_stateless/02985_parser_check_stack_size.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --query "select 'create table test (x ' || repeat('Array(', 10000) || 'UInt64' || repeat(')', 10000) || ') engine=Memory' format TSVRaw" | $CLICKHOUSE_CURL "${CLICKHOUSE_URL}&max_parser_depth=100000" --data-binary @- | grep -o -F 'TOO_DEEP' diff --git a/tests/queries/0_stateless/02985_shard_query_start_time.reference b/tests/queries/0_stateless/02985_shard_query_start_time.reference new file mode 100644 index 00000000000..ff563ea1d53 --- /dev/null +++ b/tests/queries/0_stateless/02985_shard_query_start_time.reference @@ -0,0 +1,2 @@ +QueryStart 2 2 2 2 +QueryFinish 2 2 2 2 diff --git a/tests/queries/0_stateless/02985_shard_query_start_time.sql b/tests/queries/0_stateless/02985_shard_query_start_time.sql new file mode 100644 index 00000000000..c31d81e58ae --- /dev/null +++ b/tests/queries/0_stateless/02985_shard_query_start_time.sql @@ -0,0 +1,34 @@ +DROP TABLE IF EXISTS sharded_table; +CREATE TABLE sharded_table (dummy UInt8) ENGINE = Distributed('test_cluster_two_shards', 'system', 'one'); + +SET prefer_localhost_replica=0; +SELECT * FROM sharded_table FORMAT Null SETTINGS log_comment='02985_shard_query_start_time_query_1'; + +SYSTEM FLUSH LOGS; + +-- Check that there are 2 queries to shards and for each one query_start_time_microseconds is more recent +-- than initial_query_start_time_microseconds, and initial_query_start_time_microseconds matches the original query +-- query_start_time_microseconds +WITH +( + SELECT + (query_id, query_start_time, query_start_time_microseconds) + FROM + system.query_log + WHERE + event_date >= yesterday() + AND current_database = currentDatabase() + AND log_comment = '02985_shard_query_start_time_query_1' + AND type = 'QueryFinish' +) AS id_and_start_tuple +SELECT + type, + countIf(query_start_time >= initial_query_start_time), -- Using >= because it's comparing seconds + countIf(query_start_time_microseconds > initial_query_start_time_microseconds), + countIf(initial_query_start_time = id_and_start_tuple.2), + countIf(initial_query_start_time_microseconds = id_and_start_tuple.3) +FROM + system.query_log +WHERE + NOT is_initial_query AND initial_query_id = id_and_start_tuple.1 +GROUP BY type; diff --git a/tests/queries/0_stateless/02986_leftpad_fixedstring.reference b/tests/queries/0_stateless/02986_leftpad_fixedstring.reference new file mode 100644 index 00000000000..8e51d03d0a8 --- /dev/null +++ b/tests/queries/0_stateless/02986_leftpad_fixedstring.reference @@ -0,0 +1,124 @@ + +a String + +a String + 1 1 1 +61 1 1 1 +6162 1 1 1 +616263 1 1 1 +61626333 1 1 1 +6162633334 1 1 1 +616263333433 1 1 1 +61626333343332 1 1 1 +6162633334333234 1 1 1 +206162633334333234 1 1 1 +20206162633334333234 1 1 1 +2020206162633334333234 1 1 1 +202020206162633334333234 1 1 1 +20202020206162633334333234 1 1 1 +2020202020206162633334333234 1 1 1 +202020202020206162633334333234 1 1 1 +20202020202020206162633334333234 1 1 1 +2020202020202020206162633334333234 1 1 1 +202020202020202020206162633334333234 1 1 1 +20202020202020202020206162633334333234 1 1 1 + 1 1 1 +61 1 1 1 +6162 1 1 1 +616263 1 1 1 +61626333 1 1 1 +6162633334 1 1 1 +616263333433 1 1 1 +61626333343332 1 1 1 +6162633334333234 1 1 1 +616263333433323420 1 1 1 +61626333343332342020 1 1 1 +6162633334333234202020 1 1 1 +616263333433323420202020 1 1 1 +61626333343332342020202020 1 1 1 +6162633334333234202020202020 1 1 1 +616263333433323420202020202020 1 1 1 +61626333343332342020202020202020 1 1 1 +6162633334333234202020202020202020 1 1 1 +616263333433323420202020202020202020 1 1 1 +61626333343332342020202020202020202020 1 1 1 + 1 +61 1 +6162 1 +616263 1 +61626333 1 +6162633334 1 +616263333433 1 +61626333343332 1 +6162633334333234 1 +F09F87AA6162633334333234 1 +F09F87AAF09F87B86162633334333234 1 +F09F87AAF09F87B8F09F87AA6162633334333234 1 +F09F87AAF09F87B8F09F87AAF09F87B86162633334333234 1 +F09F87AAF09F87B8F09F87AAF09F87B8F09F87AA6162633334333234 1 +F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B86162633334333234 1 +F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AA6162633334333234 1 +F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B86162633334333234 1 +F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AA6162633334333234 1 +F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B86162633334333234 1 +F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AA6162633334333234 1 + 1 +61 1 +6162 1 +616263 1 +61626333 1 +6162633334 1 +616263333433 1 +61626333343332 1 +6162633334333234 1 +6162633334333234F09F87AA 1 +6162633334333234F09F87AAF09F87B8 1 +6162633334333234F09F87AAF09F87B8F09F87AA 1 +6162633334333234F09F87AAF09F87B8F09F87AAF09F87B8 1 +6162633334333234F09F87AAF09F87B8F09F87AAF09F87B8F09F87AA 1 +6162633334333234F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8 1 +6162633334333234F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AA 1 +6162633334333234F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8 1 +6162633334333234F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AA 1 +6162633334333234F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8 1 +6162633334333234F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AAF09F87B8F09F87AA 1 + 1 +F09F87AA 1 +F09F87AAF09F87B8 1 +C391F09F87AAF09F87B8 1 +C391C391F09F87AAF09F87B8 1 +C391C391C391F09F87AAF09F87B8 1 +C391C391C391C391F09F87AAF09F87B8 1 +C391C391C391C391C391F09F87AAF09F87B8 1 +C391C391C391C391C391C391F09F87AAF09F87B8 1 +C391C391C391C391C391C391C391F09F87AAF09F87B8 1 +C391C391C391C391C391C391C391C391F09F87AAF09F87B8 1 +C391C391C391C391C391C391C391C391C391F09F87AAF09F87B8 1 +C391C391C391C391C391C391C391C391C391C391F09F87AAF09F87B8 1 +C391C391C391C391C391C391C391C391C391C391C391F09F87AAF09F87B8 1 +C391C391C391C391C391C391C391C391C391C391C391C391F09F87AAF09F87B8 1 +C391C391C391C391C391C391C391C391C391C391C391C391C391F09F87AAF09F87B8 1 +C391C391C391C391C391C391C391C391C391C391C391C391C391C391F09F87AAF09F87B8 1 +C391C391C391C391C391C391C391C391C391C391C391C391C391C391C391F09F87AAF09F87B8 1 +C391C391C391C391C391C391C391C391C391C391C391C391C391C391C391C391F09F87AAF09F87B8 1 +C391C391C391C391C391C391C391C391C391C391C391C391C391C391C391C391C391F09F87AAF09F87B8 1 + 1 +F09F87AA 1 +F09F87AAF09F87B8 1 +F09F87AAF09F87B8C391 1 +F09F87AAF09F87B8C391C391 1 +F09F87AAF09F87B8C391C391C391 1 +F09F87AAF09F87B8C391C391C391C391 1 +F09F87AAF09F87B8C391C391C391C391C391 1 +F09F87AAF09F87B8C391C391C391C391C391C391 1 +F09F87AAF09F87B8C391C391C391C391C391C391C391 1 +F09F87AAF09F87B8C391C391C391C391C391C391C391C391 1 +F09F87AAF09F87B8C391C391C391C391C391C391C391C391C391 1 +F09F87AAF09F87B8C391C391C391C391C391C391C391C391C391C391 1 +F09F87AAF09F87B8C391C391C391C391C391C391C391C391C391C391C391 1 +F09F87AAF09F87B8C391C391C391C391C391C391C391C391C391C391C391C391 1 +F09F87AAF09F87B8C391C391C391C391C391C391C391C391C391C391C391C391C391 1 +F09F87AAF09F87B8C391C391C391C391C391C391C391C391C391C391C391C391C391C391 1 +F09F87AAF09F87B8C391C391C391C391C391C391C391C391C391C391C391C391C391C391C391 1 +F09F87AAF09F87B8C391C391C391C391C391C391C391C391C391C391C391C391C391C391C391C391 1 +F09F87AAF09F87B8C391C391C391C391C391C391C391C391C391C391C391C391C391C391C391C391C391 1 diff --git a/tests/queries/0_stateless/02986_leftpad_fixedstring.sql b/tests/queries/0_stateless/02986_leftpad_fixedstring.sql new file mode 100644 index 00000000000..eaed9b3adc6 --- /dev/null +++ b/tests/queries/0_stateless/02986_leftpad_fixedstring.sql @@ -0,0 +1,41 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/59604 +SELECT leftPad(toFixedString('abc', 3), 0), leftPad('abc', CAST('0', 'Int32')); +SELECT leftPad(toFixedString('abc343243424324', 15), 1) as a, toTypeName(a); + +SELECT rightPad(toFixedString('abc', 3), 0), rightPad('abc', CAST('0', 'Int32')); +SELECT rightPad(toFixedString('abc343243424324', 15), 1) as a, toTypeName(a); + +SELECT + hex(leftPad(toFixedString('abc34324' as s, 8), number)) as result, + hex(leftPad(s, number)) = result, + hex(leftPadUTF8(toFixedString(s, 8), number)) = result, + hex(leftPadUTF8(s, number)) = result +FROM numbers(20); + +SELECT + hex(rightPad(toFixedString('abc34324' as s, 8), number)) as result, + hex(rightPad(s, number)) = result, + hex(rightPadUTF8(toFixedString(s, 8), number)) = result, + hex(rightPadUTF8(s, number)) = result +FROM numbers(20); + +-- I'm not confident the behaviour should be like this. I'm only testing memory problems +SELECT + hex(leftPadUTF8(toFixedString('abc34324' as s, 8), number, '🇪🇸')) as result, + hex(leftPadUTF8(s, number, '🇪🇸')) = result +FROM numbers(20); + +SELECT + hex(rightPadUTF8(toFixedString('abc34324' as s, 8), number, '🇪🇸')) as result, + hex(rightPadUTF8(s, number, '🇪🇸')) = result +FROM numbers(20); + +SELECT + hex(leftPadUTF8(toFixedString('🇪🇸' as s, 8), number, 'Ñ')) as result, + hex(leftPadUTF8(s, number, 'Ñ')) = result +FROM numbers(20); + +SELECT + hex(rightPadUTF8(toFixedString('🇪🇸' as s, 8), number, 'Ñ')) as result, + hex(rightPadUTF8(s, number, 'Ñ')) = result +FROM numbers(20); diff --git a/tests/queries/0_stateless/02987_logical_optimizer_pass_lowcardinality.reference b/tests/queries/0_stateless/02987_logical_optimizer_pass_lowcardinality.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02987_logical_optimizer_pass_lowcardinality.sql b/tests/queries/0_stateless/02987_logical_optimizer_pass_lowcardinality.sql new file mode 100644 index 00000000000..be7689025b2 --- /dev/null +++ b/tests/queries/0_stateless/02987_logical_optimizer_pass_lowcardinality.sql @@ -0,0 +1,5 @@ +CREATE TABLE 02987_logical_optimizer_table (key Int, value Int) ENGINE=Memory(); +CREATE VIEW v1 AS SELECT * FROM 02987_logical_optimizer_table; +CREATE TABLE 02987_logical_optimizer_merge AS v1 ENGINE=Merge(currentDatabase(), 'v1'); + +SELECT _table, key FROM 02987_logical_optimizer_merge WHERE (_table = toFixedString(toFixedString(toFixedString('v1', toNullable(2)), 2), 2)) OR ((value = toLowCardinality(toNullable(10))) AND (_table = toFixedString(toNullable('v3'), 2))) OR ((value = 20) AND (_table = toFixedString(toFixedString(toFixedString('v1', 2), 2), 2)) AND (_table = toFixedString(toLowCardinality(toFixedString('v3', 2)), 2))) SETTINGS allow_experimental_analyzer = true, join_use_nulls = true, convert_query_to_cnf = true; diff --git a/tests/queries/0_stateless/02988_join_using_prewhere_pushdown.reference b/tests/queries/0_stateless/02988_join_using_prewhere_pushdown.reference new file mode 100644 index 00000000000..c9bf491872a --- /dev/null +++ b/tests/queries/0_stateless/02988_join_using_prewhere_pushdown.reference @@ -0,0 +1,2 @@ +1 a +2 b Int64 diff --git a/tests/queries/0_stateless/02988_join_using_prewhere_pushdown.sql b/tests/queries/0_stateless/02988_join_using_prewhere_pushdown.sql new file mode 100644 index 00000000000..db49f155d3f --- /dev/null +++ b/tests/queries/0_stateless/02988_join_using_prewhere_pushdown.sql @@ -0,0 +1,24 @@ +DROP TABLE IF EXISTS t; + +SET allow_suspicious_low_cardinality_types = 1; + + +CREATE TABLE t (`id` UInt16, `u` LowCardinality(Int32), `s` LowCardinality(String)) +ENGINE = MergeTree ORDER BY id; + +INSERT INTO t VALUES (1,1,'a'),(2,2,'b'); + +SELECT u, s FROM t +INNER JOIN ( SELECT number :: Int32 AS u FROM numbers(10) ) AS t1 +USING (u) +WHERE u != 2 +; + +SELECT u, s, toTypeName(u) FROM t +FULL JOIN ( SELECT number :: UInt32 AS u FROM numbers(10) ) AS t1 +USING (u) +WHERE u == 2 +ORDER BY 1 +; + +DROP TABLE IF EXISTS t; diff --git a/tests/queries/0_stateless/02989_group_by_tuple.reference b/tests/queries/0_stateless/02989_group_by_tuple.reference new file mode 100644 index 00000000000..4539bbf2d22 --- /dev/null +++ b/tests/queries/0_stateless/02989_group_by_tuple.reference @@ -0,0 +1,3 @@ +0 +1 +2 diff --git a/tests/queries/0_stateless/02989_group_by_tuple.sql b/tests/queries/0_stateless/02989_group_by_tuple.sql new file mode 100644 index 00000000000..d0a205f5edc --- /dev/null +++ b/tests/queries/0_stateless/02989_group_by_tuple.sql @@ -0,0 +1 @@ +SELECT number FROM numbers(3) GROUP BY (number, number % 2) ORDER BY number; diff --git a/tests/queries/0_stateless/02989_system_tables_metadata_version.reference b/tests/queries/0_stateless/02989_system_tables_metadata_version.reference new file mode 100644 index 00000000000..73f6a1ad346 --- /dev/null +++ b/tests/queries/0_stateless/02989_system_tables_metadata_version.reference @@ -0,0 +1,9 @@ +test_temporary_table_02989 0 +-- +test_table 0 +-- +test_table_replicated 0 +-- +test_table_replicated 1 +-- +test_table_replicated 2 diff --git a/tests/queries/0_stateless/02989_system_tables_metadata_version.sql b/tests/queries/0_stateless/02989_system_tables_metadata_version.sql new file mode 100644 index 00000000000..9534b1f2e82 --- /dev/null +++ b/tests/queries/0_stateless/02989_system_tables_metadata_version.sql @@ -0,0 +1,50 @@ +-- Tags: zookeeper, no-parallel + +DROP TABLE IF EXISTS test_temporary_table_02989; +CREATE TEMPORARY TABLE test_temporary_table_02989 +( + id UInt64, + value String +) ENGINE=MergeTree ORDER BY id; + +SELECT name, metadata_version FROM system.tables WHERE name = 'test_temporary_table_02989' AND is_temporary; + +DROP TABLE test_temporary_table_02989; + +DROP TABLE IF EXISTS test_table; +CREATE TABLE test_table +( + id UInt64, + value String +) ENGINE=MergeTree ORDER BY id; + +SELECT '--'; + +SELECT name, metadata_version FROM system.tables WHERE database = currentDatabase() AND name = 'test_table'; + +DROP TABLE test_table; + +DROP TABLE IF EXISTS test_table_replicated; +CREATE TABLE test_table_replicated +( + id UInt64, + value String +) ENGINE=ReplicatedMergeTree('/clickhouse/tables/{database}/test_table_replicated', '1_replica') ORDER BY id; + +SELECT '--'; + +SELECT name, metadata_version FROM system.tables WHERE database = currentDatabase() AND name = 'test_table_replicated'; + +ALTER TABLE test_table_replicated ADD COLUMN insert_time DateTime; + +SELECT '--'; + +SELECT name, metadata_version FROM system.tables WHERE database = currentDatabase() AND name = 'test_table_replicated'; + +ALTER TABLE test_table_replicated ADD COLUMN insert_time_updated DateTime; + +SELECT '--'; + +SELECT name, metadata_version FROM system.tables WHERE database = currentDatabase() AND name = 'test_table_replicated'; + +DROP TABLE test_table_replicated; diff --git a/tests/queries/0_stateless/02990_arrayFold_nullable_lc.reference b/tests/queries/0_stateless/02990_arrayFold_nullable_lc.reference new file mode 100644 index 00000000000..5bd5d7bbd90 --- /dev/null +++ b/tests/queries/0_stateless/02990_arrayFold_nullable_lc.reference @@ -0,0 +1,16 @@ +23 +23 +23 +23 +3 +3 +\N +1 +\N +\N +\N +23 +23 +23 +\N +\N diff --git a/tests/queries/0_stateless/02990_arrayFold_nullable_lc.sql b/tests/queries/0_stateless/02990_arrayFold_nullable_lc.sql new file mode 100644 index 00000000000..280defdfbb4 --- /dev/null +++ b/tests/queries/0_stateless/02990_arrayFold_nullable_lc.sql @@ -0,0 +1,35 @@ +SET allow_suspicious_low_cardinality_types=1; + +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4], toInt64(3)); +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4], toInt64(toNullable(3))); +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4], materialize(toInt64(toNullable(3)))); + +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4]::Array(Nullable(Int64)), toInt64(3)); -- { serverError TYPE_MISMATCH } +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4]::Array(Nullable(Int64)), toInt64(toNullable(3))); + +SELECT arrayFold((acc, x) -> (acc + (x * 2)), []::Array(Int64), toInt64(3)); +SELECT arrayFold((acc, x) -> (acc + (x * 2)), []::Array(Nullable(Int64)), toInt64(toNullable(3))); +SELECT arrayFold((acc, x) -> (acc + (x * 2)), []::Array(Nullable(Int64)), toInt64(NULL)); + +SELECT arrayFold((acc, x) -> x, materialize(CAST('[0, 1]', 'Array(Nullable(UInt8))')), toUInt8(toNullable(0))); +SELECT arrayFold((acc, x) -> x, materialize(CAST([NULL], 'Array(Nullable(UInt8))')), toUInt8(toNullable(0))); +SELECT arrayFold((acc, x) -> acc + x, materialize(CAST([NULL], 'Array(Nullable(UInt8))')), toUInt64(toNullable(0))); +SELECT arrayFold((acc, x) -> acc + x, materialize(CAST([1, 2, NULL], 'Array(Nullable(UInt8))')), toUInt64(toNullable(0))); + +SELECT arrayFold((acc, x) -> toNullable(acc + (x * 2)), [1, 2, 3, 4], toInt64(3)); -- { serverError TYPE_MISMATCH } +SELECT arrayFold((acc, x) -> toNullable(acc + (x * 2)), [1, 2, 3, 4], toNullable(toInt64(3))); + +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4], toLowCardinality(toInt64(3))); -- { serverError TYPE_MISMATCH } +SELECT arrayFold((acc, x) -> toLowCardinality(acc + (x * 2)), [1, 2, 3, 4], toLowCardinality(toInt64(3))); +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4]::Array(LowCardinality(Int64)), toInt64(toLowCardinality(3))); -- { serverError TYPE_MISMATCH } +SELECT arrayFold((acc, x) -> toLowCardinality(acc + (x * 2)), [1, 2, 3, 4]::Array(LowCardinality(Int64)), toInt64(toLowCardinality(3))); + +SELECT arrayFold((acc, x) -> acc + (x * 2), [1, 2, 3, 4]::Array(Nullable(Int64)), toInt64(toLowCardinality(3))); -- { serverError TYPE_MISMATCH } +SELECT arrayFold((acc, x) -> toLowCardinality(acc + (x * 2)), [1, 2, 3, 4]::Array(Nullable(Int64)), toInt64(toLowCardinality(3))); -- { serverError TYPE_MISMATCH } +SELECT arrayFold((acc, x) -> toLowCardinality(acc + (x * 2)), [1, 2, 3, 4]::Array(Nullable(Int64)), toInt64(toNullable(3))); -- { serverError TYPE_MISMATCH } + +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4], NULL); +-- It's debatable which one of the following 2 queries should work, but considering the return type must match the +-- accumulator type it makes sense to be the second one +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4], NULL::LowCardinality(Nullable(Int64))); -- { serverError TYPE_MISMATCH } +SELECT arrayFold((acc, x) -> (acc + (x * 2))::LowCardinality(Nullable(Int64)), [1, 2, 3, 4], NULL::LowCardinality(Nullable(Int64))); diff --git a/tests/queries/0_stateless/02990_format_lambdas.reference b/tests/queries/0_stateless/02990_format_lambdas.reference new file mode 100644 index 00000000000..f898d6ffa0e --- /dev/null +++ b/tests/queries/0_stateless/02990_format_lambdas.reference @@ -0,0 +1,10 @@ +SELECT lambda(1, 1) +SELECT lambda(1, 1) +SELECT x -> 1 +SELECT x -> 1 +SELECT (x, y) -> 1 +SELECT (x, y) -> 1 +SELECT lambda(f(1), 1) +SELECT lambda(f(1), 1) +SELECT lambda(f(x), 1) +SELECT lambda(f(x), 1) diff --git a/tests/queries/0_stateless/02990_format_lambdas.sh b/tests/queries/0_stateless/02990_format_lambdas.sh new file mode 100755 index 00000000000..9dc5e0f0461 --- /dev/null +++ b/tests/queries/0_stateless/02990_format_lambdas.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +QUERY="SELECT lambda(1, 1)"; QUERY2=$(${CLICKHOUSE_FORMAT} --query "$QUERY"); echo "$QUERY2"; QUERY3=$(${CLICKHOUSE_FORMAT} --query "$QUERY2"); echo "$QUERY3"; +QUERY="SELECT lambda(x, 1)"; QUERY2=$(${CLICKHOUSE_FORMAT} --query "$QUERY"); echo "$QUERY2"; QUERY3=$(${CLICKHOUSE_FORMAT} --query "$QUERY2"); echo "$QUERY3"; +QUERY="SELECT lambda((x, y), 1)"; QUERY2=$(${CLICKHOUSE_FORMAT} --query "$QUERY"); echo "$QUERY2"; QUERY3=$(${CLICKHOUSE_FORMAT} --query "$QUERY2"); echo "$QUERY3"; +QUERY="SELECT lambda(f(1), 1)"; QUERY2=$(${CLICKHOUSE_FORMAT} --query "$QUERY"); echo "$QUERY2"; QUERY3=$(${CLICKHOUSE_FORMAT} --query "$QUERY2"); echo "$QUERY3"; +QUERY="SELECT lambda(f(x), 1)"; QUERY2=$(${CLICKHOUSE_FORMAT} --query "$QUERY"); echo "$QUERY2"; QUERY3=$(${CLICKHOUSE_FORMAT} --query "$QUERY2"); echo "$QUERY3"; diff --git a/tests/queries/0_stateless/02990_format_not_precedence.reference b/tests/queries/0_stateless/02990_format_not_precedence.reference new file mode 100644 index 00000000000..f44cf2fdb52 --- /dev/null +++ b/tests/queries/0_stateless/02990_format_not_precedence.reference @@ -0,0 +1,13 @@ +-- { echoOn } +SELECT NOT 0 + NOT 0; +0 +SELECT NOT (0 + (NOT 0)); +0 +SELECT (NOT 0) + (NOT 0); +2 +SELECT formatQuery('SELECT NOT 0 + NOT 0'); +SELECT NOT (0 + (NOT 0)) +SELECT formatQuery('SELECT NOT (0 + (NOT 0))'); +SELECT NOT (0 + (NOT 0)) +SELECT formatQuery('SELECT (NOT 0) + (NOT 0)'); +SELECT (NOT 0) + (NOT 0) diff --git a/tests/queries/0_stateless/02990_format_not_precedence.sql b/tests/queries/0_stateless/02990_format_not_precedence.sql new file mode 100644 index 00000000000..98ef2c9e781 --- /dev/null +++ b/tests/queries/0_stateless/02990_format_not_precedence.sql @@ -0,0 +1,7 @@ +-- { echoOn } +SELECT NOT 0 + NOT 0; +SELECT NOT (0 + (NOT 0)); +SELECT (NOT 0) + (NOT 0); +SELECT formatQuery('SELECT NOT 0 + NOT 0'); +SELECT formatQuery('SELECT NOT (0 + (NOT 0))'); +SELECT formatQuery('SELECT (NOT 0) + (NOT 0)'); diff --git a/tests/queries/0_stateless/02990_format_select_from_explain.reference b/tests/queries/0_stateless/02990_format_select_from_explain.reference new file mode 100644 index 00000000000..7c8dcef3824 --- /dev/null +++ b/tests/queries/0_stateless/02990_format_select_from_explain.reference @@ -0,0 +1,9 @@ +SELECT explain +FROM +( + SELECT * + FROM viewExplain('EXPLAIN AST', '', ( + SELECT * + FROM system.numbers + )) +) diff --git a/tests/queries/0_stateless/02990_format_select_from_explain.sh b/tests/queries/0_stateless/02990_format_select_from_explain.sh new file mode 100755 index 00000000000..4955b733788 --- /dev/null +++ b/tests/queries/0_stateless/02990_format_select_from_explain.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +${CLICKHOUSE_FORMAT} --query "SELECT explain FROM (EXPLAIN AST SELECT * FROM system.numbers)" diff --git a/tests/queries/0_stateless/02990_optimize_uniq_to_count_alias.reference b/tests/queries/0_stateless/02990_optimize_uniq_to_count_alias.reference new file mode 100644 index 00000000000..6ed281c757a --- /dev/null +++ b/tests/queries/0_stateless/02990_optimize_uniq_to_count_alias.reference @@ -0,0 +1,2 @@ +1 +1 diff --git a/tests/queries/0_stateless/02990_optimize_uniq_to_count_alias.sql b/tests/queries/0_stateless/02990_optimize_uniq_to_count_alias.sql new file mode 100644 index 00000000000..5ba0be39991 --- /dev/null +++ b/tests/queries/0_stateless/02990_optimize_uniq_to_count_alias.sql @@ -0,0 +1,37 @@ +--https://github.com/ClickHouse/ClickHouse/issues/59999 +DROP TABLE IF EXISTS tags; +CREATE TABLE tags (dev_tag String) ENGINE = Memory AS SELECT '1'; + +SELECT * +FROM +( + SELECT countDistinct(dev_tag) AS total_devtags + FROM + ( + SELECT dev_tag + FROM + ( + SELECT * + FROM tags + ) AS t + GROUP BY dev_tag + ) AS t +) SETTINGS optimize_uniq_to_count=0; + +SELECT * +FROM +( + SELECT countDistinct(dev_tag) AS total_devtags + FROM + ( + SELECT dev_tag + FROM + ( + SELECT * + FROM tags + ) AS t + GROUP BY dev_tag + ) AS t +) SETTINGS optimize_uniq_to_count=1; + +DROP TABLE IF EXISTS tags; diff --git a/tests/queries/0_stateless/02990_rmt_replica_path_uuid.reference b/tests/queries/0_stateless/02990_rmt_replica_path_uuid.reference new file mode 100644 index 00000000000..5521c015fcf --- /dev/null +++ b/tests/queries/0_stateless/02990_rmt_replica_path_uuid.reference @@ -0,0 +1,4 @@ +aaaaaaaa-1111-2222-3333-aaaaaaaaaaaa +/tables/default/aaaaaaaa-1111-2222-3333-aaaaaaaaaaaa/replicas/r1 +aaaaaaaa-1111-2222-3333-aaaaaaaaaaaa +/tables/default/aaaaaaaa-1111-2222-3333-aaaaaaaaaaaa/replicas/r1 diff --git a/tests/queries/0_stateless/02990_rmt_replica_path_uuid.sql b/tests/queries/0_stateless/02990_rmt_replica_path_uuid.sql new file mode 100644 index 00000000000..4fcdff2910f --- /dev/null +++ b/tests/queries/0_stateless/02990_rmt_replica_path_uuid.sql @@ -0,0 +1,23 @@ +-- Tags: no-parallel, no-ordinary-database, no-replicated-database +-- Tag no-parallel: static UUID +-- Tag no-ordinary-database: requires UUID +-- Tag no-replicated-database: executes with ON CLUSTER anyway + +-- Ignore "ATTACH TABLE query with full table definition is not recommended" +-- Ignore BAD_ARGUMENTS +SET send_logs_level='fatal'; + +DROP TABLE IF EXISTS x; + +ATTACH TABLE x UUID 'aaaaaaaa-1111-2222-3333-aaaaaaaaaaaa' (key Int) ENGINE = ReplicatedMergeTree('/tables/{database}/{uuid}', 'r1') ORDER BY tuple(); +SELECT uuid FROM system.tables WHERE database = currentDatabase() and table = 'x'; +SELECT replica_path FROM system.replicas WHERE database = currentDatabase() and table = 'x'; +DROP TABLE x; + +-- {uuid} macro forbidden for CREATE TABLE without explicit UUID +CREATE TABLE x (key Int) ENGINE = ReplicatedMergeTree('/tables/{database}/{uuid}', 'r1') ORDER BY tuple(); -- { serverError BAD_ARGUMENTS } + +CREATE TABLE x UUID 'aaaaaaaa-1111-2222-3333-aaaaaaaaaaaa' (key Int) ENGINE = ReplicatedMergeTree('/tables/{database}/{uuid}', 'r1') ORDER BY tuple(); +SELECT uuid FROM system.tables WHERE database = currentDatabase() and table = 'x'; +SELECT replica_path FROM system.replicas WHERE database = currentDatabase() and table = 'x'; +DROP TABLE x; diff --git a/tests/queries/0_stateless/02991_count_rewrite_analyzer.reference b/tests/queries/0_stateless/02991_count_rewrite_analyzer.reference new file mode 100644 index 00000000000..ccb266fc2b5 --- /dev/null +++ b/tests/queries/0_stateless/02991_count_rewrite_analyzer.reference @@ -0,0 +1,4 @@ +Nullable(UInt64) +UInt64 +Nullable(UInt64) +UInt64 diff --git a/tests/queries/0_stateless/02991_count_rewrite_analyzer.sql b/tests/queries/0_stateless/02991_count_rewrite_analyzer.sql new file mode 100644 index 00000000000..b11aeedd225 --- /dev/null +++ b/tests/queries/0_stateless/02991_count_rewrite_analyzer.sql @@ -0,0 +1,7 @@ +-- Regression test for https://github.com/ClickHouse/ClickHouse/issues/59919 +SET allow_experimental_analyzer=1; + +SELECT toTypeName(sum(toNullable('a') IN toNullable('a'))) AS x; +SELECT toTypeName(count(toNullable('a') IN toNullable('a'))) AS x; +SELECT toTypeName(sum(toFixedString('a', toLowCardinality(toNullable(1))) IN toFixedString('a', 1))) AS x; +SELECT toTypeName(count(toFixedString('a', toLowCardinality(toNullable(1))) IN toFixedString('a', 1))) AS x; diff --git a/tests/queries/0_stateless/02992_settings_overflow.reference b/tests/queries/0_stateless/02992_settings_overflow.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02992_settings_overflow.sql b/tests/queries/0_stateless/02992_settings_overflow.sql new file mode 100644 index 00000000000..d120c3400e5 --- /dev/null +++ b/tests/queries/0_stateless/02992_settings_overflow.sql @@ -0,0 +1 @@ +SET max_threads = -1; -- { serverError CANNOT_CONVERT_TYPE } diff --git a/tests/queries/0_stateless/02993_lazy_index_loading.reference b/tests/queries/0_stateless/02993_lazy_index_loading.reference new file mode 100644 index 00000000000..5bc329ae4eb --- /dev/null +++ b/tests/queries/0_stateless/02993_lazy_index_loading.reference @@ -0,0 +1,4 @@ +100000000 140000000 +0 0 +1 +100000000 100000000 diff --git a/tests/queries/0_stateless/02993_lazy_index_loading.sql b/tests/queries/0_stateless/02993_lazy_index_loading.sql new file mode 100644 index 00000000000..7de4af9ef0e --- /dev/null +++ b/tests/queries/0_stateless/02993_lazy_index_loading.sql @@ -0,0 +1,18 @@ +DROP TABLE IF EXISTS test; +CREATE TABLE test (s String) ENGINE = MergeTree ORDER BY s SETTINGS index_granularity = 1; + +INSERT INTO test SELECT randomString(1000) FROM numbers(100000); +SELECT round(primary_key_bytes_in_memory, -7), round(primary_key_bytes_in_memory_allocated, -7) FROM system.parts WHERE database = currentDatabase() AND table = 'test'; + +DETACH TABLE test; +SET max_memory_usage = '50M'; +ATTACH TABLE test; + +SELECT primary_key_bytes_in_memory, primary_key_bytes_in_memory_allocated FROM system.parts WHERE database = currentDatabase() AND table = 'test'; + +SET max_memory_usage = '200M'; +SELECT s != '' FROM test LIMIT 1; + +SELECT round(primary_key_bytes_in_memory, -7), round(primary_key_bytes_in_memory_allocated, -7) FROM system.parts WHERE database = currentDatabase() AND table = 'test'; + +DROP TABLE test; diff --git a/tests/queries/0_stateless/02993_values_escape_quote.reference b/tests/queries/0_stateless/02993_values_escape_quote.reference new file mode 100644 index 00000000000..29d6a133fec --- /dev/null +++ b/tests/queries/0_stateless/02993_values_escape_quote.reference @@ -0,0 +1,3 @@ +('foo')('foo\'bar')('foo\'\'bar') +output_format_values_escape_quote_with_quote=1 +('foo')('foo''bar')('foo''''bar') diff --git a/tests/queries/0_stateless/02993_values_escape_quote.sql b/tests/queries/0_stateless/02993_values_escape_quote.sql new file mode 100644 index 00000000000..e6fc5f1b280 --- /dev/null +++ b/tests/queries/0_stateless/02993_values_escape_quote.sql @@ -0,0 +1,12 @@ +select 'foo' format Values; +select 'foo\'bar' format Values; +select 'foo\'\'bar' format Values; + +select '\noutput_format_values_escape_quote_with_quote=1' format LineAsString; +set output_format_values_escape_quote_with_quote=1; + +select 'foo' format Values; +select 'foo\'bar' format Values; +select 'foo\'\'bar' format Values; +-- fix no newline at end of file +select '' format LineAsString; diff --git a/tests/queries/0_stateless/02994_cosineDistanceNullable.reference b/tests/queries/0_stateless/02994_cosineDistanceNullable.reference new file mode 100644 index 00000000000..e4fe1f97e7e --- /dev/null +++ b/tests/queries/0_stateless/02994_cosineDistanceNullable.reference @@ -0,0 +1,11 @@ +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N diff --git a/tests/queries/0_stateless/02994_cosineDistanceNullable.sql b/tests/queries/0_stateless/02994_cosineDistanceNullable.sql new file mode 100644 index 00000000000..a62216982f3 --- /dev/null +++ b/tests/queries/0_stateless/02994_cosineDistanceNullable.sql @@ -0,0 +1,3 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/59596 +SELECT cosineDistance((1, 1), (toNullable(0.5), 0.1)); +SELECT cosineDistance((1, 1), (toNullable(0.5), 0.1)) from numbers(10); diff --git a/tests/queries/1_stateful/00037_uniq_state_merge1.sql b/tests/queries/1_stateful/00037_uniq_state_merge1.sql index c941a14b571..6abaad7297f 100644 --- a/tests/queries/1_stateful/00037_uniq_state_merge1.sql +++ b/tests/queries/1_stateful/00037_uniq_state_merge1.sql @@ -1 +1,2 @@ +SET max_bytes_before_external_group_by = '1G'; SELECT k, any(u) AS u, uniqMerge(us) AS us FROM (SELECT domain(URL) AS k, uniq(UserID) AS u, uniqState(UserID) AS us FROM test.hits GROUP BY k) GROUP BY k ORDER BY u DESC, k ASC LIMIT 100 diff --git a/tests/queries/1_stateful/00038_uniq_state_merge2.sql b/tests/queries/1_stateful/00038_uniq_state_merge2.sql index 677458daeda..f97395943a1 100644 --- a/tests/queries/1_stateful/00038_uniq_state_merge2.sql +++ b/tests/queries/1_stateful/00038_uniq_state_merge2.sql @@ -1 +1,2 @@ +SET max_bytes_before_external_group_by = '1G'; SELECT topLevelDomain(concat('http://', k)) AS tld, sum(u) AS u, uniqMerge(us) AS us FROM (SELECT domain(URL) AS k, uniq(UserID) AS u, uniqState(UserID) AS us FROM test.hits GROUP BY k) GROUP BY tld ORDER BY u DESC, tld ASC LIMIT 100 diff --git a/tests/queries/1_stateful/00098_primary_key_memory_allocated.reference b/tests/queries/1_stateful/00098_primary_key_memory_allocated.reference new file mode 100644 index 00000000000..72749c905a3 --- /dev/null +++ b/tests/queries/1_stateful/00098_primary_key_memory_allocated.reference @@ -0,0 +1 @@ +1 1 1 diff --git a/tests/queries/1_stateful/00098_primary_key_memory_allocated.sql b/tests/queries/1_stateful/00098_primary_key_memory_allocated.sql new file mode 100644 index 00000000000..7371678a0f6 --- /dev/null +++ b/tests/queries/1_stateful/00098_primary_key_memory_allocated.sql @@ -0,0 +1 @@ +SELECT primary_key_bytes_in_memory < 16000, primary_key_bytes_in_memory_allocated < 16000, primary_key_bytes_in_memory_allocated / primary_key_bytes_in_memory < 1.1 FROM system.parts WHERE database = 'test' AND table = 'hits'; diff --git a/tests/queries/1_stateful/00157_cache_dictionary.sql b/tests/queries/1_stateful/00157_cache_dictionary.sql index 3621ff82126..9699843af8f 100644 --- a/tests/queries/1_stateful/00157_cache_dictionary.sql +++ b/tests/queries/1_stateful/00157_cache_dictionary.sql @@ -1,5 +1,8 @@ -- Tags: no-tsan, no-parallel +-- Suppress "ReadWriteBufferFromHTTP: HTTP request to `{}` failed at try 1/10 with bytes read: 311149/378695. Error: DB::HTTPException: Received error from remote server {}. (Current backoff wait is 100/10000 ms)" errors +SET send_logs_level='error'; + DROP TABLE IF EXISTS test.hits_1m; CREATE TABLE test.hits_1m AS test.hits diff --git a/tests/queries/1_stateful/00165_jit_aggregate_functions.reference b/tests/queries/1_stateful/00165_jit_aggregate_functions.reference index fa084170f53..62baba2af8b 100644 --- a/tests/queries/1_stateful/00165_jit_aggregate_functions.reference +++ b/tests/queries/1_stateful/00165_jit_aggregate_functions.reference @@ -68,73 +68,3 @@ Simple functions with non compilable function without key 4611686725751467379 9223371678237104442 3626326766789368100 61384643584599682996279588 408650940859.2896 104735.01095549858 8873898 9223372036854775807 4611686018427387904 3818489297630359920 Simple functions if combinator without key 4611687533683519016 9223371678237104442 4124667747700004330 930178817930.5122 321189.2280948817 4434274 9223372036854775806 4611686018427387904 2265422677606390266 -Aggregation without JIT compilation -Simple functions -1704509 4611700827100483880 9223360787015464643 10441337359398154812 19954243669348.844 9648741.579254271 523264 9223372036854775807 4611686018427387904 4544239379628300646 -732797 4611701940806302259 9223355550934604746 977192643464016658 2054229034942.3723 51998323.94457991 475698 9223372036854775807 4611686018427387904 4091184823334377716 -598875 4611701407242345792 9223362250391155632 9312163881623734456 27615161624211.875 12261797.824844675 337212 9223372036854775807 4611686018427387904 3725992504798702670 -792887 4611699550286611812 9223290551912005343 6930300520201292824 27479710385933.586 53095331.60360441 252197 9223372036854775807 4611686018427387904 6536441508464694614 -3807842 4611710821592843606 9223326163906184987 16710274896338005145 85240848090850.69 22373416.533275086 196036 9223372036854775807 4611686018427387904 1797862753609257231 -25703952 4611709443519524003 9223353913449113943 9946868158853570839 67568783303242.086 3154349.826950714 147211 9223372036854775807 4611686018427387904 8737124378202300429 -716829 4611852156092872082 9223361623076951140 15381015774917924786 170693446547158.72 201431892.4773785 90109 9223372036854775807 4611686018427387904 8209915323001116338 -59183 4611730685242027332 9223354909338698162 8078812522502896568 94622946187035.42 1425270865.0901496 85379 9223372036854775807 4611686018427387904 8909082036598843562 -33010362 4611704682869732882 9223268545373999677 2064452191838585926 26532987929602.555 3695122.4062526934 77807 9223372036854775807 4611686018427387904 5411365383789552292 -800784 4611752907938305166 9223340418389788041 18082918611792817587 233352070043266.62 36535786.81446395 77492 9223372036854775807 4611686018427387904 2059255810151375435 -20810645 4611712185532639162 9223218900001937412 4996531385439292694 68246505203164.63 6316535.831023813 73213 9223372036854775807 4611686018427387904 8852740550386113674 -25843850 4611690025407720929 9223346023778617822 12755881190906812868 185015319325648.16 9962165.34831339 68945 9223372036854775807 4611686018427387904 7849665866595760148 -23447120 4611796031755620254 9223329309291309758 17231649548755339966 255019232629204.38 7937191.271698021 67570 9223372036854775807 4611686018427387904 3435410911925610424 -14739804 4611692230555590277 9223313509005166531 2458378896777063244 38308020331864.36 14590240.469105456 64174 9223372036854775807 4611686018427387904 511910855240035342 -32077710 4611884228437061959 9223352444952988904 12965822147651192908 214467085941034.7 7257521.096258734 60456 9223372036854775807 4611686018427387904 2256071920672551964 -22446879 4611846229717089436 9223124373140579096 13530160492087688838 231724477077663.4 4737362.521046629 58389 9223372036854775807 4611686018427387904 6236276364886386410 -170282 4611833225706935900 9223371583739401906 8076893424988479310 141657635880324.8 1613795518.1065989 57017 9223372036854775807 4611686018427387904 4755775861151848768 -11482817 4611708000353743073 9223337838355779113 14841435427430843458 283531099960470.8 9938452.835998287 52345 9223372036854775807 4611686018427387904 5371586112642152558 -63469 4611695097019173921 9223353530156141191 6296784708578574520 120762239817777.88 579655378.4603049 52142 9223372036854775807 4611686018427387904 4150567963952988110 -29103473 4611744585914335132 9223333530281362537 5908285283932344933 123712996438970.34 867841.595541967 47758 9223372036854775807 4611686018427387904 3238284030821087319 -Simple functions with non compilable function -1704509 4611700827100483880 9223360787015464643 10441337359398154812 3620921835565807284859452 19954243669348.844 9648741.579254271 523264 9223372036854775807 4611686018427387904 4544239379628300646 -732797 4611701940806302259 9223355550934604746 977192643464016658 3289442827160604417733394 2054229034942.3723 51998323.94457991 475698 9223372036854775807 4611686018427387904 4091184823334377716 -598875 4611701407242345792 9223362250391155632 9312163881623734456 2330921446573746856380600 27615161624211.875 12261797.824844675 337212 9223372036854775807 4611686018427387904 3725992504798702670 -792887 4611699550286611812 9223290551912005343 6930300520201292824 1745179600137886041476120 27479710385933.586 53095331.60360441 252197 9223372036854775807 4611686018427387904 6536441508464694614 -3807842 4611710821592843606 9223326163906184987 16710274896338005145 1356295121550317411019929 85240848090850.69 22373416.533275086 196036 9223372036854775807 4611686018427387904 1797862753609257231 -25703952 4611709443519524003 9223353913449113943 9946868158853570839 1018731388338768841564439 67568783303242.086 3154349.826950714 147211 9223372036854775807 4611686018427387904 8737124378202300429 -716829 4611852156092872082 9223361623076951140 15381015774917924786 623810478612337115371442 170693446547158.72 201431892.4773785 90109 9223372036854775807 4611686018427387904 8209915323001116338 -59183 4611730685242027332 9223354909338698162 8078812522502896568 589916507545680254024632 94622946187035.42 1425270865.0901496 85379 9223372036854775807 4611686018427387904 8909082036598843562 -33010362 4611704682869732882 9223268545373999677 2064452191838585926 538517864195994778911814 26532987929602.555 3695122.4062526934 77807 9223372036854775807 4611686018427387904 5411365383789552292 -800784 4611752907938305166 9223340418389788041 18082918611792817587 535545510122473785781683 233352070043266.62 36535786.81446395 77492 9223372036854775807 4611686018427387904 2059255810151375435 -20810645 4611712185532639162 9223218900001937412 4996531385439292694 506405014842860050255126 68246505203164.63 6316535.831023813 73213 9223372036854775807 4611686018427387904 8852740550386113674 -25843850 4611690025407720929 9223346023778617822 12755881190906812868 476547495537329753708996 185015319325648.16 9962165.34831339 68945 9223372036854775807 4611686018427387904 7849665866595760148 -23447120 4611796031755620254 9223329309291309758 17231649548755339966 467236365548464278670014 255019232629204.38 7937191.271698021 67570 9223372036854775807 4611686018427387904 3435410911925610424 -14739804 4611692230555590277 9223313509005166531 2458378896777063244 444126268697527941770060 38308020331864.36 14590240.469105456 64174 9223372036854775807 4611686018427387904 511910855240035342 -32077710 4611884228437061959 9223352444952988904 12965822147651192908 417407443977973675608140 214467085941034.7 7257521.096258734 60456 9223372036854775807 4611686018427387904 2256071920672551964 -22446879 4611846229717089436 9223124373140579096 13530160492087688838 403462269796593691082374 231724477077663.4 4737362.521046629 58389 9223372036854775807 4611686018427387904 6236276364886386410 -170282 4611833225706935900 9223371583739401906 8076893424988479310 394417911933408911581006 141657635880324.8 1613795518.1065989 57017 9223372036854775807 4611686018427387904 4755775861151848768 -11482817 4611708000353743073 9223337838355779113 14841435427430843458 361995300393829962204226 283531099960470.8 9938452.835998287 52345 9223372036854775807 4611686018427387904 5371586112642152558 -63469 4611695097019173921 9223353530156141191 6296784708578574520 360843057610541117735096 120762239817777.88 579655378.4603049 52142 9223372036854775807 4611686018427387904 4150567963952988110 -29103473 4611744585914335132 9223333530281362537 5908285283932344933 330534668598011678200421 123712996438970.34 867841.595541967 47758 9223372036854775807 4611686018427387904 3238284030821087319 -Simple functions if combinator -1704509 4611700827100483880 9223310246721229500 16398241567152875142 62618822667209.71 2224726.7626273884 261874 9223372036854775806 4611686018427387904 4518874482384062894 -732797 4611721382223060002 9223355550934604746 16281585268876620522 68472164943295.68 5898616.931652982 237784 9223372036854775806 4611686018427387904 3641900047478154650 -598875 4611701407242345792 9223362250391155632 3577699408183553052 21300140553347.42 53771550.26565126 167966 9223372036854775806 4611686018427387904 1688477495230210408 -792887 4611699550286611812 9223164887726235740 7088177025760385824 56461952267903.89 92835869.96920013 125539 9223372036854775806 4611686018427387904 4850868151095058072 -3807842 4611710821592843606 9223283397553859544 5756765290752687660 58835559208469.4 39794091.419183925 97845 9223372036854775806 4611686018427387904 6845214684357194564 -25703952 4611784761593342388 9223241341744449690 4782279928971192568 65182094768443.91 9276773.708181158 73368 9223372036854775806 4611686018427387904 1384302533387727316 -716829 4611852156092872082 9223361623076951140 8613712481895484190 191445613359755.62 291083243.75407773 44993 9223372036854775806 4611686018427387904 6344483471397203854 -59183 4611730685242027332 9223354909338698162 18369075291092794110 429013599530392 5925109959.715378 42817 9223372036854775806 4611686018427387904 5909305558020042898 -33010362 4611704682869732882 9223092117352620518 9991152681891671022 257099731913529.5 12412830.045471078 38861 9223372036854775806 4611686018427387904 4672855013852508626 -800784 4611752907938305166 9223309994342931384 5251877538869750510 135472890315726.03 53535427.52018088 38767 9223372036854775806 4611686018427387904 7801864489649220514 -20810645 4611712185532639162 9223218900001937412 11803718472901310700 323593455407553 10496765.20741332 36477 9223372036854775806 4611686018427387904 5941995311893397960 -25843850 4611744529689964352 9223346023778617822 127137885677350808 3700925266420.715 18966925.191309396 34353 9223372036854775806 4611686018427387904 6700111718676827412 -23447120 4611796031755620254 9223329309291309758 1841522159325376278 54534534450526.42 6271211.193812284 33768 9223372036854775806 4611686018427387904 2325654077031843898 -14739804 4611762063154116632 9223007205463222212 16302703534054321116 506987919332451.8 6885575.861759452 32156 9223372036854775806 4611686018427387904 2114922310535979832 -32077710 4612033458080771112 9223352444952988904 421072759851674408 13955745719596.793 12220152.393889504 30172 9223372036854775806 4611686018427387904 4399934528735249092 -22446879 4611846229717089436 9223124373140579096 6577134317587565298 224866980668999.47 2482202.163802278 29249 9223372036854775806 4611686018427387904 8763910740678180498 -170282 4611833225706935900 9223371583739401906 15764226366913732386 551447384017691 2515144222.953728 28587 9223372036854775806 4611686018427387904 8217388408377809010 -11482817 4611990575414646848 9223302669582414438 9828522700609834800 378121905921203.2 34845264.2080656 25993 9223372036854775806 4611686018427387904 4689180182672571856 -63469 4612175339998036670 9222961628400798084 17239621485933250238 663164390134376.5 7825349797.6059 25996 9223372036854775806 4611686018427387904 2067736879306995526 -29103473 4611744585914335132 9223035551850347954 12590190375872647672 525927999326314.7 26049107.15514301 23939 9223372036854775806 4611686018427387904 8318055464870862444 -Simple functions without key -4611686725751467379 9223371678237104442 3626326766789368100 408650940859.2896 104735.01095549858 8873898 9223372036854775807 4611686018427387904 3818489297630359920 -Simple functions with non compilable function without key -4611686725751467379 9223371678237104442 3626326766789368100 61384643584599682996279588 408650940859.2896 104735.01095549858 8873898 9223372036854775807 4611686018427387904 3818489297630359920 -Simple functions if combinator without key -4611687533683519016 9223371678237104442 4124667747700004330 930178817930.5122 321189.2280948817 4434274 9223372036854775806 4611686018427387904 2265422677606390266 diff --git a/tests/queries/1_stateful/00165_jit_aggregate_functions.sql b/tests/queries/1_stateful/00165_jit_aggregate_functions.sql index 6017fc57c52..03d29601804 100644 --- a/tests/queries/1_stateful/00165_jit_aggregate_functions.sql +++ b/tests/queries/1_stateful/00165_jit_aggregate_functions.sql @@ -1,4 +1,3 @@ -SET compile_aggregate_expressions = 1; SET min_count_to_compile_aggregate_expression = 0; -- The test uses many aggregations. A low max_bytes_before_external_group_by value will lead to high disk usage -- which in CI leads to timeouts @@ -103,104 +102,3 @@ SELECT FROM test.hits ORDER BY min_watch_id DESC LIMIT 20; - -SET compile_aggregate_expressions = 0; - -SELECT 'Aggregation without JIT compilation'; - -SELECT 'Simple functions'; - -SELECT - CounterID, - min(WatchID), - max(WatchID), - sum(WatchID), - avg(WatchID), - avgWeighted(WatchID, CounterID), - count(WatchID), - groupBitOr(WatchID), - groupBitAnd(WatchID), - groupBitXor(WatchID) -FROM test.hits -GROUP BY CounterID ORDER BY count() DESC LIMIT 20; - -SELECT 'Simple functions with non compilable function'; -SELECT - CounterID, - min(WatchID), - max(WatchID), - sum(WatchID), - sum(toUInt128(WatchID)), - avg(WatchID), - avgWeighted(WatchID, CounterID), - count(WatchID), - groupBitOr(WatchID), - groupBitAnd(WatchID), - groupBitXor(WatchID) -FROM test.hits -GROUP BY CounterID ORDER BY count() DESC LIMIT 20; - -SELECT 'Simple functions if combinator'; - -WITH (WatchID % 2 == 0) AS predicate -SELECT - CounterID, - minIf(WatchID,predicate), - maxIf(WatchID, predicate), - sumIf(WatchID, predicate), - avgIf(WatchID, predicate), - avgWeightedIf(WatchID, CounterID, predicate), - countIf(WatchID, predicate), - groupBitOrIf(WatchID, predicate), - groupBitAndIf(WatchID, predicate), - groupBitXorIf(WatchID, predicate) -FROM test.hits -GROUP BY CounterID ORDER BY count() DESC LIMIT 20; - -SELECT 'Simple functions without key'; - -SELECT - min(WatchID) AS min_watch_id, - max(WatchID), - sum(WatchID), - avg(WatchID), - avgWeighted(WatchID, CounterID), - count(WatchID), - groupBitOr(WatchID), - groupBitAnd(WatchID), - groupBitXor(WatchID) -FROM test.hits -ORDER BY min_watch_id DESC LIMIT 20; - -SELECT 'Simple functions with non compilable function without key'; - -SELECT - min(WatchID) AS min_watch_id, - max(WatchID), - sum(WatchID), - sum(toUInt128(WatchID)), - avg(WatchID), - avgWeighted(WatchID, CounterID), - count(WatchID), - groupBitOr(WatchID), - groupBitAnd(WatchID), - groupBitXor(WatchID) -FROM test.hits -ORDER BY min_watch_id DESC LIMIT 20; - -SELECT 'Simple functions if combinator without key'; - -WITH (WatchID % 2 == 0) AS predicate -SELECT - minIf(WatchID, predicate) as min_watch_id, - maxIf(WatchID, predicate), - sumIf(WatchID, predicate), - avgIf(WatchID, predicate), - avgWeightedIf(WatchID, CounterID, predicate), - countIf(WatchID, predicate), - groupBitOrIf(WatchID, predicate), - groupBitAndIf(WatchID, predicate), - groupBitXorIf(WatchID, predicate) -FROM test.hits -ORDER BY min_watch_id -DESC LIMIT 20; diff --git a/tests/queries/1_stateful/00178_quantile_ddsketch.sql b/tests/queries/1_stateful/00178_quantile_ddsketch.sql index 6844dc05cf9..c1ef4b9f4f2 100644 --- a/tests/queries/1_stateful/00178_quantile_ddsketch.sql +++ b/tests/queries/1_stateful/00178_quantile_ddsketch.sql @@ -1,5 +1,5 @@ -SELECT CounterID AS k, round(quantileDDSketch(0.01, 0.5)(ResolutionWidth), 2) FROM test.hits GROUP BY k ORDER BY count() DESC, CounterID LIMIT 10; -SELECT CounterID AS k, arrayMap(a -> round(a, 2), quantilesDDSketch(0.01, 0.1, 0.5, 0.9, 0.99, 0.999)(ResolutionWidth)) FROM test.hits GROUP BY k ORDER BY count() DESC, CounterID LIMIT 10; +SELECT CounterID AS k, round(quantileDD(0.01, 0.5)(ResolutionWidth), 2) FROM test.hits GROUP BY k ORDER BY count() DESC, CounterID LIMIT 10; +SELECT CounterID AS k, arrayMap(a -> round(a, 2), quantilesDD(0.01, 0.1, 0.5, 0.9, 0.99, 0.999)(ResolutionWidth)) FROM test.hits GROUP BY k ORDER BY count() DESC, CounterID LIMIT 10; -SELECT CounterID AS k, round(quantileDDSketch(0.01, 0.5)(ResolutionWidth), 2) FROM remote('127.0.0.{1,2}', test.hits) GROUP BY k ORDER BY count() DESC, CounterID LIMIT 10; -SELECT CounterID AS k, arrayMap(a -> round(a, 2), quantilesDDSketch(0.01, 0.1, 0.5, 0.9, 0.99, 0.999)(ResolutionWidth)) FROM remote('127.0.0.{1,2}', test.hits) GROUP BY k ORDER BY count() DESC, CounterID LIMIT 10; +SELECT CounterID AS k, round(quantileDD(0.01, 0.5)(ResolutionWidth), 2) FROM remote('127.0.0.{1,2}', test.hits) GROUP BY k ORDER BY count() DESC, CounterID LIMIT 10; +SELECT CounterID AS k, arrayMap(a -> round(a, 2), quantilesDD(0.01, 0.1, 0.5, 0.9, 0.99, 0.999)(ResolutionWidth)) FROM remote('127.0.0.{1,2}', test.hits) GROUP BY k ORDER BY count() DESC, CounterID LIMIT 10; diff --git a/tests/queries/shell_config.sh b/tests/queries/shell_config.sh index c687a63623f..614bfcece8f 100644 --- a/tests/queries/shell_config.sh +++ b/tests/queries/shell_config.sh @@ -4,6 +4,10 @@ # Don't check for ODR violation, since we may test shared build with ASAN export ASAN_OPTIONS=detect_odr_violation=0 +# If ClickHouse was built with coverage - dump the coverage information at exit +# (in other cases this environment variable has no effect) +export CLICKHOUSE_WRITE_COVERAGE="coverage" + export CLICKHOUSE_DATABASE=${CLICKHOUSE_DATABASE:="test"} export CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=${CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL:="warning"} diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index f4be6ebcf09..f2e5a744a21 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1,4 +1,4 @@ -personal_ws-1.1 en 2657 +personal_ws-1.1 en 2724 AArch ACLs ALTERs @@ -12,6 +12,7 @@ ARMv ASLR ASOF ASan +AWND AWST Actian ActionsMenu @@ -189,6 +190,7 @@ CustomSeparatedWithNamesAndTypes DBAs DBMSs DBeaver +DD DDLWORKER DDLWorker DDLWorkerThreads @@ -214,7 +216,6 @@ DatabaseOrdinaryThreadsActive DateTime DateTimes DbCL -DDSketch Decrypted Deduplicate Deduplication @@ -238,6 +239,7 @@ DistributedSend DockerHub DoubleDelta Doxygen +Durre ECMA Ecto EdgeAngle @@ -289,6 +291,7 @@ ForEach FreeBSD Fuzzer Fuzzers +GHCN GTID GTest Gb @@ -444,6 +447,7 @@ Khanna KittenHouse Klickhouse Kolmogorov +Korzeniewski Kubernetes LDAP LGPL @@ -503,6 +507,7 @@ MaxMind MaxPartCountForPartition MaxPushedDDLEntryID Mbps +McNeal Memcheck MemoryCode MemoryDataAndStack @@ -512,6 +517,7 @@ MemorySanitizer MemoryShared MemoryTracking MemoryVirtual +Menne MergeJoin MergeState MergeTree @@ -556,6 +562,7 @@ NEWDATE NEWDECIMAL NFKC NFKD +NOAA NULLIF NVME NVMe @@ -576,6 +583,7 @@ NetworkSendBytes NetworkSendDrop NetworkSendErrors NetworkSendPackets +Noaa NodeJs NuRaft NumHexagons @@ -656,8 +664,10 @@ OrZero OvercommitTracker PAAMAYIM PCRE +PRCP PREWHERE PROCESSLIST +PSUN PagerDuty ParallelFormattingOutputFormatThreads ParallelFormattingOutputFormatThreadsActive @@ -802,6 +812,7 @@ SIMD SLES SLRU SMALLINT +SNWD SPNEGO SQEs SQLAlchemy @@ -829,6 +840,7 @@ Sematext SendExternalTables SendScalars ShareAlike +Shortkeys SimHash Simhash SimpleAggregateFunction @@ -874,11 +886,14 @@ SupersetDocker SystemReplicasThreads SystemReplicasThreadsActive TABLUM +TAVG TCPConnection TCPThreads TDigest TINYINT TLSv +TMAX +TMIN TPCH TSDB TSVRaw @@ -938,6 +953,7 @@ TotalRowsOfMergeTreeTables TotalTemporaryFiles Tradeoff Transactional +Tukey TwoColumnList UBSan UDFs @@ -975,12 +991,15 @@ VIEWs Vadim Valgrind Vectorized +VersionBadge VersionInteger VersionedCollapsingMergeTree VideoContainer ViewAllLink VirtualBox +Vose WALs +WSFG Welch's Werror Wether @@ -999,6 +1018,7 @@ Xeon YAML YAMLRegExpTree YYYY +YYYYMMDD YYYYMMDDToDate YYYYMMDDhhmmssToDateTime Yandex @@ -1307,6 +1327,7 @@ cosineDistance countDigits countEqual countMatches +countMatchesCaseInsensitive countSubstrings covarPop covarSamp @@ -1331,6 +1352,7 @@ cryptographic csv csvwithnames csvwithnamesandtypes +ctukey curdate currentDatabase currentProfiles @@ -1570,6 +1592,7 @@ getSetting getSizeOfEnumType getblockinfo getevents +ghcnd github glibc globalIn @@ -1593,6 +1616,7 @@ groupArrayLast groupArrayMovingAvg groupArrayMovingSum groupArraySample +groupArraySorted groupBitAnd groupBitOr groupBitXor @@ -1607,6 +1631,7 @@ grouparraylast grouparraymovingavg grouparraymovingsum grouparraysample +grouparraysorted groupbitand groupbitmap groupbitmapand @@ -1666,6 +1691,7 @@ hudi hyperscan hypot hyvor +iTerm icosahedron icudata idempotency @@ -1952,6 +1978,7 @@ ngramSimHashCaseInsensitiveUTF ngramSimHashUTF ngrambf ngrams +noaa nonNegativeDerivative noop normalizeQuery @@ -2060,7 +2087,6 @@ prebuild prebuilt preemptable preferServerCiphers -prefertch prefetch prefetchsize preloaded @@ -2110,6 +2136,7 @@ py qryn quantile quantileBFloat +quantileDD quantileDeterministic quantileExact quantileExactExclusive @@ -2124,6 +2151,7 @@ quantileTDigestWeighted quantileTiming quantileTimingWeighted quantilebfloat +quantileddsketch quantiledeterministic quantileexact quantileexactweighted @@ -2136,8 +2164,6 @@ quantiletdigest quantiletdigestweighted quantiletiming quantiletimingweighted -quantileddsketch -quantileDDSketch quartile queryID queryString @@ -2209,6 +2235,7 @@ reinterpretAsString reinterpretAsUInt reinterpretAsUUID remoteSecure +repivot replaceAll replaceOne replaceRegexpAll @@ -2270,6 +2297,7 @@ sequenceCount sequenceMatch sequenceNextNode seriesDecomposeSTL +seriesOutliersDetectTukey seriesPeriodDetectFFT serverTimeZone serverTimezone @@ -2283,6 +2311,7 @@ shardNum sharded sharding shortcircuit +shortkeys shoutout simdjson simpleLinearRegression @@ -2562,6 +2591,7 @@ tryPunycodeDecode tskv tsv tui +tukey tumbleEnd tumbleStart tupleConcat @@ -2637,6 +2667,8 @@ uuid varPop varSamp variadic +variantElement +variantType varint varpop varsamp diff --git a/utils/check-style/check-style b/utils/check-style/check-style index daee2e7fb00..a71dac91683 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -76,6 +76,7 @@ EXTERN_TYPES_EXCLUDES=( ProfileEvents::getProfileEvents ProfileEvents::ThreadIdToCountersSnapshot ProfileEvents::LOCAL_NAME + ProfileEvents::keeper_profile_events ProfileEvents::CountersIncrement CurrentMetrics::add @@ -87,6 +88,7 @@ EXTERN_TYPES_EXCLUDES=( CurrentMetrics::Metric CurrentMetrics::values CurrentMetrics::Value + CurrentMetrics::keeper_metrics ErrorCodes::ErrorCode ErrorCodes::getName @@ -106,7 +108,7 @@ for extern_type in ${!EXTERN_TYPES[@]}; do find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | { # NOTE: the check is pretty dumb and distinguish only by the type_of_extern, # and this matches with zkutil::CreateMode - grep -v 'src/Common/ZooKeeper/Types.h' + grep -v -e 'src/Common/ZooKeeper/Types.h' -e 'src/Coordination/KeeperConstants.cpp' } | { grep -vP $EXCLUDE_DIRS | xargs grep -l -P "extern const $type_of_extern $allowed_chars" } | while read file; do @@ -274,6 +276,11 @@ find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | grep -vP $EXCLUDE_DIRS | xargs grep -F '!!!' | grep -P '.' && echo "Too many exclamation marks (looks dirty, unconfident)." +# Exclamation mark in a message +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -F '!",' | grep -P '.' && echo "No need for an exclamation mark (looks dirty, unconfident)." + # Trailing whitespaces find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | grep -vP $EXCLUDE_DIRS | @@ -435,3 +442,14 @@ ls -1d $ROOT_PATH/contrib/*-cmake | xargs -I@ find @ -name 'CMakeLists.txt' -or # DOS/Windows newlines find $ROOT_PATH/{base,src,programs,utils,docs} -name '*.md' -or -name '*.h' -or -name '*.cpp' -or -name '*.js' -or -name '*.py' -or -name '*.html' | xargs grep -l -P '\r$' && echo "^ Files contain DOS/Windows newlines (\r\n instead of \n)." + +# Wrong spelling of abbreviations, e.g. SQL is right, Sql is wrong. XMLHttpRequest is very wrong. +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -P 'Sql|Html|Xml|Cpu|Tcp|Udp|Http|Db|Json|Yaml' | grep -v -P 'RabbitMQ|Azure|Aws|aws|Avro|IO/S3' && + echo "Abbreviations such as SQL, XML, HTTP, should be in all caps. For example, SQL is right, Sql is wrong. XMLHttpRequest is very wrong." + +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -F -i 'ErrorCodes::LOGICAL_ERROR, "Logical error:' && + echo "If an exception has LOGICAL_ERROR code, there is no need to include the text 'Logical error' in the exception message, because then the phrase 'Logical error' will be printed twice." diff --git a/utils/keeper-data-dumper/main.cpp b/utils/keeper-data-dumper/main.cpp index e06b301edbf..351a4ab90bc 100644 --- a/utils/keeper-data-dumper/main.cpp +++ b/utils/keeper-data-dumper/main.cpp @@ -25,13 +25,13 @@ void dumpMachine(std::shared_ptr machine) keys.pop(); std::cout << key << "\n"; auto value = storage.container.getValue(key); - std::cout << "\tStat: {version: " << value.stat.version << - ", mtime: " << value.stat.mtime << - ", emphemeralOwner: " << value.stat.ephemeralOwner << - ", czxid: " << value.stat.czxid << - ", mzxid: " << value.stat.mzxid << - ", numChildren: " << value.stat.numChildren << - ", dataLength: " << value.getData().size() << + std::cout << "\tStat: {version: " << value.version << + ", mtime: " << value.mtime << + ", emphemeralOwner: " << value.ephemeralOwner() << + ", czxid: " << value.czxid << + ", mzxid: " << value.mzxid << + ", numChildren: " << value.numChildren() << + ", dataLength: " << value.data_size << "}" << std::endl; std::cout << "\tData: " << storage.container.getValue(key).getData() << std::endl; @@ -59,7 +59,7 @@ int main(int argc, char *argv[]) Poco::Logger::root().setChannel(channel); Poco::Logger::root().setLevel("trace"); } - auto * logger = &Poco::Logger::get("keeper-dumper"); + auto logger = getLogger("keeper-dumper"); ResponsesQueue queue(std::numeric_limits::max()); SnapshotsQueue snapshots_queue{1}; CoordinationSettingsPtr settings = std::make_shared(); diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index b2983033e44..23fc0032056 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,5 +1,13 @@ +v24.1.5.6-stable 2024-02-14 +v24.1.4.20-stable 2024-02-14 +v24.1.3.31-stable 2024-02-09 +v24.1.2.5-stable 2024-02-02 +v24.1.1.2048-stable 2024-01-30 +v23.12.4.15-stable 2024-02-09 +v23.12.3.40-stable 2024-02-02 v23.12.2.59-stable 2024-01-05 v23.12.1.1368-stable 2023-12-28 +v23.11.5.29-stable 2024-02-02 v23.11.4.24-stable 2024-01-05 v23.11.3.23-stable 2023-12-21 v23.11.2.11-stable 2023-12-13 diff --git a/utils/self-extracting-executable/decompressor.cpp b/utils/self-extracting-executable/decompressor.cpp index 1f19a349d65..071ecb066cb 100644 --- a/utils/self-extracting-executable/decompressor.cpp +++ b/utils/self-extracting-executable/decompressor.cpp @@ -322,6 +322,8 @@ int decompressFiles(int input_fd, char * path, char * name, bool & have_compress return 1; } + if (0 != munmap(output, le64toh(file_info.uncompressed_size))) + perror("munmap"); if (0 != fsync(output_fd)) perror("fsync"); if (0 != close(output_fd)) @@ -527,6 +529,8 @@ int main(int/* argc*/, char* argv[]) char decompressed_name[decompressed_name_len + 1]; (void)snprintf(decompressed_name, decompressed_name_len + 1, decompressed_name_fmt, self, decompressed_suffix); +#if defined(OS_DARWIN) + // We can't just rename it on Mac due to security issues, so we copy it... std::error_code ec; std::filesystem::copy_file(static_cast(decompressed_name), static_cast(self), ec); if (ec) @@ -534,7 +538,13 @@ int main(int/* argc*/, char* argv[]) std::cerr << ec.message() << std::endl; return 1; } - +#else + if (link(decompressed_name, self)) + { + perror("link"); + return 1; + } +#endif if (chmod(self, static_cast(decompressed_umask))) { perror("chmod");
Build type Version SanitizerCoverage Status Build log Build timenone{build_result.coverage}{build_result.status}