diff --git a/.github/actions/clean/action.yml b/.github/actions/clean/action.yml new file mode 100644 index 00000000000..547738b17cc --- /dev/null +++ b/.github/actions/clean/action.yml @@ -0,0 +1,11 @@ +name: Clean runner +description: Clean the runner's temp path on ending +runs: + using: "composite" + steps: + - name: Clean + shell: bash + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "${{runner.temp}}" diff --git a/.github/actions/common_setup/action.yml b/.github/actions/common_setup/action.yml new file mode 100644 index 00000000000..0d31945087d --- /dev/null +++ b/.github/actions/common_setup/action.yml @@ -0,0 +1,33 @@ +name: Common setup +description: Setup necessary environments +inputs: + job_type: + description: the name to use in the TEMP_PATH and REPO_COPY + default: common + type: string + nested_job: + description: the fuse for unintended use inside of the reusable callable jobs + default: true + type: boolean +runs: + using: "composite" + steps: + - name: Setup and check ENV + shell: bash + run: | + echo "Setup the common ENV variables" + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/${{inputs.job_type}} + REPO_COPY=${{runner.temp}}/${{inputs.job_type}}/git-repo-copy + EOF + if [ -z "${{env.GITHUB_JOB_OVERRIDDEN}}" ] && [ "true" == "${{inputs.nested_job}}" ]; then + echo "The GITHUB_JOB_OVERRIDDEN ENV is unset, and must be set for the nested jobs" + exit 1 + fi + - name: Setup $TEMP_PATH + shell: bash + run: | + # to remove every leftovers + sudo rm -fr "$TEMP_PATH" + mkdir -p "$REPO_COPY" + cp -a "$GITHUB_WORKSPACE"/. "$REPO_COPY"/ diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index eb4c29130c4..7611c5429c5 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -1,3 +1,4 @@ +# yamllint disable rule:comments-indentation name: BackportPR env: @@ -33,7 +34,12 @@ jobs: - name: Python unit tests run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 -m unittest discover -s . -p '*_test.py' + echo "Testing the main ci directory" + python3 -m unittest discover -s . -p 'test_*.py' + for dir in *_lambda/; do + echo "Testing $dir" + python3 -m unittest discover -s "$dir" -p 'test_*.py' + done DockerHubPushAarch64: runs-on: [self-hosted, style-checker-aarch64] needs: CheckLabels @@ -69,7 +75,7 @@ jobs: name: changed_images_amd64 path: ${{ runner.temp }}/docker_images_check/changed_images_amd64.json DockerHubPush: - needs: [DockerHubPushAmd64, DockerHubPushAarch64] + needs: [DockerHubPushAmd64, DockerHubPushAarch64, PythonUnitTests] runs-on: [self-hosted, style-checker] steps: - name: Check out repository code @@ -164,320 +170,43 @@ jobs: ######################################################################################### BuilderDebRelease: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_release - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # For a proper version and performance artifacts - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_release + checkout_depth: 0 BuilderDebAarch64: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_aarch64 - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # For a proper version and performance artifacts - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_aarch64 + checkout_depth: 0 BuilderDebAsan: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_asan - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_asan BuilderDebTsan: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_tsan - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_tsan BuilderDebDebug: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_debug - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Apply sparse checkout for contrib # in order to check that it doesn't break build - run: | - rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' - git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' - "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' - du -hs "$GITHUB_WORKSPACE/contrib" ||: - find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_debug BuilderBinDarwin: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_darwin - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Apply sparse checkout for contrib # in order to check that it doesn't break build - run: | - rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' - git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' - "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' - du -hs "$GITHUB_WORKSPACE/contrib" ||: - find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_darwin + checkout_depth: 0 BuilderBinDarwinAarch64: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_darwin_aarch64 - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Apply sparse checkout for contrib # in order to check that it doesn't break build - run: | - rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' - git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' - "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' - du -hs "$GITHUB_WORKSPACE/contrib" ||: - find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_darwin_aarch64 + checkout_depth: 0 ############################################################################################ ##################################### Docker images ####################################### ############################################################################################ diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index a9aa7717add..5a0fc2fabcb 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -1,3 +1,4 @@ +# yamllint disable rule:comments-indentation name: MasterCI env: @@ -19,7 +20,12 @@ jobs: - name: Python unit tests run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 -m unittest discover -s . -p '*_test.py' + echo "Testing the main ci directory" + python3 -m unittest discover -s . -p 'test_*.py' + for dir in *_lambda/; do + echo "Testing $dir" + python3 -m unittest discover -s "$dir" -p 'test_*.py' + done DockerHubPushAarch64: runs-on: [self-hosted, style-checker-aarch64] steps: @@ -179,789 +185,109 @@ jobs: ######################################################################################### BuilderDebRelease: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_release - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # For a proper version and performance artifacts - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + checkout_depth: 0 + build_name: package_release BuilderDebAarch64: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_aarch64 - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ runner.temp }}/images_path - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # For a proper version and performance artifacts - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + checkout_depth: 0 + build_name: package_aarch64 BuilderBinRelease: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_release - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + checkout_depth: 0 + build_name: binary_release BuilderDebAsan: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_asan - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_asan BuilderDebUBsan: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_ubsan - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_ubsan BuilderDebTsan: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_tsan - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_tsan BuilderDebMsan: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_msan - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_msan BuilderDebDebug: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_debug - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Apply sparse checkout for contrib # in order to check that it doesn't break build - run: | - rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' - git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' - "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' - du -hs "$GITHUB_WORKSPACE/contrib" ||: - find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_debug ########################################################################################## ##################################### SPECIAL BUILDS ##################################### ########################################################################################## BuilderBinClangTidy: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_tidy - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_tidy BuilderBinDarwin: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_darwin - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Apply sparse checkout for contrib # in order to check that it doesn't break build - run: | - rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' - git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' - "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' - du -hs "$GITHUB_WORKSPACE/contrib" ||: - find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_darwin + checkout_depth: 0 BuilderBinAarch64: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_aarch64 - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_aarch64 + checkout_depth: 0 BuilderBinFreeBSD: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_freebsd - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_freebsd + checkout_depth: 0 BuilderBinDarwinAarch64: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_darwin_aarch64 - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Apply sparse checkout for contrib # in order to check that it doesn't break build - run: | - rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' - git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' - "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' - du -hs "$GITHUB_WORKSPACE/contrib" ||: - find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_darwin_aarch64 + checkout_depth: 0 BuilderBinPPC64: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_ppc64le - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_ppc64le + checkout_depth: 0 BuilderBinAmd64Compat: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_amd64_compat - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_amd64_compat + checkout_depth: 0 BuilderBinAarch64V80Compat: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_aarch64_v80compat - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_aarch64_v80compat + checkout_depth: 0 BuilderBinRISCV64: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_riscv64 - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_riscv64 + checkout_depth: 0 BuilderBinS390X: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_s390x - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_s390x + checkout_depth: 0 ############################################################################################ ##################################### Docker images ####################################### ############################################################################################ diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 5937f434135..66a0b186743 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -1,3 +1,4 @@ +# yamllint disable rule:comments-indentation name: PullRequestCI env: @@ -47,10 +48,10 @@ jobs: run: | cd "$GITHUB_WORKSPACE/tests/ci" echo "Testing the main ci directory" - python3 -m unittest discover -s . -p '*_test.py' + python3 -m unittest discover -s . -p 'test_*.py' for dir in *_lambda/; do echo "Testing $dir" - python3 -m unittest discover -s "$dir" -p '*_test.py' + python3 -m unittest discover -s "$dir" -p 'test_*.py' done DockerHubPushAarch64: needs: CheckLabels @@ -246,771 +247,100 @@ jobs: #################################### ORDINARY BUILDS #################################### ######################################################################################### BuilderDebRelease: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_release - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - fetch-depth: 0 # for performance artifact - filter: tree:0 - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" - BuilderBinRelease: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_release - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_release + checkout_depth: 0 BuilderDebAarch64: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_aarch64 - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ runner.temp }}/images_path - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # for performance artifact - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_aarch64 + checkout_depth: 0 + BuilderBinRelease: + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_release BuilderDebAsan: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_asan - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_asan BuilderDebUBsan: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_ubsan - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_ubsan BuilderDebTsan: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_tsan - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_tsan BuilderDebMsan: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_msan - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_msan BuilderDebDebug: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_debug - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Apply sparse checkout for contrib # in order to check that it doesn't break build - run: | - rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' - git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' - "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' - du -hs "$GITHUB_WORKSPACE/contrib" ||: - find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_debug ########################################################################################## ##################################### SPECIAL BUILDS ##################################### ########################################################################################## BuilderBinClangTidy: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_tidy - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_tidy BuilderBinDarwin: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_darwin - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Apply sparse checkout for contrib # in order to check that it doesn't break build - run: | - rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' - git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' - "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' - du -hs "$GITHUB_WORKSPACE/contrib" ||: - find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_darwin BuilderBinAarch64: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_aarch64 - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_aarch64 BuilderBinFreeBSD: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_freebsd - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_freebsd BuilderBinDarwinAarch64: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_darwin_aarch64 - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Apply sparse checkout for contrib # in order to check that it doesn't break build - run: | - rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' - git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' - "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' - du -hs "$GITHUB_WORKSPACE/contrib" ||: - find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_darwin_aarch64 BuilderBinPPC64: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_ppc64le - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_ppc64le BuilderBinAmd64Compat: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_amd64_compat - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_amd64_compat BuilderBinAarch64V80Compat: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_aarch64_v80compat - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_aarch64_v80compat BuilderBinRISCV64: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_riscv64 - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_riscv64 BuilderBinS390X: - needs: [DockerHubPush, FastTest, StyleCheck] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_s390x - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + needs: [FastTest, StyleCheck] + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_s390x ############################################################################################ ##################################### Docker images ####################################### ############################################################################################ diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 5dd837c6456..29776d0aa5c 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -1,3 +1,4 @@ +# yamllint disable rule:comments-indentation name: ReleaseBranchCI env: @@ -140,401 +141,53 @@ jobs: ######################################################################################### BuilderDebRelease: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_release - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_release + checkout_depth: 0 BuilderDebAarch64: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_aarch64 - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ runner.temp }}/images_path - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # For a proper version and performance artifacts - filter: tree:0 - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ runner.temp }}/build_check/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_aarch64 + checkout_depth: 0 BuilderDebAsan: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_asan - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_asan BuilderDebUBsan: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_ubsan - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_ubsan BuilderDebTsan: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_tsan - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_tsan BuilderDebMsan: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_msan - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_msan BuilderDebDebug: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=package_debug - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - - name: Apply sparse checkout for contrib # in order to check that it doesn't break build - run: | - rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' - git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' - "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' - du -hs "$GITHUB_WORKSPACE/contrib" ||: - find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_debug BuilderBinDarwin: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_darwin - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Apply sparse checkout for contrib # in order to check that it doesn't break build - run: | - rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' - git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' - "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' - du -hs "$GITHUB_WORKSPACE/contrib" ||: - find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_darwin + checkout_depth: 0 BuilderBinDarwinAarch64: needs: [DockerHubPush] - runs-on: [self-hosted, builder] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/build_check - IMAGES_PATH=${{runner.temp}}/images_path - REPO_COPY=${{runner.temp}}/build_check/ClickHouse - CACHES_PATH=${{runner.temp}}/../ccaches - BUILD_NAME=binary_darwin_aarch64 - EOF - - name: Download changed images - uses: actions/download-artifact@v3 - with: - name: changed_images - path: ${{ env.IMAGES_PATH }} - - name: Check out repository code - uses: ClickHouse/checkout@v1 - with: - clear-repository: true - submodules: true - fetch-depth: 0 # otherwise we will have no info about contributors - filter: tree:0 - - name: Apply sparse checkout for contrib # in order to check that it doesn't break build - run: | - rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' - git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' - "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' - du -hs "$GITHUB_WORKSPACE/contrib" ||: - find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: - - name: Build - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - - name: Upload build URLs to artifacts - if: ${{ success() || failure() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ env.BUILD_URLS }} - path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - - name: Cleanup - if: always() - run: | - docker ps --quiet | xargs --no-run-if-empty docker kill ||: - docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: - sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + uses: ./.github/workflows/reusable_build.yml + with: + build_name: binary_darwin_aarch64 + checkout_depth: 0 ############################################################################################ ##################################### Docker images ####################################### ############################################################################################ diff --git a/.github/workflows/reusable_build.yml b/.github/workflows/reusable_build.yml new file mode 100644 index 00000000000..1eb25307f0c --- /dev/null +++ b/.github/workflows/reusable_build.yml @@ -0,0 +1,74 @@ +### For the pure soul wishes to move it to another place +# https://github.com/orgs/community/discussions/9050 + +name: Build ClickHouse +'on': + workflow_call: + inputs: + build_name: + description: the value of build type from tests/ci/ci_config.py + required: true + type: string + checkout_depth: + description: the value of the git shallow checkout + required: false + type: number + default: 1 + runner_type: + description: the label of runner to use + default: builder + type: string + additional_envs: + description: additional ENV variables to setup the job + type: string + +jobs: + Build: + name: Build-${{inputs.build_name}} + runs-on: [self-hosted, '${{inputs.runner_type}}'] + steps: + - name: Check out repository code + uses: ClickHouse/checkout@v1 + with: + clear-repository: true + submodules: true + fetch-depth: ${{inputs.checkout_depth}} + filter: tree:0 + - name: Set build envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + IMAGES_PATH=${{runner.temp}}/images_path + GITHUB_JOB_OVERRIDDEN=Build-${{inputs.build_name}} + ${{inputs.additional_envs}} + EOF + python3 "$GITHUB_WORKSPACE"/tests/ci/ci_config.py --build-name "${{inputs.build_name}}" >> "$GITHUB_ENV" + - name: Apply sparse checkout for contrib # in order to check that it doesn't break build + # This step is done in GITHUB_WORKSPACE, + # because it's broken in REPO_COPY for some reason + if: ${{ env.BUILD_SPARSE_CHECKOUT == 'true' }} + run: | + rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed' + git -C "$GITHUB_WORKSPACE" checkout . && echo 'restored' + "$GITHUB_WORKSPACE/contrib/update-submodules.sh" && echo 'OK' + du -hs "$GITHUB_WORKSPACE/contrib" ||: + find "$GITHUB_WORKSPACE/contrib" -type f | wc -l ||: + - name: Common setup + uses: ./.github/actions/common_setup + with: + job_type: build_check + - name: Download changed images + uses: actions/download-artifact@v3 + with: + name: changed_images + path: ${{ env.IMAGES_PATH }} + - name: Build + run: | + cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" + - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ env.BUILD_URLS }} + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json + - name: Clean + uses: ./.github/actions/clean diff --git a/.gitmodules b/.gitmodules index 904d2cec249..1a464ee1170 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ +# Please do not use 'branch = ...' tags with submodule entries. Such tags make updating submodules a +# little bit more convenient but they do *not* specify the tracked submodule branch. Thus, they are +# more confusing than useful. [submodule "contrib/zstd"] path = contrib/zstd url = https://github.com/facebook/zstd diff --git a/README.md b/README.md index 5951430a418..d0fd19c0b73 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,17 @@ -[ClickHouse — open source distributed column-oriented DBMS](https://clickhouse.com?utm_source=github) +
-ClickHouse® is an open-source column-oriented database management system that allows generating analytical data reports in real-time. +[![Website](https://img.shields.io/website?up_message=AVAILABLE&down_message=DOWN&url=https%3A%2F%2Fclickhouse.com&style=for-the-badge)](https://clickhouse.com) +[![Apache 2.0 License](https://img.shields.io/badge/license-Apache%202.0-blueviolet?style=for-the-badge)](https://www.apache.org/licenses/LICENSE-2.0) + + + + + The ClickHouse company logo. + + +

ClickHouse® is an open-source column-oriented database management system that allows generating analytical data reports in real-time.

+ +
## How To Install (Linux, macOS, FreeBSD) ``` @@ -22,8 +33,7 @@ curl https://clickhouse.com/ | sh ## Upcoming Events -* [**ClickHouse Meetup in Beijing**](https://www.meetup.com/clickhouse-beijing-user-group/events/296334856/) - Nov 4 -* [**ClickHouse Meetup in San Francisco**](https://www.meetup.com/clickhouse-silicon-valley-meetup-group/events/296334923/) - Nov 8 +* [**ClickHouse Meetup in San Francisco**](https://www.meetup.com/clickhouse-silicon-valley-meetup-group/events/296334923/) - Nov 14 * [**ClickHouse Meetup in Singapore**](https://www.meetup.com/clickhouse-singapore-meetup-group/events/296334976/) - Nov 15 * [**ClickHouse Meetup in Berlin**](https://www.meetup.com/clickhouse-berlin-user-group/events/296488501/) - Nov 30 * [**ClickHouse Meetup in NYC**](https://www.meetup.com/clickhouse-new-york-user-group/events/296488779/) - Dec 11 @@ -33,7 +43,7 @@ Also, keep an eye out for upcoming meetups around the world. Somewhere else you ## Recent Recordings * **Recent Meetup Videos**: [Meetup Playlist](https://www.youtube.com/playlist?list=PL0Z2YDlm0b3iNDUzpY1S3L_iV4nARda_U) Whenever possible recordings of the ClickHouse Community Meetups are edited and presented as individual talks. Current featuring "Modern SQL in 2023", "Fast, Concurrent, and Consistent Asynchronous INSERTS in ClickHouse", and "Full-Text Indices: Design and Experiments" -* **Recording available**: [**v23.6 Release Webinar**](https://www.youtube.com/watch?v=cuf_hYn7dqU) All the features of 23.6, one convenient video! Watch it now! +* **Recording available**: [**v23.10 Release Webinar**](https://www.youtube.com/watch?v=PGQS6uPb970) All the features of 23.10, one convenient video! Watch it now! * **All release webinar recordings**: [YouTube playlist](https://www.youtube.com/playlist?list=PL0Z2YDlm0b3jAlSy1JxyP8zluvXaN3nxU) diff --git a/base/base/defines.h b/base/base/defines.h index d852f6b9f63..02058a29096 100644 --- a/base/base/defines.h +++ b/base/base/defines.h @@ -119,17 +119,16 @@ #include namespace DB { - void abortOnFailedAssertion(const String & description); + [[noreturn]] void abortOnFailedAssertion(const String & description); } - #define chassert(x) static_cast(x) ? void(0) : ::DB::abortOnFailedAssertion(#x) + #define chassert(x) do { static_cast(x) ? void(0) : ::DB::abortOnFailedAssertion(#x); } while (0) #define UNREACHABLE() abort() // clang-format off #else /// Here sizeof() trick is used to suppress unused warning for result, /// since simple "(void)x" will evaluate the expression, while /// "sizeof(!(x))" will not. - #define NIL_EXPRESSION(x) (void)sizeof(!(x)) - #define chassert(x) NIL_EXPRESSION(x) + #define chassert(x) (void)sizeof(!(x)) #define UNREACHABLE() __builtin_unreachable() #endif #endif diff --git a/cmake/split_debug_symbols.cmake b/cmake/split_debug_symbols.cmake index d6821eb6c48..67c2c386f20 100644 --- a/cmake/split_debug_symbols.cmake +++ b/cmake/split_debug_symbols.cmake @@ -1,3 +1,5 @@ +# Generates a separate file with debug symbols while stripping it from the main binary. +# This is needed for Debian packages. macro(clickhouse_split_debug_symbols) set(oneValueArgs TARGET DESTINATION_DIR BINARY_PATH) diff --git a/contrib/NuRaft b/contrib/NuRaft index eb1572129c7..b7ea89b817a 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit eb1572129c71beb2156dcdaadc3fb136954aed96 +Subproject commit b7ea89b817a18dc0eafc1f909d568869f02d2d04 diff --git a/contrib/grpc b/contrib/grpc index bef8212d1e0..b723ecae099 160000 --- a/contrib/grpc +++ b/contrib/grpc @@ -1 +1 @@ -Subproject commit bef8212d1e01f99e406c282ceab3d42da08e09ce +Subproject commit b723ecae0991bb873fe87a595dfb187178733fde diff --git a/contrib/libssh-cmake/CMakeLists.txt b/contrib/libssh-cmake/CMakeLists.txt index 58db81cf352..1e58a856119 100644 --- a/contrib/libssh-cmake/CMakeLists.txt +++ b/contrib/libssh-cmake/CMakeLists.txt @@ -1,3 +1,10 @@ +option (ENABLE_SSH "Enable support for SSH keys and protocol" ON) + +if (NOT ENABLE_SSH) + message(STATUS "Not using SSH") + return() +endif() + set(LIB_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/libssh") set(LIB_BINARY_DIR "${ClickHouse_BINARY_DIR}/contrib/libssh") # Specify search path for CMake modules to be loaded by include() diff --git a/contrib/sparse-checkout/update-grpc.sh b/contrib/sparse-checkout/update-grpc.sh index 38934fdbc1b..21628ce8dd1 100755 --- a/contrib/sparse-checkout/update-grpc.sh +++ b/contrib/sparse-checkout/update-grpc.sh @@ -6,12 +6,13 @@ FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout echo '/*' > $FILES_TO_CHECKOUT echo '!/test/*' >> $FILES_TO_CHECKOUT echo '/test/build/*' >> $FILES_TO_CHECKOUT +echo '/test/core/tsi/alts/fake_handshaker/*' >> $FILES_TO_CHECKOUT +echo '/test/core/event_engine/fuzzing_event_engine/*' >> $FILES_TO_CHECKOUT echo '!/tools/*' >> $FILES_TO_CHECKOUT echo '/tools/codegen/*' >> $FILES_TO_CHECKOUT echo '!/examples/*' >> $FILES_TO_CHECKOUT echo '!/doc/*' >> $FILES_TO_CHECKOUT -# FIXME why do we need csharp? -#echo '!/src/csharp/*' >> $FILES_TO_CHECKOUT +echo '!/src/csharp/*' >> $FILES_TO_CHECKOUT echo '!/src/python/*' >> $FILES_TO_CHECKOUT echo '!/src/objective-c/*' >> $FILES_TO_CHECKOUT echo '!/src/php/*' >> $FILES_TO_CHECKOUT diff --git a/contrib/update-submodules.sh b/contrib/update-submodules.sh index c94681e6240..b612d25352b 100755 --- a/contrib/update-submodules.sh +++ b/contrib/update-submodules.sh @@ -1,11 +1,12 @@ #!/bin/sh - set -e -WORKDIR=$(dirname "$0") -WORKDIR=$(readlink -f "${WORKDIR}") +SCRIPT_PATH=$(realpath "$0") +SCRIPT_DIR=$(dirname "${SCRIPT_PATH}") +GIT_DIR=$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel) +cd $GIT_DIR -"$WORKDIR/sparse-checkout/setup-sparse-checkout.sh" +contrib/sparse-checkout/setup-sparse-checkout.sh git submodule init git submodule sync -git submodule update --depth=1 +git config --file .gitmodules --get-regexp .*path | sed 's/[^ ]* //' | xargs -I _ --max-procs 64 git submodule update --depth=1 --single-branch _ diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index 54ce22838d9..1f4fd39bc26 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="23.10.1.1976" +ARG VERSION="23.10.3.5" ARG PACKAGES="clickhouse-keeper" # user/group precreated explicitly with fixed uid/gid on purpose. diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh index cc2613cbaf5..150ce1ab385 100755 --- a/docker/packager/binary/build.sh +++ b/docker/packager/binary/build.sh @@ -126,6 +126,7 @@ fi mv ./programs/clickhouse* /output || mv ./programs/*_fuzzer /output [ -x ./programs/self-extracting/clickhouse ] && mv ./programs/self-extracting/clickhouse /output +[ -x ./programs/self-extracting/clickhouse-stripped ] && mv ./programs/self-extracting/clickhouse-stripped /output mv ./src/unit_tests_dbms /output ||: # may not exist for some binary builds mv ./programs/*.dict ./programs/*.options ./programs/*_seed_corpus.zip /output ||: # libFuzzer oss-fuzz compatible infrastructure diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index 3effa0fd50d..41be7e611a3 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="23.10.1.1976" +ARG VERSION="23.10.3.5" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # user/group precreated explicitly with fixed uid/gid on purpose. diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index d1a1c6ee71f..0ff6ae2e227 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -30,7 +30,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list ARG REPO_CHANNEL="stable" ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION="23.10.1.1976" +ARG VERSION="23.10.3.5" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # set non-empty deb_location_url url to create a docker image diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 0ff79e24bf8..ec24b237752 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -15,10 +15,15 @@ CLICKHOUSE_CI_LOGS_USER=${CLICKHOUSE_CI_LOGS_USER:-ci} # Pre-configured destination cluster, where to export the data CLICKHOUSE_CI_LOGS_CLUSTER=${CLICKHOUSE_CI_LOGS_CLUSTER:-system_logs_export} -EXTRA_COLUMNS=${EXTRA_COLUMNS:-"pull_request_number UInt32, commit_sha String, check_start_time DateTime('UTC'), check_name String, instance_type String, instance_id String, "} -EXTRA_COLUMNS_EXPRESSION=${EXTRA_COLUMNS_EXPRESSION:-"CAST(0 AS UInt32) AS pull_request_number, '' AS commit_sha, now() AS check_start_time, '' AS check_name, '' AS instance_type, '' AS instance_id"} +EXTRA_COLUMNS=${EXTRA_COLUMNS:-"pull_request_number UInt32, commit_sha String, check_start_time DateTime('UTC'), check_name LowCardinality(String), instance_type LowCardinality(String), instance_id String, INDEX ix_pr (pull_request_number) TYPE set(100), INDEX ix_commit (commit_sha) TYPE set(100), INDEX ix_check_time (check_start_time) TYPE minmax, "} +EXTRA_COLUMNS_EXPRESSION=${EXTRA_COLUMNS_EXPRESSION:-"CAST(0 AS UInt32) AS pull_request_number, '' AS commit_sha, now() AS check_start_time, toLowCardinality('') AS check_name, toLowCardinality('') AS instance_type, '' AS instance_id"} EXTRA_ORDER_BY_COLUMNS=${EXTRA_ORDER_BY_COLUMNS:-"check_name, "} +# trace_log needs more columns for symbolization +EXTRA_COLUMNS_TRACE_LOG="${EXTRA_COLUMNS} symbols Array(LowCardinality(String)), lines Array(LowCardinality(String)), " +EXTRA_COLUMNS_EXPRESSION_TRACE_LOG="${EXTRA_COLUMNS_EXPRESSION}, arrayMap(x -> toLowCardinality(demangle(addressToSymbol(x))), trace) AS symbols, arrayMap(x -> toLowCardinality(addressToLine(x)), trace) AS lines" + + function __set_connection_args { # It's impossible to use generous $CONNECTION_ARGS string, it's unsafe from word splitting perspective. @@ -125,9 +130,18 @@ function setup_logs_replication echo 'Create %_log tables' clickhouse-client --query "SHOW TABLES FROM system LIKE '%\\_log'" | while read -r table do + if [[ "$table" = "trace_log" ]] + then + EXTRA_COLUMNS_FOR_TABLE="${EXTRA_COLUMNS_TRACE_LOG}" + EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION_TRACE_LOG}" + else + EXTRA_COLUMNS_FOR_TABLE="${EXTRA_COLUMNS}" + EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION}" + fi + # Calculate hash of its structure. Note: 4 is the version of extra columns - increment it if extra columns are changed: hash=$(clickhouse-client --query " - SELECT sipHash64(4, groupArray((name, type))) + SELECT sipHash64(9, groupArray((name, type))) FROM (SELECT name, type FROM system.columns WHERE database = 'system' AND table = '$table' ORDER BY position) @@ -135,7 +149,7 @@ function setup_logs_replication # Create the destination table with adapted name and structure: statement=$(clickhouse-client --format TSVRaw --query "SHOW CREATE TABLE system.${table}" | sed -r -e ' - s/^\($/('"$EXTRA_COLUMNS"'/; + s/^\($/('"$EXTRA_COLUMNS_FOR_TABLE"'/; s/ORDER BY \(/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"'/; s/^CREATE TABLE system\.\w+_log$/CREATE TABLE IF NOT EXISTS '"$table"'_'"$hash"'/; /^TTL /d @@ -155,7 +169,7 @@ function setup_logs_replication ENGINE = Distributed(${CLICKHOUSE_CI_LOGS_CLUSTER}, default, ${table}_${hash}) SETTINGS flush_on_detach=0 EMPTY AS - SELECT ${EXTRA_COLUMNS_EXPRESSION}, * + SELECT ${EXTRA_COLUMNS_EXPRESSION_FOR_TABLE}, * FROM system.${table} " || continue @@ -163,7 +177,7 @@ function setup_logs_replication clickhouse-client --query " CREATE MATERIALIZED VIEW system.${table}_watcher TO system.${table}_sender AS - SELECT ${EXTRA_COLUMNS_EXPRESSION}, * + SELECT ${EXTRA_COLUMNS_EXPRESSION_FOR_TABLE}, * FROM system.${table} " || continue done diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 34fc12d1a72..9951d79d6ac 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -19,6 +19,11 @@ dpkg -i package_folder/clickhouse-common-static-dbg_*.deb dpkg -i package_folder/clickhouse-server_*.deb dpkg -i package_folder/clickhouse-client_*.deb +# Check that the tools are available under short names +ch --query "SELECT 1" || exit 1 +chl --query "SELECT 1" || exit 1 +chc --version || exit 1 + ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test # shellcheck disable=SC1091 @@ -62,7 +67,7 @@ if [ "$NUM_TRIES" -gt "1" ]; then export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000 mkdir -p /var/run/clickhouse-server - # simpliest way to forward env variables to server + # simplest way to forward env variables to server sudo -E -u clickhouse /usr/bin/clickhouse-server --config /etc/clickhouse-server/config.xml --daemon --pid-file /var/run/clickhouse-server/clickhouse-server.pid else sudo clickhouse start diff --git a/docker/test/stateless/stress_tests.lib b/docker/test/stateless/stress_tests.lib index ea79089175e..2309e307324 100644 --- a/docker/test/stateless/stress_tests.lib +++ b/docker/test/stateless/stress_tests.lib @@ -53,31 +53,28 @@ function configure() > /etc/clickhouse-server/config.d/keeper_port.xml.tmp sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml - function randomize_config_boolean_value { + function randomize_keeper_config_boolean_value { value=$(($RANDOM % 2)) - sudo cat /etc/clickhouse-server/config.d/keeper_port.xml \ + sudo cat /etc/clickhouse-server/config.d/$2.xml \ | sed "s|<$1>[01]|<$1>$value|" \ - > /etc/clickhouse-server/config.d/keeper_port.xml.tmp - sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml + > /etc/clickhouse-server/config.d/$2.xml.tmp + sudo mv /etc/clickhouse-server/config.d/$2.xml.tmp /etc/clickhouse-server/config.d/$2.xml } if [[ -n "$RANDOMIZE_KEEPER_FEATURE_FLAGS" ]] && [[ "$RANDOMIZE_KEEPER_FEATURE_FLAGS" -eq 1 ]]; then # Randomize all Keeper feature flags - randomize_config_boolean_value filtered_list - randomize_config_boolean_value multi_read - randomize_config_boolean_value check_not_exists - randomize_config_boolean_value create_if_not_exists + randomize_config_boolean_value filtered_list keeper_port + randomize_config_boolean_value multi_read keeper_port + randomize_config_boolean_value check_not_exists keeper_port + randomize_config_boolean_value create_if_not_exists keeper_port fi sudo chown clickhouse /etc/clickhouse-server/config.d/keeper_port.xml sudo chgrp clickhouse /etc/clickhouse-server/config.d/keeper_port.xml - #Randomize merge tree setting allow_experimental_block_number_column - value=$(($RANDOM % 2)) - sudo cat /etc/clickhouse-server/config.d/merge_tree_settings.xml \ - | sed "s|[01]|$value|" \ - > /etc/clickhouse-server/config.d/merge_tree_settings.xml.tmp - sudo mv /etc/clickhouse-server/config.d/merge_tree_settings.xml.tmp /etc/clickhouse-server/config.d/merge_tree_settings.xml + randomize_config_boolean_value use_compression zookeeper + + randomize_config_boolean_value allow_experimental_block_number_column merge_tree_settings # for clickhouse-server (via service) echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment diff --git a/docs/changelogs/v23.10.2.13-stable.md b/docs/changelogs/v23.10.2.13-stable.md new file mode 100644 index 00000000000..4961c991047 --- /dev/null +++ b/docs/changelogs/v23.10.2.13-stable.md @@ -0,0 +1,18 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.10.2.13-stable (65d8522bb1d) FIXME as compared to v23.10.1.1976-stable (13adae0e42f) + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix a crash during table loading on startup [#56232](https://github.com/ClickHouse/ClickHouse/pull/56232) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix segfault in signal handler for Keeper [#56266](https://github.com/ClickHouse/ClickHouse/pull/56266) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix incomplete query result for UNION in view() function. [#56274](https://github.com/ClickHouse/ClickHouse/pull/56274) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix inconsistency of "cast('0' as DateTime64(3))" and "cast('0' as Nullable(DateTime64(3)))" [#56286](https://github.com/ClickHouse/ClickHouse/pull/56286) ([李扬](https://github.com/taiyang-li)). +* Fix crash in case of adding a column with type Object(JSON) [#56307](https://github.com/ClickHouse/ClickHouse/pull/56307) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Fix buffer overflow in T64 [#56434](https://github.com/ClickHouse/ClickHouse/pull/56434) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + diff --git a/docs/changelogs/v23.10.3.5-stable.md b/docs/changelogs/v23.10.3.5-stable.md new file mode 100644 index 00000000000..2357b069cdb --- /dev/null +++ b/docs/changelogs/v23.10.3.5-stable.md @@ -0,0 +1,16 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.10.3.5-stable (b2ba7637a41) FIXME as compared to v23.10.2.13-stable (65d8522bb1d) + +#### Improvement +* Backported in [#56513](https://github.com/ClickHouse/ClickHouse/issues/56513): Allow backup of materialized view with dropped inner table instead of failing the backup. [#56387](https://github.com/ClickHouse/ClickHouse/pull/56387) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### NO CL CATEGORY + +* Backported in [#56605](https://github.com/ClickHouse/ClickHouse/issues/56605):. [#56598](https://github.com/ClickHouse/ClickHouse/pull/56598) ([Maksim Kita](https://github.com/kitaisreal)). + diff --git a/docs/changelogs/v23.3.16.7-lts.md b/docs/changelogs/v23.3.16.7-lts.md new file mode 100644 index 00000000000..7f5aee06e0e --- /dev/null +++ b/docs/changelogs/v23.3.16.7-lts.md @@ -0,0 +1,14 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.3.16.7-lts (fb4125cc92a) FIXME as compared to v23.3.15.29-lts (218336662e4) + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix: avoid using regex match, possibly containing alternation, as a key condition. [#54696](https://github.com/ClickHouse/ClickHouse/pull/54696) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Fix buffer overflow in T64 [#56434](https://github.com/ClickHouse/ClickHouse/pull/56434) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + diff --git a/docs/changelogs/v23.8.6.16-lts.md b/docs/changelogs/v23.8.6.16-lts.md new file mode 100644 index 00000000000..6eb752e987c --- /dev/null +++ b/docs/changelogs/v23.8.6.16-lts.md @@ -0,0 +1,21 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.8.6.16-lts (077df679bed) FIXME as compared to v23.8.5.16-lts (e8a1af5fe2f) + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix rare case of CHECKSUM_DOESNT_MATCH error [#54549](https://github.com/ClickHouse/ClickHouse/pull/54549) ([alesapin](https://github.com/alesapin)). +* Fix: avoid using regex match, possibly containing alternation, as a key condition. [#54696](https://github.com/ClickHouse/ClickHouse/pull/54696) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Fix a crash during table loading on startup [#56232](https://github.com/ClickHouse/ClickHouse/pull/56232) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix segfault in signal handler for Keeper [#56266](https://github.com/ClickHouse/ClickHouse/pull/56266) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix buffer overflow in T64 [#56434](https://github.com/ClickHouse/ClickHouse/pull/56434) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Improve enrich image [#55793](https://github.com/ClickHouse/ClickHouse/pull/55793) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v23.9.4.11-stable.md b/docs/changelogs/v23.9.4.11-stable.md new file mode 100644 index 00000000000..a5d100ea606 --- /dev/null +++ b/docs/changelogs/v23.9.4.11-stable.md @@ -0,0 +1,17 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.9.4.11-stable (74c1f49dd6a) FIXME as compared to v23.9.3.12-stable (b7230b06563) + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix wrong query result when http_write_exception_in_output_format=1 [#56135](https://github.com/ClickHouse/ClickHouse/pull/56135) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix schema cache for fallback JSON->JSONEachRow with changed settings [#56172](https://github.com/ClickHouse/ClickHouse/pull/56172) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix a crash during table loading on startup [#56232](https://github.com/ClickHouse/ClickHouse/pull/56232) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix segfault in signal handler for Keeper [#56266](https://github.com/ClickHouse/ClickHouse/pull/56266) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix buffer overflow in T64 [#56434](https://github.com/ClickHouse/ClickHouse/pull/56434) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + diff --git a/docs/en/development/architecture.md b/docs/en/development/architecture.md index ba81b31b8ef..cfdd2bbcc41 100644 --- a/docs/en/development/architecture.md +++ b/docs/en/development/architecture.md @@ -67,22 +67,30 @@ Implementations of `ReadBuffer`/`WriteBuffer` are used for working with files an Read/WriteBuffers only deal with bytes. There are functions from `ReadHelpers` and `WriteHelpers` header files to help with formatting input/output. For example, there are helpers to write a number in decimal format. -Let’s look at what happens when you want to write a result set in `JSON` format to stdout. You have a result set ready to be fetched from `IBlockInputStream`. You create `WriteBufferFromFileDescriptor(STDOUT_FILENO)` to write bytes to stdout. You create `JSONRowOutputStream`, initialized with that `WriteBuffer`, to write rows in `JSON` to stdout. You create `BlockOutputStreamFromRowOutputStream` on top of it, to represent it as `IBlockOutputStream`. Then you call `copyData` to transfer data from `IBlockInputStream` to `IBlockOutputStream`, and everything works. Internally, `JSONRowOutputStream` will write various JSON delimiters and call the `IDataType::serializeTextJSON` method with a reference to `IColumn` and the row number as arguments. Consequently, `IDataType::serializeTextJSON` will call a method from `WriteHelpers.h`: for example, `writeText` for numeric types and `writeJSONString` for `DataTypeString`. +Let's examine what happens when you want to write a result set in `JSON` format to stdout. +You have a result set ready to be fetched from a pulling `QueryPipeline`. +First, you create a `WriteBufferFromFileDescriptor(STDOUT_FILENO)` to write bytes to stdout. +Next, you connect the result from the query pipeline to `JSONRowOutputFormat`, which is initialized with that `WriteBuffer`, to write rows in `JSON` format to stdout. +This can be done via the `complete` method, which turns a pulling `QueryPipeline` into a completed `QueryPipeline`. +Internally, `JSONRowOutputFormat` will write various JSON delimiters and call the `IDataType::serializeTextJSON` method with a reference to `IColumn` and the row number as arguments. Consequently, `IDataType::serializeTextJSON` will call a method from `WriteHelpers.h`: for example, `writeText` for numeric types and `writeJSONString` for `DataTypeString`. ## Tables {#tables} The `IStorage` interface represents tables. Different implementations of that interface are different table engines. Examples are `StorageMergeTree`, `StorageMemory`, and so on. Instances of these classes are just tables. -The key `IStorage` methods are `read` and `write`. There are also `alter`, `rename`, `drop`, and so on. The `read` method accepts the following arguments: the set of columns to read from a table, the `AST` query to consider, and the desired number of streams to return. It returns one or multiple `IBlockInputStream` objects and information about the stage of data processing that was completed inside a table engine during query execution. +The key methods in `IStorage` are `read` and `write`, along with others such as `alter`, `rename`, and `drop`. The `read` method accepts the following arguments: a set of columns to read from a table, the `AST` query to consider, and the desired number of streams. It returns a `Pipe`. -In most cases, the read method is only responsible for reading the specified columns from a table, not for any further data processing. All further data processing is done by the query interpreter and is outside the responsibility of `IStorage`. +In most cases, the read method is responsible only for reading the specified columns from a table, not for any further data processing. +All subsequent data processing is handled by another part of the pipeline, which falls outside the responsibility of `IStorage`. But there are notable exceptions: - The AST query is passed to the `read` method, and the table engine can use it to derive index usage and to read fewer data from a table. - Sometimes the table engine can process data itself to a specific stage. For example, `StorageDistributed` can send a query to remote servers, ask them to process data to a stage where data from different remote servers can be merged, and return that preprocessed data. The query interpreter then finishes processing the data. -The table’s `read` method can return multiple `IBlockInputStream` objects to allow parallel data processing. These multiple block input streams can read from a table in parallel. Then you can wrap these streams with various transformations (such as expression evaluation or filtering) that can be calculated independently and create a `UnionBlockInputStream` on top of them, to read from multiple streams in parallel. +The table’s `read` method can return a `Pipe` consisting of multiple `Processors`. These `Processors` can read from a table in parallel. +Then, you can connect these processors with various other transformations (such as expression evaluation or filtering), which can be calculated independently. +And then, create a `QueryPipeline` on top of them, and execute it via `PipelineExecutor`. There are also `TableFunction`s. These are functions that return a temporary `IStorage` object to use in the `FROM` clause of a query. @@ -98,9 +106,19 @@ A hand-written recursive descent parser parses a query. For example, `ParserSele ## Interpreters {#interpreters} -Interpreters are responsible for creating the query execution pipeline from an `AST`. There are simple interpreters, such as `InterpreterExistsQuery` and `InterpreterDropQuery`, or the more sophisticated `InterpreterSelectQuery`. The query execution pipeline is a combination of block input or output streams. For example, the result of interpreting the `SELECT` query is the `IBlockInputStream` to read the result set from; the result of the `INSERT` query is the `IBlockOutputStream` to write data for insertion to, and the result of interpreting the `INSERT SELECT` query is the `IBlockInputStream` that returns an empty result set on the first read, but that copies data from `SELECT` to `INSERT` at the same time. +Interpreters are responsible for creating the query execution pipeline from an AST. There are simple interpreters, such as `InterpreterExistsQuery` and `InterpreterDropQuery`, as well as the more sophisticated `InterpreterSelectQuery`. -`InterpreterSelectQuery` uses `ExpressionAnalyzer` and `ExpressionActions` machinery for query analysis and transformations. This is where most rule-based query optimizations are done. `ExpressionAnalyzer` is quite messy and should be rewritten: various query transformations and optimizations should be extracted to separate classes to allow modular transformations of query. +The query execution pipeline is a combination of processors that can consume and produce chunks (sets of columns with specific types). +A processor communicates via ports and can have multiple input ports and multiple output ports. +A more detailed description can be found in [src/Processors/IProcessor.h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Processors/IProcessor.h). + +For example, the result of interpreting the `SELECT` query is a "pulling" `QueryPipeline` which has a special output port to read the result set from. +The result of the `INSERT` query is a "pushing" `QueryPipeline` with an input port to write data for insertion. +And the result of interpreting the `INSERT SELECT` query is a "completed" `QueryPipeline` that has no inputs or outputs but copies data from `SELECT` to `INSERT` simultaneously. + +`InterpreterSelectQuery` uses `ExpressionAnalyzer` and `ExpressionActions` machinery for query analysis and transformations. This is where most rule-based query optimizations are performed. `ExpressionAnalyzer` is quite messy and should be rewritten: various query transformations and optimizations should be extracted into separate classes to allow for modular transformations of the query. + +To address current problems that exist in interpreters, a new `InterpreterSelectQueryAnalyzer` is being developed. It is a new version of `InterpreterSelectQuery` that does not use `ExpressionAnalyzer` and introduces an additional abstraction level between `AST` and `QueryPipeline` called `QueryTree`. It is not production-ready yet, but it can be tested with the `allow_experimental_analyzer` flag. ## Functions {#functions} diff --git a/docs/en/development/developer-instruction.md b/docs/en/development/developer-instruction.md index bdb7275b826..9d6a80de904 100644 --- a/docs/en/development/developer-instruction.md +++ b/docs/en/development/developer-instruction.md @@ -23,43 +23,34 @@ Create a fork of ClickHouse repository. To do that please click on the “fork The development process consists of first committing the intended changes into your fork of ClickHouse and then creating a “pull request” for these changes to be accepted into the main repository (ClickHouse/ClickHouse). -To work with git repositories, please install `git`. - -To do that in Ubuntu you would run in the command line terminal: +To work with Git repositories, please install `git`. To do that in Ubuntu you would run in the command line terminal: sudo apt update sudo apt install git -A brief manual on using Git can be found here: https://education.github.com/git-cheat-sheet-education.pdf. -For a detailed manual on Git see https://git-scm.com/book/en/v2. +A brief manual on using Git can be found [here](https://education.github.com/git-cheat-sheet-education.pdf). +For a detailed manual on Git see [here](https://git-scm.com/book/en/v2). ## Cloning a Repository to Your Development Machine {#cloning-a-repository-to-your-development-machine} Next, you need to download the source files onto your working machine. This is called “to clone a repository” because it creates a local copy of the repository on your working machine. -In the command line terminal run: +Run in your terminal: - git clone --shallow-submodules git@github.com:your_github_username/ClickHouse.git + git clone git@github.com:your_github_username/ClickHouse.git # replace placeholder with your GitHub user name cd ClickHouse -Or (if you'd like to use sparse checkout for submodules and avoid checking out unneeded files): +This command will create a directory `ClickHouse/` containing the source code of ClickHouse. If you specify a custom checkout directory (after the URL), it is important that this path does not contain whitespaces as it may lead to problems with the build system. - git clone git@github.com:your_github_username/ClickHouse.git - cd ClickHouse - ./contrib/update-submodules.sh +To make library dependencies available for the build, the ClickHouse repository uses Git submodules, i.e. references to external repositories. These are not checked out by default. To do so, you can either -Note: please, substitute *your_github_username* with what is appropriate! +- run `git clone` with option `--recurse-submodules`, -This command will create a directory `ClickHouse` containing the working copy of the project. +- if `git clone` did not check out submodules, run `git submodule update --init --jobs ` (e.g. ` = 12` to parallelize the checkout) to achieve the same as the previous alternative, or -It is important that the path to the working directory contains no whitespaces as it may lead to problems with running the build system. +- if `git clone` did not check out submodules and you like to use [sparse](https://github.blog/2020-01-17-bring-your-monorepo-down-to-size-with-sparse-checkout/) and [shallow](https://github.blog/2020-12-21-get-up-to-speed-with-partial-clone-and-shallow-clone/) submodule checkout to omit unneeded files and history in submodules to save space (ca. 5 GB instead of ca. 15 GB), run `./contrib/update-submodules.sh`. Not really recommended as it generally makes working with submodules less convenient and slower. -Please note that ClickHouse repository uses `submodules`. That is what the references to additional repositories are called (i.e. external libraries on which the project depends). It means that when cloning the repository you need to specify the `--recursive` flag as in the example above. If the repository has been cloned without submodules, to download them you need to run the following: - - git submodule init - git submodule update - -You can check the status with the command: `git submodule status`. +You can check the Git status with the command: `git submodule status`. If you get the following error message: @@ -83,36 +74,6 @@ You can also add original ClickHouse repo address to your local repository to pu After successfully running this command you will be able to pull updates from the main ClickHouse repo by running `git pull upstream master`. -### Working with Submodules {#working-with-submodules} - -Working with submodules in git could be painful. Next commands will help to manage it: - - # ! each command accepts - # Update remote URLs for submodules. Barely rare case - git submodule sync - # Add new submodules - git submodule init - # Update existing submodules to the current state - git submodule update - # Two last commands could be merged together - git submodule update --init - -The next commands would help you to reset all submodules to the initial state (!WARNING! - any changes inside will be deleted): - - # Synchronizes submodules' remote URL with .gitmodules - git submodule sync - # Update the registered submodules with initialize not yet initialized - git submodule update --init - # Reset all changes done after HEAD - git submodule foreach git reset --hard - # Clean files from .gitignore - git submodule foreach git clean -xfd - # Repeat last 4 commands for all submodule - git submodule foreach git submodule sync - git submodule foreach git submodule update --init - git submodule foreach git submodule foreach git reset --hard - git submodule foreach git submodule foreach git clean -xfd - ## Build System {#build-system} ClickHouse uses CMake and Ninja for building. diff --git a/docs/en/development/style.md b/docs/en/development/style.md index 5b03468623d..0b71a669638 100644 --- a/docs/en/development/style.md +++ b/docs/en/development/style.md @@ -345,7 +345,7 @@ struct ExtractDomain **7.** For abstract classes (interfaces) you can add the `I` prefix. ``` cpp -class IBlockInputStream +class IProcessor ``` **8.** If you use a variable locally, you can use the short name. diff --git a/docs/en/engines/table-engines/integrations/materialized-postgresql.md b/docs/en/engines/table-engines/integrations/materialized-postgresql.md index 47dae2ed494..02afec5cfd6 100644 --- a/docs/en/engines/table-engines/integrations/materialized-postgresql.md +++ b/docs/en/engines/table-engines/integrations/materialized-postgresql.md @@ -2,9 +2,10 @@ slug: /en/engines/table-engines/integrations/materialized-postgresql sidebar_position: 130 sidebar_label: MaterializedPostgreSQL -title: MaterializedPostgreSQL --- +# [experimental] MaterializedPostgreSQL + Creates ClickHouse table with an initial data dump of PostgreSQL table and starts replication process, i.e. executes background job to apply new changes as they happen on PostgreSQL table in the remote PostgreSQL database. If more than one table is required, it is highly recommended to use the [MaterializedPostgreSQL](../../../engines/database-engines/materialized-postgresql.md) database engine instead of the table engine and use the `materialized_postgresql_tables_list` setting, which specifies the tables to be replicated (will also be possible to add database `schema`). It will be much better in terms of CPU, fewer connections and fewer replication slots inside the remote PostgreSQL database. diff --git a/docs/en/engines/table-engines/special/distributed.md b/docs/en/engines/table-engines/special/distributed.md index eb2979d1283..14431c4c43b 100644 --- a/docs/en/engines/table-engines/special/distributed.md +++ b/docs/en/engines/table-engines/special/distributed.md @@ -46,6 +46,11 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] AS [db2.]name2 `sharding_key` - (optionally) sharding key +Specifying the `sharding_key` is necessary for the following: + +- For `INSERTs` into a distributed table (as the table engine needs the `sharding_key` to determine how to split the data). However, if `insert_distributed_one_random_shard` setting is enabled, then `INSERTs` do not need the sharding key +- For use with `optimize_skip_unused_shards` as the `sharding_key` is necessary to determine what shards should be queried + #### policy_name `policy_name` - (optionally) policy name, it will be used to store temporary files for background send diff --git a/docs/en/getting-started/example-datasets/wikistat.md b/docs/en/getting-started/example-datasets/wikistat.md index 9d0760efe94..d913ccd9b31 100644 --- a/docs/en/getting-started/example-datasets/wikistat.md +++ b/docs/en/getting-started/example-datasets/wikistat.md @@ -1,5 +1,4 @@ --- -slug: /en/getting-started/example-datasets/wikistat sidebar_label: WikiStat --- @@ -41,7 +40,8 @@ CREATE TABLE wikistat project LowCardinality(String), subproject LowCardinality(String), path String CODEC(ZSTD(3)), - hits UInt64 CODEC(ZSTD(3)) + hits UInt64 CODEC(ZSTD(3)), + size UInt64 CODEC(ZSTD(3)) ) ENGINE = MergeTree ORDER BY (path, time); diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index eb963de0c35..155ae316890 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -2156,7 +2156,7 @@ To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/t - [input_format_parquet_local_file_min_bytes_for_seek](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_local_file_min_bytes_for_seek) - min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format. Default value - `8192`. - [output_format_parquet_fixed_string_as_fixed_byte_array](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_fixed_string_as_fixed_byte_array) - use Parquet FIXED_LENGTH_BYTE_ARRAY type instead of Binary/String for FixedString columns. Default value - `true`. - [output_format_parquet_version](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_version) - The version of Parquet format used in output format. Default value - `2.latest`. -- [output_format_parquet_compression_method](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_compression_method) - compression method used in output Parquet format. Default value - `snappy`. +- [output_format_parquet_compression_method](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_compression_method) - compression method used in output Parquet format. Default value - `lz4`. ## ParquetMetadata {data-format-parquet-metadata} diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index 0e2c0c00e4c..63f75fb7830 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -438,7 +438,7 @@ $ curl -v 'http://localhost:8123/predefined_query' < X-ClickHouse-Query-Id: 96fe0052-01e6-43ce-b12a-6b7370de6e8a < X-ClickHouse-Format: Template < X-ClickHouse-Timezone: Asia/Shanghai -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"} < # HELP "Query" "Number of executing queries" @@ -603,7 +603,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/hi' < Connection: Keep-Alive < Content-Type: text/html; charset=UTF-8 < Transfer-Encoding: chunked -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"} < * Connection #0 to host localhost left intact @@ -643,7 +643,7 @@ $ curl -v -H 'XXX:xxx' 'http://localhost:8123/get_config_static_handler' < Connection: Keep-Alive < Content-Type: text/plain; charset=UTF-8 < Transfer-Encoding: chunked -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"} < * Connection #0 to host localhost left intact @@ -695,7 +695,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_absolute_path_static_handler' < Connection: Keep-Alive < Content-Type: text/html; charset=UTF-8 < Transfer-Encoding: chunked -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"} < Absolute Path File @@ -714,7 +714,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_relative_path_static_handler' < Connection: Keep-Alive < Content-Type: text/html; charset=UTF-8 < Transfer-Encoding: chunked -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"} < Relative Path File diff --git a/docs/en/interfaces/third-party/client-libraries.md b/docs/en/interfaces/third-party/client-libraries.md index 7b5c4f27a2a..5aa634785aa 100644 --- a/docs/en/interfaces/third-party/client-libraries.md +++ b/docs/en/interfaces/third-party/client-libraries.md @@ -74,6 +74,7 @@ ClickHouse Inc does **not** maintain the libraries listed below and hasn’t don ### Elixir - [clickhousex](https://github.com/appodeal/clickhousex/) - [pillar](https://github.com/sofakingworld/pillar) + - [ecto_ch](https://github.com/plausible/ecto_ch) ### Nim - [nim-clickhouse](https://github.com/leonardoce/nim-clickhouse) ### Haskell diff --git a/docs/en/operations/optimizing-performance/profile-guided-optimization.md b/docs/en/operations/optimizing-performance/profile-guided-optimization.md index cda21e3c604..3d36bb2cc14 100644 --- a/docs/en/operations/optimizing-performance/profile-guided-optimization.md +++ b/docs/en/operations/optimizing-performance/profile-guided-optimization.md @@ -1,5 +1,4 @@ --- -slug: /en/operations/optimizing-performance/profile-guided-optimization sidebar_position: 54 sidebar_label: Profile Guided Optimization (PGO) --- diff --git a/docs/en/operations/optimizing-performance/sampling-query-profiler.md b/docs/en/operations/optimizing-performance/sampling-query-profiler.md index 9988bfc44bc..206f710734e 100644 --- a/docs/en/operations/optimizing-performance/sampling-query-profiler.md +++ b/docs/en/operations/optimizing-performance/sampling-query-profiler.md @@ -11,7 +11,8 @@ ClickHouse runs sampling profiler that allows analyzing query execution. Using p Query profiler is automatically enabled in ClickHouse Cloud and you can run a sample query as follows -:::note If you are running the following query in ClickHouse Cloud, make sure to change `FROM system.trace_log` to `FROM clusterAllReplicas(default, system.trace_log)` to select from all nodes of the cluster ::: +:::note If you are running the following query in ClickHouse Cloud, make sure to change `FROM system.trace_log` to `FROM clusterAllReplicas(default, system.trace_log)` to select from all nodes of the cluster +::: ``` sql SELECT diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 2c3f8be79b3..98636a653fb 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -214,7 +214,7 @@ Max consecutive resolving failures before dropping a host from ClickHouse DNS ca Type: UInt32 -Default: 1024 +Default: 10 ## index_mark_cache_policy @@ -2427,6 +2427,8 @@ This section contains the following parameters: * hostname_levenshtein_distance - just like nearest_hostname, but it compares hostname in a levenshtein distance manner. * first_or_random - selects the first ZooKeeper node, if it's not available then randomly selects one of remaining ZooKeeper nodes. * round_robin - selects the first ZooKeeper node, if reconnection happens selects the next. +- `use_compression` — If set to true, enables compression in Keeper protocol. + **Example configuration** diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index bb59402079e..344e6dda680 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -897,6 +897,12 @@ Use DOS/Windows-style line separator (CRLF) in CSV instead of Unix style (LF). Disabled by default. +### input_format_csv_allow_cr_end_of_line {#input_format_csv_allow_cr_end_of_line} + +If it is set true, CR(\\r) will be allowed at end of line not followed by LF(\\n) + +Disabled by default. + ### input_format_csv_enum_as_number {#input_format_csv_enum_as_number} When enabled, always treat enum values as enum ids for CSV input format. It's recommended to enable this setting if data contains only enum ids to optimize enum parsing. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 8c138b7ea0a..2f3805e8e55 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3310,22 +3310,11 @@ Possible values: Default value: `0`. -## use_mysql_types_in_show_columns {#use_mysql_types_in_show_columns} - -Show the names of MySQL data types corresponding to ClickHouse data types in [SHOW COLUMNS](../../sql-reference/statements/show.md#show_columns). - -Possible values: - -- 0 - Show names of native ClickHouse data types. -- 1 - Show names of MySQL data types corresponding to ClickHouse data types. - -Default value: `0`. - ## mysql_map_string_to_text_in_show_columns {#mysql_map_string_to_text_in_show_columns} When enabled, [String](../../sql-reference/data-types/string.md) ClickHouse data type will be displayed as `TEXT` in [SHOW COLUMNS](../../sql-reference/statements/show.md#show_columns). -Has effect only when [use_mysql_types_in_show_columns](#use_mysql_types_in_show_columns) is enabled. +Has an effect only when the connection is made through the MySQL wire protocol. - 0 - Use `BLOB`. - 1 - Use `TEXT`. @@ -3336,7 +3325,7 @@ Default value: `0`. When enabled, [FixedString](../../sql-reference/data-types/fixedstring.md) ClickHouse data type will be displayed as `TEXT` in [SHOW COLUMNS](../../sql-reference/statements/show.md#show_columns). -Has effect only when [use_mysql_types_in_show_columns](#use_mysql_types_in_show_columns) is enabled. +Has an effect only when the connection is made through the MySQL wire protocol. - 0 - Use `BLOB`. - 1 - Use `TEXT`. @@ -3954,6 +3943,17 @@ Possible values: Default value: `''`. +## preferred_optimize_projection_name {#preferred_optimize_projection_name} + +If it is set to a non-empty string, ClickHouse will try to apply specified projection in query. + + +Possible values: + +- string: name of preferred projection + +Default value: `''`. + ## alter_sync {#alter-sync} Allows to set up waiting for actions to be executed on replicas by [ALTER](../../sql-reference/statements/alter/index.md), [OPTIMIZE](../../sql-reference/statements/optimize.md) or [TRUNCATE](../../sql-reference/statements/truncate.md) queries. @@ -4812,3 +4812,10 @@ LIFETIME(MIN 0 MAX 3600) LAYOUT(COMPLEX_KEY_HASHED_ARRAY()) SETTINGS(dictionary_use_async_executor=1, max_threads=8); ``` + +## storage_metadata_write_full_object_key {#storage_metadata_write_full_object_key} + +When set to `true` the metadata files are written with `VERSION_FULL_OBJECT_KEY` format version. With that format full object storage key names are written to the metadata files. +When set to `false` the metadata files are written with the previous format version, `VERSION_INLINE_DATA`. With that format only suffixes of object storage key names are are written to the metadata files. The prefix for all of object storage key names is set in configurations files at `storage_configuration.disks` section. + +Default value: `false`. diff --git a/docs/en/operations/system-tables/information_schema.md b/docs/en/operations/system-tables/information_schema.md index 8470ac838a4..d3f06f6e719 100644 --- a/docs/en/operations/system-tables/information_schema.md +++ b/docs/en/operations/system-tables/information_schema.md @@ -18,12 +18,14 @@ SHOW TABLES FROM information_schema; │ KEY_COLUMN_USAGE │ │ REFERENTIAL_CONSTRAINTS │ │ SCHEMATA │ +| STATISTICS | │ TABLES │ │ VIEWS │ │ columns │ │ key_column_usage │ │ referential_constraints │ │ schemata │ +| statistics | │ tables │ │ views │ └─────────────────────────┘ @@ -32,11 +34,12 @@ SHOW TABLES FROM information_schema; `INFORMATION_SCHEMA` contains the following views: - [COLUMNS](#columns) -- [SCHEMATA](#schemata) -- [TABLES](#tables) -- [VIEWS](#views) - [KEY_COLUMN_USAGE](#key_column_usage) - [REFERENTIAL_CONSTRAINTS](#referential_constraints) +- [SCHEMATA](#schemata) +- [STATISTICS](#statistics) +- [TABLES](#tables) +- [VIEWS](#views) Case-insensitive equivalent views, e.g. `INFORMATION_SCHEMA.columns` are provided for reasons of compatibility with other databases. The same applies to all the columns in these views - both lowercase (for example, `table_name`) and uppercase (`TABLE_NAME`) variants are provided. @@ -372,3 +375,28 @@ Columns: - `delete_rule` ([String](../../sql-reference/data-types/string.md)) — Currently unused. - `table_name` ([String](../../sql-reference/data-types/string.md)) — Currently unused. - `referenced_table_name` ([String](../../sql-reference/data-types/string.md)) — Currently unused. + +## STATISTICS {#statistics} + +Provides information about table indexes. Currently returns an empty result (no rows) which is just enough to provide compatibility with 3rd party tools like Tableau Online. + +Columns: + +- `table_catalog` ([String](../../sql-reference/data-types/string.md)) — Currently unused. +- `table_schema` ([String](../../sql-reference/data-types/string.md)) — Currently unused. +- `table_name` ([String](../../sql-reference/data-types/string.md)) — Currently unused. +- `non_unique` ([Int32](../../sql-reference/data-types/int-uint.md)) — Currently unused. +- `index_schema` ([String](../../sql-reference/data-types/string.md)) — Currently unused. +- `index_name` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Currently unused. +- `seq_in_index` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Currently unused. +- `column_name` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Currently unused. +- `collation` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Currently unused. +- `cardinality` ([Nullable](../../sql-reference/data-types/nullable.md)([Int64](../../sql-reference/data-types/int-uint.md))) — Currently unused. +- `sub_part` ([Nullable](../../sql-reference/data-types/nullable.md)([Int64](../../sql-reference/data-types/int-uint.md))) — Currently unused. +- `packed` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Currently unused. +- `nullable` ([String](../../sql-reference/data-types/string.md)) — Currently unused. +- `index_type` ([String](../../sql-reference/data-types/string.md)) — Currently unused. +- `comment` ([String](../../sql-reference/data-types/string.md)) — Currently unused. +- `index_comment` ([String](../../sql-reference/data-types/string.md)) — Currently unused. +- `is_visible` ([String](../../sql-reference/data-types/string.md)) — Currently unused. +- `expression` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Currently unused. diff --git a/docs/en/operations/system-tables/stack_trace.md b/docs/en/operations/system-tables/stack_trace.md index 52ee7088597..90f1f47e52f 100644 --- a/docs/en/operations/system-tables/stack_trace.md +++ b/docs/en/operations/system-tables/stack_trace.md @@ -35,27 +35,25 @@ WITH arrayMap(x -> demangle(addressToSymbol(x)), trace) AS all SELECT thread_nam ``` text Row 1: ────── -thread_name: clickhouse-serv - -thread_id: 686 -query_id: 1a11f70b-626d-47c1-b948-f9c7b206395d -res: sigqueue -DB::StorageSystemStackTrace::fillData(std::__1::vector::mutable_ptr, std::__1::allocator::mutable_ptr > >&, DB::Context const&, DB::SelectQueryInfo const&) const -DB::IStorageSystemOneBlock::read(std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&, DB::SelectQueryInfo const&, DB::Context const&, DB::QueryProcessingStage::Enum, unsigned long, unsigned int) -DB::InterpreterSelectQuery::executeFetchColumns(DB::QueryProcessingStage::Enum, DB::QueryPipeline&, std::__1::shared_ptr const&, std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&) -DB::InterpreterSelectQuery::executeImpl(DB::QueryPipeline&, std::__1::shared_ptr const&, std::__1::optional) -DB::InterpreterSelectQuery::execute() -DB::InterpreterSelectWithUnionQuery::execute() -DB::executeQueryImpl(char const*, char const*, DB::Context&, bool, DB::QueryProcessingStage::Enum, bool, DB::ReadBuffer*) -DB::executeQuery(std::__1::basic_string, std::__1::allocator > const&, DB::Context&, bool, DB::QueryProcessingStage::Enum, bool) -DB::TCPHandler::runImpl() -DB::TCPHandler::run() -Poco::Net::TCPServerConnection::start() -Poco::Net::TCPServerDispatcher::run() -Poco::PooledThread::run() -Poco::ThreadImpl::runnableEntry(void*) -start_thread -__clone +thread_name: QueryPipelineEx +thread_id: 743490 +query_id: dc55a564-febb-4e37-95bb-090ef182c6f1 +res: memcpy +large_ralloc +arena_ralloc +do_rallocx +Allocator::realloc(void*, unsigned long, unsigned long, unsigned long) +HashTable, HashTableNoState, PairNoInit>, HashCRC32, HashTableGrowerWithPrecalculation<8ul>, Allocator>::resize(unsigned long, unsigned long) +void DB::Aggregator::executeImplBatch, HashTableNoState, PairNoInit>, HashCRC32, HashTableGrowerWithPrecalculation<8ul>, Allocator>, true, false>>(DB::AggregationMethodOneNumber, HashTableNoState, PairNoInit>, HashCRC32, HashTableGrowerWithPrecalculation<8ul>, Allocator>, true, false>&, DB::AggregationMethodOneNumber, HashTableNoState, PairNoInit>, HashCRC32, HashTableGrowerWithPrecalculation<8ul>, Allocator>, true, false>::State&, DB::Arena*, unsigned long, unsigned long, DB::Aggregator::AggregateFunctionInstruction*, bool, char*) const +DB::Aggregator::executeImpl(DB::AggregatedDataVariants&, unsigned long, unsigned long, std::__1::vector>&, DB::Aggregator::AggregateFunctionInstruction*, bool, bool, char*) const +DB::Aggregator::executeOnBlock(std::__1::vector::immutable_ptr, std::__1::allocator::immutable_ptr>>, unsigned long, unsigned long, DB::AggregatedDataVariants&, std::__1::vector>&, std::__1::vector>, std::__1::allocator>>>&, bool&) const +DB::AggregatingTransform::work() +DB::ExecutionThreadContext::executeTask() +DB::PipelineExecutor::executeStepImpl(unsigned long, std::__1::atomic*) +void std::__1::__function::__policy_invoker::__call_impl>(std::__1::__function::__policy_storage const*) +ThreadPoolImpl>::worker(std::__1::__list_iterator, void*>) +void std::__1::__function::__policy_invoker::__call_impl::ThreadFromGlobalPoolImpl>::scheduleImpl(std::__1::function, Priority, std::__1::optional, bool)::'lambda0'()>(void&&)::'lambda'(), void ()>>(std::__1::__function::__policy_storage const*) +void* std::__1::__thread_proxy[abi:v15000]>, void ThreadPoolImpl::scheduleImpl(std::__1::function, Priority, std::__1::optional, bool)::'lambda0'()>>(void*) ``` Getting filenames and line numbers in ClickHouse source code: diff --git a/docs/en/operations/system-tables/symbols.md b/docs/en/operations/system-tables/symbols.md new file mode 100644 index 00000000000..5acd3ad51c7 --- /dev/null +++ b/docs/en/operations/system-tables/symbols.md @@ -0,0 +1,35 @@ +--- +slug: /en/operations/system-tables/symbols +--- +# symbols + +Contains information for introspection of `clickhouse` binary. It requires the introspection privilege to access. +This table is only useful for C++ experts and ClickHouse engineers. + +Columns: + +- `symbol` ([String](../../sql-reference/data-types/string.md)) — Symbol name in the binary. It is mangled. You can apply `demangle(symbol)` to obtain a readable name. +- `address_begin` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Start address of the symbol in the binary. +- `address_end` ([UInt64](../../sql-reference/data-types/int-uint.md)) — End address of the symbol in the binary. +- `name` ([String](../../sql-reference/data-types/string.md)) — Alias for `event`. + +**Example** + +``` sql +SELECT address_begin, address_end - address_begin AS size, demangle(symbol) FROM system.symbols ORDER BY size DESC LIMIT 10 +``` + +``` text +┌─address_begin─┬─────size─┬─demangle(symbol)──────────────────────────────────────────────────────────────────┐ +│ 25000976 │ 29466000 │ icudt70_dat │ +│ 400605288 │ 2097272 │ arena_emap_global │ +│ 18760592 │ 1048576 │ CLD2::kQuadChrome1015_2 │ +│ 9807152 │ 884808 │ TopLevelDomainLookupHash::isValid(char const*, unsigned long)::wordlist │ +│ 57442432 │ 850608 │ llvm::X86Insts │ +│ 55682944 │ 681360 │ (anonymous namespace)::X86DAGToDAGISel::SelectCode(llvm::SDNode*)::MatcherTable │ +│ 55130368 │ 502840 │ (anonymous namespace)::X86InstructionSelector::getMatchTable() const::MatchTable0 │ +│ 402930616 │ 404032 │ qpl::ml::dispatcher::hw_dispatcher::get_instance()::instance │ +│ 274131872 │ 356795 │ DB::SettingsTraits::Accessor::instance()::$_0::operator()() const │ +│ 58293040 │ 249424 │ llvm::X86InstrNameData │ +└───────────────┴──────────┴───────────────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/en/operations/utilities/clickhouse-keeper-client.md b/docs/en/operations/utilities/clickhouse-keeper-client.md index d06d88598a7..d6e11fb9613 100644 --- a/docs/en/operations/utilities/clickhouse-keeper-client.md +++ b/docs/en/operations/utilities/clickhouse-keeper-client.md @@ -12,6 +12,7 @@ A client application to interact with clickhouse-keeper by its native protocol. - `-q QUERY`, `--query=QUERY` — Query to execute. If this parameter is not passed, `clickhouse-keeper-client` will start in interactive mode. - `-h HOST`, `--host=HOST` — Server host. Default value: `localhost`. - `-p N`, `--port=N` — Server port. Default value: 9181 +- `-c FILE_PATH`, `--config-file=FILE_PATH` — Set path of config file to get the connection string. Default value: `config.xml`. - `--connection-timeout=TIMEOUT` — Set connection timeout in seconds. Default value: 10s. - `--session-timeout=TIMEOUT` — Set session timeout in seconds. Default value: 10s. - `--operation-timeout=TIMEOUT` — Set operation timeout in seconds. Default value: 10s. diff --git a/docs/en/sql-reference/data-types/float.md b/docs/en/sql-reference/data-types/float.md index 3b55271f707..f1b99153b41 100644 --- a/docs/en/sql-reference/data-types/float.md +++ b/docs/en/sql-reference/data-types/float.md @@ -16,7 +16,7 @@ CREATE TABLE IF NOT EXISTS float_vs_decimal my_decimal Decimal64(3) )Engine=MergeTree ORDER BY tuple() -INSERT INTO float_vs_decimal SELECT round(canonicalRand(), 3) AS res, res FROM system.numbers LIMIT 1000000; # Generate 1 000 000 random number with 2 decimal places and store them as a float and as a decimal +INSERT INTO float_vs_decimal SELECT round(randCanonical(), 3) AS res, res FROM system.numbers LIMIT 1000000; # Generate 1 000 000 random number with 2 decimal places and store them as a float and as a decimal SELECT sum(my_float), sum(my_decimal) FROM float_vs_decimal; > 500279.56300000014 500279.563 diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 40bfb65e4e8..75fcbab6401 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -2175,7 +2175,7 @@ Result: ## arrayRandomSample -Function `arrayRandomSample` returns a subset with `samples`-many random elements of an input array. If `samples` exceeds the size of the input array, the sample size is limited to the size of the array. In this case, all elements of the input array are returned, but the order is not guaranteed. The function can handle both flat arrays and nested arrays. +Function `arrayRandomSample` returns a subset with `samples`-many random elements of an input array. If `samples` exceeds the size of the input array, the sample size is limited to the size of the array, i.e. all array elements are returned but their order is not guaranteed. The function can handle both flat arrays and nested arrays. **Syntax** @@ -2185,13 +2185,15 @@ arrayRandomSample(arr, samples) **Arguments** -- `arr` — The input array from which to sample elements. This may be flat or nested arrays. -- `samples` — An unsigned integer specifying the number of elements to include in the random sample. +- `arr` — The input array from which to sample elements. ([Array(T)](../data-types/array.md)) +- `samples` — The number of elements to include in the random sample ([UInt*](../data-types/int-uint.md)) **Returned Value** - An array containing a random sample of elements from the input array. +Type: [Array](../data-types/array.md). + **Examples** Query: @@ -2201,9 +2203,10 @@ SELECT arrayRandomSample(['apple', 'banana', 'cherry', 'date'], 2) as res; ``` Result: + ``` ┌─res────────────────┐ -│ ['banana','apple'] │ +│ ['cherry','apple'] │ └────────────────────┘ ``` @@ -2214,6 +2217,7 @@ SELECT arrayRandomSample([[1, 2], [3, 4], [5, 6]], 2) as res; ``` Result: + ``` ┌─res───────────┐ │ [[3,4],[5,6]] │ @@ -2222,24 +2226,12 @@ Result: Query: -```sql -SELECT arrayRandomSample([1, 2, 3, 4, 5], 0) as res; -``` - -Result: -``` -┌─res─┐ -│ [] │ -└─────┘ -``` - -Query: - ```sql SELECT arrayRandomSample([1, 2, 3], 5) as res; ``` Result: + ``` ┌─res─────┐ │ [3,1,2] │ diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 4df5e2afcbf..55d09be7847 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -2766,9 +2766,11 @@ Result: ## fromUnixTimestamp -Function converts Unix timestamp to a calendar date and a time of a day. When there is only a single argument of [Integer](../../sql-reference/data-types/int-uint.md) type, it acts in the same way as [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime) and return [DateTime](../../sql-reference/data-types/datetime.md) type. +This function converts a Unix timestamp to a calendar date and a time of a day. -fromUnixTimestamp uses MySQL datetime format style, refer to https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format. +It can be called in two ways: + +When given a single argument of type [Integer](../../sql-reference/data-types/int-uint.md), it returns a value of type [DateTime](../../sql-reference/data-types/datetime.md), i.e. behaves like [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime). Alias: `FROM_UNIXTIME`. @@ -2786,14 +2788,16 @@ Result: └──────────────────────────────┘ ``` -When there are two or three arguments, the first an [Integer](../../sql-reference/data-types/int-uint.md), [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md), the second a constant format string and the third an optional constant time zone string — it acts in the same way as [formatDateTime](#formatdatetime) and return [String](../../sql-reference/data-types/string.md#string) type. +When given two or three arguments where the first argument is a value of type [Integer](../../sql-reference/data-types/int-uint.md), [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md), the second argument is a constant format string and the third argument is an optional constant time zone string, the function returns a value of type [String](../../sql-reference/data-types/string.md#string), i.e. it behaves like [formatDateTime](#formatdatetime). In this case, [MySQL's datetime format style](https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format) is used. -For example: +**Example:** ```sql SELECT fromUnixTimestamp(1234334543, '%Y-%m-%d %R:%S') AS DateTime; ``` +Result: + ```text ┌─DateTime────────────┐ │ 2009-02-11 14:42:23 │ @@ -2806,19 +2810,20 @@ SELECT fromUnixTimestamp(1234334543, '%Y-%m-%d %R:%S') AS DateTime; ## fromUnixTimestampInJodaSyntax -Similar to fromUnixTimestamp, except that it formats time in Joda style instead of MySQL style. Refer to https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html. +Same as [fromUnixTimestamp](#fromUnixTimestamp) but when called in the second way (two or three arguments), the formatting is performed using [Joda style](https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) instead of MySQL style. **Example:** ``` sql -SELECT fromUnixTimestampInJodaSyntax(1669804872, 'yyyy-MM-dd HH:mm:ss', 'UTC'); +SELECT fromUnixTimestampInJodaSyntax(1234334543, 'yyyy-MM-dd HH:mm:ss', 'UTC') AS DateTime; ``` Result: + ``` -┌─fromUnixTimestampInJodaSyntax(1669804872, 'yyyy-MM-dd HH:mm:ss', 'UTC')────┐ -│ 2022-11-30 10:41:12 │ -└────────────────────────────────────────────────────────────────────────────┘ +┌─DateTime────────────┐ +│ 2009-02-11 06:42:23 │ +└─────────────────────┘ ``` ## toModifiedJulianDay diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 6b092cf384d..35fd5089bf0 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2760,10 +2760,13 @@ message Root Returns a formatted, possibly multi-line, version of the given SQL query. +Throws an exception if the query is not well-formed. To return `NULL` instead, function `formatQueryOrNull()` may be used. + **Syntax** ```sql formatQuery(query) +formatQueryOrNull(query) ``` **Arguments** @@ -2796,10 +2799,13 @@ WHERE (a > 3) AND (b < 3) │ Like formatQuery() but the returned formatted string contains no line breaks. +Throws an exception if the query is not well-formed. To return `NULL` instead, function `formatQuerySingleLineOrNull()` may be used. + **Syntax** ```sql formatQuerySingleLine(query) +formatQuerySingleLineOrNull(query) ``` **Arguments** diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md index e9a0ed72466..84839c2489c 100644 --- a/docs/en/sql-reference/functions/rounding-functions.md +++ b/docs/en/sql-reference/functions/rounding-functions.md @@ -107,11 +107,7 @@ round(3.65, 1) = 3.6 Rounds a number to a specified decimal position. -- If the rounding number is halfway between two numbers, the function uses banker’s rounding. - - Banker's rounding is a method of rounding fractional numbers. When the rounding number is halfway between two numbers, it's rounded to the nearest even digit at the specified decimal position. For example: 3.5 rounds up to 4, 2.5 rounds down to 2. - - It's the default rounding method for floating point numbers defined in [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754#Roundings_to_nearest). The [round](#rounding_functions-round) function performs the same rounding for floating point numbers. The `roundBankers` function also rounds integers the same way, for example, `roundBankers(45, -1) = 40`. +- If the rounding number is halfway between two numbers, the function uses banker’s rounding. Banker's rounding is a method of rounding fractional numbers. When the rounding number is halfway between two numbers, it's rounded to the nearest even digit at the specified decimal position. For example: 3.5 rounds up to 4, 2.5 rounds down to 2. It's the default rounding method for floating point numbers defined in [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754#Roundings_to_nearest). The [round](#rounding_functions-round) function performs the same rounding for floating point numbers. The `roundBankers` function also rounds integers the same way, for example, `roundBankers(45, -1) = 40`. - In other cases, the function rounds numbers to the nearest integer. diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 47e16b67643..4df987b5e2a 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -1371,6 +1371,86 @@ Result: └──────────────────┘ ``` +## byteHammingDistance + +Calculates the [hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) between two byte strings. + +**Syntax** + +```sql +byteHammingDistance(string1, string2) +``` + +**Examples** + +``` sql +SELECT byteHammingDistance('karolin', 'kathrin'); +``` + +Result: + +``` text +┌─byteHammingDistance('karolin', 'kathrin')─┐ +│ 3 │ +└───────────────────────────────────────────┘ +``` + +Alias: mismatches + +## stringJaccardIndex + +Calculates the [Jaccard similarity index](https://en.wikipedia.org/wiki/Jaccard_index) between two byte strings. + +**Syntax** + +```sql +stringJaccardIndex(string1, string2) +``` + +**Examples** + +``` sql +SELECT stringJaccardIndex('clickhouse', 'mouse'); +``` + +Result: + +``` text +┌─stringJaccardIndex('clickhouse', 'mouse')─┐ +│ 0.4 │ +└───────────────────────────────────────────┘ +``` + +## stringJaccardIndexUTF8 + +Like [stringJaccardIndex](#stringJaccardIndex) but for UTF8-encoded strings. + +## editDistance + +Calculates the [edit distance](https://en.wikipedia.org/wiki/Edit_distance) between two byte strings. + +**Syntax** + +```sql +editDistance(string1, string2) +``` + +**Examples** + +``` sql +SELECT editDistance('clickhouse', 'mouse'); +``` + +Result: + +``` text +┌─editDistance('clickhouse', 'mouse')─┐ +│ 6 │ +└─────────────────────────────────────┘ +``` + +Alias: levenshteinDistance + ## initcap Convert the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters. diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 264708513fa..1cb71e6f35d 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -681,79 +681,3 @@ Like [hasSubsequence](#hasSubsequence) but assumes `haystack` and `needle` are U ## hasSubsequenceCaseInsensitiveUTF8 Like [hasSubsequenceUTF8](#hasSubsequenceUTF8) but searches case-insensitively. - -## byteHammingDistance - -Calculates the [hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) between two byte strings. - -**Syntax** - -```sql -byteHammingDistance(string2, string2) -``` - -**Examples** - -``` sql -SELECT byteHammingDistance('abc', 'ab') ; -``` - -Result: - -``` text -┌─byteHammingDistance('abc', 'ab')─┐ -│ 1 │ -└──────────────────────────────────┘ -``` - -- Alias: mismatches - -## jaccardIndex - -Calculates the [Jaccard similarity index](https://en.wikipedia.org/wiki/Jaccard_index) between two byte strings. - -**Syntax** - -```sql -byteJaccardIndex(string1, string2) -``` - -**Examples** - -``` sql -SELECT jaccardIndex('clickhouse', 'mouse'); -``` - -Result: - -``` text -┌─jaccardIndex('clickhouse', 'mouse')─┐ -│ 0.4 │ -└─────────────────────────────────────────┘ -``` - -## editDistance - -Calculates the [edit distance](https://en.wikipedia.org/wiki/Edit_distance) between two byte strings. - -**Syntax** - -```sql -editDistance(string1, string2) -``` - -**Examples** - -``` sql -SELECT editDistance('clickhouse', 'mouse'); -``` - -Result: - -``` text -┌─editDistance('clickhouse', 'mouse')─┐ -│ 6 │ -└─────────────────────────────────────────┘ -``` - -- Alias: levenshteinDistance diff --git a/docs/en/sql-reference/functions/tuple-functions.md b/docs/en/sql-reference/functions/tuple-functions.md index 2f7c6377ee1..5930239dc56 100644 --- a/docs/en/sql-reference/functions/tuple-functions.md +++ b/docs/en/sql-reference/functions/tuple-functions.md @@ -171,7 +171,8 @@ Result: Can be used with [MinHash](../../sql-reference/functions/hash-functions.md#ngramminhash) functions for detection of semi-duplicate strings: ``` sql -SELECT tupleHammingDistance(wordShingleMinHash(string), wordShingleMinHashCaseInsensitive(string)) as HammingDistance FROM (SELECT 'ClickHouse is a column-oriented database management system for online analytical processing of queries.' AS string); +SELECT tupleHammingDistance(wordShingleMinHash(string), wordShingleMinHashCaseInsensitive(string)) AS HammingDistance +FROM (SELECT 'ClickHouse is a column-oriented database management system for online analytical processing of queries.' AS string); ``` Result: diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index f9c829209c5..37d4ac30648 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1840,9 +1840,9 @@ Converts an `Int64` to a `DateTime64` value with fixed sub-second precision and **Syntax** ``` sql -fromUnixTimestamp64Milli(value [, timezone]) -fromUnixTimestamp64Micro(value [, timezone]) -fromUnixTimestamp64Nano(value [, timezone]) +fromUnixTimestamp64Milli(value[, timezone]) +fromUnixTimestamp64Micro(value[, timezone]) +fromUnixTimestamp64Nano(value[, timezone]) ``` **Arguments** diff --git a/docs/en/sql-reference/statements/select/except.md b/docs/en/sql-reference/statements/select/except.md index cc4bb9d1c24..8ba7544d21f 100644 --- a/docs/en/sql-reference/statements/select/except.md +++ b/docs/en/sql-reference/statements/select/except.md @@ -103,7 +103,7 @@ INSERT INTO holdings VALUES ('Bitcoin', 200), ('Ethereum', 250), ('Ethereum', 5000), - ('DOGEFI', 10); + ('DOGEFI', 10), ('Bitcoin Diamond', 5000); ``` diff --git a/docs/en/sql-reference/statements/show.md b/docs/en/sql-reference/statements/show.md index 6ad9c247d02..029ca4adf3b 100644 --- a/docs/en/sql-reference/statements/show.md +++ b/docs/en/sql-reference/statements/show.md @@ -207,7 +207,7 @@ The optional keyword `FULL` causes the output to include the collation, comment The statement produces a result table with the following structure: - `field` - The name of the column (String) -- `type` - The column data type. If setting `[use_mysql_types_in_show_columns](../../operations/settings/settings.md#use_mysql_types_in_show_columns) = 1` (default: 0), then the equivalent type name in MySQL is shown. (String) +- `type` - The column data type. If the query was made through the MySQL wire protocol, then the equivalent type name in MySQL is shown. (String) - `null` - `YES` if the column data type is Nullable, `NO` otherwise (String) - `key` - `PRI` if the column is part of the primary key, `SOR` if the column is part of the sorting key, empty otherwise (String) - `default` - Default expression of the column if it is of type `ALIAS`, `DEFAULT`, or `MATERIALIZED`, otherwise `NULL`. (Nullable(String)) diff --git a/docs/en/sql-reference/table-functions/gcs.md b/docs/en/sql-reference/table-functions/gcs.md index 48c2381696e..c49ae6a8501 100644 --- a/docs/en/sql-reference/table-functions/gcs.md +++ b/docs/en/sql-reference/table-functions/gcs.md @@ -7,7 +7,7 @@ keywords: [gcs, bucket] # gcs Table Function -Provides a table-like interface to select/insert files in [Google Cloud Storage](https://cloud.google.com/storage/). +Provides a table-like interface to `SELECT` and `INSERT` data from [Google Cloud Storage](https://cloud.google.com/storage/). Requires the [`Storage Object User` IAM role](https://cloud.google.com/storage/docs/access-control/iam-roles). **Syntax** diff --git a/docs/ru/development/architecture.md b/docs/ru/development/architecture.md index 35741570702..b2e851a78cd 100644 --- a/docs/ru/development/architecture.md +++ b/docs/ru/development/architecture.md @@ -49,21 +49,9 @@ ClickHouse — полноценная столбцовая СУБД. Данны Блоки создаются для всех обработанных фрагментов данных. Напоминаем, что одни и те же типы вычислений, имена столбцов и типы переиспользуются в разных блоках и только данные колонок изменяются. Лучше разделить данные и заголовок блока потому, что в блоках маленького размера мы имеем большой оверхэд по временным строкам при копировании умных указателей (`shared_ptrs`) и имен столбцов. -## Потоки блоков (Block Streams) {#block-streams} +## Процессоры -Потоки блоков обрабатывают данные. Мы используем потоки блоков для чтения данных, трансформации или записи данных куда-либо. `IBlockInputStream` предоставляет метод `read` для получения следующего блока, пока это возможно, и метод `write`, чтобы продвигать (push) блок куда-либо. - -Потоки отвечают за: - -1. Чтение и запись в таблицу. Таблица лишь возвращает поток для чтения или записи блоков. -2. Реализацию форматов данных. Например, при выводе данных в терминал в формате `Pretty`, вы создаете выходной поток блоков, который форматирует поступающие в него блоки. -3. Трансформацию данных. Допустим, у вас есть `IBlockInputStream` и вы хотите создать отфильтрованный поток. Вы создаете `FilterBlockInputStream` и инициализируете его вашим потоком. Затем вы тянете (pull) блоки из `FilterBlockInputStream`, а он тянет блоки исходного потока, фильтрует их и возвращает отфильтрованные блоки вам. Таким образом построены конвейеры выполнения запросов. - -Имеются и более сложные трансформации. Например, когда вы тянете блоки из `AggregatingBlockInputStream`, он считывает все данные из своего источника, агрегирует их, и возвращает поток агрегированных данных вам. Другой пример: конструктор `UnionBlockInputStream` принимает множество источников входных данных и число потоков. Такой `Stream` работает в несколько потоков и читает данные источников параллельно. - -> Потоки блоков используют «втягивающий» (pull) подход к управлению потоком выполнения: когда вы вытягиваете блок из первого потока, он, следовательно, вытягивает необходимые блоки из вложенных потоков, так и работает весь конвейер выполнения. Ни «pull» ни «push» не имеют явного преимущества, потому что поток управления неявный, и это ограничивает в реализации различных функций, таких как одновременное выполнение нескольких запросов (слияние нескольких конвейеров вместе). Это ограничение можно преодолеть с помощью сопрограмм (coroutines) или просто запуском дополнительных потоков, которые ждут друг друга. У нас может быть больше возможностей, если мы сделаем поток управления явным: если мы локализуем логику для передачи данных из одной расчетной единицы в другую вне этих расчетных единиц. Читайте эту [статью](http://journal.stuffwithstuff.com/2013/01/13/iteration-inside-and-out/) для углубленного изучения. - -Следует отметить, что конвейер выполнения запроса создает временные данные на каждом шаге. Мы стараемся сохранить размер блока достаточно маленьким, чтобы временные данные помещались в кэш процессора. При таком допущении запись и чтение временных данных практически бесплатны по сравнению с другими расчетами. Мы могли бы рассмотреть альтернативу, которая заключается в том, чтобы объединить многие операции в конвейере вместе. Это может сделать конвейер как можно короче и удалить большую часть временных данных, что может быть преимуществом, но у такого подхода также есть недостатки. Например, разделенный конвейер позволяет легко реализовать кэширование промежуточных данных, использование промежуточных данных из аналогичных запросов, выполняемых одновременно, и объединение конвейеров для аналогичных запросов. +Смотрите описание в файле [src/Processors/IProcessor.h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Processors/IProcessor.h) исходного кода. ## Форматы {#formats} @@ -81,13 +69,16 @@ ClickHouse — полноценная столбцовая СУБД. Данны Буферы чтения-записи имеют дело только с байтами. В заголовочных файлах `ReadHelpers` и `WriteHelpers` объявлены некоторые функции, чтобы помочь с форматированием ввода-вывода. Например, есть помощники для записи числа в десятичном формате. -Давайте посмотрим, что происходит, когда вы хотите вывести результат в `JSON` формате в стандартный вывод (stdout). У вас есть результирующий набор данных, готовый к извлечению из `IBlockInputStream`. Вы создаете `WriteBufferFromFileDescriptor(STDOUT_FILENO)` чтобы записать байты в stdout. Вы создаете `JSONRowOutputStream`, инициализируете с этим `WriteBuffer`'ом, чтобы записать строки `JSON` в stdout. Кроме того вы создаете `BlockOutputStreamFromRowOutputStream`, реализуя `IBlockOutputStream`. Затем вызывается `copyData` для передачи данных из `IBlockInputStream` в `IBlockOutputStream` и все работает. Внутренний `JSONRowOutputStream` будет писать в формате `JSON` различные разделители и вызвать `IDataType::serializeTextJSON` метод со ссылкой на `IColumn` и номер строки в качестве аргументов. Следовательно, `IDataType::serializeTextJSON` вызовет метод из `WriteHelpers.h`: например, `writeText` для числовых типов и `writeJSONString` для `DataTypeString`. +Давайте посмотрим, что происходит, когда вы хотите вывести результат в `JSON` формате в стандартный вывод (stdout). У вас есть результирующий набор данных, готовый к извлечению из `QueryPipeline`. Вы создаете `WriteBufferFromFileDescriptor(STDOUT_FILENO)` чтобы записать байты в stdout. Вы создаете `JSONRowOutputFormat`, инициализируете с этим `WriteBuffer`'ом, чтобы записать строки `JSON` в stdout. +Чтобы соеденить выход `QueryPipeline` с форматом, можно использовать метод `complete`, который превращает `QueryPipeline` в завершенный `QueryPipeline`. +Внутренний `JSONRowOutputStream` будет писать в формате `JSON` различные разделители и вызвать `IDataType::serializeTextJSON` метод со ссылкой на `IColumn` и номер строки в качестве аргументов. Следовательно, `IDataType::serializeTextJSON` вызовет метод из `WriteHelpers.h`: например, `writeText` для числовых типов и `writeJSONString` для `DataTypeString`. ## Таблицы {#tables} Интерфейс `IStorage` служит для отображения таблицы. Различные движки таблиц являются реализациями этого интерфейса. Примеры `StorageMergeTree`, `StorageMemory` и так далее. Экземпляры этих классов являются просто таблицами. -Ключевые методы `IStorage` это `read` и `write`. Есть и другие варианты — `alter`, `rename`, `drop` и так далее. Метод `read` принимает следующие аргументы: набор столбцов для чтения из таблицы, `AST` запрос и желаемое количество потоков для вывода. Он возвращает один или несколько объектов `IBlockInputStream` и информацию о стадии обработки данных, которая была завершена внутри табличного движка во время выполнения запроса. +Ключевые методы `IStorage` это `read` и `write`. Есть и другие варианты — `alter`, `rename`, `drop` и так далее. +Метод `read` принимает следующие аргументы: набор столбцов для чтения из таблицы, `AST` запрос и желаемое количество потоков для вывода и возвращает `Pipe`. В большинстве случаев метод read отвечает только за чтение указанных столбцов из таблицы, а не за дальнейшую обработку данных. Вся дальнейшая обработка данных осуществляется интерпретатором запросов и не входит в сферу ответственности `IStorage`. @@ -96,7 +87,9 @@ ClickHouse — полноценная столбцовая СУБД. Данны - AST-запрос, передающийся в метод `read`, может использоваться движком таблицы для получения информации о возможности использования индекса и считывания меньшего количества данных из таблицы. - Иногда движок таблиц может сам обрабатывать данные до определенного этапа. Например, `StorageDistributed` можно отправить запрос на удаленные серверы, попросить их обработать данные до этапа, когда данные с разных удаленных серверов могут быть объединены, и вернуть эти предварительно обработанные данные. Затем интерпретатор запросов завершает обработку данных. -Метод `read` может возвращать несколько объектов `IBlockInputStream`, позволяя осуществлять параллельную обработку данных. Эти несколько блочных входных потоков могут считываться из таблицы параллельно. Затем вы можете обернуть эти потоки различными преобразованиями (такими как вычисление выражений или фильтрация), которые могут быть вычислены независимо, и создать `UnionBlockInputStream` поверх них, чтобы читать из нескольких потоков параллельно. +Метод `read` может возвращать `Pipe`, состоящий из нескольких процессоров. Каждый их этих процессоров может читать данные параллельно. +Затем, вы можете соеденить эти просессоры с другими преобразованиями (такими как вычисление выражений или фильтрация), которые могут быть вычислены независимо. +Далее, создан `QueryPipeline` поверх них, можно выполнить пайплайн с помощью `PipelineExecutor`. Есть и другие варианты. Например, `TableFunction` возвращает временный объект `IStorage`, который можно подставить во `FROM`. @@ -112,10 +105,18 @@ ClickHouse — полноценная столбцовая СУБД. Данны ## Интерпретаторы {#interpreters} -Интерпретаторы отвечают за создание конвейера выполнения запроса из `AST`. Есть простые интерпретаторы, такие как `InterpreterExistsQuery` и `InterpreterDropQuery` или более сложный `InterpreterSelectQuery`. Конвейер выполнения запроса представляет собой комбинацию входных и выходных потоков блоков. Например, результатом интерпретации `SELECT` запроса является `IBlockInputStream` для чтения результирующего набора данных; результат интерпретации `INSERT` запроса — это `IBlockOutputStream`, для записи данных, предназначенных для вставки; результат интерпретации `INSERT SELECT` запроса — это `IBlockInputStream`, который возвращает пустой результирующий набор при первом чтении, но копирует данные из `SELECT` к `INSERT`. +Интерпретаторы отвечают за создание конвейера выполнения запроса из `AST`. Есть простые интерпретаторы, такие как `InterpreterExistsQuery` и `InterpreterDropQuery` или более сложный `InterpreterSelectQuery`. + +Конвейер выполнения запроса представляет собой комбинацию процессоров, которые могут принимать на вход и также возвращать чанки (набор колонок с их типами) +Процессоры обмениваются данными через порты и могут иметь несколько входных и выходных портов. +Более подробное описание можно найти в файле [src/Processors/IProcessor.h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Processors/IProcessor.h). + +Например, результатом интерпретации `SELECT` запроса является `QueryPipeline`, который имеет специальный выходной порт для чтения результирующего набора данных. Результатом интерпретации `INSERT` запроса является `QueryPipeline` с входным портом для записи данных для вставки. Результатом интерпретации `INSERT SELECT` запроса является завершенный `QueryPipeline`, который не имеет входов или выходов, но копирует данные из `SELECT` в `INSERT` одновременно. `InterpreterSelectQuery` использует `ExpressionAnalyzer` и `ExpressionActions` механизмы для анализа запросов и преобразований. Именно здесь выполняется большинство оптимизаций запросов на основе правил. `ExpressionAnalyzer` написан довольно грязно и должен быть переписан: различные преобразования запросов и оптимизации должны быть извлечены в отдельные классы, чтобы позволить модульные преобразования или запросы. +Для решения текущих проблем, существующих в интерпретаторах, разрабатывается новый `InterpreterSelectQueryAnalyzer`. Это новая версия `InterpreterSelectQuery`, которая не использует `ExpressionAnalyzer` и вводит дополнительный уровень абстракции между `AST` и `QueryPipeline`, называемый `QueryTree`. Он еще не готов к использованию в продакшене, но его можно протестировать с помощью флага `allow_experimental_analyzer`. + ## Функции {#functions} Существуют обычные функции и агрегатные функции. Агрегатные функции смотрите в следующем разделе. diff --git a/docs/ru/development/style.md b/docs/ru/development/style.md index 6098dc9c13b..49c4aade4e9 100644 --- a/docs/ru/development/style.md +++ b/docs/ru/development/style.md @@ -345,7 +345,7 @@ struct ExtractDomain **7.** Для абстрактных классов (интерфейсов) можно добавить в начало имени букву `I`. ``` cpp -class IBlockInputStream +class IProcessor ``` **8.** Если переменная используется достаточно локально, то можно использовать короткое имя. diff --git a/docs/ru/interfaces/http.md b/docs/ru/interfaces/http.md index 16927408bc4..be8cfbdda6c 100644 --- a/docs/ru/interfaces/http.md +++ b/docs/ru/interfaces/http.md @@ -366,7 +366,7 @@ $ curl -v 'http://localhost:8123/predefined_query' < X-ClickHouse-Query-Id: 96fe0052-01e6-43ce-b12a-6b7370de6e8a < X-ClickHouse-Format: Template < X-ClickHouse-Timezone: Asia/Shanghai -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"} < # HELP "Query" "Number of executing queries" @@ -529,7 +529,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/hi' < Connection: Keep-Alive < Content-Type: text/html; charset=UTF-8 < Transfer-Encoding: chunked -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"} < * Connection #0 to host localhost left intact @@ -569,7 +569,7 @@ $ curl -v -H 'XXX:xxx' 'http://localhost:8123/get_config_static_handler' < Connection: Keep-Alive < Content-Type: text/plain; charset=UTF-8 < Transfer-Encoding: chunked -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"} < * Connection #0 to host localhost left intact @@ -621,7 +621,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_absolute_path_static_handler' < Connection: Keep-Alive < Content-Type: text/html; charset=UTF-8 < Transfer-Encoding: chunked -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"} < Absolute Path File @@ -640,7 +640,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_relative_path_static_handler' < Connection: Keep-Alive < Content-Type: text/html; charset=UTF-8 < Transfer-Encoding: chunked -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"} < Relative Path File diff --git a/docs/ru/operations/system-tables/stack_trace.md b/docs/ru/operations/system-tables/stack_trace.md index 817f66d1af0..bf9dbd55f80 100644 --- a/docs/ru/operations/system-tables/stack_trace.md +++ b/docs/ru/operations/system-tables/stack_trace.md @@ -31,27 +31,25 @@ WITH arrayMap(x -> demangle(addressToSymbol(x)), trace) AS all SELECT thread_nam ``` text Row 1: ────── -thread_name: clickhouse-serv - -thread_id: 686 -query_id: 1a11f70b-626d-47c1-b948-f9c7b206395d -res: sigqueue -DB::StorageSystemStackTrace::fillData(std::__1::vector::mutable_ptr, std::__1::allocator::mutable_ptr > >&, DB::Context const&, DB::SelectQueryInfo const&) const -DB::IStorageSystemOneBlock::read(std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&, DB::SelectQueryInfo const&, DB::Context const&, DB::QueryProcessingStage::Enum, unsigned long, unsigned int) -DB::InterpreterSelectQuery::executeFetchColumns(DB::QueryProcessingStage::Enum, DB::QueryPipeline&, std::__1::shared_ptr const&, std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&) -DB::InterpreterSelectQuery::executeImpl(DB::QueryPipeline&, std::__1::shared_ptr const&, std::__1::optional) -DB::InterpreterSelectQuery::execute() -DB::InterpreterSelectWithUnionQuery::execute() -DB::executeQueryImpl(char const*, char const*, DB::Context&, bool, DB::QueryProcessingStage::Enum, bool, DB::ReadBuffer*) -DB::executeQuery(std::__1::basic_string, std::__1::allocator > const&, DB::Context&, bool, DB::QueryProcessingStage::Enum, bool) -DB::TCPHandler::runImpl() -DB::TCPHandler::run() -Poco::Net::TCPServerConnection::start() -Poco::Net::TCPServerDispatcher::run() -Poco::PooledThread::run() -Poco::ThreadImpl::runnableEntry(void*) -start_thread -__clone +thread_name: QueryPipelineEx +thread_id: 743490 +query_id: dc55a564-febb-4e37-95bb-090ef182c6f1 +res: memcpy +large_ralloc +arena_ralloc +do_rallocx +Allocator::realloc(void*, unsigned long, unsigned long, unsigned long) +HashTable, HashTableNoState, PairNoInit>, HashCRC32, HashTableGrowerWithPrecalculation<8ul>, Allocator>::resize(unsigned long, unsigned long) +void DB::Aggregator::executeImplBatch, HashTableNoState, PairNoInit>, HashCRC32, HashTableGrowerWithPrecalculation<8ul>, Allocator>, true, false>>(DB::AggregationMethodOneNumber, HashTableNoState, PairNoInit>, HashCRC32, HashTableGrowerWithPrecalculation<8ul>, Allocator>, true, false>&, DB::AggregationMethodOneNumber, HashTableNoState, PairNoInit>, HashCRC32, HashTableGrowerWithPrecalculation<8ul>, Allocator>, true, false>::State&, DB::Arena*, unsigned long, unsigned long, DB::Aggregator::AggregateFunctionInstruction*, bool, char*) const +DB::Aggregator::executeImpl(DB::AggregatedDataVariants&, unsigned long, unsigned long, std::__1::vector>&, DB::Aggregator::AggregateFunctionInstruction*, bool, bool, char*) const +DB::Aggregator::executeOnBlock(std::__1::vector::immutable_ptr, std::__1::allocator::immutable_ptr>>, unsigned long, unsigned long, DB::AggregatedDataVariants&, std::__1::vector>&, std::__1::vector>, std::__1::allocator>>>&, bool&) const +DB::AggregatingTransform::work() +DB::ExecutionThreadContext::executeTask() +DB::PipelineExecutor::executeStepImpl(unsigned long, std::__1::atomic*) +void std::__1::__function::__policy_invoker::__call_impl>(std::__1::__function::__policy_storage const*) +ThreadPoolImpl>::worker(std::__1::__list_iterator, void*>) +void std::__1::__function::__policy_invoker::__call_impl::ThreadFromGlobalPoolImpl>::scheduleImpl(std::__1::function, Priority, std::__1::optional, bool)::'lambda0'()>(void&&)::'lambda'(), void ()>>(std::__1::__function::__policy_storage const*) +void* std::__1::__thread_proxy[abi:v15000]>, void ThreadPoolImpl::scheduleImpl(std::__1::function, Priority, std::__1::optional, bool)::'lambda0'()>>(void*) ``` Получение имен файлов и номеров строк в исходном коде ClickHouse: diff --git a/docs/zh/interfaces/http.md b/docs/zh/interfaces/http.md index dfdcf53bd3f..84ca5ed0c47 100644 --- a/docs/zh/interfaces/http.md +++ b/docs/zh/interfaces/http.md @@ -362,7 +362,7 @@ $ curl -v 'http://localhost:8123/predefined_query' < X-ClickHouse-Query-Id: 96fe0052-01e6-43ce-b12a-6b7370de6e8a < X-ClickHouse-Format: Template < X-ClickHouse-Timezone: Asia/Shanghai -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"} < # HELP "Query" "Number of executing queries" @@ -520,7 +520,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/hi' < Connection: Keep-Alive < Content-Type: text/html; charset=UTF-8 < Transfer-Encoding: chunked -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"} < * Connection #0 to host localhost left intact @@ -560,7 +560,7 @@ $ curl -v -H 'XXX:xxx' 'http://localhost:8123/get_config_static_handler' < Connection: Keep-Alive < Content-Type: text/plain; charset=UTF-8 < Transfer-Encoding: chunked -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"} < * Connection #0 to host localhost left intact @@ -612,7 +612,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_absolute_path_static_handler' < Connection: Keep-Alive < Content-Type: text/html; charset=UTF-8 < Transfer-Encoding: chunked -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"} < Absolute Path File @@ -631,7 +631,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_relative_path_static_handler' < Connection: Keep-Alive < Content-Type: text/html; charset=UTF-8 < Transfer-Encoding: chunked -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"} < Relative Path File diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index eb4a898d472..eb117e74f6b 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -439,6 +439,13 @@ else() install (TARGETS clickhouse RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) endif() +# A target to get stripped binary. +# Note: this is different to the above (extract debug symbols to a separate place) +add_custom_target(clickhouse-stripped ALL + COMMAND "${STRIP_PATH}" -o "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-stripped" --strip-debug --remove-section=.comment --remove-section=.note "${CMAKE_CURRENT_BINARY_DIR}/clickhouse" + DEPENDS clickhouse + COMMENT "Stripping clickhouse binary" VERBATIM) + if (ENABLE_TESTS) set (CLICKHOUSE_UNIT_TESTS_TARGETS unit_tests_dbms) add_custom_target (clickhouse-tests ALL DEPENDS ${CLICKHOUSE_UNIT_TESTS_TARGETS}) diff --git a/programs/copier/ClusterCopier.cpp b/programs/copier/ClusterCopier.cpp index 2fe7808d304..b2b4970d04f 100644 --- a/programs/copier/ClusterCopier.cpp +++ b/programs/copier/ClusterCopier.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -59,7 +60,7 @@ void ClusterCopier::init() getContext()->setClustersConfig(task_cluster_current_config, false, task_cluster->clusters_prefix); /// Set up shards and their priority - task_cluster->random_engine.seed(task_cluster->random_device()); + task_cluster->random_engine.seed(randomSeed()); for (auto & task_table : task_cluster->table_tasks) { task_table.cluster_pull = getContext()->getCluster(task_table.cluster_pull_name); diff --git a/programs/copier/TaskCluster.h b/programs/copier/TaskCluster.h index fc1c8a663ec..a7f8bc3baca 100644 --- a/programs/copier/TaskCluster.h +++ b/programs/copier/TaskCluster.h @@ -7,7 +7,7 @@ #include -#include +#include namespace DB { @@ -45,7 +45,6 @@ struct TaskCluster /// Subtasks TasksTable table_tasks; - std::random_device random_device; pcg64 random_engine; }; diff --git a/programs/install/Install.cpp b/programs/install/Install.cpp index e10a9fea86b..9d4d791263b 100644 --- a/programs/install/Install.cpp +++ b/programs/install/Install.cpp @@ -420,7 +420,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv) /// Create symlinks. - std::initializer_list tools + std::initializer_list tools { "clickhouse-server", "clickhouse-client", @@ -435,6 +435,9 @@ int mainEntryClickHouseInstall(int argc, char ** argv) "clickhouse-keeper", "clickhouse-keeper-converter", "clickhouse-disks", + "ch", + "chl", + "chc", }; for (const auto & tool : tools) @@ -444,29 +447,39 @@ int mainEntryClickHouseInstall(int argc, char ** argv) if (fs::exists(symlink_path)) { - bool is_symlink = FS::isSymlink(symlink_path); - fs::path points_to; - if (is_symlink) - points_to = fs::weakly_canonical(FS::readSymlink(symlink_path)); - - if (is_symlink && (points_to == main_bin_path || (options.count("link") && points_to == binary_self_canonical_path))) + /// Do not replace short named symlinks if they are already present in the system + /// to avoid collision with other tools. + if (!tool.starts_with("clickhouse")) { + fmt::print("Symlink {} already exists. Will keep it.\n", symlink_path.string()); need_to_create = false; } else { - if (!is_symlink) + bool is_symlink = FS::isSymlink(symlink_path); + fs::path points_to; + if (is_symlink) + points_to = fs::weakly_canonical(FS::readSymlink(symlink_path)); + + if (is_symlink && (points_to == main_bin_path || (options.count("link") && points_to == binary_self_canonical_path))) { - fs::path rename_path = symlink_path.replace_extension(".old"); - fmt::print("File {} already exists but it's not a symlink. Will rename to {}.\n", - symlink_path.string(), rename_path.string()); - fs::rename(symlink_path, rename_path); + need_to_create = false; } - else if (points_to != main_bin_path) + else { - fmt::print("Symlink {} already exists but it points to {}. Will replace the old symlink to {}.\n", - symlink_path.string(), points_to.string(), main_bin_path.string()); - fs::remove(symlink_path); + if (!is_symlink) + { + fs::path rename_path = symlink_path.replace_extension(".old"); + fmt::print("File {} already exists but it's not a symlink. Will rename to {}.\n", + symlink_path.string(), rename_path.string()); + fs::rename(symlink_path, rename_path); + } + else if (points_to != main_bin_path) + { + fmt::print("Symlink {} already exists but it points to {}. Will replace the old symlink to {}.\n", + symlink_path.string(), points_to.string(), main_bin_path.string()); + fs::remove(symlink_path); + } } } } diff --git a/programs/main.cpp b/programs/main.cpp index 5857e8d5ee4..959984d565d 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -2,15 +2,12 @@ #include #include -#ifdef OS_LINUX -#include -#endif - #include #include #include #include #include +#include #include /// pair #include @@ -22,7 +19,6 @@ #include #include -#include /// Universal executable for various clickhouse applications @@ -98,7 +94,7 @@ using MainFunc = int (*)(int, char**); #if !defined(FUZZING_MODE) /// Add an item here to register new application -std::pair clickhouse_applications[] = +std::pair clickhouse_applications[] = { #if ENABLE_CLICKHOUSE_LOCAL {"local", mainEntryClickHouseLocal}, @@ -158,6 +154,18 @@ std::pair clickhouse_applications[] = #endif }; +/// Add an item here to register a new short name +std::pair clickhouse_short_names[] = +{ +#if ENABLE_CLICKHOUSE_LOCAL + {"ch", "local"}, + {"chl", "local"}, +#endif +#if ENABLE_CLICKHOUSE_CLIENT + {"chc", "client"}, +#endif +}; + int printHelp(int, char **) { std::cerr << "Use one of the following commands:" << std::endl; @@ -387,15 +395,21 @@ void checkHarmfulEnvironmentVariables(char ** argv) } -bool isClickhouseApp(const std::string & app_suffix, std::vector & argv) +bool isClickhouseApp(std::string_view app_suffix, std::vector & argv) { + for (const auto & [alias, name] : clickhouse_short_names) + if (app_suffix == name + && !argv.empty() && (alias == argv[0] || endsWith(argv[0], "/" + std::string(alias)))) + return true; + /// Use app if the first arg 'app' is passed (the arg should be quietly removed) if (argv.size() >= 2) { auto first_arg = argv.begin() + 1; /// 'clickhouse --client ...' and 'clickhouse client ...' are Ok - if (*first_arg == "--" + app_suffix || *first_arg == app_suffix) + if (*first_arg == app_suffix + || (std::string_view(*first_arg).starts_with("--") && std::string_view(*first_arg).substr(2) == app_suffix)) { argv.erase(first_arg); return true; @@ -403,7 +417,7 @@ bool isClickhouseApp(const std::string & app_suffix, std::vector & argv) } /// Use app if clickhouse binary is run through symbolic link with name clickhouse-app - std::string app_name = "clickhouse-" + app_suffix; + std::string app_name = "clickhouse-" + std::string(app_suffix); return !argv.empty() && (app_name == argv[0] || endsWith(argv[0], "/" + app_name)); } diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp index 15997ec986e..2cb5250cdf2 100644 --- a/programs/obfuscator/Obfuscator.cpp +++ b/programs/obfuscator/Obfuscator.cpp @@ -1106,7 +1106,7 @@ public: { if (isInteger(data_type)) { - if (isUnsignedInteger(data_type)) + if (isUInt(data_type)) return std::make_unique(seed); else return std::make_unique(seed); diff --git a/programs/self-extracting/CMakeLists.txt b/programs/self-extracting/CMakeLists.txt index f3ff0bbcd78..4b6dd07f618 100644 --- a/programs/self-extracting/CMakeLists.txt +++ b/programs/self-extracting/CMakeLists.txt @@ -11,8 +11,8 @@ else () endif () add_custom_target (self-extracting ALL - ${CMAKE_COMMAND} -E remove clickhouse + ${CMAKE_COMMAND} -E remove clickhouse clickhouse-stripped COMMAND ${COMPRESSOR} ${DECOMPRESSOR} clickhouse ../clickhouse - DEPENDS clickhouse compressor + COMMAND ${COMPRESSOR} ${DECOMPRESSOR} clickhouse-stripped ../clickhouse-stripped + DEPENDS clickhouse clickhouse-stripped compressor ) - diff --git a/programs/server/config.xml b/programs/server/config.xml index d0bf1c7d66a..d7ad1545201 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -104,15 +104,14 @@ - - + - 3 + 10 diff --git a/programs/server/users.d/allow_introspection_functions.xml b/programs/server/users.d/allow_introspection_functions.xml deleted file mode 100644 index ec3057c82d7..00000000000 --- a/programs/server/users.d/allow_introspection_functions.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - 1 - - - diff --git a/programs/server/users.d/allow_introspection_functions.yaml b/programs/server/users.d/allow_introspection_functions.yaml new file mode 120000 index 00000000000..bac14df302b --- /dev/null +++ b/programs/server/users.d/allow_introspection_functions.yaml @@ -0,0 +1 @@ +../../../tests/config/users.d/allow_introspection_functions.yaml \ No newline at end of file diff --git a/programs/server/users.xml b/programs/server/users.xml index 5e2ff51bf4d..fbb5a2c228f 100644 --- a/programs/server/users.xml +++ b/programs/server/users.xml @@ -86,6 +86,13 @@ + + + diff --git a/programs/server/users.yaml.example b/programs/server/users.yaml.example index afae8f2b1ff..27bdf791e35 100644 --- a/programs/server/users.yaml.example +++ b/programs/server/users.yaml.example @@ -91,6 +91,10 @@ users: # User can create other users and grant rights to them. # access_management: 1 + # SQL expressions for grants available for that user - https://clickhouse.com/docs/en/sql-reference/statements/grant + # grants: + # - query: GRANT ALL ON *.* + # Quotas. quotas: # Name of quota. diff --git a/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h b/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h index 439bb613337..023e237ef96 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h +++ b/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h @@ -84,7 +84,7 @@ public: } } - if (!isUnsignedInteger(arguments[1])) + if (!isUInt(arguments[1])) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument of aggregate function {} must be unsigned integer.", getName()); if (default_value.isNull()) diff --git a/src/AggregateFunctions/AggregateFunctionQuantile.h b/src/AggregateFunctions/AggregateFunctionQuantile.h index 13320ad90b6..07db655025d 100644 --- a/src/AggregateFunctions/AggregateFunctionQuantile.h +++ b/src/AggregateFunctions/AggregateFunctionQuantile.h @@ -238,7 +238,7 @@ public: if constexpr (has_second_arg) { assertBinary(Name::name, types); - if (!isUnsignedInteger(types[1])) + if (!isUInt(types[1])) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument (weight) for function {} must be unsigned integer, but it has type {}", diff --git a/src/AggregateFunctions/AggregateFunctionUniq.h b/src/AggregateFunctions/AggregateFunctionUniq.h index 1752d5751d5..f20fb8cb933 100644 --- a/src/AggregateFunctions/AggregateFunctionUniq.h +++ b/src/AggregateFunctions/AggregateFunctionUniq.h @@ -466,7 +466,7 @@ public: std::vector data_vec; data_vec.resize(places.size()); - for (unsigned long i = 0; i < data_vec.size(); i++) + for (size_t i = 0; i < data_vec.size(); ++i) data_vec[i] = &this->data(places[i]).set; DataSet::parallelizeMergePrepare(data_vec, thread_pool); diff --git a/src/AggregateFunctions/AggregateFunctionUniqCombined.cpp b/src/AggregateFunctions/AggregateFunctionUniqCombined.cpp index 3c1c916e377..ff3b463e906 100644 --- a/src/AggregateFunctions/AggregateFunctionUniqCombined.cpp +++ b/src/AggregateFunctions/AggregateFunctionUniqCombined.cpp @@ -143,7 +143,6 @@ namespace void registerAggregateFunctionUniqCombined(AggregateFunctionFactory & factory) { - using namespace std::placeholders; factory.registerFunction("uniqCombined", [](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *) { diff --git a/src/Analyzer/Passes/AnyFunctionPass.cpp b/src/Analyzer/Passes/MoveFunctionsOutOfAnyPass.cpp similarity index 73% rename from src/Analyzer/Passes/AnyFunctionPass.cpp rename to src/Analyzer/Passes/MoveFunctionsOutOfAnyPass.cpp index 75f12bc7d46..51edbcc6bd0 100644 --- a/src/Analyzer/Passes/AnyFunctionPass.cpp +++ b/src/Analyzer/Passes/MoveFunctionsOutOfAnyPass.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include @@ -14,8 +14,80 @@ namespace DB namespace { -class AnyFunctionVisitor : public InDepthQueryTreeVisitorWithContext +class AnyFunctionViMoveFunctionsOutOfAnyVisitor : public InDepthQueryTreeVisitorWithContext { +public: + using Base = InDepthQueryTreeVisitorWithContext; + using Base::Base; + + void enterImpl(QueryTreeNodePtr & node) + { + if (!getSettings().optimize_move_functions_out_of_any) + return; + + auto * function_node = node->as(); + if (!function_node) + return; + + /// check function is any + const auto & function_name = function_node->getFunctionName(); + if (function_name != "any" && function_name != "anyLast") + return; + + auto & arguments = function_node->getArguments().getNodes(); + if (arguments.size() != 1) + return; + + auto * inside_function_node = arguments[0]->as(); + + /// check argument is a function + if (!inside_function_node) + return; + + /// check arguments can not contain arrayJoin or lambda + if (!canRewrite(inside_function_node)) + return; + + auto & inside_function_node_arguments = inside_function_node->getArguments().getNodes(); + + /// case any(f()) + if (inside_function_node_arguments.empty()) + return; + + auto it = node_to_rewritten_node.find(node.get()); + if (it != node_to_rewritten_node.end()) + { + node = it->second; + return; + } + + /// checking done, rewrite function + bool changed_argument = false; + for (auto & inside_argument : inside_function_node_arguments) + { + if (inside_argument->as()) /// skip constant node + break; + + AggregateFunctionProperties properties; + auto aggregate_function = AggregateFunctionFactory::instance().get(function_name, {inside_argument->getResultType()}, {}, properties); + + auto any_function = std::make_shared(function_name); + any_function->resolveAsAggregateFunction(std::move(aggregate_function)); + + auto & any_function_arguments = any_function->getArguments().getNodes(); + any_function_arguments.push_back(std::move(inside_argument)); + + inside_argument = std::move(any_function); + changed_argument = true; + } + + if (changed_argument) + { + node_to_rewritten_node.emplace(node.get(), arguments[0]); + node = arguments[0]; + } + } + private: bool canRewrite(const FunctionNode * function_node) { @@ -45,90 +117,17 @@ private: return true; } -public: - using Base = InDepthQueryTreeVisitorWithContext; - using Base::Base; - - void enterImpl(QueryTreeNodePtr & node) - { - if (!getSettings().optimize_move_functions_out_of_any) - return; - - auto * function_node = node->as(); - if (!function_node) - return; - - /// check function is any - const auto & function_name = function_node->getFunctionName(); - if (!(function_name == "any" || function_name == "anyLast")) - return; - - auto & arguments = function_node->getArguments().getNodes(); - if (arguments.size() != 1) - return; - - auto * inside_function_node = arguments[0]->as(); - - /// check argument is a function - if (!inside_function_node) - return; - - /// check arguments can not contain arrayJoin or lambda - if (!canRewrite(inside_function_node)) - return; - - auto & inside_arguments = inside_function_node->getArguments().getNodes(); - - /// case any(f()) - if (inside_arguments.empty()) - return; - - if (rewritten.contains(node.get())) - { - node = rewritten.at(node.get()); - return; - } - - /// checking done, rewrite function - bool pushed = false; - for (auto & inside_argument : inside_arguments) - { - if (inside_argument->as()) /// skip constant node - break; - - AggregateFunctionProperties properties; - auto aggregate_function = AggregateFunctionFactory::instance().get(function_name, {inside_argument->getResultType()}, {}, properties); - - auto any_function = std::make_shared(function_name); - any_function->resolveAsAggregateFunction(std::move(aggregate_function)); - - auto & any_function_arguments = any_function->getArguments().getNodes(); - any_function_arguments.push_back(std::move(inside_argument)); - - inside_argument = std::move(any_function); - pushed = true; - } - - if (pushed) - { - rewritten.insert({node.get(), arguments[0]}); - node = arguments[0]; - } - } - -private: - /// After query analysis alias will be rewritten to QueryTreeNode - /// whose memory address is same with the original one. - /// So we can reuse the rewritten one. - std::unordered_map rewritten; + /// After query analysis, alias identifier will be resolved to node whose memory address is same with the original one. + /// So we can reuse the rewritten function. + std::unordered_map node_to_rewritten_node; }; } -void AnyFunctionPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void MoveFunctionsOutOfAnyPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) { - AnyFunctionVisitor visitor(context); + AnyFunctionViMoveFunctionsOutOfAnyVisitor visitor(context); visitor.visit(query_tree_node); } diff --git a/src/Analyzer/Passes/AnyFunctionPass.h b/src/Analyzer/Passes/MoveFunctionsOutOfAnyPass.h similarity index 64% rename from src/Analyzer/Passes/AnyFunctionPass.h rename to src/Analyzer/Passes/MoveFunctionsOutOfAnyPass.h index 0cc65d238dd..09a53f2b9e0 100644 --- a/src/Analyzer/Passes/AnyFunctionPass.h +++ b/src/Analyzer/Passes/MoveFunctionsOutOfAnyPass.h @@ -7,13 +7,13 @@ namespace DB /** Rewrite 'any' and 'anyLast' functions pushing them inside original function. * - * Example: any(f(x, y, g(z))) - * Result: f(any(x), any(y), g(any(z))) + * Example: SELECT any(f(x, y, g(z))); + * Result: SELECT f(any(x), any(y), g(any(z))); */ -class AnyFunctionPass final : public IQueryTreePass +class MoveFunctionsOutOfAnyPass final : public IQueryTreePass { public: - String getName() override { return "AnyFunction"; } + String getName() override { return "MoveFunctionsOutOfAnyPass"; } String getDescription() override { diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index c6fbd728b8f..7855c4f34a8 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -1467,9 +1467,15 @@ ProjectionName QueryAnalyzer::calculateFunctionProjectionName(const QueryTreeNod const ProjectionNames & arguments_projection_names) { const auto & function_node_typed = function_node->as(); + const auto & function_node_name = function_node_typed.getFunctionName(); + + bool is_array_function = function_node_name == "array"; + bool is_tuple_function = function_node_name == "tuple"; WriteBufferFromOwnString buffer; - buffer << function_node_typed.getFunctionName(); + + if (!is_array_function && !is_tuple_function) + buffer << function_node_name; if (!parameters_projection_names.empty()) { @@ -1487,7 +1493,16 @@ ProjectionName QueryAnalyzer::calculateFunctionProjectionName(const QueryTreeNod buffer << ')'; } - buffer << '('; + char open_bracket = '('; + char close_bracket = ')'; + + if (is_array_function) + { + open_bracket = '['; + close_bracket = ']'; + } + + buffer << open_bracket; size_t function_arguments_projection_names_size = arguments_projection_names.size(); for (size_t i = 0; i < function_arguments_projection_names_size; ++i) @@ -1498,7 +1513,7 @@ ProjectionName QueryAnalyzer::calculateFunctionProjectionName(const QueryTreeNod buffer << ", "; } - buffer << ')'; + buffer << close_bracket; return buffer.str(); } diff --git a/src/Analyzer/QueryTreePassManager.cpp b/src/Analyzer/QueryTreePassManager.cpp index 22ef9adec8b..08474c4100a 100644 --- a/src/Analyzer/QueryTreePassManager.cpp +++ b/src/Analyzer/QueryTreePassManager.cpp @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include @@ -164,9 +164,7 @@ private: * * TODO: Support setting optimize_substitute_columns. * TODO: Support GROUP BY injective function elimination. - * TODO: Support setting optimize_move_functions_out_of_any. * TODO: Support setting optimize_aggregators_of_group_by_keys. - * TODO: Support setting optimize_duplicate_order_by_and_distinct. * TODO: Support setting optimize_monotonous_functions_in_order_by. * TODO: Add optimizations based on function semantics. Example: SELECT * FROM test_table WHERE id != id. (id is not nullable column). */ @@ -284,7 +282,7 @@ void addQueryTreePasses(QueryTreePassManager & manager) manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); - manager.addPass(std::make_unique()); + manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); } diff --git a/src/Backups/BackupEntriesCollector.cpp b/src/Backups/BackupEntriesCollector.cpp index c216ae02ce2..a0c3bb5f40e 100644 --- a/src/Backups/BackupEntriesCollector.cpp +++ b/src/Backups/BackupEntriesCollector.cpp @@ -20,6 +20,12 @@ namespace fs = std::filesystem; +namespace ProfileEvents +{ + extern const Event BackupEntriesCollectorMicroseconds; + extern const Event BackupEntriesCollectorForTablesDataMicroseconds; + extern const Event BackupEntriesCollectorRunPostTasksMicroseconds; +} namespace DB { @@ -82,7 +88,8 @@ BackupEntriesCollector::BackupEntriesCollector( const BackupSettings & backup_settings_, std::shared_ptr backup_coordination_, const ReadSettings & read_settings_, - const ContextPtr & context_) + const ContextPtr & context_, + ThreadPool & threadpool_) : backup_query_elements(backup_query_elements_) , backup_settings(backup_settings_) , backup_coordination(backup_coordination_) @@ -101,6 +108,7 @@ BackupEntriesCollector::BackupEntriesCollector( context->getSettingsRef().backup_restore_keeper_max_retries, context->getSettingsRef().backup_restore_keeper_retry_initial_backoff_ms, context->getSettingsRef().backup_restore_keeper_retry_max_backoff_ms) + , threadpool(threadpool_) { } @@ -108,6 +116,8 @@ BackupEntriesCollector::~BackupEntriesCollector() = default; BackupEntries BackupEntriesCollector::run() { + auto timer = DB::CurrentThread::getProfileEvents().timer(ProfileEvents::BackupEntriesCollectorMicroseconds); + /// run() can be called onle once. if (!current_stage.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Already making backup entries"); @@ -133,11 +143,19 @@ BackupEntries BackupEntriesCollector::run() /// Make backup entries for the data of the found tables. setStage(Stage::EXTRACTING_DATA_FROM_TABLES); - makeBackupEntriesForTablesData(); + + { + auto timer2 = DB::CurrentThread::getProfileEvents().timer(ProfileEvents::BackupEntriesCollectorForTablesDataMicroseconds); + makeBackupEntriesForTablesData(); + } /// Run all the tasks added with addPostCollectingTask(). setStage(Stage::RUNNING_POST_TASKS); - runPostTasks(); + + { + auto timer2 = DB::CurrentThread::getProfileEvents().timer(ProfileEvents::BackupEntriesCollectorRunPostTasksMicroseconds); + runPostTasks(); + } /// No more backup entries or tasks are allowed after this point. @@ -738,8 +756,20 @@ void BackupEntriesCollector::makeBackupEntriesForTablesData() if (backup_settings.structure_only) return; + std::vector> futures; for (const auto & table_name : table_infos | boost::adaptors::map_keys) - makeBackupEntriesForTableData(table_name); + { + futures.push_back(scheduleFromThreadPool([&]() + { + makeBackupEntriesForTableData(table_name); + }, threadpool, "BackupCollect")); + } + /// Wait for all tasks. + for (auto & future : futures) + future.wait(); + /// Make sure there is no exception. + for (auto & future : futures) + future.get(); } void BackupEntriesCollector::makeBackupEntriesForTableData(const QualifiedTableName & table_name) @@ -775,20 +805,28 @@ void BackupEntriesCollector::makeBackupEntriesForTableData(const QualifiedTableN } } -void BackupEntriesCollector::addBackupEntry(const String & file_name, BackupEntryPtr backup_entry) +void BackupEntriesCollector::addBackupEntryUnlocked(const String & file_name, BackupEntryPtr backup_entry) { if (current_stage == Stage::WRITING_BACKUP) throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding backup entries is not allowed"); backup_entries.emplace_back(file_name, backup_entry); } +void BackupEntriesCollector::addBackupEntry(const String & file_name, BackupEntryPtr backup_entry) +{ + std::lock_guard lock(mutex); + addBackupEntryUnlocked(file_name, backup_entry); +} + void BackupEntriesCollector::addBackupEntry(const std::pair & backup_entry) { - addBackupEntry(backup_entry.first, backup_entry.second); + std::lock_guard lock(mutex); + addBackupEntryUnlocked(backup_entry.first, backup_entry.second); } void BackupEntriesCollector::addBackupEntries(const BackupEntries & backup_entries_) { + std::lock_guard lock(mutex); if (current_stage == Stage::WRITING_BACKUP) throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of backup entries is not allowed"); insertAtEnd(backup_entries, backup_entries_); @@ -796,6 +834,7 @@ void BackupEntriesCollector::addBackupEntries(const BackupEntries & backup_entri void BackupEntriesCollector::addBackupEntries(BackupEntries && backup_entries_) { + std::lock_guard lock(mutex); if (current_stage == Stage::WRITING_BACKUP) throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of backup entries is not allowed"); insertAtEnd(backup_entries, std::move(backup_entries_)); @@ -803,6 +842,7 @@ void BackupEntriesCollector::addBackupEntries(BackupEntries && backup_entries_) void BackupEntriesCollector::addPostTask(std::function task) { + std::lock_guard lock(mutex); if (current_stage == Stage::WRITING_BACKUP) throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of post tasks is not allowed"); post_tasks.push(std::move(task)); @@ -824,6 +864,7 @@ void BackupEntriesCollector::runPostTasks() size_t BackupEntriesCollector::getAccessCounter(AccessEntityType type) { + std::lock_guard lock(mutex); access_counters.resize(static_cast(AccessEntityType::MAX)); return access_counters[static_cast(type)]++; } diff --git a/src/Backups/BackupEntriesCollector.h b/src/Backups/BackupEntriesCollector.h index dea38b54f5b..45d1ba1652a 100644 --- a/src/Backups/BackupEntriesCollector.h +++ b/src/Backups/BackupEntriesCollector.h @@ -31,7 +31,8 @@ public: const BackupSettings & backup_settings_, std::shared_ptr backup_coordination_, const ReadSettings & read_settings_, - const ContextPtr & context_); + const ContextPtr & context_, + ThreadPool & threadpool_); ~BackupEntriesCollector(); /// Collects backup entries and returns the result. @@ -90,6 +91,8 @@ private: void makeBackupEntriesForTablesData(); void makeBackupEntriesForTableData(const QualifiedTableName & table_name); + void addBackupEntryUnlocked(const String & file_name, BackupEntryPtr backup_entry); + void runPostTasks(); Strings setStage(const String & new_stage, const String & message = ""); @@ -170,6 +173,9 @@ private: BackupEntries backup_entries; std::queue> post_tasks; std::vector access_counters; + + ThreadPool & threadpool; + std::mutex mutex; }; } diff --git a/src/Backups/BackupFileInfo.cpp b/src/Backups/BackupFileInfo.cpp index f595c02ddc5..63427de328e 100644 --- a/src/Backups/BackupFileInfo.cpp +++ b/src/Backups/BackupFileInfo.cpp @@ -215,14 +215,13 @@ BackupFileInfos buildFileInfosForBackupEntries(const BackupEntries & backup_entr ++num_active_jobs; } - auto job = [&mutex, &num_active_jobs, &event, &exception, &infos, &backup_entries, &read_settings, &base_backup, &thread_group, i, log](bool async) + auto job = [&mutex, &num_active_jobs, &event, &exception, &infos, &backup_entries, &read_settings, &base_backup, &thread_group, i, log]() { SCOPE_EXIT_SAFE({ std::lock_guard lock{mutex}; if (!--num_active_jobs) event.notify_all(); - if (async) - CurrentThread::detachFromGroupIfNotDetached(); + CurrentThread::detachFromGroupIfNotDetached(); }); try @@ -230,11 +229,10 @@ BackupFileInfos buildFileInfosForBackupEntries(const BackupEntries & backup_entr const auto & name = backup_entries[i].first; const auto & entry = backup_entries[i].second; - if (async && thread_group) + if (thread_group) CurrentThread::attachToGroup(thread_group); - if (async) - setThreadName("BackupWorker"); + setThreadName("BackupWorker"); { std::lock_guard lock{mutex}; @@ -252,8 +250,7 @@ BackupFileInfos buildFileInfosForBackupEntries(const BackupEntries & backup_entr } }; - if (!thread_pool.trySchedule([job] { job(true); })) - job(false); + thread_pool.scheduleOrThrowOnError(job); } { diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index bb97335d8fb..165cb63456a 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,14 @@ #include +namespace ProfileEvents +{ + extern const Event BackupsOpenedForRead; + extern const Event BackupsOpenedForWrite; + extern const Event BackupReadMetadataMicroseconds; + extern const Event BackupWriteMetadataMicroseconds; +} + namespace DB { namespace ErrorCodes @@ -89,12 +98,14 @@ BackupImpl::BackupImpl( , archive_params(archive_params_) , open_mode(OpenMode::READ) , reader(std::move(reader_)) + , context(context_) , is_internal_backup(false) , version(INITIAL_BACKUP_VERSION) , base_backup_info(base_backup_info_) , use_same_s3_credentials_for_base_backup(use_same_s3_credentials_for_base_backup_) + , log(&Poco::Logger::get("BackupImpl")) { - open(context_); + open(); } @@ -115,6 +126,7 @@ BackupImpl::BackupImpl( , archive_params(archive_params_) , open_mode(OpenMode::WRITE) , writer(std::move(writer_)) + , context(context_) , is_internal_backup(is_internal_backup_) , coordination(coordination_) , uuid(backup_uuid_) @@ -124,7 +136,7 @@ BackupImpl::BackupImpl( , use_same_s3_credentials_for_base_backup(use_same_s3_credentials_for_base_backup_) , log(&Poco::Logger::get("BackupImpl")) { - open(context_); + open(); } @@ -140,9 +152,11 @@ BackupImpl::~BackupImpl() } } -void BackupImpl::open(const ContextPtr & context) +void BackupImpl::open() { std::lock_guard lock{mutex}; + LOG_INFO(log, "{} backup: {}", ((open_mode == OpenMode::WRITE) ? "Writing" : "Reading"), backup_name_for_logging); + ProfileEvents::increment((open_mode == OpenMode::WRITE) ? ProfileEvents::BackupsOpenedForWrite : ProfileEvents::BackupsOpenedForRead); if (open_mode == OpenMode::WRITE) { @@ -166,35 +180,8 @@ void BackupImpl::open(const ContextPtr & context) if (open_mode == OpenMode::READ) readBackupMetadata(); - if (base_backup_info) - { - if (use_same_s3_credentials_for_base_backup) - backup_info.copyS3CredentialsTo(*base_backup_info); - - BackupFactory::CreateParams params; - params.backup_info = *base_backup_info; - params.open_mode = OpenMode::READ; - params.context = context; - /// use_same_s3_credentials_for_base_backup should be inherited for base backups - params.use_same_s3_credentials_for_base_backup = use_same_s3_credentials_for_base_backup; - - base_backup = BackupFactory::instance().createBackup(params); - - if (open_mode == OpenMode::WRITE) - { - base_backup_uuid = base_backup->getUUID(); - } - else if (base_backup_uuid != base_backup->getUUID()) - { - throw Exception( - ErrorCodes::WRONG_BASE_BACKUP, - "Backup {}: The base backup {} has different UUID ({} != {})", - backup_name_for_logging, - base_backup->getNameForLogging(), - toString(base_backup->getUUID()), - (base_backup_uuid ? toString(*base_backup_uuid) : "")); - } - } + if ((open_mode == OpenMode::WRITE) && base_backup_info) + base_backup_uuid = getBaseBackupUnlocked()->getUUID(); } void BackupImpl::close() @@ -239,6 +226,42 @@ void BackupImpl::closeArchive() archive_writer.reset(); } +std::shared_ptr BackupImpl::getBaseBackup() const +{ + std::lock_guard lock{mutex}; + return getBaseBackupUnlocked(); +} + +std::shared_ptr BackupImpl::getBaseBackupUnlocked() const +{ + if (!base_backup && base_backup_info) + { + if (use_same_s3_credentials_for_base_backup) + backup_info.copyS3CredentialsTo(*base_backup_info); + + BackupFactory::CreateParams params; + params.backup_info = *base_backup_info; + params.open_mode = OpenMode::READ; + params.context = context; + /// use_same_s3_credentials_for_base_backup should be inherited for base backups + params.use_same_s3_credentials_for_base_backup = use_same_s3_credentials_for_base_backup; + + base_backup = BackupFactory::instance().createBackup(params); + + if ((open_mode == OpenMode::READ) && (base_backup_uuid != base_backup->getUUID())) + { + throw Exception( + ErrorCodes::WRONG_BASE_BACKUP, + "Backup {}: The base backup {} has different UUID ({} != {})", + backup_name_for_logging, + base_backup->getNameForLogging(), + toString(base_backup->getUUID()), + (base_backup_uuid ? toString(*base_backup_uuid) : "")); + } + } + return base_backup; +} + size_t BackupImpl::getNumFiles() const { std::lock_guard lock{mutex}; @@ -289,8 +312,10 @@ UInt64 BackupImpl::getNumReadBytes() const void BackupImpl::writeBackupMetadata() { - assert(!is_internal_backup); + LOG_TRACE(log, "Backup {}: Writing metadata", backup_name_for_logging); + auto timer = DB::CurrentThread::getProfileEvents().timer(ProfileEvents::BackupWriteMetadataMicroseconds); + assert(!is_internal_backup); checkLockFile(true); std::unique_ptr out; @@ -374,11 +399,16 @@ void BackupImpl::writeBackupMetadata() out->finalize(); uncompressed_size = size_of_entries + out->count(); + + LOG_TRACE(log, "Backup {}: Metadata was written", backup_name_for_logging); } void BackupImpl::readBackupMetadata() { + LOG_TRACE(log, "Backup {}: Reading metadata", backup_name_for_logging); + auto timer = DB::CurrentThread::getProfileEvents().timer(ProfileEvents::BackupReadMetadataMicroseconds); + using namespace XMLUtils; std::unique_ptr in; @@ -482,6 +512,8 @@ void BackupImpl::readBackupMetadata() compressed_size = uncompressed_size; if (!use_archive) setCompressedSize(); + + LOG_TRACE(log, "Backup {}: Metadata was read", backup_name_for_logging); } void BackupImpl::checkBackupDoesntExist() const @@ -705,7 +737,8 @@ std::unique_ptr BackupImpl::readFileImpl(const SizeAndChecks if (info.base_size) { /// Make `base_read_buffer` if there is data for this backup entry in the base backup. - if (!base_backup) + auto base = getBaseBackup(); + if (!base) { throw Exception( ErrorCodes::NO_BASE_BACKUP, @@ -713,7 +746,7 @@ std::unique_ptr BackupImpl::readFileImpl(const SizeAndChecks backup_name_for_logging, formatSizeAndChecksum(size_and_checksum)); } - if (!base_backup->fileExists(std::pair(info.base_size, info.base_checksum))) + if (!base->fileExists(std::pair(info.base_size, info.base_checksum))) { throw Exception( ErrorCodes::WRONG_BASE_BACKUP, @@ -721,7 +754,7 @@ std::unique_ptr BackupImpl::readFileImpl(const SizeAndChecks backup_name_for_logging, formatSizeAndChecksum(size_and_checksum)); } - base_read_buffer = base_backup->readFile(std::pair{info.base_size, info.base_checksum}); + base_read_buffer = base->readFile(std::pair{info.base_size, info.base_checksum}); } { @@ -809,7 +842,7 @@ size_t BackupImpl::copyFileToDisk(const SizeAndChecksum & size_and_checksum, else if (info.size && (info.size == info.base_size)) { /// Data comes completely from the base backup (nothing comes from this backup). - base_backup->copyFileToDisk(std::pair{info.base_size, info.base_checksum}, destination_disk, destination_path, write_mode); + getBaseBackup()->copyFileToDisk(std::pair{info.base_size, info.base_checksum}, destination_disk, destination_path, write_mode); file_copied = true; } diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h index 5b0254c22bf..6070db79aa6 100644 --- a/src/Backups/BackupImpl.h +++ b/src/Backups/BackupImpl.h @@ -60,7 +60,7 @@ public: OpenMode getOpenMode() const override { return open_mode; } time_t getTimestamp() const override { return timestamp; } UUID getUUID() const override { return *uuid; } - BackupPtr getBaseBackup() const override { return base_backup; } + BackupPtr getBaseBackup() const override; size_t getNumFiles() const override; UInt64 getTotalSize() const override; size_t getNumEntries() const override; @@ -85,7 +85,7 @@ public: bool supportsWritingInMultipleThreads() const override { return !use_archive; } private: - void open(const ContextPtr & context); + void open(); void close(); void openArchive(); @@ -95,6 +95,9 @@ private: void writeBackupMetadata() TSA_REQUIRES(mutex); void readBackupMetadata() TSA_REQUIRES(mutex); + /// Returns the base backup or null if there is no base backup. + std::shared_ptr getBaseBackupUnlocked() const TSA_REQUIRES(mutex); + /// Checks that a new backup doesn't exist yet. void checkBackupDoesntExist() const; @@ -118,6 +121,7 @@ private: const OpenMode open_mode; std::shared_ptr writer; std::shared_ptr reader; + const ContextPtr context; const bool is_internal_backup; std::shared_ptr coordination; @@ -138,8 +142,8 @@ private: mutable size_t num_read_files = 0; mutable UInt64 num_read_bytes = 0; int version; - std::optional base_backup_info; - std::shared_ptr base_backup; + mutable std::optional base_backup_info; + mutable std::shared_ptr base_backup; std::optional base_backup_uuid; std::shared_ptr archive_reader; std::shared_ptr archive_writer; diff --git a/src/Backups/BackupOperationInfo.h b/src/Backups/BackupOperationInfo.h index d8342e7a8c9..54f5e5e9965 100644 --- a/src/Backups/BackupOperationInfo.h +++ b/src/Backups/BackupOperationInfo.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace DB { @@ -47,6 +48,9 @@ struct BackupOperationInfo std::exception_ptr exception; String error_message; + /// Profile events collected during the backup. + std::shared_ptr profile_counters = nullptr; + std::chrono::system_clock::time_point start_time; std::chrono::system_clock::time_point end_time; }; diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index da814dcbc08..f6020deabec 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -218,42 +218,145 @@ namespace } -BackupsWorker::BackupsWorker( - ContextPtr global_context, - size_t num_backup_threads, - size_t num_restore_threads, - bool allow_concurrent_backups_, - bool allow_concurrent_restores_) - : backups_thread_pool(std::make_unique( - CurrentMetrics::BackupsThreads, - CurrentMetrics::BackupsThreadsActive, - num_backup_threads, - /* max_free_threads = */ 0, - num_backup_threads)) - , restores_thread_pool(std::make_unique( - CurrentMetrics::RestoreThreads, - CurrentMetrics::RestoreThreadsActive, - num_restore_threads, - /* max_free_threads = */ 0, - num_restore_threads)) - , backup_async_executor_pool(std::make_unique( - CurrentMetrics::BackupsThreads, - CurrentMetrics::BackupsThreadsActive, - num_backup_threads, - num_backup_threads, - num_backup_threads)) - , restore_async_executor_pool(std::make_unique( - CurrentMetrics::RestoreThreads, - CurrentMetrics::RestoreThreadsActive, - num_restore_threads, - num_restore_threads, - num_restore_threads)) - , log(&Poco::Logger::get("BackupsWorker")) +/// We have to use multiple thread pools because +/// 1) there should be separate thread pools for BACKUP and RESTORE; +/// 2) a task from a thread pool can't wait another task from the same thread pool. (Because if it schedules and waits +/// while the thread pool is still occupied with the waiting task then a scheduled task can be never executed). +enum class BackupsWorker::ThreadPoolId +{ + /// "BACKUP ON CLUSTER ASYNC" waits in background while "BACKUP ASYNC" is finished on the nodes of the cluster, then finalizes the backup. + BACKUP_ASYNC_ON_CLUSTER, + + /// "BACKUP ASYNC" waits in background while all file infos are built and then it copies the backup's files. + BACKUP_ASYNC, + + /// Making a list of files to copy and copying of those files is always sequential, so those operations can share one thread pool. + BACKUP_MAKE_FILES_LIST, + BACKUP_COPY_FILES = BACKUP_MAKE_FILES_LIST, + + /// "RESTORE ON CLUSTER ASYNC" waits in background while "BACKUP ASYNC" is finished on the nodes of the cluster, then finalizes the backup. + RESTORE_ASYNC_ON_CLUSTER, + + /// "RESTORE ASYNC" waits in background while the data of all tables are restored. + RESTORE_ASYNC, + + /// Restores the data of tables. + RESTORE_TABLES_DATA, +}; + + +/// Keeps thread pools for BackupsWorker. +class BackupsWorker::ThreadPools +{ +public: + ThreadPools(size_t num_backup_threads_, size_t num_restore_threads_) + : num_backup_threads(num_backup_threads_), num_restore_threads(num_restore_threads_) + { + } + + /// Returns a thread pool, creates it if it's not created yet. + ThreadPool & getThreadPool(ThreadPoolId thread_pool_id) + { + std::lock_guard lock{mutex}; + auto it = thread_pools.find(thread_pool_id); + if (it != thread_pools.end()) + return *it->second; + + CurrentMetrics::Metric metric_threads; + CurrentMetrics::Metric metric_active_threads; + size_t max_threads = 0; + + /// What to do with a new job if a corresponding thread pool is already running `max_threads` jobs: + /// `use_queue == true` - put into the thread pool's queue, + /// `use_queue == false` - schedule() should wait until some of the jobs finish. + bool use_queue = false; + + switch (thread_pool_id) + { + case ThreadPoolId::BACKUP_ASYNC: + case ThreadPoolId::BACKUP_ASYNC_ON_CLUSTER: + case ThreadPoolId::BACKUP_COPY_FILES: + { + metric_threads = CurrentMetrics::BackupsThreads; + metric_active_threads = CurrentMetrics::BackupsThreadsActive; + max_threads = num_backup_threads; + /// We don't use thread pool queues for thread pools with a lot of tasks otherwise that queue could be memory-wasting. + use_queue = (thread_pool_id != ThreadPoolId::BACKUP_COPY_FILES); + break; + } + + case ThreadPoolId::RESTORE_ASYNC: + case ThreadPoolId::RESTORE_ASYNC_ON_CLUSTER: + case ThreadPoolId::RESTORE_TABLES_DATA: + { + metric_threads = CurrentMetrics::RestoreThreads; + metric_active_threads = CurrentMetrics::RestoreThreadsActive; + max_threads = num_restore_threads; + use_queue = (thread_pool_id != ThreadPoolId::RESTORE_TABLES_DATA); + break; + } + } + + /// We set max_free_threads = 0 because we don't want to keep any threads if there is no BACKUP or RESTORE query running right now. + chassert(max_threads != 0); + size_t max_free_threads = 0; + size_t queue_size = use_queue ? 0 : max_threads; + auto thread_pool = std::make_unique(metric_threads, metric_active_threads, max_threads, max_free_threads, queue_size); + auto * thread_pool_ptr = thread_pool.get(); + thread_pools.emplace(thread_pool_id, std::move(thread_pool)); + return *thread_pool_ptr; + } + + /// Waits for all threads to finish. + void wait() + { + auto wait_sequence = { + ThreadPoolId::RESTORE_ASYNC_ON_CLUSTER, + ThreadPoolId::RESTORE_ASYNC, + ThreadPoolId::RESTORE_TABLES_DATA, + ThreadPoolId::BACKUP_ASYNC_ON_CLUSTER, + ThreadPoolId::BACKUP_ASYNC, + ThreadPoolId::BACKUP_COPY_FILES, + }; + + for (auto thread_pool_id : wait_sequence) + { + ThreadPool * thread_pool = nullptr; + { + std::lock_guard lock{mutex}; + auto it = thread_pools.find(thread_pool_id); + if (it != thread_pools.end()) + thread_pool = it->second.get(); + } + if (thread_pool) + thread_pool->wait(); + } + } + +private: + const size_t num_backup_threads; + const size_t num_restore_threads; + std::map> thread_pools TSA_GUARDED_BY(mutex); + std::mutex mutex; +}; + + +BackupsWorker::BackupsWorker(ContextPtr global_context, size_t num_backup_threads, size_t num_restore_threads, bool allow_concurrent_backups_, bool allow_concurrent_restores_) + : thread_pools(std::make_unique(num_backup_threads, num_restore_threads)) , allow_concurrent_backups(allow_concurrent_backups_) , allow_concurrent_restores(allow_concurrent_restores_) + , log(&Poco::Logger::get("BackupsWorker")) { backup_log = global_context->getBackupLog(); - /// We set max_free_threads = 0 because we don't want to keep any threads if there is no BACKUP or RESTORE query running right now. +} + + +BackupsWorker::~BackupsWorker() = default; + + +ThreadPool & BackupsWorker::getThreadPool(ThreadPoolId thread_pool_id) +{ + return thread_pools->getThreadPool(thread_pool_id); } @@ -313,16 +416,9 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context if (backup_settings.async) { - backup_async_executor_pool->scheduleOrThrowOnError( - [this, - backup_query, - backup_id, - backup_name_for_logging, - backup_info, - backup_settings, - backup_coordination, - context_in_use, - mutable_context] + auto & thread_pool = getThreadPool(on_cluster ? ThreadPoolId::BACKUP_ASYNC_ON_CLUSTER : ThreadPoolId::BACKUP_ASYNC); + thread_pool.scheduleOrThrowOnError( + [this, backup_query, backup_id, backup_name_for_logging, backup_info, backup_settings, backup_coordination, context_in_use, mutable_context] { doBackup( backup_query, @@ -454,7 +550,9 @@ void BackupsWorker::doBackup( /// Prepare backup entries. BackupEntries backup_entries; { - BackupEntriesCollector backup_entries_collector{backup_query->elements, backup_settings, backup_coordination, backup_create_params.read_settings, context}; + BackupEntriesCollector backup_entries_collector( + backup_query->elements, backup_settings, backup_coordination, + backup_create_params.read_settings, context, getThreadPool(ThreadPoolId::BACKUP_MAKE_FILES_LIST)); backup_entries = backup_entries_collector.run(); } @@ -515,7 +613,7 @@ void BackupsWorker::buildFileInfosForBackupEntries(const BackupPtr & backup, con LOG_TRACE(log, "{}", Stage::BUILDING_FILE_INFOS); backup_coordination->setStage(Stage::BUILDING_FILE_INFOS, ""); backup_coordination->waitForStage(Stage::BUILDING_FILE_INFOS); - backup_coordination->addFileInfos(::DB::buildFileInfosForBackupEntries(backup_entries, backup->getBaseBackup(), read_settings, *backups_thread_pool)); + backup_coordination->addFileInfos(::DB::buildFileInfosForBackupEntries(backup_entries, backup->getBaseBackup(), read_settings, getThreadPool(ThreadPoolId::BACKUP_MAKE_FILES_LIST))); } @@ -541,6 +639,7 @@ void BackupsWorker::writeBackupEntries(BackupMutablePtr backup, BackupEntries && std::exception_ptr exception; bool always_single_threaded = !backup->supportsWritingInMultipleThreads(); + auto & thread_pool = getThreadPool(ThreadPoolId::BACKUP_COPY_FILES); auto thread_group = CurrentThread::getGroup(); for (size_t i = 0; i != backup_entries.size(); ++i) @@ -608,7 +707,7 @@ void BackupsWorker::writeBackupEntries(BackupMutablePtr backup, BackupEntries && continue; } - backups_thread_pool->scheduleOrThrowOnError([job] { job(true); }); + thread_pool.scheduleOrThrowOnError([job] { job(true); }); } { @@ -666,25 +765,19 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt if (restore_settings.async) { - restore_async_executor_pool->scheduleOrThrowOnError( - [this, - restore_query, - restore_id, - backup_name_for_logging, - backup_info, - restore_settings, - restore_coordination, - context_in_use] - { - doRestore( - restore_query, - restore_id, - backup_name_for_logging, - backup_info, - restore_settings, - restore_coordination, - context_in_use, - /* called_async= */ true); + auto & thread_pool = getThreadPool(on_cluster ? ThreadPoolId::RESTORE_ASYNC_ON_CLUSTER : ThreadPoolId::RESTORE_ASYNC); + thread_pool.scheduleOrThrowOnError( + [this, restore_query, restore_id, backup_name_for_logging, backup_info, restore_settings, restore_coordination, context_in_use] + { + doRestore( + restore_query, + restore_id, + backup_name_for_logging, + backup_info, + restore_settings, + restore_coordination, + context_in_use, + /* called_async= */ true); }); } else @@ -818,7 +911,7 @@ void BackupsWorker::doRestore( } /// Execute the data restoring tasks. - restoreTablesData(restore_id, backup, std::move(data_restore_tasks), *restores_thread_pool); + restoreTablesData(restore_id, backup, std::move(data_restore_tasks), getThreadPool(ThreadPoolId::RESTORE_TABLES_DATA)); /// We have restored everything, we need to tell other hosts (they could be waiting for it). restore_coordination->setStage(Stage::COMPLETED, ""); @@ -863,23 +956,21 @@ void BackupsWorker::restoreTablesData(const OperationID & restore_id, BackupPtr ++num_active_jobs; } - auto job = [&](bool async) + auto job = [&]() { SCOPE_EXIT_SAFE( std::lock_guard lock{mutex}; if (!--num_active_jobs) event.notify_all(); - if (async) - CurrentThread::detachFromGroupIfNotDetached(); + CurrentThread::detachFromGroupIfNotDetached(); ); try { - if (async && thread_group) + if (thread_group) CurrentThread::attachToGroup(thread_group); - if (async) - setThreadName("RestoreWorker"); + setThreadName("RestoreWorker"); { std::lock_guard lock{mutex}; @@ -906,7 +997,7 @@ void BackupsWorker::restoreTablesData(const OperationID & restore_id, BackupPtr } }; - thread_pool.scheduleOrThrowOnError([job] { job(true); }); + thread_pool.scheduleOrThrowOnError(job); } { @@ -967,6 +1058,7 @@ void BackupsWorker::setStatus(const String & id, BackupStatus status, bool throw auto old_status = info.status; info.status = status; + info.profile_counters = std::make_shared(CurrentThread::getProfileEvents().getPartiallyAtomicSnapshot()); if (isFinalStatus(status)) info.end_time = std::chrono::system_clock::now(); @@ -1049,10 +1141,7 @@ void BackupsWorker::shutdown() if (has_active_backups_and_restores) LOG_INFO(log, "Waiting for {} backups and {} restores to be finished", num_active_backups, num_active_restores); - backups_thread_pool->wait(); - restores_thread_pool->wait(); - backup_async_executor_pool->wait(); - restore_async_executor_pool->wait(); + thread_pools->wait(); if (has_active_backups_and_restores) LOG_INFO(log, "All backup and restore tasks have finished"); diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index bf7f2e8f1e4..b0a76eb0fa8 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -33,6 +33,7 @@ class BackupsWorker { public: BackupsWorker(ContextPtr global_context, size_t num_backup_threads, size_t num_restore_threads, bool allow_concurrent_backups_, bool allow_concurrent_restores_); + ~BackupsWorker(); /// Waits until all tasks have been completed. void shutdown(); @@ -88,11 +89,15 @@ private: void setNumFilesAndSize(const BackupOperationID & id, size_t num_files, UInt64 total_size, size_t num_entries, UInt64 uncompressed_size, UInt64 compressed_size, size_t num_read_files, UInt64 num_read_bytes); - std::unique_ptr backups_thread_pool; - std::unique_ptr restores_thread_pool; + enum class ThreadPoolId; + ThreadPool & getThreadPool(ThreadPoolId thread_pool_id); - std::unique_ptr backup_async_executor_pool; - std::unique_ptr restore_async_executor_pool; + class ThreadPools; + std::unique_ptr thread_pools; + + const bool allow_concurrent_backups; + const bool allow_concurrent_restores; + Poco::Logger * log; std::unordered_map infos; std::shared_ptr backup_log; @@ -100,9 +105,6 @@ private: std::atomic num_active_backups = 0; std::atomic num_active_restores = 0; mutable std::mutex infos_mutex; - Poco::Logger * log; - const bool allow_concurrent_backups; - const bool allow_concurrent_restores; }; } diff --git a/src/Backups/IBackup.h b/src/Backups/IBackup.h index 660f7d5da22..783cad29b63 100644 --- a/src/Backups/IBackup.h +++ b/src/Backups/IBackup.h @@ -43,7 +43,7 @@ public: /// Returns UUID of the backup. virtual UUID getUUID() const = 0; - /// Returns the base backup (can be null). + /// Returns the base backup or null if there is no base backup. virtual std::shared_ptr getBaseBackup() const = 0; /// Returns the number of files stored in the backup. Compare with getNumEntries(). diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f491407cda5..0257b7d329b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -89,6 +89,17 @@ add_headers_and_sources(clickhouse_common_io IO/Resource) add_headers_and_sources(clickhouse_common_io IO/S3) list (REMOVE_ITEM clickhouse_common_io_sources Common/malloc.cpp Common/new_delete.cpp) + +add_headers_and_sources(clickhouse_compression Compression) +add_headers_and_sources(clickhouse_compression Parsers) +add_headers_and_sources(clickhouse_compression Core) +#Included these specific files to avoid linking grpc +add_glob(clickhouse_compression_headers Server/ServerType.h) +add_glob(clickhouse_compression_sources Server/ServerType.cpp) +add_headers_and_sources(clickhouse_compression Common/SSH) +add_library(clickhouse_compression ${clickhouse_compression_headers} ${clickhouse_compression_sources}) + + add_headers_and_sources(dbms Disks/IO) add_headers_and_sources(dbms Disks/ObjectStorages) if (TARGET ch_contrib::sqlite) @@ -270,6 +281,7 @@ target_include_directories (clickhouse_common_io PUBLIC "${ClickHouse_SOURCE_DIR if (TARGET ch_contrib::llvm) dbms_target_link_libraries (PUBLIC ch_contrib::llvm) + target_link_libraries (clickhouse_compression PUBLIC ch_contrib::llvm) endif () if (TARGET ch_contrib::gwp_asan) @@ -293,6 +305,18 @@ target_link_libraries (clickhouse_common_io common ch_contrib::double_conversion ch_contrib::dragonbox_to_chars + ch_contrib::libdivide +) + + +target_link_libraries (clickhouse_compression + PUBLIC + string_utils + pcg_random + clickhouse_parsers + PRIVATE + ch_contrib::lz4 + ch_contrib::roaring ) # Use X86 AVX2/AVX512 instructions to accelerate filter operations @@ -336,6 +360,7 @@ if (TARGET ch_contrib::crc32-vpmsum) if (TARGET ch_contrib::ssh) target_link_libraries(clickhouse_common_io PUBLIC ch_contrib::ssh) + target_link_libraries(clickhouse_compression PUBLIC ch_contrib::ssh) endif() dbms_target_link_libraries(PUBLIC ch_contrib::abseil_swiss_tables) @@ -359,10 +384,12 @@ endif() if (TARGET ch_contrib::krb5) dbms_target_link_libraries(PRIVATE ch_contrib::krb5) + target_link_libraries (clickhouse_compression PRIVATE ch_contrib::krb5) endif() if (TARGET ch_contrib::nuraft) dbms_target_link_libraries(PUBLIC ch_contrib::nuraft) + target_link_libraries (clickhouse_compression PUBLIC ch_contrib::nuraft) endif() dbms_target_link_libraries ( @@ -432,6 +459,7 @@ endif () if (TARGET ch_contrib::ldap) dbms_target_link_libraries (PRIVATE ch_contrib::ldap ch_contrib::lber) + target_link_libraries (clickhouse_compression PRIVATE ch_contrib::ldap ch_contrib::lber) endif () dbms_target_link_libraries (PUBLIC ch_contrib::sparsehash) diff --git a/src/Client/ConnectionParameters.cpp b/src/Client/ConnectionParameters.cpp index 3942527e5b5..b8669c72cf7 100644 --- a/src/Client/ConnectionParameters.cpp +++ b/src/Client/ConnectionParameters.cpp @@ -30,7 +30,8 @@ ConnectionParameters::ConnectionParameters(const Poco::Util::AbstractConfigurati , port(connection_port.value_or(getPortFromConfig(config))) { bool is_secure = config.getBool("secure", false); - security = is_secure ? Protocol::Secure::Enable : Protocol::Secure::Disable; + bool is_clickhouse_cloud = connection_host.ends_with(".clickhouse.cloud") || connection_host.ends_with(".clickhouse-staging.com"); + security = (is_secure || is_clickhouse_cloud) ? Protocol::Secure::Enable : Protocol::Secure::Disable; default_database = config.getString("database", ""); diff --git a/src/Client/LocalConnection.cpp b/src/Client/LocalConnection.cpp index c135ced7c76..d01c40a9c34 100644 --- a/src/Client/LocalConnection.cpp +++ b/src/Client/LocalConnection.cpp @@ -131,7 +131,7 @@ void LocalConnection::sendQuery( try { - state->io = executeQuery(state->query, query_context, false, state->stage).second; + state->io = executeQuery(state->query, query_context, QueryFlags{}, state->stage).second; if (state->io.pipeline.pushing()) { diff --git a/src/Client/ReplxxLineReader.cpp b/src/Client/ReplxxLineReader.cpp index 49f44e3d0f9..681d06ce583 100644 --- a/src/Client/ReplxxLineReader.cpp +++ b/src/Client/ReplxxLineReader.cpp @@ -293,7 +293,6 @@ ReplxxLineReader::ReplxxLineReader( , word_break_characters(word_break_characters_) , editor(getEditor()) { - using namespace std::placeholders; using Replxx = replxx::Replxx; if (!history_file_path.empty()) diff --git a/src/Columns/ColumnDecimal.cpp b/src/Columns/ColumnDecimal.cpp index 012ae03bab2..baccfc69147 100644 --- a/src/Columns/ColumnDecimal.cpp +++ b/src/Columns/ColumnDecimal.cpp @@ -6,9 +6,7 @@ #include #include -#include #include -#include #include @@ -20,8 +18,6 @@ #include -template bool decimalLess(T x, T y, UInt32 x_scale, UInt32 y_scale); - namespace DB { diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp index 5c299e495eb..d9f515b38b1 100644 --- a/src/Common/Exception.cpp +++ b/src/Common/Exception.cpp @@ -41,12 +41,6 @@ namespace ErrorCodes void abortOnFailedAssertion(const String & description) { LOG_FATAL(&Poco::Logger::root(), "Logical error: '{}'.", description); - - /// This is to suppress -Wmissing-noreturn - volatile bool always_false = false; - if (always_false) - return; - abort(); } diff --git a/src/Common/Exception.h b/src/Common/Exception.h index b2411e256ed..a7ffa8adcd0 100644 --- a/src/Common/Exception.h +++ b/src/Common/Exception.h @@ -19,7 +19,7 @@ namespace Poco { class Logger; } namespace DB { -void abortOnFailedAssertion(const String & description); +[[noreturn]] void abortOnFailedAssertion(const String & description); /// This flag can be set for testing purposes - to check that no exceptions are thrown. extern bool terminate_on_any_exception; diff --git a/src/Common/ObjectStorageKey.cpp b/src/Common/ObjectStorageKey.cpp new file mode 100644 index 00000000000..ca5617c8aa2 --- /dev/null +++ b/src/Common/ObjectStorageKey.cpp @@ -0,0 +1,68 @@ +#include "ObjectStorageKey.h" + +#include + +#include + +namespace fs = std::filesystem; + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int LOGICAL_ERROR; +} + +const String & ObjectStorageKey::getPrefix() const +{ + if (!is_relative) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "object key has no prefix, key: {}", key); + + return prefix; +} + +const String & ObjectStorageKey::getSuffix() const +{ + if (!is_relative) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "object key has no suffix, key: {}", key); + return suffix; +} + +const String & ObjectStorageKey::serialize() const +{ + return key; +} + +ObjectStorageKey ObjectStorageKey::createAsRelative(String key_) +{ + ObjectStorageKey object_key; + object_key.suffix = std::move(key_); + object_key.key = object_key.suffix; + object_key.is_relative = true; + return object_key; +} + +ObjectStorageKey ObjectStorageKey::createAsRelative(String prefix_, String suffix_) +{ + ObjectStorageKey object_key; + object_key.prefix = std::move(prefix_); + object_key.suffix = std::move(suffix_); + + if (object_key.prefix.empty()) + object_key.key = object_key.suffix; + else + object_key.key = fs::path(object_key.prefix) / object_key.suffix; + + object_key.is_relative = true; + return object_key; +} + +ObjectStorageKey ObjectStorageKey::createAsAbsolute(String key_) +{ + ObjectStorageKey object_key; + object_key.key = std::move(key_); + object_key.is_relative = false; + return object_key; +} +} diff --git a/src/Common/ObjectStorageKey.h b/src/Common/ObjectStorageKey.h new file mode 100644 index 00000000000..7e509b741e4 --- /dev/null +++ b/src/Common/ObjectStorageKey.h @@ -0,0 +1,29 @@ +#pragma once + +#include + +#include + +namespace DB +{ + struct ObjectStorageKey + { + ObjectStorageKey() = default; + + bool hasPrefix() const { return is_relative; } + const String & getPrefix() const; + const String & getSuffix() const; + const String & serialize() const; + + static ObjectStorageKey createAsRelative(String prefix_, String suffix_); + static ObjectStorageKey createAsRelative(String key_); + static ObjectStorageKey createAsAbsolute(String key_); + + private: + String prefix; + String suffix; + String key; + bool is_relative = false; + }; + +} diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 86d11ea48ac..5e22bbd474b 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -101,6 +101,7 @@ M(ReplicatedPartChecks, "Number of times we had to perform advanced search for a data part on replicas or to clarify the need of an existing data part.") \ M(ReplicatedPartChecksFailed, "Number of times the advanced search for a data part on replicas did not give result or when unexpected part has been found and moved away.") \ M(ReplicatedDataLoss, "Number of times a data part that we wanted doesn't exist on any replica (even on replicas that are offline right now). That data parts are definitely lost. This is normal due to asynchronous replication (if quorum inserts were not enabled), when the replica on which the data part was written was failed and when it became online after fail it doesn't contain that data part.") \ + M(ReplicatedCoveredPartsInZooKeeperOnStart, "For debugging purposes. Number of parts in ZooKeeper that have a covering part, but doesn't exist on disk. Checked on server start.") \ \ M(InsertedRows, "Number of rows INSERTed to all tables.") \ M(InsertedBytes, "Number of bytes (uncompressed; for columns as they stored in memory) INSERTed to all tables.") \ @@ -546,6 +547,14 @@ The server successfully detected this situation and will download merged part fr M(IOUringCQEsCompleted, "Total number of successfully completed io_uring CQEs") \ M(IOUringCQEsFailed, "Total number of completed io_uring CQEs with failures") \ \ + M(BackupsOpenedForRead, "Number of backups opened for reading") \ + M(BackupsOpenedForWrite, "Number of backups opened for writing") \ + M(BackupReadMetadataMicroseconds, "Time spent reading backup metadata from .backup file") \ + M(BackupWriteMetadataMicroseconds, "Time spent writing backup metadata to .backup file") \ + M(BackupEntriesCollectorMicroseconds, "Time spent making backup entries") \ + M(BackupEntriesCollectorForTablesDataMicroseconds, "Time spent making backup entries for tables data") \ + M(BackupEntriesCollectorRunPostTasksMicroseconds, "Time spent running post tasks after making backup entries") \ + \ M(ReadTaskRequestsReceived, "The number of callbacks requested from the remote server back to the initiator server to choose the read task (for s3Cluster table function and similar). Measured on the initiator server side.") \ M(MergeTreeReadTaskRequestsReceived, "The number of callbacks requested from the remote server back to the initiator server to choose the read task (for MergeTree tables). Measured on the initiator server side.") \ \ diff --git a/src/Common/ZooKeeper/CMakeLists.txt b/src/Common/ZooKeeper/CMakeLists.txt index 3f7e87ff4a7..aa06375bd6a 100644 --- a/src/Common/ZooKeeper/CMakeLists.txt +++ b/src/Common/ZooKeeper/CMakeLists.txt @@ -10,6 +10,7 @@ target_compile_definitions (clickhouse_common_zookeeper PRIVATE -DZOOKEEPER_LOG) target_link_libraries (clickhouse_common_zookeeper PUBLIC clickhouse_common_io + clickhouse_compression common PRIVATE string_utils @@ -20,6 +21,7 @@ add_library(clickhouse_common_zookeeper_no_log ${clickhouse_common_zookeeper_hea target_link_libraries (clickhouse_common_zookeeper_no_log PUBLIC clickhouse_common_io + clickhouse_compression common PRIVATE string_utils diff --git a/src/Common/ZooKeeper/ZooKeeperArgs.cpp b/src/Common/ZooKeeper/ZooKeeperArgs.cpp index 55ba2d02e55..539ecdd0204 100644 --- a/src/Common/ZooKeeper/ZooKeeperArgs.cpp +++ b/src/Common/ZooKeeper/ZooKeeperArgs.cpp @@ -214,6 +214,10 @@ void ZooKeeperArgs::initFromKeeperSection(const Poco::Util::AbstractConfiguratio .max_sec = config.getUInt(config_name + "." + key + ".max"), }; } + else if (key == "use_compression") + { + use_compression = config.getBool(config_name + "." + key); + } else throw KeeperException(Coordination::Error::ZBADARGUMENTS, "Unknown key {} in config file", key); } diff --git a/src/Common/ZooKeeper/ZooKeeperArgs.h b/src/Common/ZooKeeper/ZooKeeperArgs.h index 9c48de02f61..ff44a1b191b 100644 --- a/src/Common/ZooKeeper/ZooKeeperArgs.h +++ b/src/Common/ZooKeeper/ZooKeeperArgs.h @@ -44,6 +44,7 @@ struct ZooKeeperArgs double recv_sleep_probability = 0.0; UInt64 send_sleep_ms = 0; UInt64 recv_sleep_ms = 0; + bool use_compression = false; SessionLifetimeConfiguration fallback_session_lifetime = {}; DB::GetPriorityForLoadBalancing get_priority_load_balancing; diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp index e9803d165e1..592d142e925 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp +++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp @@ -27,7 +27,6 @@ void ZooKeeperResponse::write(WriteBuffer & out) const if (error == Error::ZOK) writeImpl(buf); Coordination::write(buf.str(), out); - out.next(); } std::string ZooKeeperRequest::toString() const @@ -49,7 +48,6 @@ void ZooKeeperRequest::write(WriteBuffer & out) const Coordination::write(getOpNum(), buf); writeImpl(buf); Coordination::write(buf.str(), out); - out.next(); } void ZooKeeperSyncRequest::writeImpl(WriteBuffer & out) const diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.h b/src/Common/ZooKeeper/ZooKeeperConstants.h index a1d1be86a34..1a868963b57 100644 --- a/src/Common/ZooKeeper/ZooKeeperConstants.h +++ b/src/Common/ZooKeeper/ZooKeeperConstants.h @@ -46,6 +46,7 @@ enum class OpNum : int32_t OpNum getOpNum(int32_t raw_op_num); static constexpr int32_t ZOOKEEPER_PROTOCOL_VERSION = 0; +static constexpr int32_t ZOOKEEPER_PROTOCOL_VERSION_WITH_COMPRESSION = 10; static constexpr int32_t KEEPER_PROTOCOL_VERSION_CONNECTION_REJECT = 42; static constexpr int32_t CLIENT_HANDSHAKE_LENGTH = 44; static constexpr int32_t CLIENT_HANDSHAKE_LENGTH_WITH_READONLY = 45; diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 709dabf1506..fd845016f8a 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include #include "Coordination/KeeperConstants.h" #include "config.h" @@ -274,13 +277,34 @@ using namespace DB; template void ZooKeeper::write(const T & x) { - Coordination::write(x, *out); + Coordination::write(x, getWriteBuffer()); } template void ZooKeeper::read(T & x) { - Coordination::read(x, *in); + Coordination::read(x, getReadBuffer()); +} + +WriteBuffer & ZooKeeper::getWriteBuffer() +{ + if (compressed_out) + return *compressed_out; + return *out; +} + +void ZooKeeper::flushWriteBuffer() +{ + if (compressed_out) + compressed_out->next(); + out->next(); +} + +ReadBuffer & ZooKeeper::getReadBuffer() +{ + if (compressed_in) + return *compressed_in; + return *in; } static void removeRootPath(String & path, const String & chroot) @@ -345,7 +369,23 @@ ZooKeeper::ZooKeeper( if (args.enable_fault_injections_during_startup) setupFaultDistributions(); - connect(nodes, args.connection_timeout_ms * 1000); + try + { + use_compression = args.use_compression; + connect(nodes, args.connection_timeout_ms * 1000); + } + catch (...) + { + /// If we get exception & compression is enabled, then its possible that keeper does not support compression, + /// try without compression + if (use_compression) + { + use_compression = false; + connect(nodes, args.connection_timeout_ms * 1000); + } + else + throw; + } if (!args.auth_scheme.empty()) sendAuth(args.auth_scheme, args.identity); @@ -424,6 +464,8 @@ void ZooKeeper::connect( in.emplace(socket); out.emplace(socket); + compressed_in.reset(); + compressed_out.reset(); try { @@ -444,7 +486,14 @@ void ZooKeeper::connect( e.addMessage("while receiving handshake from ZooKeeper"); throw; } + connected = true; + if (use_compression) + { + compressed_in.emplace(*in); + compressed_out.emplace(*out, CompressionCodecFactory::instance().get("LZ4", {})); + } + original_index = static_cast(node.original_index); if (i != 0) @@ -511,16 +560,17 @@ void ZooKeeper::sendHandshake() std::array passwd {}; write(handshake_length); - write(ZOOKEEPER_PROTOCOL_VERSION); + if (use_compression) + write(ZOOKEEPER_PROTOCOL_VERSION_WITH_COMPRESSION); + else + write(ZOOKEEPER_PROTOCOL_VERSION); write(last_zxid_seen); write(timeout); write(previous_session_id); write(passwd); - - out->next(); + flushWriteBuffer(); } - void ZooKeeper::receiveHandshake() { int32_t handshake_length; @@ -533,18 +583,22 @@ void ZooKeeper::receiveHandshake() throw Exception(Error::ZMARSHALLINGERROR, "Unexpected handshake length received: {}", handshake_length); read(protocol_version_read); - if (protocol_version_read != ZOOKEEPER_PROTOCOL_VERSION) + + /// Special way to tell a client that server is not ready to serve it. + /// It's better for faster failover than just connection drop. + /// Implemented in clickhouse-keeper. + if (protocol_version_read == KEEPER_PROTOCOL_VERSION_CONNECTION_REJECT) + throw Exception::fromMessage(Error::ZCONNECTIONLOSS, + "Keeper server rejected the connection during the handshake. " + "Possibly it's overloaded, doesn't see leader or stale"); + + if (use_compression) { - /// Special way to tell a client that server is not ready to serve it. - /// It's better for faster failover than just connection drop. - /// Implemented in clickhouse-keeper. - if (protocol_version_read == KEEPER_PROTOCOL_VERSION_CONNECTION_REJECT) - throw Exception::fromMessage(Error::ZCONNECTIONLOSS, - "Keeper server rejected the connection during the handshake. " - "Possibly it's overloaded, doesn't see leader or stale"); - else - throw Exception(Error::ZMARSHALLINGERROR, "Unexpected protocol version: {}", protocol_version_read); + if (protocol_version_read != ZOOKEEPER_PROTOCOL_VERSION_WITH_COMPRESSION) + throw Exception(Error::ZMARSHALLINGERROR,"Unexpected protocol version with compression: {}", protocol_version_read); } + else if (protocol_version_read != ZOOKEEPER_PROTOCOL_VERSION) + throw Exception(Error::ZMARSHALLINGERROR, "Unexpected protocol version: {}", protocol_version_read); read(timeout); if (timeout != args.session_timeout_ms) @@ -562,7 +616,8 @@ void ZooKeeper::sendAuth(const String & scheme, const String & data) request.scheme = scheme; request.data = data; request.xid = AUTH_XID; - request.write(*out); + request.write(getWriteBuffer()); + flushWriteBuffer(); int32_t length; XID read_xid; @@ -578,10 +633,14 @@ void ZooKeeper::sendAuth(const String & scheme, const String & data) if (read_xid != AUTH_XID) throw Exception(Error::ZMARSHALLINGERROR, "Unexpected event received in reply to auth request: {}", read_xid); - int32_t actual_length = static_cast(in->count() - count_before_event); - if (length != actual_length) + if (!use_compression) + { + int32_t actual_length = static_cast(in->count() - count_before_event); + if (length != actual_length) throw Exception(Error::ZMARSHALLINGERROR, "Response length doesn't match. Expected: {}, actual: {}", length, actual_length); + } + if (err != Error::ZOK) throw Exception(Error::ZMARSHALLINGERROR, "Error received in reply to auth request. Code: {}. Message: {}", static_cast(err), err); @@ -637,7 +696,8 @@ void ZooKeeper::sendThread() info.request->addRootPath(args.chroot); info.request->probably_sent = true; - info.request->write(*out); + info.request->write(getWriteBuffer()); + flushWriteBuffer(); logOperationIfNeeded(info.request); @@ -653,7 +713,8 @@ void ZooKeeper::sendThread() ZooKeeperHeartbeatRequest request; request.xid = PING_XID; - request.write(*out); + request.write(getWriteBuffer()); + flushWriteBuffer(); } ProfileEvents::increment(ProfileEvents::ZooKeeperBytesSent, out->count() - prev_bytes_sent); @@ -825,7 +886,7 @@ void ZooKeeper::receiveEvent() } else { - response->readImpl(*in); + response->readImpl(getReadBuffer()); response->removeRootPath(args.chroot); } /// Instead of setting the watch in sendEvent, set it in receiveEvent because need to check the response. @@ -858,9 +919,14 @@ void ZooKeeper::receiveEvent() } } - int32_t actual_length = static_cast(in->count() - count_before_event); - if (length != actual_length) - throw Exception(Error::ZMARSHALLINGERROR, "Response length doesn't match. Expected: {}, actual: {}", length, actual_length); + if (!use_compression) + { + int32_t actual_length = static_cast(in->count() - count_before_event); + + if (length != actual_length) + throw Exception(Error::ZMARSHALLINGERROR, "Response length doesn't match. Expected: {}, actual: {}", + length, actual_length); + } logOperationIfNeeded(request_info.request, response, /* finalize= */ false, elapsed_ms); } diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h index 8b363398200..13e1dc9e3cd 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.h +++ b/src/Common/ZooKeeper/ZooKeeperImpl.h @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include @@ -239,8 +241,13 @@ private: Poco::Net::StreamSocket socket; /// To avoid excessive getpeername(2) calls. Poco::Net::SocketAddress socket_address; + std::optional in; std::optional out; + std::optional compressed_in; + std::optional compressed_out; + + bool use_compression = false; int64_t session_id = 0; @@ -328,6 +335,10 @@ private: template void read(T &); + WriteBuffer & getWriteBuffer(); + void flushWriteBuffer(); + ReadBuffer & getReadBuffer(); + void logOperationIfNeeded(const ZooKeeperRequestPtr & request, const ZooKeeperResponsePtr & response = nullptr, bool finalize = false, UInt64 elapsed_ms = 0); void initFeatureFlags(); diff --git a/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h b/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h index 4887e896e9b..dec3213fbc4 100644 --- a/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h +++ b/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h @@ -7,6 +7,7 @@ #include #include #include "Coordination/KeeperConstants.h" +#include namespace DB { @@ -42,7 +43,7 @@ public: } private: - std::mt19937_64 rndgen; + pcg64_fast rndgen; std::bernoulli_distribution distribution; }; diff --git a/src/Common/ZooKeeper/examples/CMakeLists.txt b/src/Common/ZooKeeper/examples/CMakeLists.txt index e8932fd3088..a99fbe55dd8 100644 --- a/src/Common/ZooKeeper/examples/CMakeLists.txt +++ b/src/Common/ZooKeeper/examples/CMakeLists.txt @@ -2,7 +2,7 @@ clickhouse_add_executable(zkutil_test_commands zkutil_test_commands.cpp) target_link_libraries(zkutil_test_commands PRIVATE clickhouse_common_zookeeper_no_log) clickhouse_add_executable(zkutil_test_commands_new_lib zkutil_test_commands_new_lib.cpp) -target_link_libraries(zkutil_test_commands_new_lib PRIVATE clickhouse_common_zookeeper_no_log string_utils) +target_link_libraries(zkutil_test_commands_new_lib PRIVATE clickhouse_common_zookeeper_no_log clickhouse_compression string_utils) clickhouse_add_executable(zkutil_test_async zkutil_test_async.cpp) target_link_libraries(zkutil_test_async PRIVATE clickhouse_common_zookeeper_no_log) diff --git a/src/Common/mysqlxx/PoolWithFailover.cpp b/src/Common/mysqlxx/PoolWithFailover.cpp index 190522c704a..df2e3b61c33 100644 --- a/src/Common/mysqlxx/PoolWithFailover.cpp +++ b/src/Common/mysqlxx/PoolWithFailover.cpp @@ -2,7 +2,9 @@ #include #include #include +#include #include +#include #include #include @@ -44,10 +46,7 @@ PoolWithFailover::PoolWithFailover( /// PoolWithFailover objects are stored in a cache inside PoolFactory. /// This cache is reset by ExternalDictionariesLoader after every SYSTEM RELOAD DICTIONAR{Y|IES} /// which triggers massive re-constructing of connection pools. - /// The state of PRNGs like std::mt19937 is considered to be quite heavy - /// thus here we attempt to optimize its construction. - static thread_local std::mt19937 rnd_generator(static_cast( - std::hash{}(std::this_thread::get_id()) + std::clock())); + static thread_local pcg64_fast rnd_generator(randomSeed()); for (auto & [_, replicas] : replicas_by_priority) { if (replicas.size() > 1) diff --git a/src/Common/randomSeed.cpp b/src/Common/randomSeed.cpp index e1aa56fa811..e10ef87283f 100644 --- a/src/Common/randomSeed.cpp +++ b/src/Common/randomSeed.cpp @@ -13,10 +13,10 @@ namespace DB { - namespace ErrorCodes - { - extern const int CANNOT_CLOCK_GETTIME; - } +namespace ErrorCodes +{ + extern const int CANNOT_CLOCK_GETTIME; +} } diff --git a/src/Compression/CompressedReadBuffer.h b/src/Compression/CompressedReadBuffer.h index 0c537d171c4..bbbea2e967e 100644 --- a/src/Compression/CompressedReadBuffer.h +++ b/src/Compression/CompressedReadBuffer.h @@ -16,8 +16,8 @@ private: bool nextImpl() override; public: - explicit CompressedReadBuffer(ReadBuffer & in_, bool allow_different_codecs_ = false) - : CompressedReadBufferBase(&in_, allow_different_codecs_), BufferWithOwnMemory(0) + explicit CompressedReadBuffer(ReadBuffer & in_, bool allow_different_codecs_ = false, bool external_data_ = false) + : CompressedReadBufferBase(&in_, allow_different_codecs_, external_data_), BufferWithOwnMemory(0) { } diff --git a/src/Compression/CompressedReadBufferBase.cpp b/src/Compression/CompressedReadBufferBase.cpp index dd19955d010..e416fadc829 100644 --- a/src/Compression/CompressedReadBufferBase.cpp +++ b/src/Compression/CompressedReadBufferBase.cpp @@ -114,7 +114,8 @@ static void readHeaderAndGetCodecAndSize( CompressionCodecPtr & codec, size_t & size_decompressed, size_t & size_compressed_without_checksum, - bool allow_different_codecs) + bool allow_different_codecs, + bool external_data) { uint8_t method = ICompressionCodec::readMethod(compressed_buffer); @@ -136,8 +137,11 @@ static void readHeaderAndGetCodecAndSize( } } - size_compressed_without_checksum = ICompressionCodec::readCompressedBlockSize(compressed_buffer); - size_decompressed = ICompressionCodec::readDecompressedBlockSize(compressed_buffer); + if (external_data) + codec->setExternalDataFlag(); + + size_compressed_without_checksum = codec->readCompressedBlockSize(compressed_buffer); + size_decompressed = codec->readDecompressedBlockSize(compressed_buffer); /// This is for clang static analyzer. assert(size_decompressed > 0); @@ -170,7 +174,8 @@ size_t CompressedReadBufferBase::readCompressedData(size_t & size_decompressed, codec, size_decompressed, size_compressed_without_checksum, - allow_different_codecs); + allow_different_codecs, + external_data); auto additional_size_at_the_end_of_buffer = codec->getAdditionalSizeAtTheEndOfBuffer(); @@ -221,7 +226,8 @@ size_t CompressedReadBufferBase::readCompressedDataBlockForAsynchronous(size_t & codec, size_decompressed, size_compressed_without_checksum, - allow_different_codecs); + allow_different_codecs, + external_data); auto additional_size_at_the_end_of_buffer = codec->getAdditionalSizeAtTheEndOfBuffer(); @@ -254,7 +260,8 @@ size_t CompressedReadBufferBase::readCompressedDataBlockForAsynchronous(size_t & } } -static void readHeaderAndGetCodec(const char * compressed_buffer, size_t size_decompressed, CompressionCodecPtr & codec, bool allow_different_codecs) +static void readHeaderAndGetCodec(const char * compressed_buffer, size_t size_decompressed, CompressionCodecPtr & codec, + bool allow_different_codecs, bool external_data) { ProfileEvents::increment(ProfileEvents::CompressedReadBufferBlocks); ProfileEvents::increment(ProfileEvents::CompressedReadBufferBytes, size_decompressed); @@ -278,17 +285,20 @@ static void readHeaderAndGetCodec(const char * compressed_buffer, size_t size_de getHexUIntLowercase(method), getHexUIntLowercase(codec->getMethodByte())); } } + + if (external_data) + codec->setExternalDataFlag(); } void CompressedReadBufferBase::decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum) { - readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs); + readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs, external_data); codec->decompress(compressed_buffer, static_cast(size_compressed_without_checksum), to); } void CompressedReadBufferBase::decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum) { - readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs); + readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs, external_data); if (codec->isNone()) { @@ -320,8 +330,8 @@ void CompressedReadBufferBase::setDecompressMode(ICompressionCodec::CodecMode mo } /// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'. -CompressedReadBufferBase::CompressedReadBufferBase(ReadBuffer * in, bool allow_different_codecs_) - : compressed_in(in), own_compressed_buffer(0), allow_different_codecs(allow_different_codecs_) +CompressedReadBufferBase::CompressedReadBufferBase(ReadBuffer * in, bool allow_different_codecs_, bool external_data_) + : compressed_in(in), own_compressed_buffer(0), allow_different_codecs(allow_different_codecs_), external_data(external_data_) { } diff --git a/src/Compression/CompressedReadBufferBase.h b/src/Compression/CompressedReadBufferBase.h index baea4d2b855..0a995f012fd 100644 --- a/src/Compression/CompressedReadBufferBase.h +++ b/src/Compression/CompressedReadBufferBase.h @@ -30,6 +30,9 @@ protected: /// Allow reading data, compressed by different codecs from one file. bool allow_different_codecs; + /// Report decompression errors as CANNOT_DECOMPRESS, not CORRUPTED_DATA + bool external_data; + /// Read compressed data into compressed_buffer. Get size of decompressed data from block header. Checksum if need. /// /// If always_copy is true then even if the compressed block is already stored in compressed_in.buffer() @@ -67,7 +70,7 @@ protected: public: /// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'. - explicit CompressedReadBufferBase(ReadBuffer * in = nullptr, bool allow_different_codecs_ = false); + explicit CompressedReadBufferBase(ReadBuffer * in = nullptr, bool allow_different_codecs_ = false, bool external_data_ = false); virtual ~CompressedReadBufferBase(); /** Disable checksums. diff --git a/src/Compression/CompressionCodecDeflateQpl.cpp b/src/Compression/CompressionCodecDeflateQpl.cpp index 0737e523ba0..76dc5f824e3 100644 --- a/src/Compression/CompressionCodecDeflateQpl.cpp +++ b/src/Compression/CompressionCodecDeflateQpl.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "libaccel_config.h" #include @@ -29,7 +30,7 @@ DeflateQplJobHWPool & DeflateQplJobHWPool::instance() DeflateQplJobHWPool::DeflateQplJobHWPool() : max_hw_jobs(0) - , random_engine(std::random_device()()) + , random_engine(randomSeed()) { Poco::Logger * log = &Poco::Logger::get("DeflateQplJobHWPool"); const char * qpl_version = qpl_get_library_version(); diff --git a/src/Compression/CompressionCodecDeflateQpl.h b/src/Compression/CompressionCodecDeflateQpl.h index 8d73568707e..e0ec791dfe5 100644 --- a/src/Compression/CompressionCodecDeflateQpl.h +++ b/src/Compression/CompressionCodecDeflateQpl.h @@ -3,6 +3,7 @@ #include #include #include +#include #include namespace Poco @@ -41,7 +42,7 @@ private: std::unique_ptr hw_job_ptr_locks; bool job_pool_ready; - std::mt19937 random_engine; + pcg64_fast random_engine; std::uniform_int_distribution distribution; }; diff --git a/src/Compression/CompressionCodecMultiple.cpp b/src/Compression/CompressionCodecMultiple.cpp index dba67749e4d..b1eb7fb50c3 100644 --- a/src/Compression/CompressionCodecMultiple.cpp +++ b/src/Compression/CompressionCodecMultiple.cpp @@ -14,12 +14,6 @@ namespace DB { - -namespace ErrorCodes -{ - extern const int CORRUPTED_DATA; -} - CompressionCodecMultiple::CompressionCodecMultiple(Codecs codecs_) : codecs(codecs_) { @@ -79,7 +73,7 @@ UInt32 CompressionCodecMultiple::doCompressData(const char * source, UInt32 sour void CompressionCodecMultiple::doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 decompressed_size) const { if (source_size < 1 || !source[0]) - throw Exception(ErrorCodes::CORRUPTED_DATA, "Wrong compression methods list"); + throw Exception(decompression_error_code, "Wrong compression methods list"); UInt8 compression_methods_size = source[0]; @@ -95,10 +89,10 @@ void CompressionCodecMultiple::doDecompressData(const char * source, UInt32 sour auto additional_size_at_the_end_of_buffer = codec->getAdditionalSizeAtTheEndOfBuffer(); compressed_buf.resize(compressed_buf.size() + additional_size_at_the_end_of_buffer); - UInt32 uncompressed_size = ICompressionCodec::readDecompressedBlockSize(compressed_buf.data()); + UInt32 uncompressed_size = readDecompressedBlockSize(compressed_buf.data()); if (idx == 0 && uncompressed_size != decompressed_size) - throw Exception(ErrorCodes::CORRUPTED_DATA, "Wrong final decompressed size in codec Multiple, got {}, expected {}", + throw Exception(decompression_error_code, "Wrong final decompressed size in codec Multiple, got {}, expected {}", uncompressed_size, decompressed_size); uncompressed_buf.resize(uncompressed_size + additional_size_at_the_end_of_buffer); diff --git a/src/Compression/CompressionCodecT64.cpp b/src/Compression/CompressionCodecT64.cpp index 3506c087b54..832b47bdbd0 100644 --- a/src/Compression/CompressionCodecT64.cpp +++ b/src/Compression/CompressionCodecT64.cpp @@ -66,6 +66,7 @@ namespace ErrorCodes extern const int ILLEGAL_SYNTAX_FOR_CODEC_TYPE; extern const int ILLEGAL_CODEC_PARAMETER; extern const int LOGICAL_ERROR; + extern const int INCORRECT_DATA; } namespace @@ -145,7 +146,7 @@ TypeIndex deserializeTypeId(uint8_t serialized_type_id) case MagicNumber::IPv4: return TypeIndex::IPv4; } - throw Exception(ErrorCodes::LOGICAL_ERROR, "Bad magic number in T64 codec: {}", static_cast(serialized_type_id)); + throw Exception(ErrorCodes::INCORRECT_DATA, "Bad magic number in T64 codec: {}", static_cast(serialized_type_id)); } @@ -378,13 +379,6 @@ void transpose(const T * src, char * dst, UInt32 num_bits, UInt32 tail = 64) /// UInt64[N] transposed matrix -> UIntX[64] template -#if defined(__s390x__) - -/* Compiler Bug for S390x :- https://github.com/llvm/llvm-project/issues/62572 - * Please remove this after the fix is backported - */ - __attribute__((noinline)) -#endif void reverseTranspose(const char * src, T * buf, UInt32 num_bits, UInt32 tail = 64) { UInt64 matrix[64] = {}; @@ -544,12 +538,13 @@ void decompressData(const char * src, UInt32 bytes_size, char * dst, UInt32 unco static constexpr const UInt32 header_size = 2 * sizeof(UInt64); if (bytes_size < header_size) - throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress, data size {} is less then T64 header", + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress, data size ({}) is less than the size of T64 header", bytes_size); if (uncompressed_size % sizeof(T)) - throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress, unexpected uncompressed size {}", - uncompressed_size); + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress, unexpected uncompressed size ({})" + " isn't a multiple of the data type size ({})", + uncompressed_size, sizeof(T)); UInt64 num_elements = uncompressed_size / sizeof(T); MinMaxType min; @@ -576,14 +571,20 @@ void decompressData(const char * src, UInt32 bytes_size, char * dst, UInt32 unco UInt32 dst_shift = sizeof(T) * matrix_size; if (!bytes_size || bytes_size % src_shift) - throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress, data size {} is not multiplier of {}", - bytes_size, toString(src_shift)); + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress, data size ({}) is not a multiplier of {}", + bytes_size, src_shift); UInt32 num_full = bytes_size / src_shift; UInt32 tail = num_elements % matrix_size; if (tail) --num_full; + UInt64 expected = static_cast(num_full) * matrix_size + tail; /// UInt64 to avoid overflow. + if (expected != num_elements) + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress, the number of elements in the compressed data ({})" + " is not equal to the expected number of elements in the decompressed data ({})", + expected, num_elements); + T upper_min = 0; T upper_max [[maybe_unused]] = 0; T sign_bit [[maybe_unused]] = 0; diff --git a/src/Compression/ICompressionCodec.cpp b/src/Compression/ICompressionCodec.cpp index a8257c4331f..b4cd6864030 100644 --- a/src/Compression/ICompressionCodec.cpp +++ b/src/Compression/ICompressionCodec.cpp @@ -15,8 +15,6 @@ namespace DB namespace ErrorCodes { - extern const int CANNOT_DECOMPRESS; - extern const int CORRUPTED_DATA; extern const int LOGICAL_ERROR; } @@ -97,14 +95,14 @@ UInt32 ICompressionCodec::decompress(const char * source, UInt32 source_size, ch UInt8 header_size = getHeaderSize(); if (source_size < header_size) - throw Exception(ErrorCodes::CORRUPTED_DATA, + throw Exception(decompression_error_code, "Can't decompress data: the compressed data size ({}, this should include header size) " "is less than the header size ({})", source_size, static_cast(header_size)); uint8_t our_method = getMethodByte(); uint8_t method = source[0]; if (method != our_method) - throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Can't decompress data with codec byte {} using codec with byte {}", method, our_method); + throw Exception(decompression_error_code, "Can't decompress data with codec byte {} using codec with byte {}", method, our_method); UInt32 decompressed_size = readDecompressedBlockSize(source); doDecompressData(&source[header_size], source_size - header_size, dest, decompressed_size); @@ -112,20 +110,20 @@ UInt32 ICompressionCodec::decompress(const char * source, UInt32 source_size, ch return decompressed_size; } -UInt32 ICompressionCodec::readCompressedBlockSize(const char * source) +UInt32 ICompressionCodec::readCompressedBlockSize(const char * source) const { UInt32 compressed_block_size = unalignedLoadLittleEndian(&source[1]); if (compressed_block_size == 0) - throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: header is corrupt with compressed block size 0"); + throw Exception(decompression_error_code, "Can't decompress data: header is corrupt with compressed block size 0"); return compressed_block_size; } -UInt32 ICompressionCodec::readDecompressedBlockSize(const char * source) +UInt32 ICompressionCodec::readDecompressedBlockSize(const char * source) const { UInt32 decompressed_block_size = unalignedLoadLittleEndian(&source[5]); if (decompressed_block_size == 0) - throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: header is corrupt with decompressed block size 0"); + throw Exception(decompression_error_code, "Can't decompress data: header is corrupt with decompressed block size 0"); return decompressed_block_size; } diff --git a/src/Compression/ICompressionCodec.h b/src/Compression/ICompressionCodec.h index 6630838fa64..ca794511268 100644 --- a/src/Compression/ICompressionCodec.h +++ b/src/Compression/ICompressionCodec.h @@ -13,6 +13,12 @@ namespace DB extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size); +namespace ErrorCodes +{ + extern const int CANNOT_DECOMPRESS; + extern const int CORRUPTED_DATA; +} + /** * Represents interface for compression codecs like LZ4, ZSTD, etc. */ @@ -59,7 +65,10 @@ public: CodecMode getDecompressMode() const{ return decompressMode; } /// if set mode to CodecMode::Asynchronous, must be followed with flushAsynchronousDecompressRequests - void setDecompressMode(CodecMode mode){ decompressMode = mode; } + void setDecompressMode(CodecMode mode) { decompressMode = mode; } + + /// Report decompression errors as CANNOT_DECOMPRESS, not CORRUPTED_DATA + void setExternalDataFlag() { decompression_error_code = ErrorCodes::CANNOT_DECOMPRESS; } /// Flush result for previous asynchronous decompression requests. /// This function must be called following several requests offload to HW. @@ -82,10 +91,10 @@ public: static constexpr UInt8 getHeaderSize() { return COMPRESSED_BLOCK_HEADER_SIZE; } /// Read size of compressed block from compressed source - static UInt32 readCompressedBlockSize(const char * source); + UInt32 readCompressedBlockSize(const char * source) const; /// Read size of decompressed block from compressed source - static UInt32 readDecompressedBlockSize(const char * source); + UInt32 readDecompressedBlockSize(const char * source) const; /// Read method byte from compressed source static uint8_t readMethod(const char * source); @@ -131,6 +140,8 @@ protected: /// Construct and set codec description from codec name and arguments. Must be called in codec constructor. void setCodecDescription(const String & name, const ASTs & arguments = {}); + int decompression_error_code = ErrorCodes::CORRUPTED_DATA; + private: ASTPtr full_codec_desc; CodecMode decompressMode{CodecMode::Synchronous}; diff --git a/src/Coordination/CoordinationSettings.cpp b/src/Coordination/CoordinationSettings.cpp index 1f27823182a..cdd691f6a79 100644 --- a/src/Coordination/CoordinationSettings.cpp +++ b/src/Coordination/CoordinationSettings.cpp @@ -36,7 +36,7 @@ void CoordinationSettings::loadFromConfig(const String & config_elem, const Poco } -const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv,csnp,lgif,rqld,rclc,clrs,ftfl"; +const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv,csnp,lgif,rqld,rclc,clrs,ftfl,ydld"; KeeperConfigurationAndSettings::KeeperConfigurationAndSettings() : server_id(NOT_EXIST) diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index 0398e12d07d..3cbfa3e449d 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -32,6 +32,7 @@ struct Settings; M(Milliseconds, shutdown_timeout, 5000, "How much time we will wait until RAFT shutdown", 0) \ M(Milliseconds, session_shutdown_timeout, 10000, "How much time we will wait until sessions are closed during shutdown", 0) \ M(Milliseconds, startup_timeout, 180000, "How much time we will wait until RAFT to start.", 0) \ + M(Milliseconds, sleep_before_leader_change_ms, 8000, "How much time we will wait before removing leader (so as leader could commit accepted but non-committed commands and they won't be lost -- leader removal is not synchronized with committing)", 0) \ M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) \ M(UInt64, rotate_log_storage_interval, 100000, "How many records will be stored in one log storage file", 0) \ M(UInt64, snapshots_to_keep, 3, "How many compressed snapshots to keep on disk", 0) \ diff --git a/src/Coordination/FourLetterCommand.cpp b/src/Coordination/FourLetterCommand.cpp index 1bec17f2050..be2c5ebd071 100644 --- a/src/Coordination/FourLetterCommand.cpp +++ b/src/Coordination/FourLetterCommand.cpp @@ -172,6 +172,9 @@ void FourLetterCommandFactory::registerCommands(KeeperDispatcher & keeper_dispat FourLetterCommandPtr feature_flags_command = std::make_shared(keeper_dispatcher); factory.registerCommand(feature_flags_command); + FourLetterCommandPtr yield_leadership_command = std::make_shared(keeper_dispatcher); + factory.registerCommand(yield_leadership_command); + factory.initializeAllowList(keeper_dispatcher); factory.setInitialize(true); } @@ -579,4 +582,10 @@ String FeatureFlagsCommand::run() return ret.str(); } +String YieldLeadershipCommand::run() +{ + keeper_dispatcher.yieldLeadership(); + return "Sent yield leadership request to leader."; +} + } diff --git a/src/Coordination/FourLetterCommand.h b/src/Coordination/FourLetterCommand.h index 0520da06b6d..bb3c616e080 100644 --- a/src/Coordination/FourLetterCommand.h +++ b/src/Coordination/FourLetterCommand.h @@ -415,4 +415,17 @@ struct FeatureFlagsCommand : public IFourLetterCommand ~FeatureFlagsCommand() override = default; }; +/// Yield leadership and become follower. +struct YieldLeadershipCommand : public IFourLetterCommand +{ + explicit YieldLeadershipCommand(KeeperDispatcher & keeper_dispatcher_) + : IFourLetterCommand(keeper_dispatcher_) + { + } + + String name() override { return "ydld"; } + String run() override; + ~YieldLeadershipCommand() override = default; +}; + } diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index 9e0cdbd6cd3..ca454c18084 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -803,6 +803,8 @@ void KeeperDispatcher::clusterUpdateWithReconfigDisabledThread() void KeeperDispatcher::clusterUpdateThread() { + using enum KeeperServer::ConfigUpdateState; + bool last_command_was_leader_change = false; auto & shutdown_called = keeper_context->shutdown_called; while (!shutdown_called) { @@ -810,13 +812,18 @@ void KeeperDispatcher::clusterUpdateThread() if (!cluster_update_queue.pop(action)) return; - if (server->applyConfigUpdate(action)) + if (const auto res = server->applyConfigUpdate(action, last_command_was_leader_change); res == Accepted) LOG_DEBUG(log, "Processing config update {}: accepted", action); - else // TODO (myrrc) sleep a random amount? sleep less? + else { + last_command_was_leader_change = res == WaitBeforeChangingLeader; + (void)cluster_update_queue.pushFront(action); LOG_DEBUG(log, "Processing config update {}: declined, backoff", action); - std::this_thread::sleep_for(50ms); + + std::this_thread::sleep_for(last_command_was_leader_change + ? configuration_and_settings->coordination_settings->sleep_before_leader_change_ms + : 50ms); } } } diff --git a/src/Coordination/KeeperDispatcher.h b/src/Coordination/KeeperDispatcher.h index 39941f55d5e..6483de7bd19 100644 --- a/src/Coordination/KeeperDispatcher.h +++ b/src/Coordination/KeeperDispatcher.h @@ -237,6 +237,12 @@ public: return server->requestLeader(); } + /// Yield leadership and become follower. + void yieldLeadership() + { + return server->yieldLeadership(); + } + void recalculateStorageStats() { return server->recalculateStorageStats(); diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index fd3db6f1032..656d009e0a7 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -870,36 +870,50 @@ std::vector KeeperServer::getDeadSessions() return state_machine->getDeadSessions(); } -bool KeeperServer::applyConfigUpdate(const ClusterUpdateAction & action) +KeeperServer::ConfigUpdateState KeeperServer::applyConfigUpdate( + const ClusterUpdateAction & action, bool last_command_was_leader_change) { + using enum ConfigUpdateState; std::lock_guard _{server_write_mutex}; if (const auto * add = std::get_if(&action)) { if (raft_instance->get_srv_config(add->id) != nullptr) - return true; + return Accepted; auto resp = raft_instance->add_srv(static_cast(*add)); resp->get(); - return resp->get_accepted(); + return resp->get_accepted() ? Accepted : Declined; } else if (const auto * remove = std::get_if(&action)) { + // This corner case is the most problematic. Issue follows: if we agree on a number + // of commands but don't commit them on leader, and then issue a leadership change via + // yield/request, leader can pause writes before all commits, therefore commands will be lost + // (leadership change is not synchronized with committing in NuRaft). + // However, waiting till some commands get _committed_ instead of _agreed_ is a hard task + // regarding current library design, and this brings lots of levels of complexity + // (see https://github.com/ClickHouse/ClickHouse/pull/53481 history). So, a compromise here + // is a timeout before issuing a leadership change with an ability to change if user knows they + // have a particularly slow network. if (remove->id == raft_instance->get_leader()) { + if (!last_command_was_leader_change) + return WaitBeforeChangingLeader; + if (isLeader()) raft_instance->yield_leadership(); else raft_instance->request_leadership(); - return false; + return Declined; } if (raft_instance->get_srv_config(remove->id) == nullptr) - return true; + return Accepted; auto resp = raft_instance->remove_srv(remove->id); resp->get(); - return resp->get_accepted(); + return resp->get_accepted() ? Accepted : Declined; } else if (const auto * update = std::get_if(&action)) { @@ -908,10 +922,10 @@ bool KeeperServer::applyConfigUpdate(const ClusterUpdateAction & action) "Attempt to apply {} but server is not present in Raft", action); else if (ptr->get_priority() == update->priority) - return true; + return Accepted; raft_instance->set_priority(update->id, update->priority, /*broadcast on live leader*/true); - return true; + return Accepted; } UNREACHABLE(); } @@ -1087,6 +1101,12 @@ bool KeeperServer::requestLeader() return isLeader() || raft_instance->request_leadership(); } +void KeeperServer::yieldLeadership() +{ + if (isLeader()) + raft_instance->yield_leadership(); +} + void KeeperServer::recalculateStorageStats() { state_machine->recalculateStorageStats(); diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index ed58418fe5f..fde40d7d60f 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -128,7 +128,10 @@ public: int getServerID() const { return server_id; } - bool applyConfigUpdate(const ClusterUpdateAction& action); + enum class ConfigUpdateState { Accepted, Declined, WaitBeforeChangingLeader }; + ConfigUpdateState applyConfigUpdate( + const ClusterUpdateAction& action, + bool last_command_was_leader_change = false); // TODO (myrrc) these functions should be removed once "reconfig" is stabilized void applyConfigUpdateWithReconfigDisabled(const ClusterUpdateAction& action); @@ -141,6 +144,8 @@ public: bool requestLeader(); + void yieldLeadership(); + void recalculateStorageStats(); }; diff --git a/src/Core/ProtocolDefines.h b/src/Core/ProtocolDefines.h index d0adf7b78d2..0e2e5b3dc60 100644 --- a/src/Core/ProtocolDefines.h +++ b/src/Core/ProtocolDefines.h @@ -30,7 +30,7 @@ #define DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION 1 -#define DBMS_PARALLEL_REPLICAS_PROTOCOL_VERSION 2 +#define DBMS_PARALLEL_REPLICAS_PROTOCOL_VERSION 3 #define DBMS_MIN_REVISION_WITH_PARALLEL_REPLICAS 54453 #define DBMS_MERGE_TREE_PART_INFO_VERSION 1 diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index 523301a8933..de0fff35389 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -75,7 +75,7 @@ namespace DB \ M(Bool, disable_internal_dns_cache, false, "Disable internal DNS caching at all.", 0) \ M(Int32, dns_cache_update_period, 15, "Internal DNS cache update period in seconds.", 0) \ - M(UInt32, dns_max_consecutive_failures, 1024, "Max DNS resolve failures of a hostname before dropping the hostname from ClickHouse DNS cache.", 0) \ + M(UInt32, dns_max_consecutive_failures, 10, "Max DNS resolve failures of a hostname before dropping the hostname from ClickHouse DNS cache.", 0) \ \ M(UInt64, max_table_size_to_drop, 50000000000lu, "If size of a table is greater than this value (in bytes) than table could not be dropped with any DROP query.", 0) \ M(UInt64, max_partition_size_to_drop, 50000000000lu, "Same as max_table_size_to_drop, but for the partitions.", 0) \ @@ -98,7 +98,8 @@ namespace DB M(Double, total_memory_tracker_sample_probability, 0, "Collect random allocations and deallocations and write them into system.trace_log with 'MemorySample' trace_type. The probability is for every alloc/free regardless to the size of the allocation (can be changed with `memory_profiler_sample_min_allocation_size` and `memory_profiler_sample_max_allocation_size`). Note that sampling happens only when the amount of untracked memory exceeds 'max_untracked_memory'. You may want to set 'max_untracked_memory' to 0 for extra fine grained sampling.", 0) \ M(UInt64, total_memory_profiler_sample_min_allocation_size, 0, "Collect random allocations of size greater or equal than specified value with probability equal to `total_memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold to work as expected.", 0) \ M(UInt64, total_memory_profiler_sample_max_allocation_size, 0, "Collect random allocations of size less or equal than specified value with probability equal to `total_memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold to work as expected.", 0) \ - M(Bool, validate_tcp_client_information, false, "Validate client_information in the query packet over the native TCP protocol.", 0) + M(Bool, validate_tcp_client_information, false, "Validate client_information in the query packet over the native TCP protocol.", 0) \ + M(Bool, storage_metadata_write_full_object_key, false, "Write disk metadata files with VERSION_FULL_OBJECT_KEY format", 0) \ DECLARE_SETTINGS_TRAITS(ServerSettingsTraits, SERVER_SETTINGS) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 609ade4cdc0..5c41c0b0829 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -208,9 +208,8 @@ class IColumn; M(Bool, allow_experimental_inverted_index, false, "If it is set to true, allow to use experimental inverted index.", 0) \ \ M(UInt64, mysql_max_rows_to_insert, 65536, "The maximum number of rows in MySQL batch insertion of the MySQL storage engine", 0) \ - M(Bool, use_mysql_types_in_show_columns, false, "Show native MySQL types in SHOW [FULL] COLUMNS", 0) \ - M(Bool, mysql_map_string_to_text_in_show_columns, false, "If enabled, String type will be mapped to TEXT in SHOW [FULL] COLUMNS, BLOB otherwise. Will only take effect if use_mysql_types_in_show_columns is enabled too", 0) \ - M(Bool, mysql_map_fixed_string_to_text_in_show_columns, false, "If enabled, FixedString type will be mapped to TEXT in SHOW [FULL] COLUMNS, BLOB otherwise. Will only take effect if use_mysql_types_in_show_columns is enabled too", 0) \ + M(Bool, mysql_map_string_to_text_in_show_columns, false, "If enabled, String type will be mapped to TEXT in SHOW [FULL] COLUMNS, BLOB otherwise.", 0) \ + M(Bool, mysql_map_fixed_string_to_text_in_show_columns, false, "If enabled, FixedString type will be mapped to TEXT in SHOW [FULL] COLUMNS, BLOB otherwise.", 0) \ \ M(UInt64, optimize_min_equality_disjunction_chain_length, 3, "The minimum length of the expression `expr = x1 OR ... expr = xN` for optimization ", 0) \ \ @@ -288,7 +287,7 @@ class IColumn; M(Bool, http_write_exception_in_output_format, true, "Write exception in output format to produce valid output. Works with JSON and XML formats.", 0) \ M(UInt64, http_response_buffer_size, 0, "The number of bytes to buffer in the server memory before sending a HTTP response to the client or flushing to disk (when http_wait_end_of_query is enabled).", 0) \ \ - M(Bool, fsync_metadata, true, "Do fsync after changing metadata for tables and databases (.sql files). Could be disabled in case of poor latency on server with high load of DDL queries and high load of disk subsystem.", 0) \ + M(Bool, fsync_metadata, true, "Do fsync after changing metadata for tables and databases (.sql files). Could be disabled in case of poor latency on server with high load of DDL queries and high load of disk subsystem.", 0) \ \ M(Bool, join_use_nulls, false, "Use NULLs for non-joined rows of outer JOINs for types that can be inside Nullable. If false, use default value of corresponding columns data type.", IMPORTANT) \ \ @@ -605,6 +604,7 @@ class IColumn; M(Bool, optimize_use_implicit_projections, true, "Automatically choose implicit projections to perform SELECT query", 0) \ M(Bool, force_optimize_projection, false, "If projection optimization is enabled, SELECT queries need to use projection", 0) \ M(String, force_optimize_projection_name, "", "If it is set to a non-empty string, check that this projection is used in the query at least once.", 0) \ + M(String, preferred_optimize_projection_name, "", "If it is set to a non-empty string, ClickHouse tries to apply specified projection", 0) \ M(Bool, async_socket_for_remote, true, "Asynchronously read from socket executing remote query", 0) \ M(Bool, async_query_sending_for_remote, true, "Asynchronously create connections and send query to shards in remote query", 0) \ M(Bool, insert_null_as_default, true, "Insert DEFAULT values instead of NULL in INSERT SELECT (UNION ALL)", 0) \ @@ -837,6 +837,7 @@ class IColumn; MAKE_OBSOLETE(M, Bool, allow_experimental_bigint_types, true) \ MAKE_OBSOLETE(M, Bool, allow_experimental_window_functions, true) \ MAKE_OBSOLETE(M, Bool, allow_experimental_geo_types, true) \ + MAKE_OBSOLETE(M, Bool, allow_experimental_query_cache, true) \ \ MAKE_OBSOLETE(M, Milliseconds, async_insert_stale_timeout_ms, 0) \ MAKE_OBSOLETE(M, StreamingHandleErrorMode, handle_kafka_error_mode, StreamingHandleErrorMode::DEFAULT) \ @@ -848,6 +849,7 @@ class IColumn; MAKE_OBSOLETE(M, UInt64, merge_tree_clear_old_parts_interval_seconds, 1) \ MAKE_OBSOLETE(M, UInt64, partial_merge_join_optimizations, 0) \ MAKE_OBSOLETE(M, MaxThreads, max_alter_threads, 0) \ + MAKE_OBSOLETE(M, Bool, use_mysql_types_in_show_columns, false) \ /* moved to config.xml: see also src/Core/ServerSettings.h */ \ MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, background_buffer_flush_schedule_pool_size, 16) \ MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, background_pool_size, 16) \ @@ -884,6 +886,7 @@ class IColumn; M(Bool, format_csv_allow_single_quotes, false, "If it is set to true, allow strings in single quotes.", 0) \ M(Bool, format_csv_allow_double_quotes, true, "If it is set to true, allow strings in double quotes.", 0) \ M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \ + M(Bool, input_format_csv_allow_cr_end_of_line, false, "If it is set true, \\r will be allowed at end of line not followed by \\n", 0) \ M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices", 0) \ M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \ M(Bool, input_format_skip_unknown_fields, true, "Skip columns with unknown names from input data (it works for JSONEachRow, -WithNames, -WithNamesAndTypes and TSKV formats).", 0) \ diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index f334cd9ff24..a1de6ea18a9 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -363,6 +363,9 @@ struct WhichDataType constexpr bool isNativeInt() const { return isInt8() || isInt16() || isInt32() || isInt64(); } constexpr bool isInt() const { return isNativeInt() || isInt128() || isInt256(); } + constexpr bool isNativeInteger() const { return isNativeInt() || isNativeUInt(); } + constexpr bool isInteger() const { return isInt() || isUInt(); } + constexpr bool isDecimal32() const { return idx == TypeIndex::Decimal32; } constexpr bool isDecimal64() const { return idx == TypeIndex::Decimal64; } constexpr bool isDecimal128() const { return idx == TypeIndex::Decimal128; } @@ -373,6 +376,9 @@ struct WhichDataType constexpr bool isFloat64() const { return idx == TypeIndex::Float64; } constexpr bool isFloat() const { return isFloat32() || isFloat64(); } + constexpr bool isNativeNumber() const { return isNativeInteger() || isFloat(); } + constexpr bool isNumber() const { return isInteger() || isFloat() || isDecimal(); } + constexpr bool isEnum8() const { return idx == TypeIndex::Enum8; } constexpr bool isEnum16() const { return idx == TypeIndex::Enum16; } constexpr bool isEnum() const { return isEnum8() || isEnum16(); } @@ -410,110 +416,60 @@ struct WhichDataType /// IDataType helpers (alternative for IDataType virtual methods with single point of truth) -template -inline bool isDate(const T & data_type) { return WhichDataType(data_type).isDate(); } -template -inline bool isDate32(const T & data_type) { return WhichDataType(data_type).isDate32(); } -template -inline bool isDateOrDate32(const T & data_type) { return WhichDataType(data_type).isDateOrDate32(); } -template -inline bool isDateTime(const T & data_type) { return WhichDataType(data_type).isDateTime(); } -template -inline bool isDateTime64(const T & data_type) { return WhichDataType(data_type).isDateTime64(); } -template -inline bool isDateTimeOrDateTime64(const T & data_type) { return WhichDataType(data_type).isDateTimeOrDateTime64(); } -template -inline bool isDateOrDate32OrDateTimeOrDateTime64(const T & data_type) { return WhichDataType(data_type).isDateOrDate32OrDateTimeOrDateTime64(); } +template inline bool isUInt8(const T & data_type) { return WhichDataType(data_type).isUInt8(); } +template inline bool isUInt16(const T & data_type) { return WhichDataType(data_type).isUInt16(); } +template inline bool isUInt32(const T & data_type) { return WhichDataType(data_type).isUInt32(); } +template inline bool isUInt64(const T & data_type) { return WhichDataType(data_type).isUInt64(); } +template inline bool isNativeUInt(const T & data_type) { return WhichDataType(data_type).isNativeUInt(); } +template inline bool isUInt(const T & data_type) { return WhichDataType(data_type).isUInt(); } -template -inline bool isEnum(const T & data_type) { return WhichDataType(data_type).isEnum(); } -template -inline bool isDecimal(const T & data_type) { return WhichDataType(data_type).isDecimal(); } -template -inline bool isTuple(const T & data_type) { return WhichDataType(data_type).isTuple(); } -template -inline bool isArray(const T & data_type) { return WhichDataType(data_type).isArray(); } -template -inline bool isMap(const T & data_type) {return WhichDataType(data_type).isMap(); } -template -inline bool isInterval(const T & data_type) {return WhichDataType(data_type).isInterval(); } -template -inline bool isNothing(const T & data_type) { return WhichDataType(data_type).isNothing(); } -template -inline bool isUUID(const T & data_type) { return WhichDataType(data_type).isUUID(); } -template -inline bool isIPv4(const T & data_type) { return WhichDataType(data_type).isIPv4(); } -template -inline bool isIPv6(const T & data_type) { return WhichDataType(data_type).isIPv6(); } +template inline bool isInt8(const T & data_type) { return WhichDataType(data_type).isInt8(); } +template inline bool isInt16(const T & data_type) { return WhichDataType(data_type).isInt16(); } +template inline bool isInt32(const T & data_type) { return WhichDataType(data_type).isInt32(); } +template inline bool isInt64(const T & data_type) { return WhichDataType(data_type).isInt64(); } +template inline bool isNativeInt(const T & data_type) { return WhichDataType(data_type).isNativeInt(); } +template inline bool isInt(const T & data_type) { return WhichDataType(data_type).isInt(); } -template -inline bool isObject(const T & data_type) { return WhichDataType(data_type).isObject(); } +template inline bool isInteger(const T & data_type) { return WhichDataType(data_type).isInteger(); } +template inline bool isNativeInteger(const T & data_type) { return WhichDataType(data_type).isNativeInteger(); } -template -inline bool isUInt8(const T & data_type) { return WhichDataType(data_type).isUInt8(); } -template -inline bool isUInt16(const T & data_type) { return WhichDataType(data_type).isUInt16(); } -template -inline bool isUInt32(const T & data_type) { return WhichDataType(data_type).isUInt32(); } -template -inline bool isUInt64(const T & data_type) { return WhichDataType(data_type).isUInt64(); } -template -inline bool isNativeUnsignedInteger(const T & data_type) { return WhichDataType(data_type).isNativeUInt(); } -template -inline bool isUnsignedInteger(const T & data_type) { return WhichDataType(data_type).isUInt(); } +template inline bool isDecimal(const T & data_type) { return WhichDataType(data_type).isDecimal(); } -template -inline bool isInt8(const T & data_type) { return WhichDataType(data_type).isInt8(); } -template -inline bool isInt16(const T & data_type) { return WhichDataType(data_type).isInt16(); } -template -inline bool isInt32(const T & data_type) { return WhichDataType(data_type).isInt32(); } -template -inline bool isInt64(const T & data_type) { return WhichDataType(data_type).isInt64(); } -template -inline bool isInt(const T & data_type) { return WhichDataType(data_type).isInt(); } +template inline bool isFloat(const T & data_type) { return WhichDataType(data_type).isFloat(); } -template -inline bool isInteger(const T & data_type) -{ - WhichDataType which(data_type); - return which.isInt() || which.isUInt(); -} +template inline bool isNativeNumber(const T & data_type) { return WhichDataType(data_type).isNativeNumber(); } +template inline bool isNumber(const T & data_type) { return WhichDataType(data_type).isNumber(); } -template -inline bool isFloat(const T & data_type) -{ - WhichDataType which(data_type); - return which.isFloat(); -} +template inline bool isEnum(const T & data_type) { return WhichDataType(data_type).isEnum(); } -template -inline bool isNativeInteger(const T & data_type) -{ - WhichDataType which(data_type); - return which.isNativeInt() || which.isNativeUInt(); -} +template inline bool isDate(const T & data_type) { return WhichDataType(data_type).isDate(); } +template inline bool isDate32(const T & data_type) { return WhichDataType(data_type).isDate32(); } +template inline bool isDateOrDate32(const T & data_type) { return WhichDataType(data_type).isDateOrDate32(); } +template inline bool isDateTime(const T & data_type) { return WhichDataType(data_type).isDateTime(); } +template inline bool isDateTime64(const T & data_type) { return WhichDataType(data_type).isDateTime64(); } +template inline bool isDateTimeOrDateTime64(const T & data_type) { return WhichDataType(data_type).isDateTimeOrDateTime64(); } +template inline bool isDateOrDate32OrDateTimeOrDateTime64(const T & data_type) { return WhichDataType(data_type).isDateOrDate32OrDateTimeOrDateTime64(); } +template inline bool isString(const T & data_type) { return WhichDataType(data_type).isString(); } +template inline bool isFixedString(const T & data_type) { return WhichDataType(data_type).isFixedString(); } +template inline bool isStringOrFixedString(const T & data_type) { return WhichDataType(data_type).isStringOrFixedString(); } -template -inline bool isNativeNumber(const T & data_type) -{ - WhichDataType which(data_type); - return which.isNativeInt() || which.isNativeUInt() || which.isFloat(); -} +template inline bool isUUID(const T & data_type) { return WhichDataType(data_type).isUUID(); } +template inline bool isIPv4(const T & data_type) { return WhichDataType(data_type).isIPv4(); } +template inline bool isIPv6(const T & data_type) { return WhichDataType(data_type).isIPv6(); } +template inline bool isArray(const T & data_type) { return WhichDataType(data_type).isArray(); } +template inline bool isTuple(const T & data_type) { return WhichDataType(data_type).isTuple(); } +template inline bool isMap(const T & data_type) {return WhichDataType(data_type).isMap(); } +template inline bool isInterval(const T & data_type) {return WhichDataType(data_type).isInterval(); } +template inline bool isObject(const T & data_type) { return WhichDataType(data_type).isObject(); } -template -inline bool isNumber(const T & data_type) -{ - WhichDataType which(data_type); - return which.isInt() || which.isUInt() || which.isFloat() || which.isDecimal(); -} +template inline bool isNothing(const T & data_type) { return WhichDataType(data_type).isNothing(); } template inline bool isColumnedAsNumber(const T & data_type) { WhichDataType which(data_type); - return which.isInt() || which.isUInt() || which.isFloat() || which.isDateOrDate32() || which.isDateTime() || which.isDateTime64() || which.isUUID() || which.isIPv4() || which.isIPv6(); + return which.isInteger() || which.isFloat() || which.isDateOrDate32OrDateTimeOrDateTime64() || which.isUUID() || which.isIPv4() || which.isIPv6(); } template @@ -531,24 +487,6 @@ inline bool isColumnedAsDecimalT(const DataType & data_type) return (which.isDecimal() || which.isDateTime64()) && which.idx == TypeToTypeIndex; } -template -inline bool isString(const T & data_type) -{ - return WhichDataType(data_type).isString(); -} - -template -inline bool isFixedString(const T & data_type) -{ - return WhichDataType(data_type).isFixedString(); -} - -template -inline bool isStringOrFixedString(const T & data_type) -{ - return WhichDataType(data_type).isStringOrFixedString(); -} - template inline bool isNotCreatable(const T & data_type) { @@ -567,12 +505,6 @@ inline bool isBool(const DataTypePtr & data_type) return data_type->getName() == "Bool"; } -inline bool isAggregateFunction(const DataTypePtr & data_type) -{ - WhichDataType which(data_type); - return which.isAggregateFunction(); -} - inline bool isNullableOrLowCardinalityNullable(const DataTypePtr & data_type) { return data_type->isNullable() || data_type->isLowCardinalityNullable(); diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 01afbdcaa57..96c084a261c 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -73,7 +73,7 @@ std::pair createTableFromAST( auto table_function = factory.get(table_function_ast, context); ColumnsDescription columns; if (ast_create_query.columns_list && ast_create_query.columns_list->columns) - columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true); + columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true, false); StoragePtr storage = table_function->execute(table_function_ast, context, ast_create_query.getTable(), std::move(columns)); storage->renameInMemory(ast_create_query); return {ast_create_query.getTable(), storage}; @@ -99,7 +99,7 @@ std::pair createTableFromAST( } else { - columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true); + columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true, false); constraints = InterpreterCreateQuery::getConstraintsDescription(ast_create_query.columns_list->constraints); } } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index e21e65ec340..a643eafdd14 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -262,7 +262,11 @@ ClusterPtr DatabaseReplicated::getClusterImpl() const shards.back().push_back(DatabaseReplicaInfo{std::move(hostname), std::move(shard), std::move(replica)}); } - UInt16 default_port = getContext()->getTCPPort(); + UInt16 default_port; + if (cluster_auth_info.cluster_secure_connection) + default_port = getContext()->getTCPPortSecure().value_or(DBMS_DEFAULT_SECURE_PORT); + else + default_port = getContext()->getTCPPort(); bool treat_local_as_remote = false; bool treat_local_port_as_remote = getContext()->getApplicationType() == Context::ApplicationType::LOCAL; @@ -722,7 +726,7 @@ void DatabaseReplicated::checkQueryValid(const ASTPtr & query, ContextPtr query_ } } -BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, ContextPtr query_context, bool internal) +BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, ContextPtr query_context, QueryFlags flags) { if (query_context->getCurrentTransaction() && query_context->getSettingsRef().throw_on_unsupported_query_inside_transaction) @@ -731,7 +735,7 @@ BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, Contex if (is_readonly) throw Exception(ErrorCodes::NO_ZOOKEEPER, "Database is in readonly mode, because it cannot connect to ZooKeeper"); - if (!internal && (query_context->getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY)) + if (!flags.internal && (query_context->getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY)) throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not initial query. ON CLUSTER is not allowed for Replicated database."); checkQueryValid(query, query_context); @@ -742,6 +746,7 @@ BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, Contex entry.initiator = ddl_worker->getCommonHostID(); entry.setSettingsIfRequired(query_context); entry.tracing_context = OpenTelemetry::CurrentContext(); + entry.is_backup_restore = flags.distributed_backup_restore; String node_path = ddl_worker->tryEnqueueAndExecuteEntry(entry, query_context); Strings hosts_to_wait; @@ -919,14 +924,14 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep String query = fmt::format("CREATE DATABASE IF NOT EXISTS {} ENGINE=Ordinary", backQuoteIfNeed(to_db_name)); auto query_context = Context::createCopy(getContext()); query_context->setSetting("allow_deprecated_database_ordinary", 1); - executeQuery(query, query_context, true); + executeQuery(query, query_context, QueryFlags{ .internal = true }); /// But we want to avoid discarding UUID of ReplicatedMergeTree tables, because it will not work /// if zookeeper_path contains {uuid} macro. Replicated database do not recreate replicated tables on recovery, /// so it's ok to save UUID of replicated table. query = fmt::format("CREATE DATABASE IF NOT EXISTS {} ENGINE=Atomic", backQuoteIfNeed(to_db_name_replicated)); query_context = Context::createCopy(getContext()); - executeQuery(query, query_context, true); + executeQuery(query, query_context, QueryFlags{ .internal = true }); } size_t moved_tables = 0; diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 005180624ed..1387ba1cb96 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -46,7 +46,7 @@ public: /// Try to execute DLL query on current host as initial query. If query is succeed, /// then it will be executed on all replicas. - BlockIO tryEnqueueReplicatedDDL(const ASTPtr & query, ContextPtr query_context, bool internal) override; + BlockIO tryEnqueueReplicatedDDL(const ASTPtr & query, ContextPtr query_context, QueryFlags flags) override; bool canExecuteReplicatedMetadataAlter() const override; diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index 3ffa08f8ec7..9b85e7194d3 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -372,6 +372,7 @@ void DatabaseWithOwnTablesBase::createTableRestoredFromBackup(const ASTPtr & cre /// Creates a table by executing a "CREATE TABLE" query. InterpreterCreateQuery interpreter{create_table_query, local_context}; interpreter.setInternal(true); + interpreter.setIsRestoreFromBackup(true); interpreter.execute(); } diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 01d940b0429..e886f1adae3 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -345,7 +346,7 @@ public: virtual bool shouldReplicateQuery(const ContextPtr & /*query_context*/, const ASTPtr & /*query_ptr*/) const { return false; } - virtual BlockIO tryEnqueueReplicatedDDL(const ASTPtr & /*query*/, ContextPtr /*query_context*/, [[maybe_unused]] bool internal = false) /// NOLINT + virtual BlockIO tryEnqueueReplicatedDDL(const ASTPtr & /*query*/, ContextPtr /*query_context*/, [[maybe_unused]] QueryFlags flags = {}) /// NOLINT { throw Exception(ErrorCodes::LOGICAL_ERROR, "Database engine {} does not have replicated DDL queue", getEngineName()); } diff --git a/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp b/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp index 3d10e66e964..14cd89e1ff6 100644 --- a/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp +++ b/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp @@ -26,12 +26,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include namespace DB { @@ -75,7 +77,7 @@ static BlockIO tryToExecuteQuery(const String & query_to_execute, ContextMutable if (!database.empty()) query_context->setCurrentDatabase(database); - return executeQuery("/*" + comment + "*/ " + query_to_execute, query_context, true).second; + return executeQuery("/*" + comment + "*/ " + query_to_execute, query_context, QueryFlags{ .internal = true }).second; } catch (...) { @@ -428,9 +430,8 @@ static inline void dumpDataForTables( static inline UInt32 randomNumber() { - std::mt19937 rng; - rng.seed(std::random_device()()); - std::uniform_int_distribution dist6( + pcg64_fast rng{randomSeed()}; + std::uniform_int_distribution dist6( std::numeric_limits::min(), std::numeric_limits::max()); return static_cast(dist6(rng)); } diff --git a/src/Dictionaries/ClickHouseDictionarySource.cpp b/src/Dictionaries/ClickHouseDictionarySource.cpp index 92fae2bc495..37f94062ef9 100644 --- a/src/Dictionaries/ClickHouseDictionarySource.cpp +++ b/src/Dictionaries/ClickHouseDictionarySource.cpp @@ -168,7 +168,7 @@ QueryPipeline ClickHouseDictionarySource::createStreamForQuery(const String & qu if (configuration.is_local) { - pipeline = executeQuery(query, context_copy, true).second.pipeline; + pipeline = executeQuery(query, context_copy, QueryFlags{ .internal = true }).second.pipeline; pipeline.convertStructureTo(empty_sample_block.getColumnsWithTypeAndName()); } else @@ -190,7 +190,7 @@ std::string ClickHouseDictionarySource::doInvalidateQuery(const std::string & re if (configuration.is_local) { - return readInvalidateQuery(executeQuery(request, context_copy, true).second.pipeline); + return readInvalidateQuery(executeQuery(request, context_copy, QueryFlags{ .internal = true }).second.pipeline); } else { diff --git a/src/Dictionaries/ExternalQueryBuilder.cpp b/src/Dictionaries/ExternalQueryBuilder.cpp index e21b0842e11..792c4e3e907 100644 --- a/src/Dictionaries/ExternalQueryBuilder.cpp +++ b/src/Dictionaries/ExternalQueryBuilder.cpp @@ -396,18 +396,20 @@ std::string ExternalQueryBuilder::composeLoadKeysQuery( } else { - writeString(query, out); - auto condition_position = query.find(CONDITION_PLACEHOLDER_TO_REPLACE_VALUE); if (condition_position == std::string::npos) { - writeString(" WHERE ", out); + writeString("SELECT * FROM (", out); + writeString(query, out); + writeString(") WHERE ", out); composeKeysCondition(key_columns, requested_rows, method, partition_key_prefix, out); writeString(";", out); return out.str(); } + writeString(query, out); + WriteBufferFromOwnString condition_value_buffer; composeKeysCondition(key_columns, requested_rows, method, partition_key_prefix, condition_value_buffer); const auto & condition_value = condition_value_buffer.str(); diff --git a/src/Dictionaries/HashedArrayDictionary.cpp b/src/Dictionaries/HashedArrayDictionary.cpp index 68c347af9df..21016025d96 100644 --- a/src/Dictionaries/HashedArrayDictionary.cpp +++ b/src/Dictionaries/HashedArrayDictionary.cpp @@ -900,8 +900,6 @@ void registerDictionaryArrayHashed(DictionaryFactory & factory) return std::make_unique>(dict_id, dict_struct, std::move(source_ptr), configuration); }; - using namespace std::placeholders; - factory.registerLayout("hashed_array", [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e, ContextPtr global_context, bool /*created_from_ddl*/) { diff --git a/src/Dictionaries/HashedDictionary.cpp b/src/Dictionaries/HashedDictionary.cpp index 562857cd790..0556e2bb266 100644 --- a/src/Dictionaries/HashedDictionary.cpp +++ b/src/Dictionaries/HashedDictionary.cpp @@ -1246,8 +1246,6 @@ void registerDictionaryHashed(DictionaryFactory & factory) } }; - using namespace std::placeholders; - factory.registerLayout("hashed", [=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e, ContextPtr global_context, bool /*created_from_ddl*/){ return create_layout(a, b, c, d, std::move(e), global_context, DictionaryKeyType::Simple, /* sparse = */ false); }, false); factory.registerLayout("sparse_hashed", diff --git a/src/Dictionaries/RangeHashedDictionary.h b/src/Dictionaries/RangeHashedDictionary.h index 624a57d65b5..9be9fa1d0d4 100644 --- a/src/Dictionaries/RangeHashedDictionary.h +++ b/src/Dictionaries/RangeHashedDictionary.h @@ -227,9 +227,7 @@ private: struct KeyAttribute final { RangeStorageTypeContainer container; - RangeStorageTypeContainer invalid_intervals_container; - }; void createAttributes(); diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index bfb418e1c5e..6911fd86db2 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -302,12 +302,14 @@ public: struct LocalPathWithObjectStoragePaths { std::string local_path; - std::string common_prefix_for_objects; StoredObjects objects; LocalPathWithObjectStoragePaths( - const std::string & local_path_, const std::string & common_prefix_for_objects_, StoredObjects && objects_) - : local_path(local_path_), common_prefix_for_objects(common_prefix_for_objects_), objects(std::move(objects_)) {} + const std::string & local_path_, + StoredObjects && objects_) + : local_path(local_path_) + , objects(std::move(objects_)) + {} }; virtual void getRemotePathsRecursive(const String &, std::vector &) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 73be834c1bb..fcb82daca95 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -102,9 +102,9 @@ AzureObjectStorage::AzureObjectStorage( data_source_description.is_encrypted = false; } -std::string AzureObjectStorage::generateBlobNameForPath(const std::string & /* path */) +ObjectStorageKey AzureObjectStorage::generateObjectKeyForPath(const std::string & /* path */) const { - return getRandomASCIIString(32); + return ObjectStorageKey::createAsRelative(getRandomASCIIString(32)); } bool AzureObjectStorage::exists(const StoredObject & object) const @@ -320,18 +320,7 @@ void AzureObjectStorage::removeObjectsIfExist(const StoredObjects & objects) auto client_ptr = client.get(); for (const auto & object : objects) { - try - { - auto delete_info = client_ptr->DeleteBlob(object.remote_path); - } - catch (const Azure::Storage::StorageException & e) - { - /// If object doesn't exist... - if (e.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) - return; - tryLogCurrentException(__PRETTY_FUNCTION__); - throw; - } + removeObjectIfExists(object); } } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 5436860818c..8e3d50418d3 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -121,7 +121,7 @@ public: const std::string & config_prefix, ContextPtr context) override; - std::string generateBlobNameForPath(const std::string & path) override; + ObjectStorageKey generateObjectKeyForPath(const std::string & path) const override; bool isRemote() const override { return true; } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/registerDiskAzureBlobStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/registerDiskAzureBlobStorage.cpp index a09befe84a8..7ba9d21db62 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/registerDiskAzureBlobStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/registerDiskAzureBlobStorage.cpp @@ -31,11 +31,12 @@ void registerDiskAzureBlobStorage(DiskFactory & factory, bool global_skip_access getAzureBlobContainerClient(config, config_prefix), getAzureBlobStorageSettings(config, config_prefix, context)); - auto metadata_storage = std::make_shared(metadata_disk, ""); + String key_prefix; + auto metadata_storage = std::make_shared(metadata_disk, key_prefix); std::shared_ptr azure_blob_storage_disk = std::make_shared( name, - /* no namespaces */"", + /* no namespaces */ key_prefix, "DiskAzureBlobStorage", std::move(metadata_storage), std::move(azure_object_storage), diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp index d94c26f27e8..e459aae190c 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp @@ -42,9 +42,9 @@ FileCache::Key CachedObjectStorage::getCacheKey(const std::string & path) const return cache->createKeyForPath(path); } -std::string CachedObjectStorage::generateBlobNameForPath(const std::string & path) +ObjectStorageKey CachedObjectStorage::generateObjectKeyForPath(const std::string & path) const { - return object_storage->generateBlobNameForPath(path); + return object_storage->generateObjectKeyForPath(path); } ReadSettings CachedObjectStorage::patchSettings(const ReadSettings & read_settings) const diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h index 925abbc6932..20b3a42540b 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h @@ -92,7 +92,7 @@ public: const std::string & getCacheName() const override { return cache_config_name; } - std::string generateBlobNameForPath(const std::string & path) override; + ObjectStorageKey generateObjectKeyForPath(const std::string & path) const override; bool isRemote() const override { return object_storage->isRemote(); } diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index b4b777bd494..c1f053be7c6 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -48,14 +48,14 @@ DiskTransactionPtr DiskObjectStorage::createObjectStorageTransaction() DiskObjectStorage::DiskObjectStorage( const String & name_, - const String & object_storage_root_path_, + const String & object_key_prefix_, const String & log_name, MetadataStoragePtr metadata_storage_, ObjectStoragePtr object_storage_, const Poco::Util::AbstractConfiguration & config, const String & config_prefix) : IDisk(name_, config, config_prefix) - , object_storage_root_path(object_storage_root_path_) + , object_key_prefix(object_key_prefix_) , log (&Poco::Logger::get("DiskObjectStorage(" + log_name + ")")) , metadata_storage(std::move(metadata_storage_)) , object_storage(std::move(object_storage_)) @@ -80,7 +80,7 @@ void DiskObjectStorage::getRemotePathsRecursive(const String & local_path, std:: { try { - paths_map.emplace_back(local_path, metadata_storage->getObjectStorageRootPath(), getStorageObjects(local_path)); + paths_map.emplace_back(local_path, getStorageObjects(local_path)); } catch (const Exception & e) { @@ -243,9 +243,9 @@ String DiskObjectStorage::getUniqueId(const String & path) const bool DiskObjectStorage::checkUniqueId(const String & id) const { - if (!id.starts_with(object_storage_root_path)) + if (!id.starts_with(object_key_prefix)) { - LOG_DEBUG(log, "Blob with id {} doesn't start with blob storage prefix {}, Stack {}", id, object_storage_root_path, StackTrace().toString()); + LOG_DEBUG(log, "Blob with id {} doesn't start with blob storage prefix {}, Stack {}", id, object_key_prefix, StackTrace().toString()); return false; } @@ -470,7 +470,7 @@ DiskObjectStoragePtr DiskObjectStorage::createDiskObjectStorage() const auto config_prefix = "storage_configuration.disks." + name; return std::make_shared( getName(), - object_storage_root_path, + object_key_prefix, getName(), metadata_storage, object_storage, @@ -586,7 +586,7 @@ void DiskObjectStorage::restoreMetadataIfNeeded( { metadata_helper->restore(config, config_prefix, context); - auto current_schema_version = metadata_helper->readSchemaVersion(object_storage.get(), object_storage_root_path); + auto current_schema_version = metadata_helper->readSchemaVersion(object_storage.get(), object_key_prefix); if (current_schema_version < DiskObjectStorageRemoteMetadataRestoreHelper::RESTORABLE_SCHEMA_VERSION) metadata_helper->migrateToRestorableSchema(); diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.h b/src/Disks/ObjectStorages/DiskObjectStorage.h index ccd7e807513..66d1b02aea7 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.h +++ b/src/Disks/ObjectStorages/DiskObjectStorage.h @@ -37,7 +37,7 @@ friend class DiskObjectStorageRemoteMetadataRestoreHelper; public: DiskObjectStorage( const String & name_, - const String & object_storage_root_path_, + const String & object_key_prefix_, const String & log_name, MetadataStoragePtr metadata_storage_, ObjectStoragePtr object_storage_, @@ -224,7 +224,7 @@ private: String getReadResourceName() const; String getWriteResourceName() const; - const String object_storage_root_path; + const String object_key_prefix; Poco::Logger * log; MetadataStoragePtr metadata_storage; diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp index dfb84ab386a..3271a190193 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include namespace DB { @@ -17,44 +19,57 @@ namespace ErrorCodes void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf) { - UInt32 version; readIntText(version, buf); - if (version < VERSION_ABSOLUTE_PATHS || version > VERSION_INLINE_DATA) + if (version < VERSION_ABSOLUTE_PATHS || version > VERSION_FULL_OBJECT_KEY) throw Exception( ErrorCodes::UNKNOWN_FORMAT, "Unknown metadata file version. Path: {}. Version: {}. Maximum expected version: {}", - common_metadata_path + metadata_file_path, toString(version), toString(VERSION_READ_ONLY_FLAG)); + metadata_file_path, toString(version), toString(VERSION_FULL_OBJECT_KEY)); assertChar('\n', buf); - UInt32 storage_objects_count; - readIntText(storage_objects_count, buf); + UInt32 keys_count; + readIntText(keys_count, buf); assertChar('\t', buf); + keys_with_meta.resize(keys_count); + readIntText(total_size, buf); assertChar('\n', buf); - storage_objects.resize(storage_objects_count); - for (size_t i = 0; i < storage_objects_count; ++i) + for (UInt32 i = 0; i < keys_count; ++i) { - String object_relative_path; - size_t object_size; + UInt64 object_size; readIntText(object_size, buf); assertChar('\t', buf); - readEscapedString(object_relative_path, buf); - if (version == VERSION_ABSOLUTE_PATHS) - { - if (!object_relative_path.starts_with(object_storage_root_path)) - throw Exception(ErrorCodes::UNKNOWN_FORMAT, - "Path in metadata does not correspond to root path. Path: {}, root path: {}, disk path: {}", - object_relative_path, object_storage_root_path, common_metadata_path); - object_relative_path = object_relative_path.substr(object_storage_root_path.size()); - } + keys_with_meta[i].metadata.size_bytes = object_size; + + String key_value; + readEscapedString(key_value, buf); assertChar('\n', buf); - storage_objects[i].relative_path = object_relative_path; - storage_objects[i].metadata.size_bytes = object_size; + if (version == VERSION_ABSOLUTE_PATHS) + { + if (!key_value.starts_with(compatible_key_prefix)) + throw Exception( + ErrorCodes::UNKNOWN_FORMAT, + "Path in metadata does not correspond to root path. Path: {}, root path: {}, disk path: {}", + key_value, + compatible_key_prefix, + metadata_file_path); + + keys_with_meta[i].key = ObjectStorageKey::createAsRelative( + compatible_key_prefix, key_value.substr(compatible_key_prefix.size())); + } + else if (version < VERSION_FULL_OBJECT_KEY) + { + keys_with_meta[i].key = ObjectStorageKey::createAsRelative(compatible_key_prefix, key_value); + } + else if (version >= VERSION_FULL_OBJECT_KEY) + { + keys_with_meta[i].key = ObjectStorageKey::createAsAbsolute(key_value); + } } readIntText(ref_count, buf); @@ -73,7 +88,7 @@ void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf) } } -void DiskObjectStorageMetadata::deserializeFromString(const std::string & data) +void DiskObjectStorageMetadata::deserializeFromString(const String & data) { ReadBufferFromString buf(data); deserialize(buf); @@ -81,21 +96,58 @@ void DiskObjectStorageMetadata::deserializeFromString(const std::string & data) void DiskObjectStorageMetadata::serialize(WriteBuffer & buf, bool sync) const { - writeIntText(VERSION_INLINE_DATA, buf); + /// These are the changes for backward compatibility + /// No new file should be written as VERSION_FULL_OBJECT_KEY until storage_metadata_write_full_object_key feature is enabled + /// However, in case of rollback, once file had been written as VERSION_FULL_OBJECT_KEY + /// it has to be always rewritten as VERSION_FULL_OBJECT_KEY + bool storage_metadata_write_full_object_key = getWriteFullObjectKeySetting(); + + if (version == VERSION_FULL_OBJECT_KEY && !storage_metadata_write_full_object_key) + { + Poco::Logger * logger = &Poco::Logger::get("DiskObjectStorageMetadata"); + LOG_WARNING( + logger, + "Metadata file {} is written with VERSION_FULL_OBJECT_KEY version" + "However storage_metadata_write_full_object_key is off.", + metadata_file_path); + } + + UInt32 write_version = version; + if (storage_metadata_write_full_object_key) + write_version = VERSION_FULL_OBJECT_KEY; + + if (!inline_data.empty() && write_version < VERSION_INLINE_DATA) + write_version = VERSION_INLINE_DATA; + + chassert(write_version >= VERSION_ABSOLUTE_PATHS && write_version <= VERSION_FULL_OBJECT_KEY); + writeIntText(write_version, buf); writeChar('\n', buf); - writeIntText(storage_objects.size(), buf); + writeIntText(keys_with_meta.size(), buf); writeChar('\t', buf); writeIntText(total_size, buf); writeChar('\n', buf); - for (const auto & [object_relative_path, object_metadata] : storage_objects) + for (const auto & [object_key, object_meta] : keys_with_meta) { - writeIntText(object_metadata.size_bytes, buf); + writeIntText(object_meta.size_bytes, buf); writeChar('\t', buf); - writeEscapedString(object_relative_path, buf); - writeChar('\n', buf); + + if (write_version == VERSION_FULL_OBJECT_KEY) + { + /// if the metadata file has VERSION_FULL_OBJECT_KEY version + /// all keys inside are written as absolute paths + writeEscapedString(object_key.serialize(), buf); + writeChar('\n', buf); + } + else + { + /// otherwise keys are written as relative paths + /// therefore keys have to have suffix and prefix + writeEscapedString(object_key.getSuffix(), buf); + writeChar('\n', buf); + } } writeIntText(ref_count, buf); @@ -104,20 +156,18 @@ void DiskObjectStorageMetadata::serialize(WriteBuffer & buf, bool sync) const writeBoolText(read_only, buf); writeChar('\n', buf); - /// Metadata version describes the format of the file - /// It determines the possibility of writing and reading a particular set of fields from the file, no matter the fields' values. - /// It should not be dependent on field values. - /// We always write inline_data in the file when we declare VERSION_INLINE_DATA as a file version, - /// unless it is impossible to introduce the next version of the format. - writeEscapedString(inline_data, buf); - writeChar('\n', buf); + if (write_version >= VERSION_INLINE_DATA) + { + writeEscapedString(inline_data, buf); + writeChar('\n', buf); + } buf.finalize(); if (sync) buf.sync(); } -std::string DiskObjectStorageMetadata::serializeToString() const +String DiskObjectStorageMetadata::serializeToString() const { WriteBufferFromOwnString result; serialize(result, false); @@ -126,20 +176,44 @@ std::string DiskObjectStorageMetadata::serializeToString() const /// Load metadata by path or create empty if `create` flag is set. DiskObjectStorageMetadata::DiskObjectStorageMetadata( - const std::string & common_metadata_path_, - const String & object_storage_root_path_, - const String & metadata_file_path_) - : common_metadata_path(common_metadata_path_) - , object_storage_root_path(object_storage_root_path_) - , metadata_file_path(metadata_file_path_) + String compatible_key_prefix_, + String metadata_file_path_) + : compatible_key_prefix(std::move(compatible_key_prefix_)) + , metadata_file_path(std::move(metadata_file_path_)) { } -void DiskObjectStorageMetadata::addObject(const String & path, size_t size) +void DiskObjectStorageMetadata::addObject(ObjectStorageKey key, size_t size) { + if (!key.hasPrefix()) + { + version = VERSION_FULL_OBJECT_KEY; + + bool storage_metadata_write_full_object_key = getWriteFullObjectKeySetting(); + if (!storage_metadata_write_full_object_key) + { + Poco::Logger * logger = &Poco::Logger::get("DiskObjectStorageMetadata"); + LOG_WARNING( + logger, + "Metadata file {} has at least one key {} without fixed common key prefix." + "That forces using VERSION_FULL_OBJECT_KEY version for that metadata file." + "However storage_metadata_write_full_object_key is off.", + metadata_file_path, + key.serialize()); + } + } + total_size += size; - storage_objects.emplace_back(path, ObjectMetadata{size, {}, {}}); + keys_with_meta.emplace_back(std::move(key), ObjectMetadata{size, {}, {}}); } +bool DiskObjectStorageMetadata::getWriteFullObjectKeySetting() +{ +#ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD + return Context::getGlobalContextInstance()->getServerSettings().storage_metadata_write_full_object_key; +#else + return false; +#endif +} } diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h index 1abb829c12a..729d93af10d 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h +++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h @@ -13,29 +13,30 @@ struct DiskObjectStorageMetadata { private: /// Metadata file version. - static constexpr uint32_t VERSION_ABSOLUTE_PATHS = 1; - static constexpr uint32_t VERSION_RELATIVE_PATHS = 2; - static constexpr uint32_t VERSION_READ_ONLY_FLAG = 3; - static constexpr uint32_t VERSION_INLINE_DATA = 4; + static constexpr UInt32 VERSION_ABSOLUTE_PATHS = 1; + static constexpr UInt32 VERSION_RELATIVE_PATHS = 2; + static constexpr UInt32 VERSION_READ_ONLY_FLAG = 3; + static constexpr UInt32 VERSION_INLINE_DATA = 4; + static constexpr UInt32 VERSION_FULL_OBJECT_KEY = 5; /// only for reading data - const std::string & common_metadata_path; + UInt32 version = VERSION_READ_ONLY_FLAG; - /// Relative paths of blobs. - RelativePathsWithMetadata storage_objects; + /// Absolute paths of blobs + ObjectKeysWithMetadata keys_with_meta; - const std::string object_storage_root_path; + const std::string compatible_key_prefix; /// Relative path to metadata file on local FS. const std::string metadata_file_path; /// Total size of all remote FS (S3, HDFS) objects. - size_t total_size = 0; + UInt64 total_size = 0; /// Number of references (hardlinks) to this metadata file. /// /// FIXME: Why we are tracking it explicitly, without /// info from filesystem???? - uint32_t ref_count = 0; + UInt32 ref_count = 0; /// Flag indicates that file is read only. bool read_only = false; @@ -46,11 +47,11 @@ private: public: DiskObjectStorageMetadata( - const std::string & common_metadata_path_, - const std::string & object_storage_root_path_, - const std::string & metadata_file_path_); + String compatible_key_prefix_, + String metadata_file_path_); + + void addObject(ObjectStorageKey key, size_t size); - void addObject(const std::string & path, size_t size); void deserialize(ReadBuffer & buf); void deserializeFromString(const std::string & data); @@ -58,14 +59,9 @@ public: void serialize(WriteBuffer & buf, bool sync) const; std::string serializeToString() const; - std::string getBlobsCommonPrefix() const + const ObjectKeysWithMetadata & getKeysWithMeta() const { - return object_storage_root_path; - } - - RelativePathsWithMetadata getBlobsRelativePaths() const - { - return storage_objects; + return keys_with_meta; } bool isReadOnly() const @@ -73,12 +69,12 @@ public: return read_only; } - uint32_t getRefCount() const + UInt32 getRefCount() const { return ref_count; } - uint64_t getTotalSizeBytes() const + UInt64 getTotalSizeBytes() const { return total_size; } @@ -112,6 +108,8 @@ public: { return inline_data; } + + static bool getWriteFullObjectKeySetting(); }; using DiskObjectStorageMetadataPtr = std::unique_ptr; diff --git a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp index 0b2d95fff70..33b98cd328c 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp @@ -34,7 +34,7 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::createFileOperationObject( const String & operation_name, UInt64 revision, const ObjectAttributes & metadata) const { const String relative_path = "operations/r" + revisionToString(revision) + operation_log_suffix + "-" + operation_name; - StoredObject object(fs::path(disk->object_storage_root_path) / relative_path); + StoredObject object(fs::path(disk->object_key_prefix) / relative_path); auto buf = disk->object_storage->writeObject(object, WriteMode::Rewrite, metadata); buf->write('0'); buf->finalize(); @@ -52,8 +52,8 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::findLastRevision() LOG_TRACE(disk->log, "Check object exists with revision prefix {}", revision_prefix); const auto & object_storage = disk->object_storage; - StoredObject revision_object{disk->object_storage_root_path + "r" + revision_prefix}; - StoredObject revision_operation_object{disk->object_storage_root_path + "operations/r" + revision_prefix}; + StoredObject revision_object{disk->object_key_prefix + "r" + revision_prefix}; + StoredObject revision_operation_object{disk->object_key_prefix + "operations/r" + revision_prefix}; /// Check file or operation with such revision prefix exists. if (object_storage->exists(revision_object) || object_storage->exists(revision_operation_object)) @@ -80,7 +80,7 @@ int DiskObjectStorageRemoteMetadataRestoreHelper::readSchemaVersion(IObjectStora void DiskObjectStorageRemoteMetadataRestoreHelper::saveSchemaVersion(const int & version) const { - StoredObject object{fs::path(disk->object_storage_root_path) / SCHEMA_VERSION_OBJECT}; + StoredObject object{fs::path(disk->object_key_prefix) / SCHEMA_VERSION_OBJECT}; auto buf = disk->object_storage->writeObject(object, WriteMode::Rewrite, /* attributes= */ {}, /* buf_size= */ DBMS_DEFAULT_BUFFER_SIZE, write_settings); writeIntText(version, *buf); @@ -187,7 +187,7 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restore(const Poco::Util::Abs try { RestoreInformation information; - information.source_path = disk->object_storage_root_path; + information.source_path = disk->object_key_prefix; information.source_namespace = disk->object_storage->getObjectsNamespace(); readRestoreInformation(information); @@ -201,11 +201,11 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restore(const Poco::Util::Abs { /// In this case we need to additionally cleanup S3 from objects with later revision. /// Will be simply just restore to different path. - if (information.source_path == disk->object_storage_root_path && information.revision != LATEST_REVISION) + if (information.source_path == disk->object_key_prefix && information.revision != LATEST_REVISION) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Restoring to the same bucket and path is allowed if revision is latest (0)"); /// This case complicates S3 cleanup in case of unsuccessful restore. - if (information.source_path != disk->object_storage_root_path && disk->object_storage_root_path.starts_with(information.source_path)) + if (information.source_path != disk->object_key_prefix && disk->object_key_prefix.starts_with(information.source_path)) throw Exception( ErrorCodes::BAD_ARGUMENTS, "Restoring to the same bucket is allowed only if source path is not a sub-path of configured path in S3 disk"); @@ -224,7 +224,7 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restore(const Poco::Util::Abs LOG_INFO(disk->log, "Removing old metadata..."); - bool cleanup_s3 = information.source_path != disk->object_storage_root_path; + bool cleanup_s3 = information.source_path != disk->object_key_prefix; for (const auto & root : data_roots) if (disk->exists(root)) disk->removeSharedRecursive(root + '/', !cleanup_s3, {}); @@ -424,18 +424,17 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::processRestoreFiles( continue; disk->createDirectories(directoryPath(path)); - auto relative_key = shrinkKey(source_path, key); - auto full_path = fs::path(disk->object_storage_root_path) / relative_key; + auto object_key = ObjectStorageKey::createAsRelative(disk->object_key_prefix, shrinkKey(source_path, key)); StoredObject object_from{key}; - StoredObject object_to{fs::path(disk->object_storage_root_path) / relative_key}; + StoredObject object_to{object_key.serialize()}; /// Copy object if we restore to different bucket / path. - if (source_object_storage->getObjectsNamespace() != disk->object_storage->getObjectsNamespace() || disk->object_storage_root_path != source_path) + if (source_object_storage->getObjectsNamespace() != disk->object_storage->getObjectsNamespace() || disk->object_key_prefix != source_path) source_object_storage->copyObjectToAnotherObjectStorage(object_from, object_to, read_settings, write_settings, *disk->object_storage); auto tx = disk->metadata_storage->createTransaction(); - tx->addBlobToMetadata(path, relative_key, meta.size_bytes); + tx->addBlobToMetadata(path, object_key, meta.size_bytes); tx->commit(); LOG_TRACE(disk->log, "Restored file {}", path); @@ -464,7 +463,7 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restoreFileOperations(IObject { /// Enable recording file operations if we restore to different bucket / path. bool send_metadata = source_object_storage->getObjectsNamespace() != disk->object_storage->getObjectsNamespace() - || disk->object_storage_root_path != restore_information.source_path; + || disk->object_key_prefix != restore_information.source_path; std::set renames; auto restore_file_operations = [this, &source_object_storage, &restore_information, &renames, &send_metadata](const RelativePathsWithMetadata & objects) diff --git a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp index 66ee2e746b4..25de89a9548 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp @@ -25,6 +25,7 @@ namespace ErrorCodes extern const int BAD_FILE_TYPE; extern const int FILE_ALREADY_EXISTS; extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED; + extern const int LOGICAL_ERROR; } DiskObjectStorageTransaction::DiskObjectStorageTransaction( @@ -511,12 +512,12 @@ struct CopyFileObjectStorageOperation final : public IDiskObjectStorageOperation for (const auto & object_from : source_blobs) { - std::string blob_name = object_storage.generateBlobNameForPath(to_path); - auto object_to = StoredObject(fs::path(metadata_storage.getObjectStorageRootPath()) / blob_name); + auto object_key = object_storage.generateObjectKeyForPath(to_path); + auto object_to = StoredObject(object_key.serialize()); object_storage.copyObject(object_from, object_to, read_settings, write_settings); - tx->addBlobToMetadata(to_path, blob_name, object_from.bytes_size); + tx->addBlobToMetadata(to_path, object_key, object_from.bytes_size); created_objects.push_back(object_to); } @@ -663,46 +664,53 @@ std::unique_ptr DiskObjectStorageTransaction::writeFile const WriteSettings & settings, bool autocommit) { - String blob_name; + auto object_key = object_storage.generateObjectKeyForPath(path); std::optional object_attributes; - blob_name = object_storage.generateBlobNameForPath(path); if (metadata_helper) { + if (!object_key.hasPrefix()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "metadata helper is not supported with absolute paths"); + auto revision = metadata_helper->revision_counter + 1; metadata_helper->revision_counter++; object_attributes = { {"path", path} }; - blob_name = "r" + revisionToString(revision) + "-file-" + blob_name; + + object_key = ObjectStorageKey::createAsRelative( + object_key.getPrefix(), + "r" + revisionToString(revision) + "-file-" + object_key.getSuffix()); } - auto object = StoredObject(fs::path(metadata_storage.getObjectStorageRootPath()) / blob_name); - auto write_operation = std::make_unique(object_storage, metadata_storage, object); + /// seems ok + auto object = StoredObject(object_key.serialize()); std::function create_metadata_callback; if (autocommit) { - create_metadata_callback = [tx = shared_from_this(), mode, path, blob_name](size_t count) + create_metadata_callback = [tx = shared_from_this(), mode, path, key_ = std::move(object_key)](size_t count) { if (mode == WriteMode::Rewrite) { - // Otherwise we will produce lost blobs which nobody points to + /// Otherwise we will produce lost blobs which nobody points to /// WriteOnce storages are not affected by the issue if (!tx->object_storage.isWriteOnce() && tx->metadata_storage.exists(path)) tx->object_storage.removeObjectsIfExist(tx->metadata_storage.getStorageObjects(path)); - tx->metadata_transaction->createMetadataFile(path, blob_name, count); + tx->metadata_transaction->createMetadataFile(path, key_, count); } else - tx->metadata_transaction->addBlobToMetadata(path, blob_name, count); + tx->metadata_transaction->addBlobToMetadata(path, key_, count); tx->metadata_transaction->commit(); }; } else { - create_metadata_callback = [object_storage_tx = shared_from_this(), write_op = write_operation.get(), mode, path, blob_name](size_t count) + auto write_operation = std::make_unique(object_storage, metadata_storage, object); + + create_metadata_callback = [object_storage_tx = shared_from_this(), write_op = write_operation.get(), mode, path, key_ = std::move(object_key)](size_t count) { /// This callback called in WriteBuffer finalize method -- only there we actually know /// how many bytes were written. We don't control when this finalize method will be called @@ -714,7 +722,7 @@ std::unique_ptr DiskObjectStorageTransaction::writeFile /// ... /// buf1->finalize() // shouldn't do anything with metadata operations, just memoize what to do /// tx->commit() - write_op->setOnExecute([object_storage_tx, mode, path, blob_name, count](MetadataTransactionPtr tx) + write_op->setOnExecute([object_storage_tx, mode, path, key_, count](MetadataTransactionPtr tx) { if (mode == WriteMode::Rewrite) { @@ -726,15 +734,16 @@ std::unique_ptr DiskObjectStorageTransaction::writeFile object_storage_tx->metadata_storage.getStorageObjects(path)); } - tx->createMetadataFile(path, blob_name, count); + tx->createMetadataFile(path, key_, count); } else - tx->addBlobToMetadata(path, blob_name, count); + tx->addBlobToMetadata(path, key_, count); }); }; + + operations_to_execute.emplace_back(std::move(write_operation)); } - operations_to_execute.emplace_back(std::move(write_operation)); auto impl = object_storage.writeObject( object, @@ -753,20 +762,27 @@ void DiskObjectStorageTransaction::writeFileUsingBlobWritingFunction( const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) { /// This function is a simplified and adapted version of DiskObjectStorageTransaction::writeFile(). - auto blob_name = object_storage.generateBlobNameForPath(path); + auto object_key = object_storage.generateObjectKeyForPath(path); std::optional object_attributes; if (metadata_helper) { + if (!object_key.hasPrefix()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "metadata helper is not supported with abs paths"); + auto revision = metadata_helper->revision_counter + 1; metadata_helper->revision_counter++; object_attributes = { {"path", path} }; - blob_name = "r" + revisionToString(revision) + "-file-" + blob_name; + + object_key = ObjectStorageKey::createAsRelative( + object_key.getPrefix(), + "r" + revisionToString(revision) + "-file-" + object_key.getSuffix()); } - auto object = StoredObject(fs::path(metadata_storage.getObjectStorageRootPath()) / blob_name); + /// seems ok + auto object = StoredObject(object_key.serialize()); auto write_operation = std::make_unique(object_storage, metadata_storage, object); operations_to_execute.emplace_back(std::move(write_operation)); @@ -788,10 +804,10 @@ void DiskObjectStorageTransaction::writeFileUsingBlobWritingFunction( if (!object_storage.isWriteOnce() && metadata_storage.exists(path)) object_storage.removeObjectsIfExist(metadata_storage.getStorageObjects(path)); - metadata_transaction->createMetadataFile(path, blob_name, object_size); + metadata_transaction->createMetadataFile(path, std::move(object_key), object_size); } else - metadata_transaction->addBlobToMetadata(path, blob_name, object_size); + metadata_transaction->addBlobToMetadata(path, std::move(object_key), object_size); } diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index 5eca98aa494..662b20f4d31 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -28,9 +28,10 @@ void HDFSObjectStorage::startup() { } -std::string HDFSObjectStorage::generateBlobNameForPath(const std::string & /* path */) +ObjectStorageKey HDFSObjectStorage::generateObjectKeyForPath(const std::string & /* path */) const { - return getRandomASCIIString(32); + /// what ever data_source_description.description value is, consider that key as relative key + return ObjectStorageKey::createAsRelative(data_source_description.description, getRandomASCIIString(32)); } bool HDFSObjectStorage::exists(const StoredObject & object) const diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h index 8d770c12d8f..fe0893f963b 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h @@ -114,7 +114,7 @@ public: const std::string & config_prefix, ContextPtr context) override; - std::string generateBlobNameForPath(const std::string & path) override; + ObjectStorageKey generateObjectKeyForPath(const std::string & path) const override; bool isRemote() const override { return true; } diff --git a/src/Disks/ObjectStorages/IMetadataStorage.h b/src/Disks/ObjectStorages/IMetadataStorage.h index 6b75e157dee..9e5078736d2 100644 --- a/src/Disks/ObjectStorages/IMetadataStorage.h +++ b/src/Disks/ObjectStorages/IMetadataStorage.h @@ -126,10 +126,10 @@ public: virtual void createEmptyMetadataFile(const std::string & path) = 0; /// Create metadata file on paths with content (blob_name, size_in_bytes) - virtual void createMetadataFile(const std::string & path, const std::string & blob_name, uint64_t size_in_bytes) = 0; + virtual void createMetadataFile(const std::string & path, ObjectStorageKey key, uint64_t size_in_bytes) = 0; /// Add to new blob to metadata file (way to implement appends) - virtual void addBlobToMetadata(const std::string & /* path */, const std::string & /* blob_name */, uint64_t /* size_in_bytes */) + virtual void addBlobToMetadata(const std::string & /* path */, ObjectStorageKey /* key */, uint64_t /* size_in_bytes */) { throwNotImplemented(); } @@ -221,8 +221,6 @@ public: /// object_storage_path is absolute. virtual StoredObjects getStorageObjects(const std::string & path) const = 0; - virtual std::string getObjectStorageRootPath() const = 0; - private: [[noreturn]] static void throwNotImplemented() { diff --git a/src/Disks/ObjectStorages/IObjectStorage.cpp b/src/Disks/ObjectStorages/IObjectStorage.cpp index 3c77de8f5b7..78fbdcaddfa 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.cpp +++ b/src/Disks/ObjectStorages/IObjectStorage.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include @@ -95,21 +95,4 @@ WriteSettings IObjectStorage::patchSettings(const WriteSettings & write_settings return settings; } -std::string IObjectStorage::generateBlobNameForPath(const std::string & /* path */) -{ - /// Path to store the new S3 object. - - /// Total length is 32 a-z characters for enough randomness. - /// First 3 characters are used as a prefix for - /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/ - - constexpr size_t key_name_total_size = 32; - constexpr size_t key_name_prefix_size = 3; - - /// Path to store new S3 object. - return fmt::format("{}/{}", - getRandomASCIIString(key_name_prefix_size), - getRandomASCIIString(key_name_total_size - key_name_prefix_size)); -} - } diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 032795b380f..1918c197577 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -17,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -35,7 +35,7 @@ using ObjectAttributes = std::map; struct ObjectMetadata { - uint64_t size_bytes; + uint64_t size_bytes = 0; std::optional last_modified; std::optional attributes; }; @@ -43,16 +43,31 @@ struct ObjectMetadata struct RelativePathWithMetadata { String relative_path; - ObjectMetadata metadata{}; + ObjectMetadata metadata; RelativePathWithMetadata() = default; - RelativePathWithMetadata(const String & relative_path_, const ObjectMetadata & metadata_) - : relative_path(relative_path_), metadata(metadata_) + RelativePathWithMetadata(String relative_path_, ObjectMetadata metadata_) + : relative_path(std::move(relative_path_)) + , metadata(std::move(metadata_)) + {} +}; + +struct ObjectKeyWithMetadata +{ + ObjectStorageKey key; + ObjectMetadata metadata; + + ObjectKeyWithMetadata() = default; + + ObjectKeyWithMetadata(ObjectStorageKey key_, ObjectMetadata metadata_) + : key(std::move(key_)) + , metadata(std::move(metadata_)) {} }; using RelativePathsWithMetadata = std::vector; +using ObjectKeysWithMetadata = std::vector; class IObjectStorageIterator; using ObjectStorageIteratorPtr = std::shared_ptr; @@ -176,7 +191,7 @@ public: /// Generate blob name for passed absolute local path. /// Path can be generated either independently or based on `path`. - virtual std::string generateBlobNameForPath(const std::string & path); + virtual ObjectStorageKey generateObjectKeyForPath(const std::string & path) const = 0; /// Get unique id for passed absolute path in object storage. virtual std::string getUniqueId(const std::string & path) const { return path; } diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp index cc53df956c6..4cf3c23d5a6 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp @@ -24,8 +24,9 @@ namespace ErrorCodes extern const int CANNOT_UNLINK; } -LocalObjectStorage::LocalObjectStorage() - : log(&Poco::Logger::get("LocalObjectStorage")) +LocalObjectStorage::LocalObjectStorage(String key_prefix_) + : key_prefix(std::move(key_prefix_)) + , log(&Poco::Logger::get("LocalObjectStorage")) { data_source_description.type = DataSourceType::Local; if (auto block_device_id = tryGetBlockDeviceId("/"); block_device_id.has_value()) @@ -200,10 +201,10 @@ void LocalObjectStorage::applyNewSettings( { } -std::string LocalObjectStorage::generateBlobNameForPath(const std::string & /* path */) +ObjectStorageKey LocalObjectStorage::generateObjectKeyForPath(const std::string & /* path */) const { constexpr size_t key_name_total_size = 32; - return getRandomASCIIString(key_name_total_size); + return ObjectStorageKey::createAsRelative(key_prefix, getRandomASCIIString(key_name_total_size)); } } diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.h b/src/Disks/ObjectStorages/Local/LocalObjectStorage.h index aa3a68731e4..263eb3f7832 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.h +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.h @@ -16,7 +16,7 @@ namespace DB class LocalObjectStorage : public IObjectStorage { public: - LocalObjectStorage(); + LocalObjectStorage(String key_prefix_); DataSourceDescription getDataSourceDescription() const override { return data_source_description; } @@ -78,13 +78,14 @@ public: const std::string & config_prefix, ContextPtr context) override; - std::string generateBlobNameForPath(const std::string & path) override; + ObjectStorageKey generateObjectKeyForPath(const std::string & path) const override; bool isRemote() const override { return false; } ReadSettings patchSettings(const ReadSettings & read_settings) const override; private: + String key_prefix; Poco::Logger * log; DataSourceDescription data_source_description; }; diff --git a/src/Disks/ObjectStorages/Local/registerLocalObjectStorage.cpp b/src/Disks/ObjectStorages/Local/registerLocalObjectStorage.cpp index eb9039fed44..0b2c71fa09d 100644 --- a/src/Disks/ObjectStorages/Local/registerLocalObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Local/registerLocalObjectStorage.cpp @@ -20,23 +20,25 @@ void registerDiskLocalObjectStorage(DiskFactory & factory, bool global_skip_acce ContextPtr context, const DisksMap & /*map*/) -> DiskPtr { - String path; + String object_key_prefix; UInt64 keep_free_space_bytes; - loadDiskLocalConfig(name, config, config_prefix, context, path, keep_free_space_bytes); - fs::create_directories(path); + loadDiskLocalConfig(name, config, config_prefix, context, object_key_prefix, keep_free_space_bytes); + /// keys are mapped to the fs, object_key_prefix is a directory also + fs::create_directories(object_key_prefix); String type = config.getString(config_prefix + ".type"); chassert(type == "local_blob_storage"); - std::shared_ptr local_storage = std::make_shared(); + std::shared_ptr local_storage = std::make_shared(object_key_prefix); MetadataStoragePtr metadata_storage; auto [metadata_path, metadata_disk] = prepareForLocalMetadata(name, config, config_prefix, context); - metadata_storage = std::make_shared(metadata_disk, path); + metadata_storage = std::make_shared(metadata_disk, object_key_prefix); auto disk = std::make_shared( - name, path, "Local", metadata_storage, local_storage, config, config_prefix); + name, object_key_prefix, "Local", metadata_storage, local_storage, config, config_prefix); disk->startup(context, global_skip_access_check); return disk; + }; factory.registerDiskType("local_blob_storage", creator); } diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp b/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp index 53428c2f6e1..91234a3fa05 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp @@ -15,9 +15,9 @@ namespace ErrorCodes extern const int FS_METADATA_ERROR; } -MetadataStorageFromDisk::MetadataStorageFromDisk(DiskPtr disk_, const std::string & object_storage_root_path_) +MetadataStorageFromDisk::MetadataStorageFromDisk(DiskPtr disk_, String compatible_key_prefix_) : disk(disk_) - , object_storage_root_path(object_storage_root_path_) + , compatible_key_prefix(compatible_key_prefix_) { } @@ -85,7 +85,7 @@ std::string MetadataStorageFromDisk::readInlineDataToString(const std::string & DiskObjectStorageMetadataPtr MetadataStorageFromDisk::readMetadataUnlocked(const std::string & path, std::shared_lock &) const { - auto metadata = std::make_unique(disk->getPath(), object_storage_root_path, path); + auto metadata = std::make_unique(compatible_key_prefix, path); auto str = readFileToString(path); metadata->deserializeFromString(str); return metadata; @@ -93,7 +93,7 @@ DiskObjectStorageMetadataPtr MetadataStorageFromDisk::readMetadataUnlocked(const DiskObjectStorageMetadataPtr MetadataStorageFromDisk::readMetadataUnlocked(const std::string & path, std::unique_lock &) const { - auto metadata = std::make_unique(disk->getPath(), object_storage_root_path, path); + auto metadata = std::make_unique(compatible_key_prefix, path); auto str = readFileToString(path); metadata->deserializeFromString(str); return metadata; @@ -135,21 +135,16 @@ MetadataTransactionPtr MetadataStorageFromDisk::createTransaction() StoredObjects MetadataStorageFromDisk::getStorageObjects(const std::string & path) const { auto metadata = readMetadata(path); + const auto & keys_with_meta = metadata->getKeysWithMeta(); - auto object_storage_relative_paths = metadata->getBlobsRelativePaths(); /// Relative paths. - - StoredObjects object_storage_paths; - object_storage_paths.reserve(object_storage_relative_paths.size()); - - /// Relative paths -> absolute. - for (auto & [object_relative_path, object_meta] : object_storage_relative_paths) + StoredObjects objects; + objects.reserve(keys_with_meta.size()); + for (const auto & [object_key, object_meta] : keys_with_meta) { - auto object_path = fs::path(metadata->getBlobsCommonPrefix()) / object_relative_path; - StoredObject object{ object_path, object_meta.size_bytes, path }; - object_storage_paths.push_back(object); + objects.emplace_back(object_key.serialize(), object_meta.size_bytes, path); } - return object_storage_paths; + return objects; } uint32_t MetadataStorageFromDisk::getHardlinkCount(const std::string & path) const @@ -253,8 +248,7 @@ void MetadataStorageFromDiskTransaction::writeInlineDataToFile( const std::string & path, const std::string & data) { - auto metadata = std::make_unique( - metadata_storage.getDisk()->getPath(), metadata_storage.getObjectStorageRootPath(), path); + auto metadata = std::make_unique(metadata_storage.compatible_key_prefix, path); metadata->setInlineData(data); writeStringToFile(path, metadata->serializeToString()); } @@ -318,26 +312,23 @@ void MetadataStorageFromDiskTransaction::setReadOnly(const std::string & path) void MetadataStorageFromDiskTransaction::createEmptyMetadataFile(const std::string & path) { - auto metadata = std::make_unique( - metadata_storage.getDisk()->getPath(), metadata_storage.getObjectStorageRootPath(), path); + auto metadata = std::make_unique(metadata_storage.compatible_key_prefix, path); writeStringToFile(path, metadata->serializeToString()); } -void MetadataStorageFromDiskTransaction::createMetadataFile(const std::string & path, const std::string & blob_name, uint64_t size_in_bytes) +void MetadataStorageFromDiskTransaction::createMetadataFile(const std::string & path, ObjectStorageKey object_key, uint64_t size_in_bytes) { - DiskObjectStorageMetadataPtr metadata = std::make_unique( - metadata_storage.getDisk()->getPath(), metadata_storage.getObjectStorageRootPath(), path); - - metadata->addObject(blob_name, size_in_bytes); + auto metadata = std::make_unique(metadata_storage.compatible_key_prefix, path); + metadata->addObject(std::move(object_key), size_in_bytes); auto data = metadata->serializeToString(); if (!data.empty()) addOperation(std::make_unique(path, *metadata_storage.getDisk(), data)); } -void MetadataStorageFromDiskTransaction::addBlobToMetadata(const std::string & path, const std::string & blob_name, uint64_t size_in_bytes) +void MetadataStorageFromDiskTransaction::addBlobToMetadata(const std::string & path, ObjectStorageKey object_key, uint64_t size_in_bytes) { - addOperation(std::make_unique(path, blob_name, metadata_storage.object_storage_root_path, size_in_bytes, *metadata_storage.disk, metadata_storage)); + addOperation(std::make_unique(path, std::move(object_key), size_in_bytes, *metadata_storage.disk, metadata_storage)); } UnlinkMetadataFileOperationOutcomePtr MetadataStorageFromDiskTransaction::unlinkMetadata(const std::string & path) diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDisk.h b/src/Disks/ObjectStorages/MetadataStorageFromDisk.h index b518f5e3622..4116659ab9a 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromDisk.h +++ b/src/Disks/ObjectStorages/MetadataStorageFromDisk.h @@ -22,12 +22,11 @@ private: friend class MetadataStorageFromDiskTransaction; mutable SharedMutex metadata_mutex; - DiskPtr disk; - std::string object_storage_root_path; + String compatible_key_prefix; public: - MetadataStorageFromDisk(DiskPtr disk_, const std::string & object_storage_root_path_); + MetadataStorageFromDisk(DiskPtr disk_, String compatible_key_prefix); MetadataTransactionPtr createTransaction() override; @@ -67,8 +66,6 @@ public: StoredObjects getStorageObjects(const std::string & path) const override; - std::string getObjectStorageRootPath() const override { return object_storage_root_path; } - DiskObjectStorageMetadataPtr readMetadata(const std::string & path) const; DiskObjectStorageMetadataPtr readMetadataUnlocked(const std::string & path, std::unique_lock & lock) const; @@ -104,9 +101,9 @@ public: void createEmptyMetadataFile(const std::string & path) override; - void createMetadataFile(const std::string & path, const std::string & blob_name, uint64_t size_in_bytes) override; + void createMetadataFile(const std::string & path, ObjectStorageKey object_key, uint64_t size_in_bytes) override; - void addBlobToMetadata(const std::string & path, const std::string & blob_name, uint64_t size_in_bytes) override; + void addBlobToMetadata(const std::string & path, ObjectStorageKey object_key, uint64_t size_in_bytes) override; void setLastModified(const std::string & path, const Poco::Timestamp & timestamp) override; diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp index 78e8764f8fc..1357acdfc66 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp @@ -294,9 +294,9 @@ void AddBlobOperation::execute(std::unique_lock & metadata_lock) if (metadata_storage.exists(path)) metadata = metadata_storage.readMetadataUnlocked(path, metadata_lock); else - metadata = std::make_unique(disk.getPath(), root_path, path); + metadata = std::make_unique(disk.getPath(), path); - metadata->addObject(blob_name, size_in_bytes); + metadata->addObject(object_key, size_in_bytes); write_operation = std::make_unique(path, disk, metadata->serializeToString()); diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h index ccb77f6ae7b..e8fda177b95 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h +++ b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h @@ -216,14 +216,12 @@ struct AddBlobOperation final : public IMetadataOperation { AddBlobOperation( const std::string & path_, - const std::string & blob_name_, - const std::string & root_path_, + ObjectStorageKey object_key_, uint64_t size_in_bytes_, IDisk & disk_, const MetadataStorageFromDisk & metadata_storage_) : path(path_) - , blob_name(blob_name_) - , root_path(root_path_) + , object_key(std::move(object_key_)) , size_in_bytes(size_in_bytes_) , disk(disk_) , metadata_storage(metadata_storage_) @@ -235,8 +233,7 @@ struct AddBlobOperation final : public IMetadataOperation private: std::string path; - std::string blob_name; - std::string root_path; + ObjectStorageKey object_key; uint64_t size_in_bytes; IDisk & disk; const MetadataStorageFromDisk & metadata_storage; diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp index 022ff86df50..5f1d1f7f7f2 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp @@ -12,9 +12,9 @@ namespace DB MetadataStorageFromPlainObjectStorage::MetadataStorageFromPlainObjectStorage( ObjectStoragePtr object_storage_, - const std::string & object_storage_root_path_) + String storage_path_prefix_) : object_storage(object_storage_) - , object_storage_root_path(object_storage_root_path_) + , storage_path_prefix(std::move(storage_path_prefix_)) { } @@ -25,19 +25,15 @@ MetadataTransactionPtr MetadataStorageFromPlainObjectStorage::createTransaction( const std::string & MetadataStorageFromPlainObjectStorage::getPath() const { - return object_storage_root_path; -} -std::filesystem::path MetadataStorageFromPlainObjectStorage::getAbsolutePath(const std::string & path) const -{ - return fs::path(object_storage_root_path) / path; + return storage_path_prefix; } bool MetadataStorageFromPlainObjectStorage::exists(const std::string & path) const { /// NOTE: exists() cannot be used here since it works only for existing /// key, and does not work for some intermediate path. - std::string abs_path = getAbsolutePath(path); - return object_storage->existsOrHasAnyChild(abs_path); + auto object_key = object_storage->generateObjectKeyForPath(path); + return object_storage->existsOrHasAnyChild(object_key.serialize()); } bool MetadataStorageFromPlainObjectStorage::isFile(const std::string & path) const @@ -48,7 +44,8 @@ bool MetadataStorageFromPlainObjectStorage::isFile(const std::string & path) con bool MetadataStorageFromPlainObjectStorage::isDirectory(const std::string & path) const { - std::string directory = getAbsolutePath(path); + auto object_key = object_storage->generateObjectKeyForPath(path); + std::string directory = object_key.serialize(); if (!directory.ends_with('/')) directory += '/'; @@ -59,8 +56,8 @@ bool MetadataStorageFromPlainObjectStorage::isDirectory(const std::string & path uint64_t MetadataStorageFromPlainObjectStorage::getFileSize(const String & path) const { - RelativePathsWithMetadata children; - auto metadata = object_storage->tryGetObjectMetadata(getAbsolutePath(path)); + auto object_key = object_storage->generateObjectKeyForPath(path); + auto metadata = object_storage->tryGetObjectMetadata(object_key.serialize()); if (metadata) return metadata->size_bytes; return 0; @@ -68,12 +65,14 @@ uint64_t MetadataStorageFromPlainObjectStorage::getFileSize(const String & path) std::vector MetadataStorageFromPlainObjectStorage::listDirectory(const std::string & path) const { - RelativePathsWithMetadata files; - std::string abs_path = getAbsolutePath(path); - if (!abs_path.ends_with('/')) - abs_path += '/'; + auto object_key = object_storage->generateObjectKeyForPath(path); - object_storage->listObjects(abs_path, files, 0); + RelativePathsWithMetadata files; + std::string abs_key = object_key.serialize(); + if (!abs_key.ends_with('/')) + abs_key += '/'; + + object_storage->listObjects(abs_key, files, 0); std::vector result; for (const auto & path_size : files) @@ -84,8 +83,8 @@ std::vector MetadataStorageFromPlainObjectStorage::listDirectory(co std::unordered_set duplicates_filter; for (auto & row : result) { - chassert(row.starts_with(abs_path)); - row.erase(0, abs_path.size()); + chassert(row.starts_with(abs_key)); + row.erase(0, abs_key.size()); auto slash_pos = row.find_first_of('/'); if (slash_pos != std::string::npos) row.erase(slash_pos, row.size() - slash_pos); @@ -105,10 +104,9 @@ DirectoryIteratorPtr MetadataStorageFromPlainObjectStorage::iterateDirectory(con StoredObjects MetadataStorageFromPlainObjectStorage::getStorageObjects(const std::string & path) const { - std::string blob_name = object_storage->generateBlobNameForPath(path); - size_t object_size = getFileSize(blob_name); - auto object = StoredObject(getAbsolutePath(blob_name), object_size, path); - return {std::move(object)}; + size_t object_size = getFileSize(path); + auto object_key = object_storage->generateObjectKeyForPath(path); + return {StoredObject(object_key.serialize(), object_size, path)}; } const IMetadataStorage & MetadataStorageFromPlainObjectStorageTransaction::getStorageForNonTransactionalReads() const @@ -118,7 +116,8 @@ const IMetadataStorage & MetadataStorageFromPlainObjectStorageTransaction::getSt void MetadataStorageFromPlainObjectStorageTransaction::unlinkFile(const std::string & path) { - auto object = StoredObject(metadata_storage.getAbsolutePath(path)); + auto object_key = metadata_storage.object_storage->generateObjectKeyForPath(path); + auto object = StoredObject(object_key.serialize()); metadata_storage.object_storage->removeObject(object); } @@ -131,7 +130,7 @@ void MetadataStorageFromPlainObjectStorageTransaction::createDirectoryRecursive( /// Noop. It is an Object Storage not a filesystem. } void MetadataStorageFromPlainObjectStorageTransaction::addBlobToMetadata( - const std::string &, const std::string & /* blob_name */, uint64_t /* size_in_bytes */) + const std::string &, ObjectStorageKey /* object_key */, uint64_t /* size_in_bytes */) { /// Noop, local metadata files is only one file, it is the metadata file itself. } diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h index bd068c1362f..2ef823d07a4 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h @@ -29,12 +29,10 @@ private: friend class MetadataStorageFromPlainObjectStorageTransaction; ObjectStoragePtr object_storage; - std::string object_storage_root_path; + String storage_path_prefix; public: - MetadataStorageFromPlainObjectStorage( - ObjectStoragePtr object_storage_, - const std::string & object_storage_root_path_); + MetadataStorageFromPlainObjectStorage(ObjectStoragePtr object_storage_, String storage_path_prefix_); MetadataTransactionPtr createTransaction() override; @@ -56,8 +54,6 @@ public: StoredObjects getStorageObjects(const std::string & path) const override; - std::string getObjectStorageRootPath() const override { return object_storage_root_path; } - Poco::Timestamp getLastModified(const std::string & /* path */) const override { /// Required by MergeTree @@ -71,9 +67,6 @@ public: bool supportsChmod() const override { return false; } bool supportsStat() const override { return false; } - -private: - std::filesystem::path getAbsolutePath(const std::string & path) const; }; class MetadataStorageFromPlainObjectStorageTransaction final : public IMetadataTransaction @@ -89,14 +82,14 @@ public: const IMetadataStorage & getStorageForNonTransactionalReads() const override; - void addBlobToMetadata(const std::string & path, const std::string & blob_name, uint64_t size_in_bytes) override; + void addBlobToMetadata(const std::string & path, ObjectStorageKey object_key, uint64_t size_in_bytes) override; void createEmptyMetadataFile(const std::string & /* path */) override { /// No metadata, no need to create anything. } - void createMetadataFile(const std::string & /* path */, const std::string & /* blob_name */, uint64_t /* size_in_bytes */) override + void createMetadataFile(const std::string & /* path */, ObjectStorageKey /* object_key */, uint64_t /* size_in_bytes */) override { /// Noop } diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 8f020e0d1ac..b36185249af 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -127,7 +128,10 @@ private: result = !objects.empty(); for (const auto & object : objects) - batch.emplace_back(object.GetKey(), ObjectMetadata{static_cast(object.GetSize()), Poco::Timestamp::fromEpochTime(object.GetLastModified().Seconds()), {}}); + batch.emplace_back( + object.GetKey(), + ObjectMetadata{static_cast(object.GetSize()), Poco::Timestamp::fromEpochTime(object.GetLastModified().Seconds()), {}} + ); if (result) request.SetContinuationToken(outcome.GetResult().GetNextContinuationToken()); @@ -293,7 +297,12 @@ void S3ObjectStorage::listObjects(const std::string & path, RelativePathsWithMet break; for (const auto & object : objects) - children.emplace_back(object.GetKey(), ObjectMetadata{static_cast(object.GetSize()), Poco::Timestamp::fromEpochTime(object.GetLastModified().Seconds()), {}}); + children.emplace_back( + object.GetKey(), + ObjectMetadata{ + static_cast(object.GetSize()), + Poco::Timestamp::fromEpochTime(object.GetLastModified().Seconds()), + {}}); if (max_keys) { @@ -524,12 +533,33 @@ std::unique_ptr S3ObjectStorage::cloneObjectStorage( return std::make_unique( std::move(new_client), std::move(new_s3_settings), version_id, s3_capabilities, new_namespace, - endpoint); + endpoint, object_key_prefix); } S3ObjectStorage::Clients::Clients(std::shared_ptr client_, const S3ObjectStorageSettings & settings) : client(std::move(client_)), client_with_long_timeout(client->clone(std::nullopt, settings.request_settings.long_request_timeout_ms)) {} +ObjectStorageKey S3ObjectStorage::generateObjectKeyForPath(const std::string &) const +{ + /// Path to store the new S3 object. + + /// Total length is 32 a-z characters for enough randomness. + /// First 3 characters are used as a prefix for + /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/ + + constexpr size_t key_name_total_size = 32; + constexpr size_t key_name_prefix_size = 3; + + /// Path to store new S3 object. + String key = fmt::format("{}/{}", + getRandomASCIIString(key_name_prefix_size), + getRandomASCIIString(key_name_total_size - key_name_prefix_size)); + + /// what ever key_prefix value is, consider that key as relative + return ObjectStorageKey::createAsRelative(object_key_prefix, key); +} + + } #endif diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index 6e516b39c88..b1b3fb22366 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -59,8 +59,10 @@ private: String version_id_, const S3Capabilities & s3_capabilities_, String bucket_, - String connection_string) - : bucket(bucket_) + String connection_string, + String object_key_prefix_) + : bucket(std::move(bucket_)) + , object_key_prefix(std::move(object_key_prefix_)) , clients(std::make_unique(std::move(client_), *s3_settings_)) , s3_settings(std::move(s3_settings_)) , s3_capabilities(s3_capabilities_) @@ -170,13 +172,17 @@ public: bool supportParallelWrite() const override { return true; } + ObjectStorageKey generateObjectKeyForPath(const std::string & path) const override; + private: void setNewSettings(std::unique_ptr && s3_settings_); void removeObjectImpl(const StoredObject & object, bool if_exists); void removeObjectsImpl(const StoredObjects & objects, bool if_exists); +private: std::string bucket; + String object_key_prefix; MultiVersion clients; MultiVersion s3_settings; @@ -195,7 +201,11 @@ private: class S3PlainObjectStorage : public S3ObjectStorage { public: - std::string generateBlobNameForPath(const std::string & path) override { return path; } + ObjectStorageKey generateObjectKeyForPath(const std::string & path) const override + { + return ObjectStorageKey::createAsRelative(object_key_prefix, path); + } + std::string getName() const override { return "S3PlainObjectStorage"; } template diff --git a/src/Disks/ObjectStorages/S3/registerDiskS3.cpp b/src/Disks/ObjectStorages/S3/registerDiskS3.cpp index 91f647cbd8b..663d8b777e8 100644 --- a/src/Disks/ObjectStorages/S3/registerDiskS3.cpp +++ b/src/Disks/ObjectStorages/S3/registerDiskS3.cpp @@ -126,12 +126,15 @@ void registerDiskS3(DiskFactory & factory, bool global_skip_access_check) if (config.getBool(config_prefix + ".send_metadata", false)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "s3_plain does not supports send_metadata"); - s3_storage = std::make_shared(std::move(client), std::move(settings), uri.version_id, s3_capabilities, uri.bucket, uri.endpoint); + s3_storage = std::make_shared( + std::move(client), std::move(settings), uri.version_id, s3_capabilities, uri.bucket, uri.endpoint, uri.key); + metadata_storage = std::make_shared(s3_storage, uri.key); } else { - s3_storage = std::make_shared(std::move(client), std::move(settings), uri.version_id, s3_capabilities, uri.bucket, uri.endpoint); + s3_storage = std::make_shared( + std::move(client), std::move(settings), uri.version_id, s3_capabilities, uri.bucket, uri.endpoint, uri.key); auto [metadata_path, metadata_disk] = prepareForLocalMetadata(name, config, config_prefix, context); metadata_storage = std::make_shared(metadata_disk, uri.key); } diff --git a/src/Disks/ObjectStorages/StoredObject.h b/src/Disks/ObjectStorages/StoredObject.h index 8afbb116a83..4a03743e310 100644 --- a/src/Disks/ObjectStorages/StoredObject.h +++ b/src/Disks/ObjectStorages/StoredObject.h @@ -1,8 +1,11 @@ #pragma once +#include + +#include + #include #include -#include namespace DB @@ -11,20 +14,32 @@ namespace DB /// Object metadata: path, size, path_key_for_cache. struct StoredObject { - std::string remote_path; - std::string local_path; /// or equivalent "metadata_path" + String remote_path; /// abs path + String local_path; /// or equivalent "metadata_path" uint64_t bytes_size = 0; StoredObject() = default; - explicit StoredObject( - const std::string & remote_path_, - uint64_t bytes_size_ = 0, - const std::string & local_path_ = "") - : remote_path(remote_path_) - , local_path(local_path_) - , bytes_size(bytes_size_) {} + explicit StoredObject(String remote_path_) + : remote_path(std::move(remote_path_)) + {} + + StoredObject( + String remote_path_, + uint64_t bytes_size_) + : remote_path(std::move(remote_path_)) + , bytes_size(bytes_size_) + {} + + StoredObject( + String remote_path_, + uint64_t bytes_size_, + String local_path_) + : remote_path(std::move(remote_path_)) + , local_path(std::move(local_path_)) + , bytes_size(bytes_size_) + {} }; using StoredObjects = std::vector; diff --git a/src/Disks/ObjectStorages/Web/MetadataStorageFromStaticFilesWebServer.cpp b/src/Disks/ObjectStorages/Web/MetadataStorageFromStaticFilesWebServer.cpp index fa07ef8590a..2d1ae41eb05 100644 --- a/src/Disks/ObjectStorages/Web/MetadataStorageFromStaticFilesWebServer.cpp +++ b/src/Disks/ObjectStorages/Web/MetadataStorageFromStaticFilesWebServer.cpp @@ -28,7 +28,8 @@ MetadataTransactionPtr MetadataStorageFromStaticFilesWebServer::createTransactio const std::string & MetadataStorageFromStaticFilesWebServer::getPath() const { - return root_path; + static const String no_root; + return no_root; } bool MetadataStorageFromStaticFilesWebServer::exists(const std::string & path) const @@ -96,7 +97,7 @@ std::vector MetadataStorageFromStaticFilesWebServer::listDirectory( for (const auto & [file_path, _] : object_storage.files) { if (file_path.starts_with(path)) - result.push_back(file_path); + result.push_back(file_path); /// It looks more like recursive listing, not sure it is right } return result; } diff --git a/src/Disks/ObjectStorages/Web/MetadataStorageFromStaticFilesWebServer.h b/src/Disks/ObjectStorages/Web/MetadataStorageFromStaticFilesWebServer.h index 96c749ad80c..1b17cac994d 100644 --- a/src/Disks/ObjectStorages/Web/MetadataStorageFromStaticFilesWebServer.h +++ b/src/Disks/ObjectStorages/Web/MetadataStorageFromStaticFilesWebServer.h @@ -16,12 +16,9 @@ private: using FileType = WebObjectStorage::FileType; const WebObjectStorage & object_storage; - std::string root_path; void assertExists(const std::string & path) const; - void initializeImpl(const String & uri_path, const std::unique_lock &) const; - public: explicit MetadataStorageFromStaticFilesWebServer(const WebObjectStorage & object_storage_); @@ -43,8 +40,6 @@ public: StoredObjects getStorageObjects(const std::string & path) const override; - std::string getObjectStorageRootPath() const override { return ""; } - struct stat stat(const String & /* path */) const override { return {}; } Poco::Timestamp getLastModified(const std::string & /* path */) const override @@ -80,7 +75,7 @@ public: /// No metadata, no need to create anything. } - void createMetadataFile(const std::string & /* path */, const std::string & /* blob_name */, uint64_t /* size_in_bytes */) override + void createMetadataFile(const std::string & /* path */, ObjectStorageKey /* object_key */, uint64_t /* size_in_bytes */) override { /// Noop } diff --git a/src/Disks/ObjectStorages/Web/WebObjectStorage.h b/src/Disks/ObjectStorages/Web/WebObjectStorage.h index 089bdb99e71..cadc369a0ec 100644 --- a/src/Disks/ObjectStorages/Web/WebObjectStorage.h +++ b/src/Disks/ObjectStorages/Web/WebObjectStorage.h @@ -89,7 +89,10 @@ public: const std::string & config_prefix, ContextPtr context) override; - std::string generateBlobNameForPath(const std::string & path) override { return path; } + ObjectStorageKey generateObjectKeyForPath(const std::string & path) const override + { + return ObjectStorageKey::createAsRelative(path); + } bool isRemote() const override { return true; } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index d51ea9ad2d0..7fb355b6c43 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -59,6 +59,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes; format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes; format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line; + format_settings.csv.allow_cr_end_of_line = settings.input_format_csv_allow_cr_end_of_line; format_settings.csv.delimiter = settings.format_csv_delimiter; format_settings.csv.tuple_delimiter = settings.format_csv_delimiter; format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 382f8b7173a..9f99a47d4d5 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -150,6 +150,7 @@ struct FormatSettings bool allow_double_quotes = true; bool empty_as_default = false; bool crlf_end_of_line = false; + bool allow_cr_end_of_line = false; bool enum_as_number = false; bool arrays_as_nested_csv = false; String null_representation = "\\N"; diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index 31f7f24eb13..57904a8ca1c 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -9,6 +9,7 @@ extract_into_parent_list(clickhouse_functions_sources dbms_sources FunctionHelpers.cpp extractTimeZoneFromFunctionArguments.cpp FunctionsLogical.cpp + CastOverloadResolver.cpp ) extract_into_parent_list(clickhouse_functions_headers dbms_headers IFunction.h @@ -16,6 +17,7 @@ extract_into_parent_list(clickhouse_functions_headers dbms_headers FunctionHelpers.h extractTimeZoneFromFunctionArguments.h FunctionsLogical.h + CastOverloadResolver.h ) add_library(clickhouse_functions_obj OBJECT ${clickhouse_functions_headers} ${clickhouse_functions_sources}) diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp index 20a08e3b60b..7fc46db50f1 100644 --- a/src/Functions/CastOverloadResolver.cpp +++ b/src/Functions/CastOverloadResolver.cpp @@ -1,10 +1,156 @@ #include +#include #include +#include namespace DB { +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +/** CastInternal does not preserve nullability of the data type, + * i.e. CastInternal(toNullable(toInt8(1)) as Int32) will be Int32(1). + * + * Cast preserves nullability according to setting `cast_keep_nullable`, + * i.e. Cast(toNullable(toInt8(1)) as Int32) will be Nullable(Int32(1)) if `cast_keep_nullable` == 1. + */ +template +class CastOverloadResolverImpl : public IFunctionOverloadResolver +{ +public: + using MonotonicityForRange = FunctionCastBase::MonotonicityForRange; + + static constexpr auto name = cast_type == CastType::accurate + ? CastName::accurate_cast_name + : (cast_type == CastType::accurateOrNull ? CastName::accurate_cast_or_null_name : CastName::cast_name); + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 2; } + + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + + explicit CastOverloadResolverImpl(ContextPtr context_, std::optional diagnostic_, bool keep_nullable_, const DataTypeValidationSettings & data_type_validation_settings_) + : context(context_) + , diagnostic(std::move(diagnostic_)) + , keep_nullable(keep_nullable_) + , data_type_validation_settings(data_type_validation_settings_) + { + } + + static FunctionOverloadResolverPtr create(ContextPtr context) + { + const auto & settings_ref = context->getSettingsRef(); + + if constexpr (internal) + return createImpl(context, {}, false /*keep_nullable*/); + + return createImpl(context, {}, settings_ref.cast_keep_nullable, DataTypeValidationSettings(settings_ref)); + } + + static FunctionOverloadResolverPtr createImpl(ContextPtr context, std::optional diagnostic = {}, bool keep_nullable = false, const DataTypeValidationSettings & data_type_validation_settings = {}) + { + assert(!internal || !keep_nullable); + return std::make_unique(context, std::move(diagnostic), keep_nullable, data_type_validation_settings); + } + + static FunctionOverloadResolverPtr createImpl(std::optional diagnostic = {}, bool keep_nullable = false, const DataTypeValidationSettings & data_type_validation_settings = {}) + { + assert(!internal || !keep_nullable); + return std::make_unique(ContextPtr(), std::move(diagnostic), keep_nullable, data_type_validation_settings); + } + +protected: + + FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override + { + DataTypes data_types(arguments.size()); + + for (size_t i = 0; i < arguments.size(); ++i) + data_types[i] = arguments[i].type; + + auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get()); + return std::make_unique>(context, name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); + } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + const auto & column = arguments.back().column; + if (!column) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. " + "Instead there is non-constant column of type {}", getName(), arguments.back().type->getName()); + + const auto * type_col = checkAndGetColumnConst(column.get()); + if (!type_col) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. " + "Instead there is a column with the following structure: {}", getName(), column->dumpStructure()); + + DataTypePtr type = DataTypeFactory::instance().get(type_col->getValue()); + validateDataType(type, data_type_validation_settings); + + if constexpr (cast_type == CastType::accurateOrNull) + return makeNullable(type); + + if constexpr (internal) + return type; + + if (keep_nullable && arguments.front().type->isNullable() && type->canBeInsideNullable()) + return makeNullable(type); + + return type; + } + + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForNothing() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + +private: + ContextPtr context; + std::optional diagnostic; + bool keep_nullable; + DataTypeValidationSettings data_type_validation_settings; +}; + + +struct CastOverloadName +{ + static constexpr auto cast_name = "CAST"; + static constexpr auto accurate_cast_name = "accurateCast"; + static constexpr auto accurate_cast_or_null_name = "accurateCastOrNull"; +}; + +struct CastInternalOverloadName +{ + static constexpr auto cast_name = "_CAST"; + static constexpr auto accurate_cast_name = "accurate_Cast"; + static constexpr auto accurate_cast_or_null_name = "accurate_CastOrNull"; +}; + +template +using CastOverloadResolver = CastOverloadResolverImpl; + +template +using CastInternalOverloadResolver = CastOverloadResolverImpl; + + +FunctionOverloadResolverPtr createInternalCastOverloadResolver(CastType type, std::optional diagnostic) +{ + switch (type) + { + case CastType::nonAccurate: + return CastInternalOverloadResolver::createImpl(diagnostic); + case CastType::accurate: + return CastInternalOverloadResolver::createImpl(diagnostic); + case CastType::accurateOrNull: + return CastInternalOverloadResolver::createImpl(diagnostic); + } +} + + REGISTER_FUNCTION(CastOverloadResolvers) { factory.registerFunction>({}, FunctionFactory::CaseInsensitive); diff --git a/src/Functions/CastOverloadResolver.h b/src/Functions/CastOverloadResolver.h index 670cd364a29..4346478e5b6 100644 --- a/src/Functions/CastOverloadResolver.h +++ b/src/Functions/CastOverloadResolver.h @@ -1,138 +1,29 @@ #pragma once -#include -#include + +#include +#include +#include + namespace DB { -namespace ErrorCodes +class IFunctionOverloadResolver; +using FunctionOverloadResolverPtr = std::shared_ptr; + +enum class CastType { - extern const int ILLEGAL_TYPE_OF_ARGUMENT; -} - -/** CastInternal does not preserve nullability of the data type, - * i.e. CastInternal(toNullable(toInt8(1)) as Int32) will be Int32(1). - * - * Cast preserves nullability according to setting `cast_keep_nullable`, - * i.e. Cast(toNullable(toInt8(1)) as Int32) will be Nullable(Int32(1)) if `cast_keep_nullable` == 1. - */ -template -class CastOverloadResolverImpl : public IFunctionOverloadResolver -{ -public: - using MonotonicityForRange = FunctionCastBase::MonotonicityForRange; - using Diagnostic = FunctionCastBase::Diagnostic; - - static constexpr auto name = cast_type == CastType::accurate - ? CastName::accurate_cast_name - : (cast_type == CastType::accurateOrNull ? CastName::accurate_cast_or_null_name : CastName::cast_name); - - String getName() const override { return name; } - - size_t getNumberOfArguments() const override { return 2; } - - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - - explicit CastOverloadResolverImpl(ContextPtr context_, std::optional diagnostic_, bool keep_nullable_, const DataTypeValidationSettings & data_type_validation_settings_) - : context(context_) - , diagnostic(std::move(diagnostic_)) - , keep_nullable(keep_nullable_) - , data_type_validation_settings(data_type_validation_settings_) - { - } - - static FunctionOverloadResolverPtr create(ContextPtr context) - { - const auto & settings_ref = context->getSettingsRef(); - - if constexpr (internal) - return createImpl(context, {}, false /*keep_nullable*/); - - return createImpl(context, {}, settings_ref.cast_keep_nullable, DataTypeValidationSettings(settings_ref)); - } - - static FunctionOverloadResolverPtr createImpl(ContextPtr context, std::optional diagnostic = {}, bool keep_nullable = false, const DataTypeValidationSettings & data_type_validation_settings = {}) - { - assert(!internal || !keep_nullable); - return std::make_unique(context, std::move(diagnostic), keep_nullable, data_type_validation_settings); - } - - static FunctionOverloadResolverPtr createImpl(std::optional diagnostic = {}, bool keep_nullable = false, const DataTypeValidationSettings & data_type_validation_settings = {}) - { - assert(!internal || !keep_nullable); - return std::make_unique(ContextPtr(), std::move(diagnostic), keep_nullable, data_type_validation_settings); - } - -protected: - - FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override - { - DataTypes data_types(arguments.size()); - - for (size_t i = 0; i < arguments.size(); ++i) - data_types[i] = arguments[i].type; - - auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get()); - return std::make_unique>(context, name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); - } - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override - { - const auto & column = arguments.back().column; - if (!column) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. " - "Instead there is non-constant column of type {}", getName(), arguments.back().type->getName()); - - const auto * type_col = checkAndGetColumnConst(column.get()); - if (!type_col) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. " - "Instead there is a column with the following structure: {}", getName(), column->dumpStructure()); - - DataTypePtr type = DataTypeFactory::instance().get(type_col->getValue()); - validateDataType(type, data_type_validation_settings); - - if constexpr (cast_type == CastType::accurateOrNull) - return makeNullable(type); - - if constexpr (internal) - return type; - - if (keep_nullable && arguments.front().type->isNullable() && type->canBeInsideNullable()) - return makeNullable(type); - - return type; - } - - bool useDefaultImplementationForNulls() const override { return false; } - bool useDefaultImplementationForNothing() const override { return false; } - bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } - -private: - ContextPtr context; - std::optional diagnostic; - bool keep_nullable; - DataTypeValidationSettings data_type_validation_settings; + nonAccurate, + accurate, + accurateOrNull }; - -struct CastOverloadName +struct CastDiagnostic { - static constexpr auto cast_name = "CAST"; - static constexpr auto accurate_cast_name = "accurateCast"; - static constexpr auto accurate_cast_or_null_name = "accurateCastOrNull"; + std::string column_from; + std::string column_to; }; -struct CastInternalOverloadName -{ - static constexpr auto cast_name = "_CAST"; - static constexpr auto accurate_cast_name = "accurate_Cast"; - static constexpr auto accurate_cast_or_null_name = "accurate_CastOrNull"; -}; - -template -using CastOverloadResolver = CastOverloadResolverImpl; - -template -using CastInternalOverloadResolver = CastOverloadResolverImpl; +FunctionOverloadResolverPtr createInternalCastOverloadResolver(CastType type, std::optional diagnostic); } diff --git a/src/Functions/FunctionBitTestMany.h b/src/Functions/FunctionBitTestMany.h index 35af3a2a771..71e94b1e71d 100644 --- a/src/Functions/FunctionBitTestMany.h +++ b/src/Functions/FunctionBitTestMany.h @@ -49,7 +49,7 @@ public: { const auto & pos_arg = arguments[i]; - if (!isUnsignedInteger(pos_arg)) + if (!isUInt(pos_arg)) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of {} argument of function {}", pos_arg->getName(), i, getName()); } diff --git a/src/Functions/FunctionGenerateRandomStructure.cpp b/src/Functions/FunctionGenerateRandomStructure.cpp index f85b2596530..6dc68134502 100644 --- a/src/Functions/FunctionGenerateRandomStructure.cpp +++ b/src/Functions/FunctionGenerateRandomStructure.cpp @@ -365,7 +365,7 @@ DataTypePtr FunctionGenerateRandomStructure::getReturnTypeImpl(const DataTypes & for (size_t i = 0; i != arguments.size(); ++i) { - if (!isUnsignedInteger(arguments[i]) && !arguments[i]->onlyNull()) + if (!isUInt(arguments[i]) && !arguments[i]->onlyNull()) { throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, diff --git a/src/Functions/FunctionTokens.h b/src/Functions/FunctionTokens.h new file mode 100644 index 00000000000..5c4e582c637 --- /dev/null +++ b/src/Functions/FunctionTokens.h @@ -0,0 +1,211 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; +} + + +/** Functions that split strings into an array of strings or vice versa. + * + * splitByChar(sep, s[, max_substrings]) + * splitByString(sep, s[, max_substrings]) + * splitByRegexp(regexp, s[, max_substrings]) + * + * splitByWhitespace(s[, max_substrings]) - split the string by whitespace characters + * splitByNonAlpha(s[, max_substrings]) - split the string by whitespace and punctuation characters + * + * extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp. + * - first subpattern, if regexp has subpattern; + * - zero subpattern (the match part, otherwise); + * - otherwise, an empty array + * + * alphaTokens(s[, max_substrings]) - select from the string subsequence `[a-zA-Z]+`. + * + * URL functions are located separately. + */ + + +/// A function that takes a string, and returns an array of substrings created by some generator. +template +class FunctionTokens : public IFunction +{ +private: + using Pos = const char *; + bool max_substrings_includes_remaining_string; + +public: + static constexpr auto name = Generator::name; + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + + explicit FunctionTokens(ContextPtr context) + { + const Settings & settings = context->getSettingsRef(); + max_substrings_includes_remaining_string = settings.splitby_max_substrings_includes_remaining_string; + } + + String getName() const override { return name; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + bool isVariadic() const override { return Generator::isVariadic(); } + + size_t getNumberOfArguments() const override { return Generator::getNumberOfArguments(); } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + Generator::checkArguments(*this, arguments); + + return std::make_shared(std::make_shared()); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override + { + Generator generator; + generator.init(arguments, max_substrings_includes_remaining_string); + + const auto & array_argument = arguments[generator.strings_argument_position]; + + const ColumnString * col_str = checkAndGetColumn(array_argument.column.get()); + const ColumnConst * col_str_const = checkAndGetColumnConstStringOrFixedString(array_argument.column.get()); + + auto col_res = ColumnArray::create(ColumnString::create()); + + ColumnString & res_strings = typeid_cast(col_res->getData()); + ColumnString::Chars & res_strings_chars = res_strings.getChars(); + ColumnString::Offsets & res_strings_offsets = res_strings.getOffsets(); + + ColumnArray::Offsets & res_offsets = col_res->getOffsets(); + + if (col_str) + { + const ColumnString::Chars & src_chars = col_str->getChars(); + const ColumnString::Offsets & src_offsets = col_str->getOffsets(); + + res_offsets.reserve(src_offsets.size()); + res_strings_offsets.reserve(src_offsets.size() * 5); /// Constant 5 - at random. + res_strings_chars.reserve(src_chars.size()); + + Pos token_begin = nullptr; + Pos token_end = nullptr; + + size_t size = src_offsets.size(); + ColumnString::Offset current_src_offset = 0; + ColumnArray::Offset current_dst_offset = 0; + ColumnString::Offset current_dst_strings_offset = 0; + for (size_t i = 0; i < size; ++i) + { + Pos pos = reinterpret_cast(&src_chars[current_src_offset]); + current_src_offset = src_offsets[i]; + Pos end = reinterpret_cast(&src_chars[current_src_offset]) - 1; + + generator.set(pos, end); + size_t j = 0; + while (generator.get(token_begin, token_end)) + { + size_t token_size = token_end - token_begin; + + res_strings_chars.resize(res_strings_chars.size() + token_size + 1); + memcpySmallAllowReadWriteOverflow15(&res_strings_chars[current_dst_strings_offset], token_begin, token_size); + res_strings_chars[current_dst_strings_offset + token_size] = 0; + + current_dst_strings_offset += token_size + 1; + res_strings_offsets.push_back(current_dst_strings_offset); + ++j; + } + + current_dst_offset += j; + res_offsets.push_back(current_dst_offset); + } + + return col_res; + } + else if (col_str_const) + { + String src = col_str_const->getValue(); + Array dst; + + generator.set(src.data(), src.data() + src.size()); + Pos token_begin = nullptr; + Pos token_end = nullptr; + + while (generator.get(token_begin, token_end)) + dst.push_back(String(token_begin, token_end - token_begin)); + + return result_type->createColumnConst(col_str_const->size(), dst); + } + else + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal columns {}, {} of arguments of function {}", + array_argument.column->getName(), array_argument.column->getName(), getName()); + } +}; + + +/// Helper functions for implementations +static inline std::optional extractMaxSplits( + const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position) +{ + if (max_substrings_argument_position >= arguments.size()) + return std::nullopt; + + if (const ColumnConst * column = checkAndGetColumn(arguments[max_substrings_argument_position].column.get())) + { + size_t res = column->getUInt(0); + if (res) + return res; + } + + return std::nullopt; +} + +static inline void checkArgumentsWithSeparatorAndOptionalMaxSubstrings( + const IFunction & func, const ColumnsWithTypeAndName & arguments) +{ + FunctionArgumentDescriptors mandatory_args{ + {"separator", &isString, isColumnConst, "const String"}, + {"s", &isString, nullptr, "String"} + }; + + FunctionArgumentDescriptors optional_args{ + {"max_substrings", &isNativeInteger, isColumnConst, "const Number"}, + }; + + validateFunctionArgumentTypes(func, arguments, mandatory_args, optional_args); +} + +static inline void checkArgumentsWithOptionalMaxSubstrings(const IFunction & func, const ColumnsWithTypeAndName & arguments) +{ + FunctionArgumentDescriptors mandatory_args{ + {"s", &isString, nullptr, "String"}, + }; + + FunctionArgumentDescriptors optional_args{ + {"max_substrings", &isNativeInteger, isColumnConst, "const Number"}, + }; + + validateFunctionArgumentTypes(func, arguments, mandatory_args, optional_args); +} + +} diff --git a/src/Functions/FunctionUnixTimestamp64.h b/src/Functions/FunctionUnixTimestamp64.h index a2065465501..d74237afd77 100644 --- a/src/Functions/FunctionUnixTimestamp64.h +++ b/src/Functions/FunctionUnixTimestamp64.h @@ -26,11 +26,13 @@ namespace ErrorCodes class FunctionToUnixTimestamp64 : public IFunction { private: - size_t target_scale; + const size_t target_scale; const char * name; + public: FunctionToUnixTimestamp64(size_t target_scale_, const char * name_) - : target_scale(target_scale_), name(name_) + : target_scale(target_scale_) + , name(name_) { } @@ -42,8 +44,10 @@ public: DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - if (!isDateTime64(arguments[0].type)) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The only argument for function {} must be DateTime64", name); + FunctionArgumentDescriptors args{ + {"value", &isDateTime64, nullptr, "DateTime64"} + }; + validateFunctionArgumentTypes(*this, arguments, args); return std::make_shared(); } @@ -98,9 +102,10 @@ public: class FunctionFromUnixTimestamp64 : public IFunction { private: - size_t target_scale; + const size_t target_scale; const char * name; const bool allow_nonconst_timezone_arguments; + public: FunctionFromUnixTimestamp64(size_t target_scale_, const char * name_, ContextPtr context) : target_scale(target_scale_) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index bda5fce1ac8..e3ec7ebd320 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -2033,7 +2034,7 @@ static inline bool isDateTime64(const ColumnsWithTypeAndName & arguments) else if constexpr (std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v) { - return (arguments.size() == 2 && isUnsignedInteger(arguments[1].type)) || arguments.size() == 3; + return (arguments.size() == 2 && isUInt(arguments[1].type)) || arguments.size() == 3; } return false; @@ -3127,14 +3128,8 @@ class ExecutableFunctionCast : public IExecutableFunction public: using WrapperType = std::function; - struct Diagnostic - { - std::string column_from; - std::string column_to; - }; - explicit ExecutableFunctionCast( - WrapperType && wrapper_function_, const char * name_, std::optional diagnostic_) + WrapperType && wrapper_function_, const char * name_, std::optional diagnostic_) : wrapper_function(std::move(wrapper_function_)), name(name_), diagnostic(std::move(diagnostic_)) {} String getName() const override { return name; } @@ -3170,24 +3165,16 @@ protected: private: WrapperType wrapper_function; const char * name; - std::optional diagnostic; + std::optional diagnostic; }; struct CastName { static constexpr auto name = "CAST"; }; struct CastInternalName { static constexpr auto name = "_CAST"; }; -enum class CastType -{ - nonAccurate, - accurate, - accurateOrNull -}; - class FunctionCastBase : public IFunctionBase { public: using MonotonicityForRange = std::function; - using Diagnostic = ExecutableFunctionCast::Diagnostic; }; template @@ -3201,7 +3188,7 @@ public: , MonotonicityForRange && monotonicity_for_range_ , const DataTypes & argument_types_ , const DataTypePtr & return_type_ - , std::optional diagnostic_ + , std::optional diagnostic_ , CastType cast_type_) : cast_name(cast_name_), monotonicity_for_range(std::move(monotonicity_for_range_)) , argument_types(argument_types_), return_type(return_type_), diagnostic(std::move(diagnostic_)) @@ -3251,7 +3238,7 @@ private: DataTypes argument_types; DataTypePtr return_type; - std::optional diagnostic; + std::optional diagnostic; CastType cast_type; ContextPtr context; @@ -4172,6 +4159,61 @@ arguments, result_type, input_rows_count); \ }; } + template + WrapperType createEnumToStringWrapper() const + { + const char * function_name = cast_name; + return [function_name] ( + ColumnsWithTypeAndName & arguments, const DataTypePtr & res_type, const ColumnNullable * nullable_col, size_t /*input_rows_count*/) + { + using ColumnEnumType = EnumType::ColumnType; + + const auto & first_col = arguments.front().column.get(); + const auto & first_type = arguments.front().type.get(); + + const ColumnEnumType * enum_col = typeid_cast(first_col); + const EnumType * enum_type = typeid_cast(first_type); + + if (enum_col && nullable_col && nullable_col->size() != enum_col->size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "ColumnNullable is not compatible with original"); + + if (enum_col && enum_type) + { + const auto size = enum_col->size(); + const auto & enum_data = enum_col->getData(); + + auto res = res_type->createColumn(); + + if (nullable_col) + { + for (size_t i = 0; i < size; ++i) + { + if (!nullable_col->isNullAt(i)) + { + const auto & value = enum_type->getNameForValue(enum_data[i]); + res->insertData(value.data, value.size); + } + else + res->insertDefault(); + } + } + else + { + for (size_t i = 0; i < size; ++i) + { + const auto & value = enum_type->getNameForValue(enum_data[i]); + res->insertData(value.data, value.size); + } + } + + return res; + } + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected column {} as first argument of function {}", + first_col->getName(), function_name); + }; + } + static WrapperType createIdentityWrapper(const DataTypePtr &) { return [] (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t /*input_rows_count*/) @@ -4559,7 +4601,12 @@ arguments, result_type, input_rows_count); \ if constexpr (WhichDataType(ToDataType::type_id).isStringOrFixedString()) { - if (from_type->getCustomSerialization()) + if constexpr (WhichDataType(FromDataType::type_id).isEnum()) + { + ret = createEnumToStringWrapper(); + return true; + } + else if (from_type->getCustomSerialization()) { ret = [](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr { diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index 00d989f388e..18b411e9839 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -60,7 +60,7 @@ public: if (!isString(arguments[0])) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName()); - if (!isUnsignedInteger(arguments[1])) + if (!isUInt(arguments[1])) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[1]->getName(), getName()); const DataTypeArray * array_type = checkAndGetDataType(arguments[2].get()); diff --git a/src/Functions/FunctionsStringArray.cpp b/src/Functions/FunctionsStringArray.cpp deleted file mode 100644 index 4afee55704f..00000000000 --- a/src/Functions/FunctionsStringArray.cpp +++ /dev/null @@ -1,73 +0,0 @@ -#include -#include - -namespace DB -{ -namespace ErrorCodes -{ - extern const int ILLEGAL_COLUMN; -} - -template -std::optional extractMaxSplitsImpl(const ColumnWithTypeAndName & argument) -{ - const auto * col = checkAndGetColumnConst>(argument.column.get()); - if (!col) - return std::nullopt; - - auto value = col->template getValue(); - return static_cast(value); -} - -std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position) -{ - if (max_substrings_argument_position >= arguments.size()) - return std::nullopt; - - std::optional max_splits; - if (!((max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) - || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) - || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) - || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])))) - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "Illegal column {}, which is {}-th argument", - arguments[max_substrings_argument_position].column->getName(), - max_substrings_argument_position + 1); - - if (*max_splits <= 0) - return std::nullopt; - - return max_splits; -} - -DataTypePtr FunctionArrayStringConcat::getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const -{ - FunctionArgumentDescriptors mandatory_args{ - {"arr", &isArray, nullptr, "Array"}, - }; - - FunctionArgumentDescriptors optional_args{ - {"separator", &isString, isColumnConst, "const String"}, - }; - - validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args); - - return std::make_shared(); -} - -REGISTER_FUNCTION(StringArray) -{ - factory.registerFunction(); - - factory.registerFunction(); - factory.registerAlias("splitByAlpha", FunctionSplitByAlpha::name); - factory.registerFunction(); - factory.registerFunction(); - factory.registerFunction(); - factory.registerFunction(); - factory.registerFunction(); - factory.registerFunction(); -} - -} diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h deleted file mode 100644 index d7d7e3b5100..00000000000 --- a/src/Functions/FunctionsStringArray.h +++ /dev/null @@ -1,990 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int BAD_ARGUMENTS; - extern const int ILLEGAL_COLUMN; -} - - -/** Functions that split strings into an array of strings or vice versa. - * - * splitByChar(sep, s[, max_substrings]) - * splitByString(sep, s[, max_substrings]) - * splitByRegexp(regexp, s[, max_substrings]) - * - * splitByWhitespace(s[, max_substrings]) - split the string by whitespace characters - * splitByNonAlpha(s[, max_substrings]) - split the string by whitespace and punctuation characters - * - * extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp. - * - first subpattern, if regexp has subpattern; - * - zero subpattern (the match part, otherwise); - * - otherwise, an empty array - * - * arrayStringConcat(arr) - * arrayStringConcat(arr, delimiter) - * - join an array of strings into one string via a separator. - * - * alphaTokens(s[, max_substrings]) - select from the string subsequence `[a-zA-Z]+`. - * - * URL functions are located separately. - */ - - -using Pos = const char *; - -std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position); - -/// Substring generators. All of them have a common interface. - -class SplitByAlphaImpl -{ -private: - Pos pos; - Pos end; - std::optional max_splits; - size_t splits; - bool max_substrings_includes_remaining_string; - -public: - static constexpr auto name = "alphaTokens"; - static String getName() { return name; } - - static bool isVariadic() { return true; } - - static size_t getNumberOfArguments() { return 0; } - - static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) - { - FunctionArgumentDescriptors mandatory_args{ - {"s", &isString, nullptr, "String"}, - }; - - FunctionArgumentDescriptors optional_args{ - {"max_substrings", &isNativeInteger, isColumnConst, "const Number"}, - }; - - validateFunctionArgumentTypes(func, arguments, mandatory_args, optional_args); - } - - static constexpr auto strings_argument_position = 0uz; - - void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) - { - max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; - max_splits = extractMaxSplits(arguments, 1); - } - - /// Called for each next string. - void set(Pos pos_, Pos end_) - { - pos = pos_; - end = end_; - splits = 0; - } - - /// Get the next token, if any, or return false. - bool get(Pos & token_begin, Pos & token_end) - { - /// Skip garbage - while (pos < end && !isAlphaASCII(*pos)) - ++pos; - - if (pos == end) - return false; - - token_begin = pos; - - if (max_splits) - { - if (max_substrings_includes_remaining_string) - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = end; - return true; - } - } - else - if (splits == *max_splits) - return false; - } - - while (pos < end && isAlphaASCII(*pos)) - ++pos; - - token_end = pos; - ++splits; - - return true; - } -}; - -class SplitByNonAlphaImpl -{ -private: - Pos pos; - Pos end; - std::optional max_splits; - size_t splits; - bool max_substrings_includes_remaining_string; - -public: - /// Get the name of the function. - static constexpr auto name = "splitByNonAlpha"; - static String getName() { return name; } - - static bool isVariadic() { return true; } - static size_t getNumberOfArguments() { return 0; } - - static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) - { - SplitByAlphaImpl::checkArguments(func, arguments); - } - - static constexpr auto strings_argument_position = 0uz; - - void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) - { - max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; - max_splits = extractMaxSplits(arguments, 1); - } - - /// Called for each next string. - void set(Pos pos_, Pos end_) - { - pos = pos_; - end = end_; - splits = 0; - } - - /// Get the next token, if any, or return false. - bool get(Pos & token_begin, Pos & token_end) - { - /// Skip garbage - while (pos < end && (isWhitespaceASCII(*pos) || isPunctuationASCII(*pos))) - ++pos; - - if (pos == end) - return false; - - token_begin = pos; - - if (max_splits) - { - if (max_substrings_includes_remaining_string) - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = end; - return true; - } - } - else - if (splits == *max_splits) - return false; - } - - while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos))) - ++pos; - - token_end = pos; - splits++; - - return true; - } -}; - -class SplitByWhitespaceImpl -{ -private: - Pos pos; - Pos end; - std::optional max_splits; - size_t splits; - bool max_substrings_includes_remaining_string; - -public: - static constexpr auto name = "splitByWhitespace"; - static String getName() { return name; } - - static bool isVariadic() { return true; } - static size_t getNumberOfArguments() { return 0; } - - static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) - { - return SplitByNonAlphaImpl::checkArguments(func, arguments); - } - - static constexpr auto strings_argument_position = 0uz; - - void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) - { - max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; - max_splits = extractMaxSplits(arguments, 1); - } - - /// Called for each next string. - void set(Pos pos_, Pos end_) - { - pos = pos_; - end = end_; - splits = 0; - } - - /// Get the next token, if any, or return false. - bool get(Pos & token_begin, Pos & token_end) - { - /// Skip garbage - while (pos < end && isWhitespaceASCII(*pos)) - ++pos; - - if (pos == end) - return false; - - token_begin = pos; - - if (max_splits) - { - if (max_substrings_includes_remaining_string) - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = end; - return true; - } - } - else - if (splits == *max_splits) - return false; - } - - while (pos < end && !isWhitespaceASCII(*pos)) - ++pos; - - token_end = pos; - splits++; - - return true; - } -}; - -class SplitByCharImpl -{ -private: - Pos pos; - Pos end; - char separator; - std::optional max_splits; - size_t splits; - bool max_substrings_includes_remaining_string; - -public: - static constexpr auto name = "splitByChar"; - static String getName() { return name; } - static bool isVariadic() { return true; } - static size_t getNumberOfArguments() { return 0; } - - static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) - { - FunctionArgumentDescriptors mandatory_args{ - {"separator", &isString, isColumnConst, "const String"}, - {"s", &isString, nullptr, "String"} - }; - - FunctionArgumentDescriptors optional_args{ - {"max_substrings", &isNativeInteger, isColumnConst, "const Number"}, - }; - - validateFunctionArgumentTypes(func, arguments, mandatory_args, optional_args); - } - - static constexpr auto strings_argument_position = 1uz; - - void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) - { - const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); - - if (!col) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. " - "Must be constant string.", arguments[0].column->getName(), getName()); - - String sep_str = col->getValue(); - - if (sep_str.size() != 1) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal separator for function {}. Must be exactly one byte.", getName()); - - separator = sep_str[0]; - - max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; - max_splits = extractMaxSplits(arguments, 2); - } - - void set(Pos pos_, Pos end_) - { - pos = pos_; - end = end_; - splits = 0; - } - - bool get(Pos & token_begin, Pos & token_end) - { - if (!pos) - return false; - - token_begin = pos; - - if (max_splits) - { - if (max_substrings_includes_remaining_string) - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = nullptr; - return true; - } - } - else - if (splits == *max_splits) - return false; - } - - pos = reinterpret_cast(memchr(pos, separator, end - pos)); - if (pos) - { - token_end = pos; - ++pos; - ++splits; - } - else - token_end = end; - - return true; - } -}; - - -class SplitByStringImpl -{ -private: - Pos pos; - Pos end; - String separator; - std::optional max_splits; - size_t splits; - bool max_substrings_includes_remaining_string; - -public: - static constexpr auto name = "splitByString"; - static String getName() { return name; } - static bool isVariadic() { return true; } - static size_t getNumberOfArguments() { return 0; } - - static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) - { - SplitByCharImpl::checkArguments(func, arguments); - } - - static constexpr auto strings_argument_position = 1uz; - - void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) - { - const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); - - if (!col) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. " - "Must be constant string.", arguments[0].column->getName(), getName()); - - separator = col->getValue(); - - max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; - max_splits = extractMaxSplits(arguments, 2); - } - - /// Called for each next string. - void set(Pos pos_, Pos end_) - { - pos = pos_; - end = end_; - splits = 0; - } - - /// Get the next token, if any, or return false. - bool get(Pos & token_begin, Pos & token_end) - { - if (separator.empty()) - { - if (pos == end) - return false; - - token_begin = pos; - - if (max_splits) - { - if (max_substrings_includes_remaining_string) - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = end; - return true; - } - } - else - if (splits == *max_splits) - return false; - } - - pos += 1; - token_end = pos; - ++splits; - } - else - { - if (!pos) - return false; - - token_begin = pos; - - if (max_splits) - { - if (max_substrings_includes_remaining_string) - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = nullptr; - return true; - } - } - else - if (splits == *max_splits) - return false; - } - - pos = reinterpret_cast(memmem(pos, end - pos, separator.data(), separator.size())); - if (pos) - { - token_end = pos; - pos += separator.size(); - ++splits; - } - else - token_end = end; - } - - return true; - } -}; - -class SplitByRegexpImpl -{ -private: - Regexps::RegexpPtr re; - OptimizedRegularExpression::MatchVec matches; - - Pos pos; - Pos end; - - std::optional max_splits; - size_t splits; - bool max_substrings_includes_remaining_string; - -public: - static constexpr auto name = "splitByRegexp"; - static String getName() { return name; } - - static bool isVariadic() { return true; } - static size_t getNumberOfArguments() { return 0; } - - static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) - { - SplitByStringImpl::checkArguments(func, arguments); - } - - static constexpr auto strings_argument_position = 1uz; - - void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) - { - const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); - - if (!col) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. " - "Must be constant string.", arguments[0].column->getName(), getName()); - - if (!col->getValue().empty()) - re = std::make_shared(Regexps::createRegexp(col->getValue())); - - max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; - max_splits = extractMaxSplits(arguments, 2); - } - - /// Called for each next string. - void set(Pos pos_, Pos end_) - { - pos = pos_; - end = end_; - splits = 0; - } - - /// Get the next token, if any, or return false. - bool get(Pos & token_begin, Pos & token_end) - { - if (!re) - { - if (pos == end) - return false; - - token_begin = pos; - - if (max_splits) - { - if (max_substrings_includes_remaining_string) - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = end; - return true; - } - } - else - if (splits == *max_splits) - return false; - } - - pos += 1; - token_end = pos; - ++splits; - } - else - { - if (!pos || pos > end) - return false; - - token_begin = pos; - - if (max_splits) - { - if (max_substrings_includes_remaining_string) - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = nullptr; - return true; - } - } - else - if (splits == *max_splits) - return false; - } - - if (!re->match(pos, end - pos, matches) || !matches[0].length) - { - token_end = end; - pos = end + 1; - } - else - { - token_end = pos + matches[0].offset; - pos = token_end + matches[0].length; - ++splits; - } - } - - return true; - } -}; - -class ExtractAllImpl -{ -private: - Regexps::RegexpPtr re; - OptimizedRegularExpression::MatchVec matches; - size_t capture; - - Pos pos; - Pos end; -public: - static constexpr auto name = "extractAll"; - static String getName() { return name; } - static bool isVariadic() { return false; } - static size_t getNumberOfArguments() { return 2; } - - static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) - { - FunctionArgumentDescriptors mandatory_args{ - {"haystack", &isString, nullptr, "String"}, - {"pattern", &isString, isColumnConst, "const String"} - }; - - validateFunctionArgumentTypes(func, arguments, mandatory_args); - } - - static constexpr auto strings_argument_position = 0uz; - - void init(const ColumnsWithTypeAndName & arguments, bool /*max_substrings_includes_remaining_string*/) - { - const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); - - if (!col) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. " - "Must be constant string.", arguments[1].column->getName(), getName()); - - re = std::make_shared(Regexps::createRegexp(col->getValue())); - capture = re->getNumberOfSubpatterns() > 0 ? 1 : 0; - - matches.resize(capture + 1); - } - - /// Called for each next string. - void set(Pos pos_, Pos end_) - { - pos = pos_; - end = end_; - } - - /// Get the next token, if any, or return false. - bool get(Pos & token_begin, Pos & token_end) - { - if (!pos || pos > end) - return false; - - if (!re->match(pos, end - pos, matches) || !matches[0].length) - return false; - - if (matches[capture].offset == std::string::npos) - { - /// Empty match. - token_begin = pos; - token_end = pos; - } - else - { - token_begin = pos + matches[capture].offset; - token_end = token_begin + matches[capture].length; - } - - pos += matches[0].offset + matches[0].length; - - return true; - } -}; - -/// A function that takes a string, and returns an array of substrings created by some generator. -template -class FunctionTokens : public IFunction -{ -private: - bool max_substrings_includes_remaining_string; - -public: - static constexpr auto name = Generator::name; - static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } - - explicit FunctionTokens(ContextPtr context) - { - const Settings & settings = context->getSettingsRef(); - max_substrings_includes_remaining_string = settings.splitby_max_substrings_includes_remaining_string; - } - - String getName() const override { return name; } - - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - - bool isVariadic() const override { return Generator::isVariadic(); } - - size_t getNumberOfArguments() const override { return Generator::getNumberOfArguments(); } - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override - { - Generator::checkArguments(*this, arguments); - - return std::make_shared(std::make_shared()); - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override - { - Generator generator; - generator.init(arguments, max_substrings_includes_remaining_string); - - const auto & array_argument = arguments[generator.strings_argument_position]; - - const ColumnString * col_str = checkAndGetColumn(array_argument.column.get()); - const ColumnConst * col_str_const = checkAndGetColumnConstStringOrFixedString(array_argument.column.get()); - - auto col_res = ColumnArray::create(ColumnString::create()); - - ColumnString & res_strings = typeid_cast(col_res->getData()); - ColumnString::Chars & res_strings_chars = res_strings.getChars(); - ColumnString::Offsets & res_strings_offsets = res_strings.getOffsets(); - - ColumnArray::Offsets & res_offsets = col_res->getOffsets(); - - if (col_str) - { - const ColumnString::Chars & src_chars = col_str->getChars(); - const ColumnString::Offsets & src_offsets = col_str->getOffsets(); - - res_offsets.reserve(src_offsets.size()); - res_strings_offsets.reserve(src_offsets.size() * 5); /// Constant 5 - at random. - res_strings_chars.reserve(src_chars.size()); - - Pos token_begin = nullptr; - Pos token_end = nullptr; - - size_t size = src_offsets.size(); - ColumnString::Offset current_src_offset = 0; - ColumnArray::Offset current_dst_offset = 0; - ColumnString::Offset current_dst_strings_offset = 0; - for (size_t i = 0; i < size; ++i) - { - Pos pos = reinterpret_cast(&src_chars[current_src_offset]); - current_src_offset = src_offsets[i]; - Pos end = reinterpret_cast(&src_chars[current_src_offset]) - 1; - - generator.set(pos, end); - size_t j = 0; - while (generator.get(token_begin, token_end)) - { - size_t token_size = token_end - token_begin; - - res_strings_chars.resize(res_strings_chars.size() + token_size + 1); - memcpySmallAllowReadWriteOverflow15(&res_strings_chars[current_dst_strings_offset], token_begin, token_size); - res_strings_chars[current_dst_strings_offset + token_size] = 0; - - current_dst_strings_offset += token_size + 1; - res_strings_offsets.push_back(current_dst_strings_offset); - ++j; - } - - current_dst_offset += j; - res_offsets.push_back(current_dst_offset); - } - - return col_res; - } - else if (col_str_const) - { - String src = col_str_const->getValue(); - Array dst; - - generator.set(src.data(), src.data() + src.size()); - Pos token_begin = nullptr; - Pos token_end = nullptr; - - while (generator.get(token_begin, token_end)) - dst.push_back(String(token_begin, token_end - token_begin)); - - return result_type->createColumnConst(col_str_const->size(), dst); - } - else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal columns {}, {} of arguments of function {}", - array_argument.column->getName(), array_argument.column->getName(), getName()); - } -}; - - -/// Joins an array of type serializable to string into one string via a separator. -class FunctionArrayStringConcat : public IFunction -{ -private: - static void executeInternal( - const ColumnString::Chars & src_chars, - const ColumnString::Offsets & src_string_offsets, - const ColumnArray::Offsets & src_array_offsets, - const char * delimiter, - const size_t delimiter_size, - ColumnString::Chars & dst_chars, - ColumnString::Offsets & dst_string_offsets, - const char8_t * null_map) - { - size_t size = src_array_offsets.size(); - - if (!size) - return; - - /// With a small margin - as if the separator goes after the last string of the array. - dst_chars.resize( - src_chars.size() - + delimiter_size * src_string_offsets.size() /// Separators after each string... - + src_array_offsets.size() /// Zero byte after each joined string - - src_string_offsets.size()); /// The former zero byte after each string of the array - - /// There will be as many strings as there were arrays. - dst_string_offsets.resize(src_array_offsets.size()); - - ColumnArray::Offset current_src_array_offset = 0; - - ColumnString::Offset current_dst_string_offset = 0; - - /// Loop through the array of strings. - for (size_t i = 0; i < size; ++i) - { - bool first_non_null = true; - /// Loop through the rows within the array. /// NOTE You can do everything in one copy, if the separator has a size of 1. - for (auto next_src_array_offset = src_array_offsets[i]; current_src_array_offset < next_src_array_offset; ++current_src_array_offset) - { - if (null_map && null_map[current_src_array_offset]) [[unlikely]] - continue; - - if (!first_non_null) - { - memcpy(&dst_chars[current_dst_string_offset], delimiter, delimiter_size); - current_dst_string_offset += delimiter_size; - } - first_non_null = false; - - const auto current_src_string_offset = current_src_array_offset ? src_string_offsets[current_src_array_offset - 1] : 0; - size_t bytes_to_copy = src_string_offsets[current_src_array_offset] - current_src_string_offset - 1; - - memcpySmallAllowReadWriteOverflow15( - &dst_chars[current_dst_string_offset], &src_chars[current_src_string_offset], bytes_to_copy); - - current_dst_string_offset += bytes_to_copy; - } - - dst_chars[current_dst_string_offset] = 0; - ++current_dst_string_offset; - - dst_string_offsets[i] = current_dst_string_offset; - } - - dst_chars.resize(dst_string_offsets.back()); - } - - static void executeInternal( - const ColumnString & col_string, - const ColumnArray & col_arr, - const String & delimiter, - ColumnString & col_res, - const char8_t * null_map = nullptr) - { - executeInternal( - col_string.getChars(), - col_string.getOffsets(), - col_arr.getOffsets(), - delimiter.data(), - delimiter.size(), - col_res.getChars(), - col_res.getOffsets(), - null_map); - } - - static ColumnPtr serializeNestedColumn(const ColumnArray & col_arr, const DataTypePtr & nested_type) - { - if (isString(nested_type)) - { - return col_arr.getDataPtr(); - } - else if (const ColumnNullable * col_nullable = checkAndGetColumn(col_arr.getData()); - col_nullable && isString(col_nullable->getNestedColumn().getDataType())) - { - return col_nullable->getNestedColumnPtr(); - } - else - { - ColumnsWithTypeAndName cols; - cols.emplace_back(col_arr.getDataPtr(), nested_type, "tmp"); - return ConvertImplGenericToString::execute(cols, std::make_shared(), col_arr.size()); - } - } - -public: - static constexpr auto name = "arrayStringConcat"; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } - - String getName() const override - { - return name; - } - - bool isVariadic() const override { return true; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - size_t getNumberOfArguments() const override { return 0; } - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override; - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override - { - String delimiter; - if (arguments.size() == 2) - { - const ColumnConst * col_delim = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); - if (!col_delim) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument for function {} must be constant string.", getName()); - - delimiter = col_delim->getValue(); - } - - const auto & nested_type = assert_cast(*arguments[0].type).getNestedType(); - if (const ColumnConst * col_const_arr = checkAndGetColumnConst(arguments[0].column.get()); - col_const_arr && isString(nested_type)) - { - Array src_arr = col_const_arr->getValue(); - String dst_str; - bool first_non_null = true; - for (size_t i = 0, size = src_arr.size(); i < size; ++i) - { - if (src_arr[i].isNull()) - continue; - if (!first_non_null) - dst_str += delimiter; - first_non_null = false; - dst_str += src_arr[i].get(); - } - - return result_type->createColumnConst(col_const_arr->size(), dst_str); - } - - ColumnPtr src_column = arguments[0].column->convertToFullColumnIfConst(); - const ColumnArray & col_arr = assert_cast(*src_column.get()); - - ColumnPtr str_subcolumn = serializeNestedColumn(col_arr, nested_type); - const ColumnString & col_string = assert_cast(*str_subcolumn.get()); - - auto col_res = ColumnString::create(); - if (const ColumnNullable * col_nullable = checkAndGetColumn(col_arr.getData())) - executeInternal(col_string, col_arr, delimiter, *col_res, col_nullable->getNullMapData().data()); - else - executeInternal(col_string, col_arr, delimiter, *col_res); - return col_res; - } -}; - - -using FunctionSplitByAlpha = FunctionTokens; -using FunctionSplitByNonAlpha = FunctionTokens; -using FunctionSplitByWhitespace = FunctionTokens; -using FunctionSplitByChar = FunctionTokens; -using FunctionSplitByString = FunctionTokens; -using FunctionSplitByRegexp = FunctionTokens; -using FunctionExtractAll = FunctionTokens; - -} diff --git a/src/Functions/FunctionsStringDistance.cpp b/src/Functions/FunctionsStringDistance.cpp index 98a04170dd4..3098d02630a 100644 --- a/src/Functions/FunctionsStringDistance.cpp +++ b/src/Functions/FunctionsStringDistance.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #ifdef __SSE4_2__ # include @@ -14,6 +15,7 @@ namespace DB { namespace ErrorCodes { +extern const int BAD_ARGUMENTS; extern const int TOO_LARGE_STRING_SIZE; } @@ -59,8 +61,8 @@ struct FunctionStringDistanceImpl size_t size = res.size(); for (size_t i = 0; i < size; ++i) { - res[i] - = Op::process(haystack_data, haystack_size, needle + needle_offsets[i - 1], needle_offsets[i] - needle_offsets[i - 1] - 1); + res[i] = Op::process(haystack_data, haystack_size, + needle + needle_offsets[i - 1], needle_offsets[i] - needle_offsets[i - 1] - 1); } } @@ -108,6 +110,117 @@ struct ByteHammingDistanceImpl } }; +template +struct ByteJaccardIndexImpl +{ + using ResultType = Float64; + static ResultType inline process( + const char * __restrict haystack, size_t haystack_size, const char * __restrict needle, size_t needle_size) + { + if (haystack_size == 0 || needle_size == 0) + return 0; + + const char * haystack_end = haystack + haystack_size; + const char * needle_end = needle + needle_size; + + /// For byte strings use plain array as a set + constexpr size_t max_size = std::numeric_limits::max() + 1; + std::array haystack_set; + std::array needle_set; + + /// For UTF-8 strings we also use sets of code points greater than max_size + std::set haystack_utf8_set; + std::set needle_utf8_set; + + haystack_set.fill(0); + needle_set.fill(0); + + while (haystack < haystack_end) + { + size_t len = 1; + if constexpr (is_utf8) + len = UTF8::seqLength(*haystack); + + if (len == 1) + { + haystack_set[static_cast(*haystack)] = 1; + ++haystack; + } + else + { + auto code_point = UTF8::convertUTF8ToCodePoint(haystack, haystack_end - haystack); + if (code_point.has_value()) + { + haystack_utf8_set.insert(code_point.value()); + haystack += len; + } + else + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", StringRef(haystack, haystack_end - haystack)); + } + } + } + + while (needle < needle_end) + { + + size_t len = 1; + if constexpr (is_utf8) + len = UTF8::seqLength(*needle); + + if (len == 1) + { + needle_set[static_cast(*needle)] = 1; + ++needle; + } + else + { + auto code_point = UTF8::convertUTF8ToCodePoint(needle, needle_end - needle); + if (code_point.has_value()) + { + needle_utf8_set.insert(code_point.value()); + needle += len; + } + else + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", StringRef(needle, needle_end - needle)); + } + } + } + + UInt8 intersection = 0; + UInt8 union_size = 0; + + if constexpr (is_utf8) + { + auto lit = haystack_utf8_set.begin(); + auto rit = needle_utf8_set.begin(); + while (lit != haystack_utf8_set.end() && rit != needle_utf8_set.end()) + { + if (*lit == *rit) + { + ++intersection; + ++lit; + ++rit; + } + else if (*lit < *rit) + ++lit; + else + ++rit; + } + union_size = haystack_utf8_set.size() + needle_utf8_set.size() - intersection; + } + + for (size_t i = 0; i < max_size; ++i) + { + intersection += haystack_set[i] & needle_set[i]; + union_size += haystack_set[i] | needle_set[i]; + } + + return static_cast(intersection) / static_cast(union_size); + } +}; + struct ByteEditDistanceImpl { using ResultType = UInt64; @@ -123,9 +236,8 @@ struct ByteEditDistanceImpl if (haystack_size > max_string_size || needle_size > max_string_size) throw Exception( ErrorCodes::TOO_LARGE_STRING_SIZE, - "The string size is too big for function byteEditDistance. " - "Should be at most {}", - max_string_size); + "The string size is too big for function editDistance, " + "should be at most {}", max_string_size); PaddedPODArray distances0(haystack_size + 1, 0); PaddedPODArray distances1(haystack_size + 1, 0); @@ -163,15 +275,25 @@ struct NameByteHammingDistance { static constexpr auto name = "byteHammingDistance"; }; +using FunctionByteHammingDistance = FunctionsStringSimilarity, NameByteHammingDistance>; struct NameEditDistance { static constexpr auto name = "editDistance"; }; +using FunctionEditDistance = FunctionsStringSimilarity, NameEditDistance>; -using FunctionByteHammingDistance = FunctionsStringSimilarity, NameByteHammingDistance>; +struct NameJaccardIndex +{ + static constexpr auto name = "stringJaccardIndex"; +}; +using FunctionStringJaccardIndex = FunctionsStringSimilarity>, NameJaccardIndex>; -using FunctionByteEditDistance = FunctionsStringSimilarity, NameEditDistance>; +struct NameJaccardIndexUTF8 +{ + static constexpr auto name = "stringJaccardIndexUTF8"; +}; +using FunctionStringJaccardIndexUTF8 = FunctionsStringSimilarity>, NameJaccardIndexUTF8>; REGISTER_FUNCTION(StringDistance) { @@ -179,9 +301,13 @@ REGISTER_FUNCTION(StringDistance) FunctionDocumentation{.description = R"(Calculates Hamming distance between two byte-strings.)"}); factory.registerAlias("mismatches", NameByteHammingDistance::name); - factory.registerFunction( + factory.registerFunction( FunctionDocumentation{.description = R"(Calculates the edit distance between two byte-strings.)"}); - factory.registerAlias("levenshteinDistance", NameEditDistance::name); + + factory.registerFunction( + FunctionDocumentation{.description = R"(Calculates the [Jaccard similarity index](https://en.wikipedia.org/wiki/Jaccard_index) between two byte strings.)"}); + factory.registerFunction( + FunctionDocumentation{.description = R"(Calculates the [Jaccard similarity index](https://en.wikipedia.org/wiki/Jaccard_index) between two UTF8 strings.)"}); } } diff --git a/src/Functions/FunctionsStringHash.h b/src/Functions/FunctionsStringHash.h index b6ebc4b9410..d951e77395e 100644 --- a/src/Functions/FunctionsStringHash.h +++ b/src/Functions/FunctionsStringHash.h @@ -64,7 +64,7 @@ public: if (arguments.size() > 1) { - if (!isUnsignedInteger(arguments[1].type)) + if (!isUInt(arguments[1].type)) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument (shingle size) of function {} must be unsigned integer, got {}", getName(), arguments[1].type->getName()); @@ -85,7 +85,7 @@ public: "Function {} expect no more than two arguments (text, shingle size), got {}", getName(), arguments.size()); - if (!isUnsignedInteger(arguments[2].type)) + if (!isUInt(arguments[2].type)) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Third argument (num hashes) of function {} must be unsigned integer, got {}", getName(), arguments[2].type->getName()); diff --git a/src/Functions/FunctionsStringSearch.h b/src/Functions/FunctionsStringSearch.h index c9de29697bf..41b476ccc56 100644 --- a/src/Functions/FunctionsStringSearch.h +++ b/src/Functions/FunctionsStringSearch.h @@ -119,7 +119,7 @@ public: if (arguments.size() >= 3) { - if (!isUnsignedInteger(arguments[2])) + if (!isUInt(arguments[2])) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", diff --git a/src/Functions/URL/URLHierarchy.cpp b/src/Functions/URL/URLHierarchy.cpp index 96b64d3182b..25c6c9ef40b 100644 --- a/src/Functions/URL/URLHierarchy.cpp +++ b/src/Functions/URL/URLHierarchy.cpp @@ -1,9 +1,15 @@ #include -#include +#include + namespace DB { +namespace +{ + +using Pos = const char *; + class URLPathHierarchyImpl { private: @@ -14,7 +20,6 @@ private: public: static constexpr auto name = "URLPathHierarchy"; - static String getName() { return name; } static bool isVariadic() { return false; } static size_t getNumberOfArguments() { return 1; } @@ -95,9 +100,10 @@ public: }; -struct NameURLPathHierarchy { static constexpr auto name = "URLPathHierarchy"; }; using FunctionURLPathHierarchy = FunctionTokens; +} + REGISTER_FUNCTION(URLPathHierarchy) { factory.registerFunction(); diff --git a/src/Functions/URL/URLPathHierarchy.cpp b/src/Functions/URL/URLPathHierarchy.cpp index 7fd6601d780..9a60d4cf989 100644 --- a/src/Functions/URL/URLPathHierarchy.cpp +++ b/src/Functions/URL/URLPathHierarchy.cpp @@ -1,9 +1,14 @@ #include -#include +#include namespace DB { +namespace +{ + +using Pos = const char *; + class URLHierarchyImpl { private: @@ -13,7 +18,6 @@ private: public: static constexpr auto name = "URLHierarchy"; - static String getName() { return name; } static bool isVariadic() { return false; } static size_t getNumberOfArguments() { return 1; } @@ -97,9 +101,10 @@ public: }; -struct NameURLHierarchy { static constexpr auto name = "URLHierarchy"; }; using FunctionURLHierarchy = FunctionTokens; +} + REGISTER_FUNCTION(URLHierarchy) { factory.registerFunction(); diff --git a/src/Functions/URL/extractURLParameterNames.cpp b/src/Functions/URL/extractURLParameterNames.cpp index b792d9140d6..08da148b43e 100644 --- a/src/Functions/URL/extractURLParameterNames.cpp +++ b/src/Functions/URL/extractURLParameterNames.cpp @@ -1,9 +1,14 @@ #include -#include +#include namespace DB { +namespace +{ + +using Pos = const char *; + class ExtractURLParameterNamesImpl { private: @@ -13,7 +18,6 @@ private: public: static constexpr auto name = "extractURLParameterNames"; - static String getName() { return name; } static bool isVariadic() { return false; } static size_t getNumberOfArguments() { return 1; } @@ -80,9 +84,10 @@ public: } }; -struct NameExtractURLParameterNames { static constexpr auto name = "extractURLParameterNames"; }; using FunctionExtractURLParameterNames = FunctionTokens; +} + REGISTER_FUNCTION(ExtractURLParameterNames) { factory.registerFunction(); diff --git a/src/Functions/URL/extractURLParameters.cpp b/src/Functions/URL/extractURLParameters.cpp index e1243d8fbcd..939622dd9d1 100644 --- a/src/Functions/URL/extractURLParameters.cpp +++ b/src/Functions/URL/extractURLParameters.cpp @@ -1,9 +1,15 @@ #include -#include +#include + namespace DB { +namespace +{ + +using Pos = const char *; + class ExtractURLParametersImpl { private: @@ -13,7 +19,6 @@ private: public: static constexpr auto name = "extractURLParameters"; - static String getName() { return name; } static bool isVariadic() { return false; } static size_t getNumberOfArguments() { return 1; } @@ -88,9 +93,10 @@ public: } }; -struct NameExtractURLParameters { static constexpr auto name = "extractURLParameters"; }; using FunctionExtractURLParameters = FunctionTokens; +} + REGISTER_FUNCTION(ExtractURLParameters) { factory.registerFunction(); diff --git a/src/Functions/alphaTokens.cpp b/src/Functions/alphaTokens.cpp new file mode 100644 index 00000000000..35cacdbdbb8 --- /dev/null +++ b/src/Functions/alphaTokens.cpp @@ -0,0 +1,104 @@ + +#include +#include +#include + + +namespace DB +{ + +/** Functions that split strings into an array of strings or vice versa. + * + * alphaTokens(s[, max_substrings]) - select from the string subsequence `[a-zA-Z]+`. + */ +namespace +{ + +using Pos = const char *; + +class SplitByAlphaImpl +{ +private: + Pos pos; + Pos end; + std::optional max_splits; + size_t splits; + bool max_substrings_includes_remaining_string; + +public: + static constexpr auto name = "alphaTokens"; + + static bool isVariadic() { return true; } + + static size_t getNumberOfArguments() { return 0; } + + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) + { + checkArgumentsWithOptionalMaxSubstrings(func, arguments); + } + + static constexpr auto strings_argument_position = 0uz; + + void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) + { + max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; + max_splits = extractMaxSplits(arguments, 1); + } + + /// Called for each next string. + void set(Pos pos_, Pos end_) + { + pos = pos_; + end = end_; + splits = 0; + } + + /// Get the next token, if any, or return false. + bool get(Pos & token_begin, Pos & token_end) + { + /// Skip garbage + while (pos < end && !isAlphaASCII(*pos)) + ++pos; + + if (pos == end) + return false; + + token_begin = pos; + + if (max_splits) + { + if (max_substrings_includes_remaining_string) + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = end; + return true; + } + } + else + if (splits == *max_splits) + return false; + } + + while (pos < end && isAlphaASCII(*pos)) + ++pos; + + token_end = pos; + ++splits; + + return true; + } +}; + +using FunctionSplitByAlpha = FunctionTokens; + +} + +REGISTER_FUNCTION(SplitByAlpha) +{ + factory.registerFunction(); + factory.registerAlias("splitByAlpha", FunctionSplitByAlpha::name); +} + +} diff --git a/src/Functions/appendTrailingCharIfAbsent.cpp b/src/Functions/appendTrailingCharIfAbsent.cpp index 62c0bbd4598..7ff35e599be 100644 --- a/src/Functions/appendTrailingCharIfAbsent.cpp +++ b/src/Functions/appendTrailingCharIfAbsent.cpp @@ -4,7 +4,6 @@ #include #include #include -#include namespace DB @@ -46,10 +45,10 @@ private: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!isString(arguments[0])) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName()); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of the first argument of function {}", arguments[0]->getName(), getName()); if (!isString(arguments[1])) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[1]->getName(), getName()); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of the second argument of function {}", arguments[1]->getName(), getName()); return std::make_shared(); } diff --git a/src/Functions/array/arrayRandomSample.cpp b/src/Functions/array/arrayRandomSample.cpp index 908ca9fa30a..1e28e089a2a 100644 --- a/src/Functions/array/arrayRandomSample.cpp +++ b/src/Functions/array/arrayRandomSample.cpp @@ -1,11 +1,13 @@ -#include #include +#include +#include #include #include #include #include #include -#include "Columns/ColumnsNumber.h" +#include +#include namespace DB { @@ -13,7 +15,6 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_COLUMN; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; } /// arrayRandomSample(arr, k) - Returns k random elements from the input array @@ -35,78 +36,64 @@ public: { FunctionArgumentDescriptors args{ {"array", &isArray, nullptr, "Array"}, - {"samples", &isUnsignedInteger, isColumnConst, "const UInt*"}, + {"samples", &isUInt, isColumnConst, "const UInt*"}, }; validateFunctionArgumentTypes(*this, arguments, args); // Return an array with the same nested type as the input array const DataTypePtr & array_type = arguments[0].type; const DataTypeArray * array_data_type = checkAndGetDataType(array_type.get()); - - // Get the nested data type of the array const DataTypePtr & nested_type = array_data_type->getNestedType(); - return std::make_shared(nested_type); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const ColumnArray * column_array = checkAndGetColumn(arguments[0].column.get()); - if (!column_array) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument must be an array"); + const ColumnArray * col_array = checkAndGetColumn(arguments[0].column.get()); + if (!col_array) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument of function {} must be an array", getName()); const IColumn * col_samples = arguments[1].column.get(); if (!col_samples) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "The second argument is empty or null, type = {}", arguments[1].type->getName()); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "The second argument of function {} is empty or null, type = {}", + getName(), arguments[1].type->getName()); - UInt64 samples; - try - { - samples = col_samples->getUInt(0); - } - catch (...) - { - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Failed to fetch UInt64 from the second argument column, type = {}", - arguments[1].type->getName()); - } + const size_t samples = col_samples->getUInt(0); - std::random_device rd; - std::mt19937 gen(rd()); + pcg64_fast rng(randomSeed()); - auto nested_column = column_array->getDataPtr()->cloneEmpty(); - auto offsets_column = ColumnUInt64::create(); + auto col_res_data = col_array->getDataPtr()->cloneEmpty(); + auto col_res_offsets = ColumnUInt64::create(input_rows_count); + auto col_res = ColumnArray::create(std::move(col_res_data), std::move(col_res_offsets)); - auto res_data = ColumnArray::create(std::move(nested_column), std::move(offsets_column)); + const auto & array_offsets = col_array->getOffsets(); + auto & res_offsets = col_res->getOffsets(); - const auto & input_offsets = column_array->getOffsets(); - auto & res_offsets = res_data->getOffsets(); - res_offsets.resize(input_rows_count); - - UInt64 cur_samples; - size_t current_offset = 0; + std::vector indices; + size_t prev_array_offset = 0; + size_t prev_res_offset = 0; for (size_t row = 0; row < input_rows_count; row++) { - size_t row_size = input_offsets[row] - current_offset; + const size_t num_elements = array_offsets[row] - prev_array_offset; + const size_t cur_samples = std::min(num_elements, samples); - std::vector indices(row_size); - std::iota(indices.begin(), indices.end(), 0); - std::shuffle(indices.begin(), indices.end(), gen); + indices.resize(num_elements); + std::iota(indices.begin(), indices.end(), prev_array_offset); + std::shuffle(indices.begin(), indices.end(), rng); - cur_samples = std::min(samples, static_cast(row_size)); + for (UInt64 i = 0; i < cur_samples; i++) + col_res->getData().insertFrom(col_array->getData(), indices[i]); - for (UInt64 j = 0; j < cur_samples; j++) - { - size_t source_index = indices[j]; - res_data->getData().insertFrom(column_array->getData(), source_index); - } + res_offsets[row] = prev_res_offset + cur_samples; - res_offsets[row] = current_offset + cur_samples; - current_offset += cur_samples; + prev_array_offset += num_elements; + prev_res_offset += cur_samples; + indices.clear(); } - return res_data; + return col_res; } }; diff --git a/src/Functions/arrayStringConcat.cpp b/src/Functions/arrayStringConcat.cpp new file mode 100644 index 00000000000..0194cc4871a --- /dev/null +++ b/src/Functions/arrayStringConcat.cpp @@ -0,0 +1,202 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/** arrayStringConcat(arr) + * arrayStringConcat(arr, delimiter) + * - join an array of strings into one string via a separator. + */ +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; +} + +namespace +{ + +/// Joins an array of type serializable to string into one string via a separator. +class FunctionArrayStringConcat : public IFunction +{ +private: + static void executeInternal( + const ColumnString::Chars & src_chars, + const ColumnString::Offsets & src_string_offsets, + const ColumnArray::Offsets & src_array_offsets, + const char * delimiter, + const size_t delimiter_size, + ColumnString::Chars & dst_chars, + ColumnString::Offsets & dst_string_offsets, + const char8_t * null_map) + { + size_t size = src_array_offsets.size(); + + if (!size) + return; + + /// With a small margin - as if the separator goes after the last string of the array. + dst_chars.resize( + src_chars.size() + + delimiter_size * src_string_offsets.size() /// Separators after each string... + + src_array_offsets.size() /// Zero byte after each joined string + - src_string_offsets.size()); /// The former zero byte after each string of the array + + /// There will be as many strings as there were arrays. + dst_string_offsets.resize(src_array_offsets.size()); + + ColumnArray::Offset current_src_array_offset = 0; + + ColumnString::Offset current_dst_string_offset = 0; + + /// Loop through the array of strings. + for (size_t i = 0; i < size; ++i) + { + bool first_non_null = true; + /// Loop through the rows within the array. /// NOTE You can do everything in one copy, if the separator has a size of 1. + for (auto next_src_array_offset = src_array_offsets[i]; current_src_array_offset < next_src_array_offset; ++current_src_array_offset) + { + if (null_map && null_map[current_src_array_offset]) [[unlikely]] + continue; + + if (!first_non_null) + { + memcpy(&dst_chars[current_dst_string_offset], delimiter, delimiter_size); + current_dst_string_offset += delimiter_size; + } + first_non_null = false; + + const auto current_src_string_offset = current_src_array_offset ? src_string_offsets[current_src_array_offset - 1] : 0; + size_t bytes_to_copy = src_string_offsets[current_src_array_offset] - current_src_string_offset - 1; + + memcpySmallAllowReadWriteOverflow15( + &dst_chars[current_dst_string_offset], &src_chars[current_src_string_offset], bytes_to_copy); + + current_dst_string_offset += bytes_to_copy; + } + + dst_chars[current_dst_string_offset] = 0; + ++current_dst_string_offset; + + dst_string_offsets[i] = current_dst_string_offset; + } + + dst_chars.resize(dst_string_offsets.back()); + } + + static void executeInternal( + const ColumnString & col_string, + const ColumnArray & col_arr, + const String & delimiter, + ColumnString & col_res, + const char8_t * null_map = nullptr) + { + executeInternal( + col_string.getChars(), + col_string.getOffsets(), + col_arr.getOffsets(), + delimiter.data(), + delimiter.size(), + col_res.getChars(), + col_res.getOffsets(), + null_map); + } + + static ColumnPtr serializeNestedColumn(const ColumnArray & col_arr, const DataTypePtr & nested_type) + { + DataTypePtr type = nested_type; + ColumnPtr column = col_arr.getDataPtr(); + + if (type->isNullable()) + { + type = removeNullable(type); + column = assert_cast(*column).getNestedColumnPtr(); + } + + return castColumn({column, type, "tmp"}, std::make_shared()); + } + +public: + static constexpr auto name = "arrayStringConcat"; + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + String getName() const override + { + return name; + } + + bool isVariadic() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + size_t getNumberOfArguments() const override { return 0; } + + bool useDefaultImplementationForConstants() const override { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + FunctionArgumentDescriptors mandatory_args + { + {"arr", &isArray, nullptr, "Array"}, + }; + + FunctionArgumentDescriptors optional_args + { + {"separator", &isString, isColumnConst, "const String"}, + }; + + validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override + { + String delimiter; + if (arguments.size() == 2) + { + const ColumnConst * col_delim = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); + if (!col_delim) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument for function {} must be constant string.", getName()); + + delimiter = col_delim->getValue(); + } + + const auto & nested_type = assert_cast(*arguments[0].type).getNestedType(); + const ColumnArray & col_arr = assert_cast(*arguments[0].column); + + ColumnPtr str_subcolumn = serializeNestedColumn(col_arr, nested_type); + const ColumnString & col_string = assert_cast(*str_subcolumn.get()); + + auto col_res = ColumnString::create(); + if (const ColumnNullable * col_nullable = checkAndGetColumn(col_arr.getData())) + executeInternal(col_string, col_arr, delimiter, *col_res, col_nullable->getNullMapData().data()); + else + executeInternal(col_string, col_arr, delimiter, *col_res); + + return col_res; + } +}; + +} + +REGISTER_FUNCTION(ArrayStringConcat) +{ + factory.registerFunction(); +} + +} diff --git a/src/Functions/extractAll.cpp b/src/Functions/extractAll.cpp new file mode 100644 index 00000000000..ad49f32f769 --- /dev/null +++ b/src/Functions/extractAll.cpp @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; +} + + +/** Functions that split strings into an array of strings or vice versa. + * + * extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp. + * - first subpattern, if regexp has subpattern; + * - zero subpattern (the match part, otherwise); + * - otherwise, an empty array + */ +namespace +{ + +using Pos = const char *; + +class ExtractAllImpl +{ +private: + Regexps::RegexpPtr re; + OptimizedRegularExpression::MatchVec matches; + size_t capture; + + Pos pos; + Pos end; +public: + static constexpr auto name = "extractAll"; + static String getName() { return name; } + static bool isVariadic() { return false; } + static size_t getNumberOfArguments() { return 2; } + + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) + { + FunctionArgumentDescriptors mandatory_args{ + {"haystack", &isString, nullptr, "String"}, + {"pattern", &isString, isColumnConst, "const String"} + }; + + validateFunctionArgumentTypes(func, arguments, mandatory_args); + } + + static constexpr auto strings_argument_position = 0uz; + + void init(const ColumnsWithTypeAndName & arguments, bool /*max_substrings_includes_remaining_string*/) + { + const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); + + if (!col) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. " + "Must be constant string.", arguments[1].column->getName(), getName()); + + re = std::make_shared(Regexps::createRegexp(col->getValue())); + capture = re->getNumberOfSubpatterns() > 0 ? 1 : 0; + + matches.resize(capture + 1); + } + + /// Called for each next string. + void set(Pos pos_, Pos end_) + { + pos = pos_; + end = end_; + } + + /// Get the next token, if any, or return false. + bool get(Pos & token_begin, Pos & token_end) + { + if (!pos || pos > end) + return false; + + if (!re->match(pos, end - pos, matches) || !matches[0].length) + return false; + + if (matches[capture].offset == std::string::npos) + { + /// Empty match. + token_begin = pos; + token_end = pos; + } + else + { + token_begin = pos + matches[capture].offset; + token_end = token_begin + matches[capture].length; + } + + pos += matches[0].offset + matches[0].length; + + return true; + } +}; + +using FunctionExtractAll = FunctionTokens; + +} + +REGISTER_FUNCTION(ExtractAll) +{ + factory.registerFunction(); +} + +} diff --git a/src/Functions/formatDateTime.cpp b/src/Functions/formatDateTime.cpp index 7b93f5e063a..1fb47832418 100644 --- a/src/Functions/formatDateTime.cpp +++ b/src/Functions/formatDateTime.cpp @@ -9,12 +9,14 @@ #include #include #include -#include #include #include #include #include +#include +#include + #include #include @@ -803,18 +805,7 @@ public: { if (arguments.size() == 1) { - if (!castType(arguments[0].type.get(), [&](const auto & type) - { - using FromDataType = std::decay_t; - res = ConvertImpl::execute(arguments, result_type, input_rows_count); - return true; - })) - { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, - "Illegal column {} of function {}, must be Integer, Date, Date32, DateTime " - "or DateTime64 when arguments size is 1.", - arguments[0].column->getName(), getName()); - } + return castColumn(arguments[0], result_type); } else { diff --git a/src/Functions/formatQuery.cpp b/src/Functions/formatQuery.cpp index f4cb937eed4..2f6bc6f9903 100644 --- a/src/Functions/formatQuery.cpp +++ b/src/Functions/formatQuery.cpp @@ -1,7 +1,9 @@ +#include #include +#include #include #include -#include +#include #include #include #include @@ -15,7 +17,19 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; } -template +enum class OutputFormatting +{ + SingleLine, + MultiLine +}; + +enum class ErrorHandling +{ + Exception, + Null +}; + +template class FunctionFormatQuery : public IFunction { public: @@ -27,70 +41,127 @@ public: } FunctionFormatQuery(size_t max_query_size_, size_t max_parser_depth_) - : max_query_size(max_query_size_), max_parser_depth(max_parser_depth_) + : max_query_size(max_query_size_) + , max_parser_depth(max_parser_depth_) { } String getName() const override { return name; } - size_t getNumberOfArguments() const override { return 1; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + bool useDefaultImplementationForConstants() const override { return true; } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - FunctionArgumentDescriptors mandatory_args{{"query", &isString, nullptr, "String"}}; - validateFunctionArgumentTypes(*this, arguments, mandatory_args); - return arguments[0].type; + FunctionArgumentDescriptors args{ + {"query", &isString, nullptr, "String"} + }; + validateFunctionArgumentTypes(*this, arguments, args); + + DataTypePtr string_type = std::make_shared(); + if constexpr (error_handling == ErrorHandling::Null) + return std::make_shared(string_type); + else + return string_type; } - bool useDefaultImplementationForConstants() const override { return true; } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const ColumnPtr column = arguments[0].column; - if (const ColumnString * col = checkAndGetColumn(column.get())) + const ColumnPtr col_query = arguments[0].column; + + ColumnUInt8::MutablePtr col_null_map; + if constexpr (error_handling == ErrorHandling::Null) + col_null_map = ColumnUInt8::create(input_rows_count, 0); + + if (const ColumnString * col_query_string = checkAndGetColumn(col_query.get())) { auto col_res = ColumnString::create(); - formatVector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets()); - return col_res; + formatVector(col_query_string->getChars(), col_query_string->getOffsets(), col_res->getChars(), col_res->getOffsets(), col_null_map); + + if constexpr (error_handling == ErrorHandling::Null) + return ColumnNullable::create(std::move(col_res), std::move(col_null_map)); + else + return col_res; } else - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", arguments[0].column->getName(), getName()); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", col_query->getName(), getName()); } private: - void formatQueryImpl(const char * begin, const char * end, ColumnString::Chars & output) const - { - ParserQuery parser{end}; - auto ast = parseQuery(parser, begin, end, {}, max_query_size, max_parser_depth); - WriteBufferFromVector buf(output, AppendModeTag{}); - formatAST(*ast, buf, /* hilite */ false, /* one_line */ one_line); - buf.finalize(); - } void formatVector( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, ColumnString::Chars & res_data, - ColumnString::Offsets & res_offsets) const + ColumnString::Offsets & res_offsets, + ColumnUInt8::MutablePtr & res_null_map) const { const size_t size = offsets.size(); res_offsets.resize(size); - res_data.reserve(data.size()); + res_data.resize(data.size()); + + size_t prev_offset = 0; + size_t res_data_size = 0; - size_t prev_in_offset = 0; for (size_t i = 0; i < size; ++i) { - const auto * begin = reinterpret_cast(&data[prev_in_offset]); - const char * end = begin + offsets[i] - 1; - formatQueryImpl(begin, end, res_data); - res_offsets[i] = res_data.size() + 1; - prev_in_offset = offsets[i]; + const char * begin = reinterpret_cast(&data[prev_offset]); + const char * end = begin + offsets[i] - prev_offset - 1; + + ParserQuery parser(end); + ASTPtr ast; + WriteBufferFromOwnString buf; + + try + { + ast = parseQuery(parser, begin, end, /*query_description*/ {}, max_query_size, max_parser_depth); + } + catch (...) + { + if constexpr (error_handling == ErrorHandling::Null) + { + const size_t res_data_new_size = res_data_size + 1; + if (res_data_new_size > res_data.size()) + res_data.resize(2 * res_data_new_size); + + res_data[res_data_size] = '\0'; + res_data_size += 1; + + res_offsets[i] = res_data_size; + prev_offset = offsets[i]; + + res_null_map->getData()[i] = 1; + + continue; + } + else + { + static_assert(error_handling == ErrorHandling::Exception); + throw; + } + } + + formatAST(*ast, buf, /*hilite*/ false, /*single_line*/ output_formatting == OutputFormatting::SingleLine); + auto formatted = buf.stringView(); + + const size_t res_data_new_size = res_data_size + formatted.size() + 1; + if (res_data_new_size > res_data.size()) + res_data.resize(2 * res_data_new_size); + + memcpy(&res_data[res_data_size], formatted.begin(), formatted.size()); + res_data_size += formatted.size(); + + res_data[res_data_size] = '\0'; + res_data_size += 1; + + res_offsets[i] = res_data_size; + prev_offset = offsets[i]; } + + res_data.resize(res_data_size); } - size_t max_query_size; - size_t max_parser_depth; + + const size_t max_query_size; + const size_t max_parser_depth; }; struct NameFormatQuery @@ -98,15 +169,25 @@ struct NameFormatQuery static constexpr auto name = "formatQuery"; }; +struct NameFormatQueryOrNull +{ + static constexpr auto name = "formatQueryOrNull"; +}; + struct NameFormatQuerySingleLine { static constexpr auto name = "formatQuerySingleLine"; }; +struct NameFormatQuerySingleLineOrNull +{ + static constexpr auto name = "formatQuerySingleLineOrNull"; +}; + REGISTER_FUNCTION(formatQuery) { - factory.registerFunction>(FunctionDocumentation{ - .description = "Returns a formatted, possibly multi-line, version of the given SQL query.\n[example:multiline]", + factory.registerFunction>(FunctionDocumentation{ + .description = "Returns a formatted, possibly multi-line, version of the given SQL query. Throws in case of a parsing error.\n[example:multiline]", .syntax = "formatQuery(query)", .arguments = {{"query", "The SQL query to be formatted. [String](../../sql-reference/data-types/string.md)"}}, .returned_value = "The formatted query. [String](../../sql-reference/data-types/string.md).", @@ -121,10 +202,28 @@ REGISTER_FUNCTION(formatQuery) .categories{"Other"}}); } +REGISTER_FUNCTION(formatQueryOrNull) +{ + factory.registerFunction>(FunctionDocumentation{ + .description = "Returns a formatted, possibly multi-line, version of the given SQL query. Returns NULL in case of a parsing error.\n[example:multiline]", + .syntax = "formatQueryOrNull(query)", + .arguments = {{"query", "The SQL query to be formatted. [String](../../sql-reference/data-types/string.md)"}}, + .returned_value = "The formatted query. [String](../../sql-reference/data-types/string.md).", + .examples{ + {"multiline", + "SELECT formatQuery('select a, b FRom tab WHERE a > 3 and b < 3');", + "SELECT\n" + " a,\n" + " b\n" + "FROM tab\n" + "WHERE (a > 3) AND (b < 3)"}}, + .categories{"Other"}}); +} + REGISTER_FUNCTION(formatQuerySingleLine) { - factory.registerFunction>(FunctionDocumentation{ - .description = "Like formatQuery() but the returned formatted string contains no line breaks.\n[example:multiline]", + factory.registerFunction>(FunctionDocumentation{ + .description = "Like formatQuery() but the returned formatted string contains no line breaks. Throws in case of a parsing error.\n[example:multiline]", .syntax = "formatQuerySingleLine(query)", .arguments = {{"query", "The SQL query to be formatted. [String](../../sql-reference/data-types/string.md)"}}, .returned_value = "The formatted query. [String](../../sql-reference/data-types/string.md).", @@ -134,4 +233,19 @@ REGISTER_FUNCTION(formatQuerySingleLine) "SELECT a, b FROM tab WHERE (a > 3) AND (b < 3)"}}, .categories{"Other"}}); } + +REGISTER_FUNCTION(formatQuerySingleLineOrNull) +{ + factory.registerFunction>(FunctionDocumentation{ + .description = "Like formatQuery() but the returned formatted string contains no line breaks. Returns NULL in case of a parsing error.\n[example:multiline]", + .syntax = "formatQuerySingleLineOrNull(query)", + .arguments = {{"query", "The SQL query to be formatted. [String](../../sql-reference/data-types/string.md)"}}, + .returned_value = "The formatted query. [String](../../sql-reference/data-types/string.md).", + .examples{ + {"multiline", + "SELECT formatQuerySingleLine('select a, b FRom tab WHERE a > 3 and b < 3');", + "SELECT a, b FROM tab WHERE (a > 3) AND (b < 3)"}}, + .categories{"Other"}}); +} + } diff --git a/src/Functions/fromDaysSinceYearZero.cpp b/src/Functions/fromDaysSinceYearZero.cpp index 804a243cda0..36a05f1cbf9 100644 --- a/src/Functions/fromDaysSinceYearZero.cpp +++ b/src/Functions/fromDaysSinceYearZero.cpp @@ -59,7 +59,7 @@ public: DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { FunctionArgumentDescriptors args{ - {"days", &isNativeUnsignedInteger, nullptr, "UInt*"} + {"days", &isNativeUInt, nullptr, "UInt*"} }; validateFunctionArgumentTypes(*this, arguments, args); diff --git a/src/Functions/parseDateTime.cpp b/src/Functions/parseDateTime.cpp index fdab85c4640..c6721b29c1c 100644 --- a/src/Functions/parseDateTime.cpp +++ b/src/Functions/parseDateTime.cpp @@ -7,15 +7,16 @@ #include #include -#include #include #include #include +#include + #include -#include #include + namespace DB { namespace ErrorCodes diff --git a/src/Functions/randomFixedString.cpp b/src/Functions/randomFixedString.cpp index 508fae3e824..914800386d5 100644 --- a/src/Functions/randomFixedString.cpp +++ b/src/Functions/randomFixedString.cpp @@ -41,7 +41,7 @@ public: DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - if (!isUnsignedInteger(arguments[0].type)) + if (!isUInt(arguments[0].type)) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be unsigned integer", getName()); if (!arguments[0].column || !isColumnConst(*arguments[0].column)) diff --git a/src/Functions/splitByChar.cpp b/src/Functions/splitByChar.cpp new file mode 100644 index 00000000000..d537039dc23 --- /dev/null +++ b/src/Functions/splitByChar.cpp @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int ILLEGAL_COLUMN; +} + + +/** Functions that split strings into an array of strings or vice versa. + * + * splitByChar(sep, s[, max_substrings]) + */ +namespace +{ + +using Pos = const char *; + +class SplitByCharImpl +{ +private: + Pos pos; + Pos end; + char separator; + std::optional max_splits; + size_t splits; + bool max_substrings_includes_remaining_string; + +public: + static constexpr auto name = "splitByChar"; + static bool isVariadic() { return true; } + static size_t getNumberOfArguments() { return 0; } + + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) + { + checkArgumentsWithSeparatorAndOptionalMaxSubstrings(func, arguments); + } + + static constexpr auto strings_argument_position = 1uz; + + void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) + { + const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); + + if (!col) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. " + "Must be constant string.", arguments[0].column->getName(), name); + + String sep_str = col->getValue(); + + if (sep_str.size() != 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal separator for function {}. Must be exactly one byte.", name); + + separator = sep_str[0]; + + max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; + max_splits = extractMaxSplits(arguments, 2); + } + + void set(Pos pos_, Pos end_) + { + pos = pos_; + end = end_; + splits = 0; + } + + bool get(Pos & token_begin, Pos & token_end) + { + if (!pos) + return false; + + token_begin = pos; + + if (max_splits) + { + if (max_substrings_includes_remaining_string) + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = nullptr; + return true; + } + } + else + if (splits == *max_splits) + return false; + } + + pos = reinterpret_cast(memchr(pos, separator, end - pos)); + if (pos) + { + token_end = pos; + ++pos; + ++splits; + } + else + token_end = end; + + return true; + } +}; + +using FunctionSplitByChar = FunctionTokens; + +} + +REGISTER_FUNCTION(SplitByChar) +{ + factory.registerFunction(); +} + +} diff --git a/src/Functions/splitByNonAlpha.cpp b/src/Functions/splitByNonAlpha.cpp new file mode 100644 index 00000000000..467e7b0b5c3 --- /dev/null +++ b/src/Functions/splitByNonAlpha.cpp @@ -0,0 +1,113 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +/** Functions that split strings into an array of strings or vice versa. + * + * splitByNonAlpha(s[, max_substrings]) - split the string by whitespace and punctuation characters + */ +namespace +{ + +using Pos = const char *; + +class SplitByNonAlphaImpl +{ +private: + Pos pos; + Pos end; + std::optional max_splits; + size_t splits; + bool max_substrings_includes_remaining_string; + +public: + /// Get the name of the function. + static constexpr auto name = "splitByNonAlpha"; + + static bool isVariadic() { return true; } + static size_t getNumberOfArguments() { return 0; } + + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) + { + checkArgumentsWithOptionalMaxSubstrings(func, arguments); + } + + static constexpr auto strings_argument_position = 0uz; + + void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) + { + max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; + max_splits = extractMaxSplits(arguments, 1); + } + + /// Called for each next string. + void set(Pos pos_, Pos end_) + { + pos = pos_; + end = end_; + splits = 0; + } + + /// Get the next token, if any, or return false. + bool get(Pos & token_begin, Pos & token_end) + { + /// Skip garbage + while (pos < end && (isWhitespaceASCII(*pos) || isPunctuationASCII(*pos))) + ++pos; + + if (pos == end) + return false; + + token_begin = pos; + + if (max_splits) + { + if (max_substrings_includes_remaining_string) + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = end; + return true; + } + } + else + if (splits == *max_splits) + return false; + } + + while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos))) + ++pos; + + token_end = pos; + splits++; + + return true; + } +}; + +using FunctionSplitByNonAlpha = FunctionTokens; + +} + +REGISTER_FUNCTION(SplitByNonAlpha) +{ + factory.registerFunction(); +} + +} diff --git a/src/Functions/splitByRegexp.cpp b/src/Functions/splitByRegexp.cpp new file mode 100644 index 00000000000..77328205c01 --- /dev/null +++ b/src/Functions/splitByRegexp.cpp @@ -0,0 +1,156 @@ +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; +} + + +/** Functions that split strings into an array of strings or vice versa. + * + * splitByRegexp(regexp, s[, max_substrings]) + */ +namespace +{ + +using Pos = const char *; + +class SplitByRegexpImpl +{ +private: + Regexps::RegexpPtr re; + OptimizedRegularExpression::MatchVec matches; + + Pos pos; + Pos end; + + std::optional max_splits; + size_t splits; + bool max_substrings_includes_remaining_string; + +public: + static constexpr auto name = "splitByRegexp"; + + static bool isVariadic() { return true; } + static size_t getNumberOfArguments() { return 0; } + + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) + { + checkArgumentsWithSeparatorAndOptionalMaxSubstrings(func, arguments); + } + + static constexpr auto strings_argument_position = 1uz; + + void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) + { + const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); + + if (!col) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. " + "Must be constant string.", arguments[0].column->getName(), name); + + if (!col->getValue().empty()) + re = std::make_shared(Regexps::createRegexp(col->getValue())); + + max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; + max_splits = extractMaxSplits(arguments, 2); + } + + /// Called for each next string. + void set(Pos pos_, Pos end_) + { + pos = pos_; + end = end_; + splits = 0; + } + + /// Get the next token, if any, or return false. + bool get(Pos & token_begin, Pos & token_end) + { + if (!re) + { + if (pos == end) + return false; + + token_begin = pos; + + if (max_splits) + { + if (max_substrings_includes_remaining_string) + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = end; + return true; + } + } + else + if (splits == *max_splits) + return false; + } + + pos += 1; + token_end = pos; + ++splits; + } + else + { + if (!pos || pos > end) + return false; + + token_begin = pos; + + if (max_splits) + { + if (max_substrings_includes_remaining_string) + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = nullptr; + return true; + } + } + else + if (splits == *max_splits) + return false; + } + + if (!re->match(pos, end - pos, matches) || !matches[0].length) + { + token_end = end; + pos = end + 1; + } + else + { + token_end = pos + matches[0].offset; + pos = token_end + matches[0].length; + ++splits; + } + } + + return true; + } +}; + +using FunctionSplitByRegexp = FunctionTokens; + +} + +REGISTER_FUNCTION(SplitByRegexp) +{ + factory.registerFunction(); +} + +} diff --git a/src/Functions/splitByString.cpp b/src/Functions/splitByString.cpp new file mode 100644 index 00000000000..7d6803b2f27 --- /dev/null +++ b/src/Functions/splitByString.cpp @@ -0,0 +1,148 @@ +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; +} + + +/** Functions that split strings into an array of strings or vice versa. + * + * splitByString(sep, s[, max_substrings]) + */ +namespace +{ + +using Pos = const char *; + +class SplitByStringImpl +{ +private: + Pos pos; + Pos end; + String separator; + std::optional max_splits; + size_t splits; + bool max_substrings_includes_remaining_string; + +public: + static constexpr auto name = "splitByString"; + static bool isVariadic() { return true; } + static size_t getNumberOfArguments() { return 0; } + + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) + { + checkArgumentsWithSeparatorAndOptionalMaxSubstrings(func, arguments); + } + + static constexpr auto strings_argument_position = 1uz; + + void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) + { + const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); + + if (!col) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. " + "Must be constant string.", arguments[0].column->getName(), name); + + separator = col->getValue(); + + max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; + max_splits = extractMaxSplits(arguments, 2); + } + + /// Called for each next string. + void set(Pos pos_, Pos end_) + { + pos = pos_; + end = end_; + splits = 0; + } + + /// Get the next token, if any, or return false. + bool get(Pos & token_begin, Pos & token_end) + { + if (separator.empty()) + { + if (pos == end) + return false; + + token_begin = pos; + + if (max_splits) + { + if (max_substrings_includes_remaining_string) + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = end; + return true; + } + } + else + if (splits == *max_splits) + return false; + } + + pos += 1; + token_end = pos; + ++splits; + } + else + { + if (!pos) + return false; + + token_begin = pos; + + if (max_splits) + { + if (max_substrings_includes_remaining_string) + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = nullptr; + return true; + } + } + else + if (splits == *max_splits) + return false; + } + + pos = reinterpret_cast(memmem(pos, end - pos, separator.data(), separator.size())); + if (pos) + { + token_end = pos; + pos += separator.size(); + ++splits; + } + else + token_end = end; + } + + return true; + } +}; + +using FunctionSplitByString = FunctionTokens; + +} + +REGISTER_FUNCTION(SplitByString) +{ + factory.registerFunction(); +} + +} diff --git a/src/Functions/splitByWhitespace.cpp b/src/Functions/splitByWhitespace.cpp new file mode 100644 index 00000000000..168e429c6f5 --- /dev/null +++ b/src/Functions/splitByWhitespace.cpp @@ -0,0 +1,101 @@ +#include +#include +#include + + +namespace DB +{ + +/** Functions that split strings into an array of strings or vice versa. + * + * splitByWhitespace(s[, max_substrings]) - split the string by whitespace characters + */ +namespace +{ + +using Pos = const char *; + +class SplitByWhitespaceImpl +{ +private: + Pos pos; + Pos end; + std::optional max_splits; + size_t splits; + bool max_substrings_includes_remaining_string; + +public: + static constexpr auto name = "splitByWhitespace"; + + static bool isVariadic() { return true; } + static size_t getNumberOfArguments() { return 0; } + + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) + { + checkArgumentsWithOptionalMaxSubstrings(func, arguments); + } + + static constexpr auto strings_argument_position = 0uz; + + void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) + { + max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; + max_splits = extractMaxSplits(arguments, 1); + } + + /// Called for each next string. + void set(Pos pos_, Pos end_) + { + pos = pos_; + end = end_; + splits = 0; + } + + /// Get the next token, if any, or return false. + bool get(Pos & token_begin, Pos & token_end) + { + /// Skip garbage + while (pos < end && isWhitespaceASCII(*pos)) + ++pos; + + if (pos == end) + return false; + + token_begin = pos; + + if (max_splits) + { + if (max_substrings_includes_remaining_string) + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = end; + return true; + } + } + else + if (splits == *max_splits) + return false; + } + + while (pos < end && !isWhitespaceASCII(*pos)) + ++pos; + + token_end = pos; + splits++; + + return true; + } +}; + +using FunctionSplitByWhitespace = FunctionTokens; + +} + +REGISTER_FUNCTION(SplitByWhitespace) +{ + factory.registerFunction(); +} + +} diff --git a/src/Functions/toBool.cpp b/src/Functions/toBool.cpp index 765da0c3206..6f2c436c1ea 100644 --- a/src/Functions/toBool.cpp +++ b/src/Functions/toBool.cpp @@ -1,6 +1,5 @@ #include #include -#include #include #include #include @@ -12,6 +11,14 @@ namespace { class FunctionToBool : public IFunction { + private: + ContextPtr context; + + static String getReturnTypeName(const DataTypePtr & argument) + { + return argument->isNullable() ? "Nullable(Bool)" : "Bool"; + } + public: static constexpr auto name = "toBool"; @@ -32,8 +39,7 @@ namespace DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - auto bool_type = DataTypeFactory::instance().get("Bool"); - return arguments[0]->isNullable() ? makeNullable(bool_type) : bool_type; + return DataTypeFactory::instance().get(getReturnTypeName(arguments[0])); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t) const override @@ -42,18 +48,17 @@ namespace { arguments[0], { - DataTypeString().createColumnConst(arguments[0].column->size(), arguments[0].type->isNullable() ? "Nullable(Bool)" : "Bool"), + DataTypeString().createColumnConst(arguments[0].column->size(), getReturnTypeName(arguments[0].type)), std::make_shared(), "" } }; - FunctionOverloadResolverPtr func_builder_cast = CastInternalOverloadResolver::createImpl(); + FunctionOverloadResolverPtr func_builder_cast = createInternalCastOverloadResolver(CastType::nonAccurate, {}); auto func_cast = func_builder_cast->build(cast_args); return func_cast->execute(cast_args, result_type, arguments[0].column->size()); } }; - } REGISTER_FUNCTION(ToBool) diff --git a/src/Functions/toFixedString.h b/src/Functions/toFixedString.h index 6d14f0f1380..7bee666c5dd 100644 --- a/src/Functions/toFixedString.h +++ b/src/Functions/toFixedString.h @@ -47,7 +47,7 @@ public: DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - if (!isUnsignedInteger(arguments[1].type)) + if (!isUInt(arguments[1].type)) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument for function {} must be unsigned integer", getName()); if (!arguments[1].column) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument for function {} must be constant", getName()); diff --git a/src/Functions/vectorFunctions.cpp b/src/Functions/vectorFunctions.cpp index 35ba49e4545..33b0e9f6039 100644 --- a/src/Functions/vectorFunctions.cpp +++ b/src/Functions/vectorFunctions.cpp @@ -1147,7 +1147,7 @@ public: double p; if (isFloat(p_column.column->getDataType())) p = p_column.column->getFloat64(0); - else if (isUnsignedInteger(p_column.column->getDataType())) + else if (isUInt(p_column.column->getDataType())) p = p_column.column->getUInt(0); else throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument for function {} must be either constant Float64 or constant UInt", getName()); diff --git a/src/Functions/ztest.cpp b/src/Functions/ztest.cpp index 9ced926d239..55e1b59a897 100644 --- a/src/Functions/ztest.cpp +++ b/src/Functions/ztest.cpp @@ -57,7 +57,7 @@ public: { for (size_t i = 0; i < 4; ++i) { - if (!isUnsignedInteger(arguments[i].type)) + if (!isUInt(arguments[i].type)) { throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, diff --git a/src/IO/HTTPChunkedReadBuffer.cpp b/src/IO/HTTPChunkedReadBuffer.cpp index 29034b35e16..41788fa8ce7 100644 --- a/src/IO/HTTPChunkedReadBuffer.cpp +++ b/src/IO/HTTPChunkedReadBuffer.cpp @@ -13,7 +13,7 @@ namespace ErrorCodes { extern const int ARGUMENT_OUT_OF_BOUND; extern const int UNEXPECTED_END_OF_FILE; - extern const int CORRUPTED_DATA; + extern const int BAD_REQUEST_PARAMETER; } size_t HTTPChunkedReadBuffer::readChunkHeader() @@ -22,7 +22,7 @@ size_t HTTPChunkedReadBuffer::readChunkHeader() throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Unexpected end of file while reading chunk header of HTTP chunked data"); if (!isHexDigit(*in->position())) - throw Exception(ErrorCodes::CORRUPTED_DATA, "Unexpected data instead of HTTP chunk header"); + throw Exception(ErrorCodes::BAD_REQUEST_PARAMETER, "Unexpected data instead of HTTP chunk header"); size_t res = 0; do diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 9b9374ff05a..19750906fdb 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -835,7 +835,7 @@ void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & /// Check for single '\r' not followed by '\n' /// We should not stop in this case. - if (*buf.position() == '\r') + if (*buf.position() == '\r' && !settings.allow_cr_end_of_line) { ++buf.position(); if (!buf.eof() && *buf.position() != '\n') diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 40f812050db..c5a456d70f6 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -984,20 +984,31 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons template inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, ReadBuffer & buf, const DateLUTImpl & date_lut) { + static constexpr bool throw_exception = std::is_same_v; + time_t whole = 0; bool is_negative_timestamp = (!buf.eof() && *buf.position() == '-'); bool is_empty = buf.eof(); if (!is_empty) { - try + if constexpr (throw_exception) { - readDateTimeTextImpl(whole, buf, date_lut); + try + { + readDateTimeTextImpl(whole, buf, date_lut); + } + catch (const DB::ParsingException &) + { + if (buf.eof() || *buf.position() != '.') + throw; + } } - catch (const DB::ParsingException & exception) + else { - if (buf.eof() || *buf.position() != '.') - throw exception; + auto ok = readDateTimeTextImpl(whole, buf, date_lut); + if (!ok && (buf.eof() || *buf.position() != '.')) + return ReturnType(false); } } diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index 44ab01ba959..ceb7d275299 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -387,27 +387,44 @@ Model::CompleteMultipartUploadOutcome Client::CompleteMultipartUpload(const Comp auto outcome = doRequestWithRetryNetworkErrors( request, [this](const Model::CompleteMultipartUploadRequest & req) { return CompleteMultipartUpload(req); }); - if (!outcome.IsSuccess() || provider_type != ProviderType::GCS) - return outcome; - const auto & key = request.GetKey(); const auto & bucket = request.GetBucket(); - /// For GCS we will try to compose object at the end, otherwise we cannot do a native copy - /// for the object (e.g. for backups) - /// We don't care if the compose fails, because the upload was still successful, only the - /// performance for copying the object will be affected - S3::ComposeObjectRequest compose_req; - compose_req.SetBucket(bucket); - compose_req.SetKey(key); - compose_req.SetComponentNames({key}); - compose_req.SetContentType("binary/octet-stream"); - auto compose_outcome = ComposeObject(compose_req); + if (!outcome.IsSuccess() + && outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_UPLOAD) + { + auto check_request = HeadObjectRequest() + .WithBucket(bucket) + .WithKey(key); + auto check_outcome = HeadObject(check_request); - if (compose_outcome.IsSuccess()) - LOG_TRACE(log, "Composing object was successful"); - else - LOG_INFO(log, "Failed to compose object. Message: {}, Key: {}, Bucket: {}", compose_outcome.GetError().GetMessage(), key, bucket); + /// if the key exists, than MultipartUpload has been completed at some of the retries + /// rewrite outcome with success status + if (check_outcome.IsSuccess()) + outcome = Aws::S3::Model::CompleteMultipartUploadOutcome(Aws::S3::Model::CompleteMultipartUploadResult()); + } + + if (outcome.IsSuccess() && provider_type == ProviderType::GCS) + { + /// For GCS we will try to compose object at the end, otherwise we cannot do a native copy + /// for the object (e.g. for backups) + /// We don't care if the compose fails, because the upload was still successful, only the + /// performance for copying the object will be affected + S3::ComposeObjectRequest compose_req; + compose_req.SetBucket(bucket); + compose_req.SetKey(key); + compose_req.SetComponentNames({key}); + compose_req.SetContentType("binary/octet-stream"); + auto compose_outcome = ComposeObject(compose_req); + + if (compose_outcome.IsSuccess()) + LOG_TRACE(log, "Composing object was successful"); + else + LOG_INFO( + log, + "Failed to compose object. Message: {}, Key: {}, Bucket: {}", + compose_outcome.GetError().GetMessage(), key, bucket); + } return outcome; } diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index fa3ddeabd58..e1b9c17efe9 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -582,7 +582,7 @@ void WriteBufferFromS3::completeMultipartUpload() if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY) { /// For unknown reason, at least MinIO can respond with NO_SUCH_KEY for put requests - /// BTW, NO_SUCH_UPLOAD is expected error and we shouldn't retry it + /// BTW, NO_SUCH_UPLOAD is expected error and we shouldn't retry it here, DB::S3::Client take care of it LOG_INFO(log, "Multipart upload failed with NO_SUCH_KEY error, will retry. {}, Parts: {}", getVerboseLogDetails(), multipart_tags.size()); } else diff --git a/src/Interpreters/Access/InterpreterShowAccessEntitiesQuery.cpp b/src/Interpreters/Access/InterpreterShowAccessEntitiesQuery.cpp index b0937dc2f66..bffb47ac714 100644 --- a/src/Interpreters/Access/InterpreterShowAccessEntitiesQuery.cpp +++ b/src/Interpreters/Access/InterpreterShowAccessEntitiesQuery.cpp @@ -23,7 +23,7 @@ InterpreterShowAccessEntitiesQuery::InterpreterShowAccessEntitiesQuery(const AST BlockIO InterpreterShowAccessEntitiesQuery::execute() { - return executeQuery(getRewrittenQuery(), getContext(), true).second; + return executeQuery(getRewrittenQuery(), getContext(), QueryFlags{ .internal = true }).second; } diff --git a/src/Interpreters/Access/InterpreterShowPrivilegesQuery.cpp b/src/Interpreters/Access/InterpreterShowPrivilegesQuery.cpp index 213e3c813fa..1a0b441a06d 100644 --- a/src/Interpreters/Access/InterpreterShowPrivilegesQuery.cpp +++ b/src/Interpreters/Access/InterpreterShowPrivilegesQuery.cpp @@ -12,7 +12,7 @@ InterpreterShowPrivilegesQuery::InterpreterShowPrivilegesQuery(const ASTPtr & qu BlockIO InterpreterShowPrivilegesQuery::execute() { - return executeQuery("SELECT * FROM system.privileges", context, true).second; + return executeQuery("SELECT * FROM system.privileges", context, QueryFlags{ .internal = true }).second; } } diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 04dee2ed6e6..3aad4cf9247 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -3,9 +3,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -21,6 +21,7 @@ #include #include + namespace DB { @@ -248,7 +249,7 @@ const ActionsDAG::Node & ActionsDAG::addCast(const Node & node_to_cast, const Da const auto * cast_type_constant_node = &addColumn(std::move(column)); ActionsDAG::NodeRawConstPtrs children = {&node_to_cast, cast_type_constant_node}; - FunctionOverloadResolverPtr func_builder_cast = CastInternalOverloadResolver::createImpl(); + FunctionOverloadResolverPtr func_builder_cast = createInternalCastOverloadResolver(CastType::nonAccurate, {}); return addFunction(func_builder_cast, std::move(children), result_name); } @@ -335,6 +336,28 @@ const ActionsDAG::Node * ActionsDAG::tryFindInOutputs(const std::string & name) return nullptr; } +ActionsDAG::NodeRawConstPtrs ActionsDAG::findInOutpus(const Names & names) const +{ + NodeRawConstPtrs required_nodes; + required_nodes.reserve(names.size()); + + std::unordered_map names_map; + for (const auto * node : outputs) + names_map[node->result_name] = node; + + for (const auto & name : names) + { + auto it = names_map.find(name); + if (it == names_map.end()) + throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, + "Unknown column: {}, there are only columns {}", name, dumpDAG()); + + required_nodes.push_back(it->second); + } + + return required_nodes; +} + void ActionsDAG::addOrReplaceInOutputs(const Node & node) { for (auto & output_node : outputs) @@ -441,35 +464,26 @@ void ActionsDAG::removeUnusedActions(const NameSet & required_names, bool allow_ void ActionsDAG::removeUnusedActions(const Names & required_names, bool allow_remove_inputs, bool allow_constant_folding) { - NodeRawConstPtrs required_nodes; - required_nodes.reserve(required_names.size()); - - std::unordered_map names_map; - for (const auto * node : outputs) - names_map[node->result_name] = node; - - for (const auto & name : required_names) - { - auto it = names_map.find(name); - if (it == names_map.end()) - throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, - "Unknown column: {}, there are only columns {}", name, dumpDAG()); - - required_nodes.push_back(it->second); - } - + auto required_nodes = findInOutpus(required_names); outputs.swap(required_nodes); removeUnusedActions(allow_remove_inputs, allow_constant_folding); } void ActionsDAG::removeUnusedActions(bool allow_remove_inputs, bool allow_constant_folding) { - std::unordered_set visited_nodes; std::unordered_set used_inputs; - std::stack stack; + if (!allow_remove_inputs) + { + for (const auto * input : inputs) + used_inputs.insert(input); + } + removeUnusedActions(used_inputs, allow_constant_folding); +} - for (const auto * input : inputs) - used_inputs.insert(input); +void ActionsDAG::removeUnusedActions(const std::unordered_set & used_inputs, bool allow_constant_folding) +{ + std::unordered_set visited_nodes; + std::stack stack; for (const auto * node : outputs) { @@ -488,7 +502,7 @@ void ActionsDAG::removeUnusedActions(bool allow_remove_inputs, bool allow_consta stack.push(&node); } - if (node.type == ActionType::INPUT && !allow_remove_inputs && used_inputs.contains(&node)) + if (node.type == ActionType::INPUT && used_inputs.contains(&node)) visited_nodes.insert(&node); } @@ -528,6 +542,62 @@ void ActionsDAG::removeUnusedActions(bool allow_remove_inputs, bool allow_consta std::erase_if(inputs, [&](const Node * node) { return !visited_nodes.contains(node); }); } +ActionsDAGPtr ActionsDAG::cloneSubDAG(const NodeRawConstPtrs & outputs, bool remove_aliases) +{ + auto actions = std::make_shared(); + std::unordered_map copy_map; + + struct Frame + { + const Node * node = nullptr; + size_t next_child = 0; + }; + + std::stack stack; + + for (const auto * output : outputs) + { + if (copy_map.contains(output)) + continue; + + stack.push(Frame{output}); + while (!stack.empty()) + { + auto & frame = stack.top(); + const auto & children = frame.node->children; + while (frame.next_child < children.size() && copy_map.contains(children[frame.next_child])) + ++frame.next_child; + + if (frame.next_child < children.size()) + { + stack.push(Frame{children[frame.next_child]}); + continue; + } + + auto & copy_node = copy_map[frame.node]; + + if (remove_aliases && frame.node->type == ActionType::ALIAS) + copy_node = copy_map[frame.node->children.front()]; + else + copy_node = &actions->nodes.emplace_back(*frame.node); + + if (frame.node->type == ActionType::INPUT) + actions->inputs.push_back(copy_node); + + stack.pop(); + } + } + + for (auto & node : actions->nodes) + for (auto & child : node.children) + child = copy_map[child]; + + for (const auto * output : outputs) + actions->outputs.push_back(copy_map[output]); + + return actions; +} + static ColumnWithTypeAndName executeActionForHeader(const ActionsDAG::Node * node, ColumnsWithTypeAndName arguments) { ColumnWithTypeAndName res_column; @@ -1312,9 +1382,9 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions( const auto * right_arg = &actions_dag->addColumn(std::move(column)); const auto * left_arg = dst_node; - FunctionCastBase::Diagnostic diagnostic = {dst_node->result_name, res_elem.name}; + CastDiagnostic diagnostic = {dst_node->result_name, res_elem.name}; FunctionOverloadResolverPtr func_builder_cast - = CastInternalOverloadResolver::createImpl(std::move(diagnostic)); + = createInternalCastOverloadResolver(CastType::nonAccurate, std::move(diagnostic)); NodeRawConstPtrs children = { left_arg, right_arg }; dst_node = &actions_dag->addFunction(func_builder_cast, std::move(children), {}); @@ -2187,14 +2257,6 @@ ActionsDAGPtr ActionsDAG::cloneActionsForFilterPushDown( { /// If filter column is not needed, remove it from output nodes. std::erase_if(outputs, [&](const Node * node) { return node == predicate; }); - - /// At the very end of this method we'll call removeUnusedActions() with allow_remove_inputs=false, - /// so we need to manually remove predicate if it is an input node. - if (predicate->type == ActionType::INPUT) - { - std::erase_if(inputs, [&](const Node * node) { return node == predicate; }); - nodes.remove_if([&](const Node & node) { return &node == predicate; }); - } } else { @@ -2261,7 +2323,15 @@ ActionsDAGPtr ActionsDAG::cloneActionsForFilterPushDown( } } - removeUnusedActions(false); + std::unordered_set used_inputs; + for (const auto * input : inputs) + { + if (can_remove_filter && input == predicate) + continue; + used_inputs.insert(input); + } + + removeUnusedActions(used_inputs); return actions; } diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index 48ed03d7347..94b6b1ac41d 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -157,6 +157,9 @@ public: /// Same, but return nullptr if node not found. const Node * tryFindInOutputs(const std::string & name) const; + /// Same, but for the list of names. + NodeRawConstPtrs findInOutpus(const Names & names) const; + /// Find first node with the same name in output nodes and replace it. /// If was not found, add node to outputs end. void addOrReplaceInOutputs(const Node & node); @@ -182,6 +185,9 @@ public: /// Remove actions that are not needed to compute output nodes void removeUnusedActions(bool allow_remove_inputs = true, bool allow_constant_folding = true); + /// Remove actions that are not needed to compute output nodes. Keep inputs from used_inputs. + void removeUnusedActions(const std::unordered_set & used_inputs, bool allow_constant_folding = true); + /// Remove actions that are not needed to compute output nodes with required names void removeUnusedActions(const Names & required_names, bool allow_remove_inputs = true, bool allow_constant_folding = true); @@ -257,6 +263,8 @@ public: ActionsDAGPtr clone() const; + static ActionsDAGPtr cloneSubDAG(const NodeRawConstPtrs & outputs, bool remove_aliases); + /// Execute actions for header. Input block must have empty columns. /// Result should be equal to the execution of ExpressionActions built from this DAG. /// Actions are not changed, no expressions are compiled. diff --git a/src/Interpreters/Cache/QueryCache.cpp b/src/Interpreters/Cache/QueryCache.cpp index 33cb124f3bc..e8b52bbc6a0 100644 --- a/src/Interpreters/Cache/QueryCache.cpp +++ b/src/Interpreters/Cache/QueryCache.cpp @@ -147,17 +147,18 @@ QueryCache::Key::Key(ASTPtr ast_, const String & user_name_) { } +/// Hashing of ASTs must consider aliases (issue #56258) +constexpr bool ignore_aliases = false; + bool QueryCache::Key::operator==(const Key & other) const { - return ast->getTreeHash() == other.ast->getTreeHash(); + return ast->getTreeHash(ignore_aliases) == other.ast->getTreeHash(ignore_aliases); } size_t QueryCache::KeyHasher::operator()(const Key & key) const { - SipHash hash; - hash.update(key.ast->getTreeHash()); - auto res = hash.get64(); - return res; + IAST::Hash hash = key.ast->getTreeHash(ignore_aliases); + return hash.low64; } size_t QueryCache::QueryCacheEntryWeight::operator()(const Entry & entry) const diff --git a/src/Interpreters/Cluster.cpp b/src/Interpreters/Cluster.cpp index 82c3d48bc05..78c708b96da 100644 --- a/src/Interpreters/Cluster.cpp +++ b/src/Interpreters/Cluster.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,7 @@ #include #include +#include namespace DB { @@ -660,8 +662,7 @@ namespace void shuffleReplicas(std::vector & replicas, const Settings & settings, size_t replicas_needed) { - std::random_device rd; - std::mt19937 gen{rd()}; + pcg64_fast gen{randomSeed()}; if (settings.prefer_localhost_replica) { diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 138741a2f2b..ac552a3969c 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -846,7 +846,7 @@ Strings Context::getWarnings() const } res = res + "]" + (single_element ? " is" : " are") + " changed. " - "Please check 'select * from system.settings where changed and is_obsolete' and read the changelog."; + "Please check 'SELECT * FROM system.settings WHERE changed AND is_obsolete' and read the changelog at https://github.com/ClickHouse/ClickHouse/blob/master/CHANGELOG.md"; common_warnings.emplace_back(res); } diff --git a/src/Interpreters/CrossToInnerJoinVisitor.cpp b/src/Interpreters/CrossToInnerJoinVisitor.cpp index 8755daa1dc8..005450c2a2c 100644 --- a/src/Interpreters/CrossToInnerJoinVisitor.cpp +++ b/src/Interpreters/CrossToInnerJoinVisitor.cpp @@ -1,13 +1,8 @@ #include #include -#include -#include -#include #include #include #include -#include -#include #include #include #include @@ -15,11 +10,11 @@ #include #include #include -#include #include #include + namespace DB { @@ -117,7 +112,7 @@ std::map> moveExpressionToJoinOn( std::map> asts_to_join_on; for (const auto & node : splitConjunctionsAst(ast)) { - if (const auto * func = node->as(); func && func->name == NameEquals::name) + if (const auto * func = node->as(); func && func->name == "equals") { if (!func->arguments || func->arguments->children.size() != 2) return {}; @@ -154,7 +149,7 @@ ASTPtr makeOnExpression(const std::vector & expressions) for (const auto & ast : expressions) arguments.emplace_back(ast->clone()); - return makeASTFunction(NameAnd::name, std::move(arguments)); + return makeASTFunction("and", std::move(arguments)); } std::vector getTables(const ASTSelectQuery & select) diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 4e684f5899f..632061a8ec5 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -113,6 +113,9 @@ String DDLLogEntry::toString() const writeChar('\n', wb); } + if (version >= BACKUP_RESTORE_FLAG_IN_ZK_VERSION) + wb << "is_backup_restore: " << is_backup_restore << "\n"; + return wb.str(); } @@ -165,6 +168,12 @@ void DDLLogEntry::parse(const String & data) checkChar('\n', rb); } + if (version >= BACKUP_RESTORE_FLAG_IN_ZK_VERSION) + { + checkString("is_backup_restore: ", rb); + readBoolText(is_backup_restore, rb); + checkChar('\n', rb); + } assertEOF(rb); diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index e92b1f9a885..db8b0628b4b 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -72,10 +72,11 @@ struct DDLLogEntry static constexpr const UInt64 NORMALIZE_CREATE_ON_INITIATOR_VERSION = 3; static constexpr const UInt64 OPENTELEMETRY_ENABLED_VERSION = 4; static constexpr const UInt64 PRESERVE_INITIAL_QUERY_ID_VERSION = 5; + static constexpr const UInt64 BACKUP_RESTORE_FLAG_IN_ZK_VERSION = 6; /// Add new version here /// Remember to update the value below once new version is added - static constexpr const UInt64 DDL_ENTRY_FORMAT_MAX_VERSION = 5; + static constexpr const UInt64 DDL_ENTRY_FORMAT_MAX_VERSION = 6; UInt64 version = 1; String query; @@ -84,6 +85,7 @@ struct DDLLogEntry std::optional settings; OpenTelemetry::TracingContext tracing_context; String initial_query_id; + bool is_backup_restore = false; void setSettingsIfRequired(ContextPtr context); String toString() const; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index da46aad0329..a9930036e7e 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -489,7 +489,8 @@ bool DDLWorker::tryExecuteQuery(DDLTaskBase & task, const ZooKeeperPtr & zookeep if (!task.is_initial_query) query_scope.emplace(query_context); - executeQuery(istr, ostr, !task.is_initial_query, query_context, {}); + + executeQuery(istr, ostr, !task.is_initial_query, query_context, {}, QueryFlags{ .internal = false, .distributed_backup_restore = task.entry.is_backup_restore }); if (auto txn = query_context->getZooKeeperMetadataTransaction()) { @@ -513,6 +514,7 @@ bool DDLWorker::tryExecuteQuery(DDLTaskBase & task, const ZooKeeperPtr & zookeep /// get the same exception again. So we return false only for several special exception codes, /// and consider query as executed with status "failed" and return true in other cases. bool no_sense_to_retry = e.code() != ErrorCodes::KEEPER_EXCEPTION && + e.code() != ErrorCodes::UNFINISHED && e.code() != ErrorCodes::NOT_A_LEADER && e.code() != ErrorCodes::TABLE_IS_READ_ONLY && e.code() != ErrorCodes::CANNOT_ASSIGN_ALTER && @@ -725,7 +727,8 @@ bool DDLWorker::taskShouldBeExecutedOnLeader(const ASTPtr & ast_ddl, const Stora // Setting alters should be executed on all replicas if (alter->isSettingsAlter() || alter->isFreezeAlter() || - alter->isMovePartitionToDiskOrVolumeAlter()) + alter->isMovePartitionToDiskOrVolumeAlter() || + alter->isCommentAlter()) return false; } @@ -793,11 +796,15 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( // Has to get with zk fields to get active replicas field replicated_storage->getStatus(status, true); - // Should return as soon as possible if the table is dropped. + // Should return as soon as possible if the table is dropped or detached, so we will release StoragePtr bool replica_dropped = storage->is_dropped; bool all_replicas_likely_detached = status.active_replicas == 0 && !DatabaseCatalog::instance().isTableExist(storage->getStorageID(), context); if (replica_dropped || all_replicas_likely_detached) { + /// We have to exit (and release StoragePtr) if the replica is being restarted, + /// but we can retry in this case, so don't write execution status + if (storage->is_being_restarted) + throw Exception(ErrorCodes::UNFINISHED, "Cannot execute replicated DDL query, table is dropped or detached permanently"); LOG_WARNING(log, ", task {} will not be executed.", task.entry_name); task.execution_status = ExecutionStatus(ErrorCodes::UNFINISHED, "Cannot execute replicated DDL query, table is dropped or detached permanently"); return false; diff --git a/src/Interpreters/GatherFunctionQuantileVisitor.cpp b/src/Interpreters/GatherFunctionQuantileVisitor.cpp index 805fcfec181..664bb9e9383 100644 --- a/src/Interpreters/GatherFunctionQuantileVisitor.cpp +++ b/src/Interpreters/GatherFunctionQuantileVisitor.cpp @@ -1,10 +1,9 @@ #include -#include #include -#include #include + namespace DB { @@ -14,22 +13,23 @@ namespace ErrorCodes } /// Mapping from quantile functions for single value to plural -static const std::unordered_map quantile_fuse_name_mapping = { - {NameQuantile::name, NameQuantiles::name}, - {NameQuantileBFloat16::name, NameQuantilesBFloat16::name}, - {NameQuantileBFloat16Weighted::name, NameQuantilesBFloat16Weighted::name}, - {NameQuantileDeterministic::name, NameQuantilesDeterministic::name}, - {NameQuantileExact::name, NameQuantilesExact::name}, - {NameQuantileExactExclusive::name, NameQuantilesExactExclusive::name}, - {NameQuantileExactHigh::name, NameQuantilesExactHigh::name}, - {NameQuantileExactInclusive::name, NameQuantilesExactInclusive::name}, - {NameQuantileExactLow::name, NameQuantilesExactLow::name}, - {NameQuantileExactWeighted::name, NameQuantilesExactWeighted::name}, - {NameQuantileInterpolatedWeighted::name, NameQuantilesInterpolatedWeighted::name}, - {NameQuantileTDigest::name, NameQuantilesTDigest::name}, - {NameQuantileTDigestWeighted::name, NameQuantilesTDigestWeighted::name}, - {NameQuantileTiming::name, NameQuantilesTiming::name}, - {NameQuantileTimingWeighted::name, NameQuantilesTimingWeighted::name}, +static const std::unordered_map quantile_fuse_name_mapping = +{ + {"quantile", "quantiles"}, + {"quantileBFloat16", "quantilesBFloat16"}, + {"quantileBFloat16Weighted", "quantilesBFloat16Weighted"}, + {"quantileDeterministic", "quantilesDeterministic"}, + {"quantileExact", "quantilesExact"}, + {"quantileExactExclusive", "quantilesExactExclusive"}, + {"quantileExactHigh", "quantilesExactHigh"}, + {"quantileExactInclusive", "quantilesExactInclusive"}, + {"quantileExactLow", "quantilesExactLow"}, + {"quantileExactWeighted", "quantilesExactWeighted"}, + {"quantileInterpolatedWeighted", "quantilesInterpolatedWeighted"}, + {"quantileTDigest", "quantilesTDigest"}, + {"quantileTDigestWeighted", "quantilesTDigestWeighted"}, + {"quantileTiming", "quantilesTiming"}, + {"quantileTimingWeighted", "quantilesTimingWeighted"}, }; String GatherFunctionQuantileData::toFusedNameOrSelf(const String & func_name) @@ -63,9 +63,9 @@ void GatherFunctionQuantileData::FuseQuantileAggregatesData::addFuncNode(ASTPtr const auto & arguments = func->arguments->children; - bool need_two_args = func->name == NameQuantileDeterministic::name || func->name == NameQuantileExactWeighted::name - || func->name == NameQuantileInterpolatedWeighted::name || func->name == NameQuantileTimingWeighted::name - || func->name == NameQuantileTDigestWeighted::name || func->name == NameQuantileBFloat16Weighted::name; + bool need_two_args = func->name == "quantileDeterministic" || func->name == "quantileExactWeighted" + || func->name == "quantileInterpolatedWeighted" || func->name == "quantileTimingWeighted" + || func->name == "quantileTDigestWeighted" || func->name == "quantileBFloat16Weighted"; if (arguments.size() != (need_two_args ? 2 : 1)) return; diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index f851607000c..6eebc1450b4 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -42,6 +42,7 @@ namespace ErrorCodes extern const int TABLE_IS_READ_ONLY; extern const int BAD_ARGUMENTS; extern const int UNKNOWN_TABLE; + extern const int UNKNOWN_DATABASE; } @@ -74,9 +75,14 @@ BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter) if (!UserDefinedSQLFunctionFactory::instance().empty()) UserDefinedSQLFunctionVisitor::visit(query_ptr); - auto table_id = getContext()->resolveStorageID(alter, Context::ResolveOrdinary); - query_ptr->as().setDatabase(table_id.database_name); - StoragePtr table = DatabaseCatalog::instance().tryGetTable(table_id, getContext()); + auto table_id = getContext()->tryResolveStorageID(alter, Context::ResolveOrdinary); + StoragePtr table; + + if (table_id) + { + query_ptr->as().setDatabase(table_id.database_name); + table = DatabaseCatalog::instance().tryGetTable(table_id, getContext()); + } if (!alter.cluster.empty() && !maybeRemoveOnCluster(query_ptr, getContext())) { @@ -90,6 +96,9 @@ BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter) getContext()->checkAccess(getRequiredAccess()); + if (!table_id) + throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database {} does not exist", backQuoteIfNeed(alter.getDatabase())); + DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); if (database->shouldReplicateQuery(getContext(), query_ptr)) { diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 252f45677ef..581d0905478 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -480,7 +480,7 @@ ASTPtr InterpreterCreateQuery::formatProjections(const ProjectionsDescription & } ColumnsDescription InterpreterCreateQuery::getColumnsDescription( - const ASTExpressionList & columns_ast, ContextPtr context_, bool attach) + const ASTExpressionList & columns_ast, ContextPtr context_, bool attach, bool is_restore_from_backup) { /// First, deduce implicit types. @@ -489,7 +489,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( ASTPtr default_expr_list = std::make_shared(); NamesAndTypesList column_names_and_types; - bool make_columns_nullable = !attach && context_->getSettingsRef().data_type_default_nullable; + bool make_columns_nullable = !attach && !is_restore_from_backup && context_->getSettingsRef().data_type_default_nullable; for (const auto & ast : columns_ast.children) { @@ -645,7 +645,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( res.add(std::move(column)); } - if (!attach && context_->getSettingsRef().flatten_nested) + if (!attach && !is_restore_from_backup && context_->getSettingsRef().flatten_nested) res.flattenNested(); if (res.getAllPhysical().empty()) @@ -692,7 +692,7 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti if (create.columns_list->columns) { - properties.columns = getColumnsDescription(*create.columns_list->columns, getContext(), create.attach); + properties.columns = getColumnsDescription(*create.columns_list->columns, getContext(), create.attach, is_restore_from_backup); } if (create.columns_list->indices) @@ -752,7 +752,6 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti } else if (create.select) { - Block as_select_sample; if (getContext()->getSettingsRef().allow_experimental_analyzer) @@ -1077,7 +1076,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) auto guard = DatabaseCatalog::instance().getDDLGuard(database_name, create.getTable()); create.setDatabase(database_name); guard->releaseTableLock(); - return database->tryEnqueueReplicatedDDL(query_ptr, getContext(), internal); + return database->tryEnqueueReplicatedDDL(query_ptr, getContext(), QueryFlags{ .internal = internal, .distributed_backup_restore = is_restore_from_backup }); } if (!create.cluster.empty()) @@ -1225,15 +1224,15 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) DatabasePtr database; bool need_add_to_database = !create.temporary; if (need_add_to_database) - database = DatabaseCatalog::instance().getDatabase(database_name); + database = DatabaseCatalog::instance().tryGetDatabase(database_name); - if (need_add_to_database && database->shouldReplicateQuery(getContext(), query_ptr)) + if (need_add_to_database && database && database->shouldReplicateQuery(getContext(), query_ptr)) { chassert(!ddl_guard); auto guard = DatabaseCatalog::instance().getDDLGuard(create.getDatabase(), create.getTable()); assertOrSetUUID(create, database); guard->releaseTableLock(); - return database->tryEnqueueReplicatedDDL(query_ptr, getContext(), internal); + return database->tryEnqueueReplicatedDDL(query_ptr, getContext(), QueryFlags{ .internal = internal, .distributed_backup_restore = is_restore_from_backup }); } if (!create.cluster.empty()) @@ -1242,6 +1241,9 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) return executeQueryOnCluster(create); } + if (need_add_to_database && !database) + throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database {} does not exist", backQuoteIfNeed(database_name)); + if (create.replace_table) { chassert(!ddl_guard); diff --git a/src/Interpreters/InterpreterCreateQuery.h b/src/Interpreters/InterpreterCreateQuery.h index 67339dea928..0843a7ad15a 100644 --- a/src/Interpreters/InterpreterCreateQuery.h +++ b/src/Interpreters/InterpreterCreateQuery.h @@ -66,9 +66,14 @@ public: need_ddl_guard = false; } + void setIsRestoreFromBackup(bool is_restore_from_backup_) + { + is_restore_from_backup = is_restore_from_backup_; + } + /// Obtain information about columns, their types, default values and column comments, /// for case when columns in CREATE query is specified explicitly. - static ColumnsDescription getColumnsDescription(const ASTExpressionList & columns, ContextPtr context, bool attach); + static ColumnsDescription getColumnsDescription(const ASTExpressionList & columns, ContextPtr context, bool attach, bool is_restore_from_backup); static ConstraintsDescription getConstraintsDescription(const ASTExpressionList * constraints); static void prepareOnClusterQuery(ASTCreateQuery & create, ContextPtr context, const String & cluster_name); @@ -116,6 +121,7 @@ private: bool force_attach = false; bool load_database_without_tables = false; bool need_ddl_guard = true; + bool is_restore_from_backup = false; mutable String as_database_saved; mutable String as_table_saved; diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index dd52b6c2e14..b8c9d5dabb5 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -267,7 +267,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(ContextPtr context_, ASTDropQue bool check_loading_deps = !check_ref_deps && getContext()->getSettingsRef().check_table_dependencies; DatabaseCatalog::instance().checkTableCanBeRemovedOrRenamed(table_id, check_ref_deps, check_loading_deps, is_drop_or_detach_database); - table->flushAndShutdown(); + table->flushAndShutdown(true); TableExclusiveLockHolder table_lock; if (database->getUUID() == UUIDHelpers::Nil) diff --git a/src/Interpreters/InterpreterKillQueryQuery.cpp b/src/Interpreters/InterpreterKillQueryQuery.cpp index 1c2e3ff6777..6e1422f2938 100644 --- a/src/Interpreters/InterpreterKillQueryQuery.cpp +++ b/src/Interpreters/InterpreterKillQueryQuery.cpp @@ -420,7 +420,7 @@ Block InterpreterKillQueryQuery::getSelectResult(const String & columns, const S if (where_expression) select_query += " WHERE " + queryToString(where_expression); - auto io = executeQuery(select_query, getContext(), true).second; + auto io = executeQuery(select_query, getContext(), QueryFlags{ .internal = true }).second; PullingPipelineExecutor executor(io.pipeline); Block res; while (!res && executor.pull(res)); diff --git a/src/Interpreters/InterpreterSelectQueryAnalyzer.cpp b/src/Interpreters/InterpreterSelectQueryAnalyzer.cpp index b8cace5e0ad..f6f8b346a2d 100644 --- a/src/Interpreters/InterpreterSelectQueryAnalyzer.cpp +++ b/src/Interpreters/InterpreterSelectQueryAnalyzer.cpp @@ -257,4 +257,10 @@ void InterpreterSelectQueryAnalyzer::addStorageLimits(const StorageLimitsList & planner.addStorageLimits(storage_limits); } +void InterpreterSelectQueryAnalyzer::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & /*ast*/, ContextPtr /*context*/) const +{ + for (const auto & used_row_policy : planner.getUsedRowPolicies()) + elem.used_row_policies.emplace(used_row_policy); +} + } diff --git a/src/Interpreters/InterpreterSelectQueryAnalyzer.h b/src/Interpreters/InterpreterSelectQueryAnalyzer.h index 37ec04abecd..15ed2b25fba 100644 --- a/src/Interpreters/InterpreterSelectQueryAnalyzer.h +++ b/src/Interpreters/InterpreterSelectQueryAnalyzer.h @@ -57,14 +57,16 @@ public: void addStorageLimits(const StorageLimitsList & storage_limits); + void extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & /*ast*/, ContextPtr /*context*/) const override; + bool supportsTransactions() const override { return true; } bool ignoreLimits() const override { return select_query_options.ignore_limits; } bool ignoreQuota() const override { return select_query_options.ignore_quota; } - const Planner & getPlanner() const { return planner; } + Planner & getPlanner() { return planner; } const QueryTreeNodePtr & getQueryTree() const { return query_tree; } diff --git a/src/Interpreters/InterpreterShowColumnsQuery.cpp b/src/Interpreters/InterpreterShowColumnsQuery.cpp index c8fb64e37f2..a5b22387448 100644 --- a/src/Interpreters/InterpreterShowColumnsQuery.cpp +++ b/src/Interpreters/InterpreterShowColumnsQuery.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -25,8 +26,10 @@ String InterpreterShowColumnsQuery::getRewrittenQuery() { const auto & query = query_ptr->as(); + ClientInfo::Interface client_interface = getContext()->getClientInfo().interface; + const bool use_mysql_types = (client_interface == ClientInfo::Interface::MYSQL); // connection made through MySQL wire protocol + const auto & settings = getContext()->getSettingsRef(); - const bool use_mysql_types = settings.use_mysql_types_in_show_columns; const bool remap_string_as_text = settings.mysql_map_string_to_text_in_show_columns; const bool remap_fixed_string_as_text = settings.mysql_map_fixed_string_to_text_in_show_columns; @@ -39,7 +42,6 @@ String InterpreterShowColumnsQuery::getRewrittenQuery() if (use_mysql_types) { /// Cheapskate SQL-based mapping from native types to MySQL types, see https://dev.mysql.com/doc/refman/8.0/en/data-types.html - /// Only used with setting 'use_mysql_types_in_show_columns = 1' /// Known issues: /// - Enums are translated to TEXT rewritten_query += fmt::format( @@ -159,7 +161,7 @@ WHERE BlockIO InterpreterShowColumnsQuery::execute() { - return executeQuery(getRewrittenQuery(), getContext(), true).second; + return executeQuery(getRewrittenQuery(), getContext(), QueryFlags{ .internal = true }).second; } diff --git a/src/Interpreters/InterpreterShowEngineQuery.cpp b/src/Interpreters/InterpreterShowEngineQuery.cpp index a2367e9bfdf..2927fbd0f2d 100644 --- a/src/Interpreters/InterpreterShowEngineQuery.cpp +++ b/src/Interpreters/InterpreterShowEngineQuery.cpp @@ -12,7 +12,7 @@ namespace DB BlockIO InterpreterShowEnginesQuery::execute() { - return executeQuery("SELECT * FROM system.table_engines ORDER BY name", getContext(), true).second; + return executeQuery("SELECT * FROM system.table_engines ORDER BY name", getContext(), QueryFlags{ .internal = true }).second; } } diff --git a/src/Interpreters/InterpreterShowFunctionsQuery.cpp b/src/Interpreters/InterpreterShowFunctionsQuery.cpp index ace22ca4bb6..a9da01b0988 100644 --- a/src/Interpreters/InterpreterShowFunctionsQuery.cpp +++ b/src/Interpreters/InterpreterShowFunctionsQuery.cpp @@ -15,7 +15,7 @@ InterpreterShowFunctionsQuery::InterpreterShowFunctionsQuery(const ASTPtr & quer BlockIO InterpreterShowFunctionsQuery::execute() { - return executeQuery(getRewrittenQuery(), getContext(), true).second; + return executeQuery(getRewrittenQuery(), getContext(), QueryFlags{ .internal = true }).second; } String InterpreterShowFunctionsQuery::getRewrittenQuery() diff --git a/src/Interpreters/InterpreterShowIndexesQuery.cpp b/src/Interpreters/InterpreterShowIndexesQuery.cpp index 549afd32506..09b70e951db 100644 --- a/src/Interpreters/InterpreterShowIndexesQuery.cpp +++ b/src/Interpreters/InterpreterShowIndexesQuery.cpp @@ -101,7 +101,7 @@ ORDER BY index_type, expression, column_name, seq_in_index;)", database, table, BlockIO InterpreterShowIndexesQuery::execute() { - return executeQuery(getRewrittenQuery(), getContext(), true).second; + return executeQuery(getRewrittenQuery(), getContext(), QueryFlags{ .internal = true }).second; } diff --git a/src/Interpreters/InterpreterShowProcesslistQuery.cpp b/src/Interpreters/InterpreterShowProcesslistQuery.cpp index 4ed5f4171c6..f711cc0dac9 100644 --- a/src/Interpreters/InterpreterShowProcesslistQuery.cpp +++ b/src/Interpreters/InterpreterShowProcesslistQuery.cpp @@ -12,7 +12,7 @@ namespace DB BlockIO InterpreterShowProcesslistQuery::execute() { - return executeQuery("SELECT * FROM system.processes ORDER BY elapsed DESC", getContext(), true).second; + return executeQuery("SELECT * FROM system.processes ORDER BY elapsed DESC", getContext(), QueryFlags{ .internal = true }).second; } } diff --git a/src/Interpreters/InterpreterShowSettingQuery.cpp b/src/Interpreters/InterpreterShowSettingQuery.cpp index 7567e77d28f..45e9b8a1f1c 100644 --- a/src/Interpreters/InterpreterShowSettingQuery.cpp +++ b/src/Interpreters/InterpreterShowSettingQuery.cpp @@ -26,9 +26,8 @@ String InterpreterShowSettingQuery::getRewrittenQuery() BlockIO InterpreterShowSettingQuery::execute() { - return executeQuery(getRewrittenQuery(), getContext(), true).second; + return executeQuery(getRewrittenQuery(), getContext(), QueryFlags{ .internal = true }).second; } } - diff --git a/src/Interpreters/InterpreterShowTablesQuery.cpp b/src/Interpreters/InterpreterShowTablesQuery.cpp index a8db55a317a..0ca6578128d 100644 --- a/src/Interpreters/InterpreterShowTablesQuery.cpp +++ b/src/Interpreters/InterpreterShowTablesQuery.cpp @@ -214,7 +214,7 @@ BlockIO InterpreterShowTablesQuery::execute() return res; } - return executeQuery(getRewrittenQuery(), getContext(), true).second; + return executeQuery(getRewrittenQuery(), getContext(), QueryFlags{ .internal = true }).second; } /// (*) Sorting is strictly speaking not necessary but 1. it is convenient for users, 2. SQL currently does not allow to diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 07a1ae7d170..ea0e95c2b27 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -728,6 +728,11 @@ StoragePtr InterpreterSystemQuery::tryRestartReplica(const StorageID & replica, if (!table || !dynamic_cast(table.get())) return nullptr; + SCOPE_EXIT({ + if (table) + table->is_being_restarted = false; + }); + table->is_being_restarted = true; table->flushAndShutdown(); { /// If table was already dropped by anyone, an exception will be thrown @@ -745,7 +750,7 @@ StoragePtr InterpreterSystemQuery::tryRestartReplica(const StorageID & replica, auto & create = create_ast->as(); create.attach = true; - auto columns = InterpreterCreateQuery::getColumnsDescription(*create.columns_list->columns, system_context, true); + auto columns = InterpreterCreateQuery::getColumnsDescription(*create.columns_list->columns, system_context, true, false); auto constraints = InterpreterCreateQuery::getConstraintsDescription(create.columns_list->constraints); auto data_path = database->getTableDataPath(create); diff --git a/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.h b/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.h index 27f467a12ae..484fd6a0207 100644 --- a/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.h +++ b/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.h @@ -75,7 +75,7 @@ public: ASTs rewritten_queries = InterpreterImpl::getRewrittenQueries(query, getContext(), mapped_to_database, mysql_database); for (const auto & rewritten_query : rewritten_queries) - executeQuery("/* Rewritten MySQL DDL Query */ " + queryToString(rewritten_query), getContext(), true); + executeQuery("/* Rewritten MySQL DDL Query */ " + queryToString(rewritten_query), getContext(), QueryFlags{ .internal = true }); return BlockIO{}; } diff --git a/src/Interpreters/addMissingDefaults.cpp b/src/Interpreters/addMissingDefaults.cpp index 3febcfc74a8..fbf17d7efb7 100644 --- a/src/Interpreters/addMissingDefaults.cpp +++ b/src/Interpreters/addMissingDefaults.cpp @@ -8,8 +8,7 @@ #include #include #include -#include -#include +#include namespace DB diff --git a/src/Interpreters/castColumn.cpp b/src/Interpreters/castColumn.cpp index 44e669a21ab..906dfb84b14 100644 --- a/src/Interpreters/castColumn.cpp +++ b/src/Interpreters/castColumn.cpp @@ -1,13 +1,16 @@ #include - -#include #include +#include +#include +#include +#include +#include + namespace DB { -template -static ColumnPtr castColumn(const ColumnWithTypeAndName & arg, const DataTypePtr & type, InternalCastFunctionCache * cache = nullptr) +static ColumnPtr castColumn(CastType cast_type, const ColumnWithTypeAndName & arg, const DataTypePtr & type, InternalCastFunctionCache * cache = nullptr) { if (arg.type->equals(*type) && cast_type != CastType::accurateOrNull) return arg.column; @@ -23,37 +26,34 @@ static ColumnPtr castColumn(const ColumnWithTypeAndName & arg, const DataTypePtr "" } }; - auto get_cast_func = [&arguments] + auto get_cast_func = [cast_type, &arguments] { - FunctionOverloadResolverPtr func_builder_cast = CastInternalOverloadResolver::createImpl(); + + FunctionOverloadResolverPtr func_builder_cast = createInternalCastOverloadResolver(cast_type, {}); return func_builder_cast->build(arguments); }; FunctionBasePtr func_cast = cache ? cache->getOrSet(cast_type, from_name, to_name, std::move(get_cast_func)) : get_cast_func(); - if constexpr (cast_type == CastType::accurateOrNull) - { + if (cast_type == CastType::accurateOrNull) return func_cast->execute(arguments, makeNullable(type), arg.column->size()); - } else - { return func_cast->execute(arguments, type, arg.column->size()); - } } ColumnPtr castColumn(const ColumnWithTypeAndName & arg, const DataTypePtr & type, InternalCastFunctionCache * cache) { - return castColumn(arg, type, cache); + return castColumn(CastType::nonAccurate, arg, type, cache); } ColumnPtr castColumnAccurate(const ColumnWithTypeAndName & arg, const DataTypePtr & type, InternalCastFunctionCache * cache) { - return castColumn(arg, type, cache); + return castColumn(CastType::accurate, arg, type, cache); } ColumnPtr castColumnAccurateOrNull(const ColumnWithTypeAndName & arg, const DataTypePtr & type, InternalCastFunctionCache * cache) { - return castColumn(arg, type, cache); + return castColumn(CastType::accurateOrNull, arg, type, cache); } } diff --git a/src/Interpreters/castColumn.h b/src/Interpreters/castColumn.h index 8d2c05025bb..b9ed3403d0f 100644 --- a/src/Interpreters/castColumn.h +++ b/src/Interpreters/castColumn.h @@ -2,11 +2,15 @@ #include #include -#include +#include + namespace DB { +class IFunctionBase; +using FunctionBasePtr = std::shared_ptr; + struct InternalCastFunctionCache { private: @@ -15,7 +19,7 @@ private: std::map, FunctionBasePtr> impl; mutable std::mutex mutex; public: - template + template FunctionBasePtr getOrSet(CastType cast_type, const String & from, const String & to, Getter && getter) { std::lock_guard lock{mutex}; diff --git a/src/Interpreters/examples/string_hash_map.cpp b/src/Interpreters/examples/string_hash_map.cpp index f55ed983fbc..58bf7858b39 100644 --- a/src/Interpreters/examples/string_hash_map.cpp +++ b/src/Interpreters/examples/string_hash_map.cpp @@ -14,24 +14,25 @@ /** +#include #include #include +#include using namespace std; int main() { std::string s; - std::random_device dev; - std::mt19937 rng(dev()); - std::uniform_int_distribution dist(0, 25); - std::binomial_distribution binomial1(100, 0.01); - std::binomial_distribution binomial2(100, 0.02); - std::binomial_distribution binomial4(100, 0.04); - std::binomial_distribution binomial8(100, 0.08); - std::binomial_distribution binomial16(100, 0.16); - std::binomial_distribution binomial24(100, 0.24); - std::binomial_distribution binomial48(100, 0.48); + pcg64_fast rng{randomSeed()}; + std::uniform_int_distribution dist(0, 25); + std::binomial_distribution binomial1(100, 0.01); + std::binomial_distribution binomial2(100, 0.02); + std::binomial_distribution binomial4(100, 0.04); + std::binomial_distribution binomial8(100, 0.08); + std::binomial_distribution binomial16(100, 0.16); + std::binomial_distribution binomial24(100, 0.24); + std::binomial_distribution binomial48(100, 0.48); // 11GB std::ofstream f("/tmp/terms.csv"); size_t l1, l2, l4, l8, l16, l24, l48; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index a5c0d9421d9..8cd3c8ab848 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -646,10 +647,12 @@ static std::tuple executeQueryImpl( const char * begin, const char * end, ContextMutablePtr context, - bool internal, + QueryFlags flags, QueryProcessingStage::Enum stage, ReadBuffer * istr) { + const bool internal = flags.internal; + /// query_span is a special span, when this function exits, it's lifetime is not ended, but ends when the query finishes. /// Some internal queries might call this function recursively by setting 'internal' parameter to 'true', /// to make sure SpanHolders in current stack ends in correct order, we disable this span for these internal queries @@ -1085,6 +1088,9 @@ static std::tuple executeQueryImpl( insert_interpreter->addBuffer(std::move(insert_data_buffer_holder)); } + if (auto * create_interpreter = typeid_cast(&*interpreter)) + create_interpreter->setIsRestoreFromBackup(flags.distributed_backup_restore); + { std::unique_ptr span; if (OpenTelemetry::CurrentContext().isTraceEnabled()) @@ -1257,13 +1263,13 @@ static std::tuple executeQueryImpl( std::pair executeQuery( const String & query, ContextMutablePtr context, - bool internal, + QueryFlags flags, QueryProcessingStage::Enum stage) { ASTPtr ast; BlockIO res; - std::tie(ast, res) = executeQueryImpl(query.data(), query.data() + query.size(), context, internal, stage, nullptr); + std::tie(ast, res) = executeQueryImpl(query.data(), query.data() + query.size(), context, flags, stage, nullptr); if (const auto * ast_query_with_output = dynamic_cast(ast.get())) { @@ -1284,6 +1290,7 @@ void executeQuery( bool allow_into_outfile, ContextMutablePtr context, SetResultDetailsFunc set_result_details, + QueryFlags flags, const std::optional & output_format_settings, HandleExceptionInOutputFormatFunc handle_exception_in_output_format) { @@ -1372,7 +1379,7 @@ void executeQuery( try { - std::tie(ast, streams) = executeQueryImpl(begin, end, context, false, QueryProcessingStage::Complete, &istr); + std::tie(ast, streams) = executeQueryImpl(begin, end, context, flags, QueryProcessingStage::Complete, &istr); } catch (...) { diff --git a/src/Interpreters/executeQuery.h b/src/Interpreters/executeQuery.h index 6f14f54d7d6..0f599922668 100644 --- a/src/Interpreters/executeQuery.h +++ b/src/Interpreters/executeQuery.h @@ -29,6 +29,13 @@ struct QueryResultDetails using SetResultDetailsFunc = std::function; using HandleExceptionInOutputFormatFunc = std::function; +struct QueryFlags +{ + bool internal = false; /// If true, this query is caused by another query and thus needn't be registered in the ProcessList. + bool distributed_backup_restore = false; /// If true, this query is a part of backup restore. +}; + + /// Parse and execute a query. void executeQuery( ReadBuffer & istr, /// Where to read query from (and data for INSERT, if present). @@ -36,6 +43,7 @@ void executeQuery( bool allow_into_outfile, /// If true and the query contains INTO OUTFILE section, redirect output to that file. ContextMutablePtr context, /// DB, tables, data types, storage engines, functions, aggregate functions... SetResultDetailsFunc set_result_details, /// If a non-empty callback is passed, it will be called with the query id, the content-type, the format, and the timezone. + QueryFlags flags = {}, const std::optional & output_format_settings = std::nullopt, /// Format settings for output format, will be calculated from the context if not set. HandleExceptionInOutputFormatFunc handle_exception_in_output_format = {} /// If a non-empty callback is passed, it will be called on exception with created output format. ); @@ -58,7 +66,7 @@ void executeQuery( std::pair executeQuery( const String & query, /// Query text without INSERT data. The latter must be written to BlockIO::out. ContextMutablePtr context, /// DB, tables, data types, storage engines, functions, aggregate functions... - bool internal = false, /// If true, this query is caused by another query and thus needn't be registered in the ProcessList. + QueryFlags flags = {}, QueryProcessingStage::Enum stage = QueryProcessingStage::Complete /// To which stage the query must be executed. ); diff --git a/src/Interpreters/fuzzers/execute_query_fuzzer.cpp b/src/Interpreters/fuzzers/execute_query_fuzzer.cpp index 0f6bfc1ae58..40e2325e46e 100644 --- a/src/Interpreters/fuzzers/execute_query_fuzzer.cpp +++ b/src/Interpreters/fuzzers/execute_query_fuzzer.cpp @@ -42,7 +42,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) static bool initialized = initialize(); (void) initialized; - auto io = DB::executeQuery(input, context, true, QueryProcessingStage::Complete).second; + auto io = DB::executeQuery(input, context, QueryFlags{ .internal = true }, QueryProcessingStage::Complete).second; PullingPipelineExecutor executor(io.pipeline); Block res; diff --git a/src/Interpreters/loadMetadata.cpp b/src/Interpreters/loadMetadata.cpp index faa1dcda2c0..3612dbfdc4e 100644 --- a/src/Interpreters/loadMetadata.cpp +++ b/src/Interpreters/loadMetadata.cpp @@ -282,7 +282,7 @@ static void convertOrdinaryDatabaseToAtomic(Poco::Logger * log, ContextMutablePt LOG_INFO(log, "Will convert database {} from Ordinary to Atomic", name_quoted); String create_database_query = fmt::format("CREATE DATABASE IF NOT EXISTS {}", tmp_name_quoted); - auto res = executeQuery(create_database_query, context, true).second; + auto res = executeQuery(create_database_query, context, QueryFlags{ .internal = true }).second; executeTrivialBlockIO(res, context); res = {}; auto tmp_database = DatabaseCatalog::instance().getDatabase(tmp_name); @@ -322,7 +322,7 @@ static void convertOrdinaryDatabaseToAtomic(Poco::Logger * log, ContextMutablePt String tmp_qualified_quoted_name = id.getFullTableName(); String move_table_query = fmt::format("RENAME TABLE {} TO {}", qualified_quoted_name, tmp_qualified_quoted_name); - res = executeQuery(move_table_query, context, true).second; + res = executeQuery(move_table_query, context, QueryFlags{ .internal = true }).second; executeTrivialBlockIO(res, context); res = {}; } @@ -334,12 +334,12 @@ static void convertOrdinaryDatabaseToAtomic(Poco::Logger * log, ContextMutablePt String drop_query = fmt::format("DROP DATABASE {}", name_quoted); context->setSetting("force_remove_data_recursively_on_drop", false); - res = executeQuery(drop_query, context, true).second; + res = executeQuery(drop_query, context, QueryFlags{ .internal = true }).second; executeTrivialBlockIO(res, context); res = {}; String rename_query = fmt::format("RENAME DATABASE {} TO {}", tmp_name_quoted, name_quoted); - res = executeQuery(rename_query, context, true).second; + res = executeQuery(rename_query, context, QueryFlags{ .internal = true }).second; executeTrivialBlockIO(res, context); LOG_INFO(log, "Finished database engine conversion of {}", name_quoted); @@ -409,7 +409,7 @@ static void maybeConvertOrdinaryDatabaseToAtomic(ContextMutablePtr context, cons /// Reload database just in case (and update logger name) String detach_query = fmt::format("DETACH DATABASE {}", backQuoteIfNeed(database_name)); - auto res = executeQuery(detach_query, context, true).second; + auto res = executeQuery(detach_query, context, QueryFlags{ .internal = true }).second; executeTrivialBlockIO(res, context); res = {}; diff --git a/src/Interpreters/parseColumnsListForTableFunction.cpp b/src/Interpreters/parseColumnsListForTableFunction.cpp index 196053fe509..87f76f7f824 100644 --- a/src/Interpreters/parseColumnsListForTableFunction.cpp +++ b/src/Interpreters/parseColumnsListForTableFunction.cpp @@ -73,7 +73,7 @@ ColumnsDescription parseColumnsListFromString(const std::string & structure, con if (!columns_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Could not cast AST to ASTExpressionList"); - auto columns = InterpreterCreateQuery::getColumnsDescription(*columns_list, context, false); + auto columns = InterpreterCreateQuery::getColumnsDescription(*columns_list, context, false, false); auto validation_settings = DataTypeValidationSettings(context->getSettingsRef()); for (const auto & [name, type] : columns.getAll()) validateDataType(type, validation_settings); @@ -100,7 +100,7 @@ bool tryParseColumnsListFromString(const std::string & structure, ColumnsDescrip try { - columns = InterpreterCreateQuery::getColumnsDescription(*columns_list, context, false); + columns = InterpreterCreateQuery::getColumnsDescription(*columns_list, context, false, false); auto validation_settings = DataTypeValidationSettings(context->getSettingsRef()); for (const auto & [name, type] : columns.getAll()) validateDataType(type, validation_settings); diff --git a/src/Parsers/ASTAlterQuery.cpp b/src/Parsers/ASTAlterQuery.cpp index 955320c318c..aba6803a8ab 100644 --- a/src/Parsers/ASTAlterQuery.cpp +++ b/src/Parsers/ASTAlterQuery.cpp @@ -486,6 +486,11 @@ bool ASTAlterQuery::isDropPartitionAlter() const return isOneCommandTypeOnly(ASTAlterCommand::DROP_PARTITION) || isOneCommandTypeOnly(ASTAlterCommand::DROP_DETACHED_PARTITION); } +bool ASTAlterQuery::isCommentAlter() const +{ + return isOneCommandTypeOnly(ASTAlterCommand::COMMENT_COLUMN) || isOneCommandTypeOnly(ASTAlterCommand::MODIFY_COMMENT); +} + bool ASTAlterQuery::isMovePartitionToDiskOrVolumeAlter() const { if (command_list) diff --git a/src/Parsers/ASTAlterQuery.h b/src/Parsers/ASTAlterQuery.h index 30cf0cac4ce..f8e3fb96dd3 100644 --- a/src/Parsers/ASTAlterQuery.h +++ b/src/Parsers/ASTAlterQuery.h @@ -239,6 +239,8 @@ public: bool isMovePartitionToDiskOrVolumeAlter() const; + bool isCommentAlter() const; + String getID(char) const override; ASTPtr clone() const override; diff --git a/src/Parsers/ASTColumnsMatcher.cpp b/src/Parsers/ASTColumnsMatcher.cpp index aff7d9fa833..30b172ecbb8 100644 --- a/src/Parsers/ASTColumnsMatcher.cpp +++ b/src/Parsers/ASTColumnsMatcher.cpp @@ -46,11 +46,11 @@ void ASTColumnsRegexpMatcher::appendColumnName(WriteBuffer & ostr) const writeChar(')', ostr); } -void ASTColumnsRegexpMatcher::updateTreeHashImpl(SipHash & hash_state) const +void ASTColumnsRegexpMatcher::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(original_pattern.size()); hash_state.update(original_pattern); - IAST::updateTreeHashImpl(hash_state); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); } void ASTColumnsRegexpMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const @@ -201,11 +201,11 @@ const std::shared_ptr & ASTQualifiedColumnsRegexpMatcher::getMatcher() return column_matcher; } -void ASTQualifiedColumnsRegexpMatcher::updateTreeHashImpl(SipHash & hash_state) const +void ASTQualifiedColumnsRegexpMatcher::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(original_pattern.size()); hash_state.update(original_pattern); - IAST::updateTreeHashImpl(hash_state); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); } void ASTQualifiedColumnsRegexpMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const diff --git a/src/Parsers/ASTColumnsMatcher.h b/src/Parsers/ASTColumnsMatcher.h index f31a8bd9a22..6fc5581a4eb 100644 --- a/src/Parsers/ASTColumnsMatcher.h +++ b/src/Parsers/ASTColumnsMatcher.h @@ -27,7 +27,7 @@ public: const String & getPattern() const; const std::shared_ptr & getMatcher() const; bool isColumnMatching(const String & column_name) const; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; ASTPtr expression; ASTPtr transformers; @@ -65,7 +65,7 @@ public: const std::shared_ptr & getMatcher() const; void setPattern(String pattern, bool set_matcher = true); void setMatcher(std::shared_ptr matcher); - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; ASTPtr qualifier; ASTPtr transformers; diff --git a/src/Parsers/ASTColumnsTransformers.cpp b/src/Parsers/ASTColumnsTransformers.cpp index 27d56dec283..6976683678e 100644 --- a/src/Parsers/ASTColumnsTransformers.cpp +++ b/src/Parsers/ASTColumnsTransformers.cpp @@ -151,15 +151,15 @@ void ASTColumnsApplyTransformer::appendColumnName(WriteBuffer & ostr) const } } -void ASTColumnsApplyTransformer::updateTreeHashImpl(SipHash & hash_state) const +void ASTColumnsApplyTransformer::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(func_name.size()); hash_state.update(func_name); if (parameters) - parameters->updateTreeHashImpl(hash_state); + parameters->updateTreeHashImpl(hash_state, ignore_aliases); if (lambda) - lambda->updateTreeHashImpl(hash_state); + lambda->updateTreeHashImpl(hash_state, ignore_aliases); hash_state.update(lambda_arg.size()); hash_state.update(lambda_arg); @@ -167,7 +167,7 @@ void ASTColumnsApplyTransformer::updateTreeHashImpl(SipHash & hash_state) const hash_state.update(column_name_prefix.size()); hash_state.update(column_name_prefix); - IAST::updateTreeHashImpl(hash_state); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); } void ASTColumnsExceptTransformer::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const @@ -216,13 +216,13 @@ void ASTColumnsExceptTransformer::appendColumnName(WriteBuffer & ostr) const writeChar(')', ostr); } -void ASTColumnsExceptTransformer::updateTreeHashImpl(SipHash & hash_state) const +void ASTColumnsExceptTransformer::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(is_strict); hash_state.update(original_pattern.size()); hash_state.update(original_pattern); - IAST::updateTreeHashImpl(hash_state); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); } void ASTColumnsExceptTransformer::transform(ASTs & nodes) const @@ -312,14 +312,14 @@ void ASTColumnsReplaceTransformer::Replacement::appendColumnName(WriteBuffer & o writeProbablyBackQuotedString(name, ostr); } -void ASTColumnsReplaceTransformer::Replacement::updateTreeHashImpl(SipHash & hash_state) const +void ASTColumnsReplaceTransformer::Replacement::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { assert(children.size() == 1); hash_state.update(name.size()); hash_state.update(name); - children[0]->updateTreeHashImpl(hash_state); - IAST::updateTreeHashImpl(hash_state); + children[0]->updateTreeHashImpl(hash_state, ignore_aliases); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); } void ASTColumnsReplaceTransformer::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const @@ -361,10 +361,10 @@ void ASTColumnsReplaceTransformer::appendColumnName(WriteBuffer & ostr) const writeChar(')', ostr); } -void ASTColumnsReplaceTransformer::updateTreeHashImpl(SipHash & hash_state) const +void ASTColumnsReplaceTransformer::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(is_strict); - IAST::updateTreeHashImpl(hash_state); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); } void ASTColumnsReplaceTransformer::replaceChildren(ASTPtr & node, const ASTPtr & replacement, const String & name) diff --git a/src/Parsers/ASTColumnsTransformers.h b/src/Parsers/ASTColumnsTransformers.h index e42949ebfd8..a2a138e13c9 100644 --- a/src/Parsers/ASTColumnsTransformers.h +++ b/src/Parsers/ASTColumnsTransformers.h @@ -48,7 +48,7 @@ public: } void transform(ASTs & nodes) const override; void appendColumnName(WriteBuffer & ostr) const override; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; // Case 1 APPLY (quantile(0.9)) String func_name; @@ -80,7 +80,7 @@ public: const std::shared_ptr & getMatcher() const; bool isColumnMatching(const String & column_name) const; void appendColumnName(WriteBuffer & ostr) const override; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; protected: void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; @@ -103,7 +103,7 @@ public: } void appendColumnName(WriteBuffer & ostr) const override; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; String name; @@ -121,7 +121,7 @@ public: } void transform(ASTs & nodes) const override; void appendColumnName(WriteBuffer & ostr) const override; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; protected: void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; diff --git a/src/Parsers/ASTFunction.cpp b/src/Parsers/ASTFunction.cpp index 267148ee62b..80d9f2fb4a5 100644 --- a/src/Parsers/ASTFunction.cpp +++ b/src/Parsers/ASTFunction.cpp @@ -599,11 +599,11 @@ ASTPtr ASTFunction::clone() const } -void ASTFunction::updateTreeHashImpl(SipHash & hash_state) const +void ASTFunction::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(name.size()); hash_state.update(name); - IAST::updateTreeHashImpl(hash_state); + ASTWithAlias::updateTreeHashImpl(hash_state, ignore_aliases); } template diff --git a/src/Parsers/ASTFunction.h b/src/Parsers/ASTFunction.h index 4a036c5e94a..fe30b7c6e95 100644 --- a/src/Parsers/ASTFunction.h +++ b/src/Parsers/ASTFunction.h @@ -63,7 +63,7 @@ public: ASTPtr clone() const override; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; ASTSelectWithUnionQuery * tryGetQueryArgument() const; diff --git a/src/Parsers/ASTFunctionWithKeyValueArguments.cpp b/src/Parsers/ASTFunctionWithKeyValueArguments.cpp index 2c28e342610..a5467bef363 100644 --- a/src/Parsers/ASTFunctionWithKeyValueArguments.cpp +++ b/src/Parsers/ASTFunctionWithKeyValueArguments.cpp @@ -53,12 +53,12 @@ bool ASTPair::hasSecretParts() const } -void ASTPair::updateTreeHashImpl(SipHash & hash_state) const +void ASTPair::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(first.size()); hash_state.update(first); hash_state.update(second_with_brackets); - IAST::updateTreeHashImpl(hash_state); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); } @@ -92,12 +92,12 @@ void ASTFunctionWithKeyValueArguments::formatImpl(const FormatSettings & setting } -void ASTFunctionWithKeyValueArguments::updateTreeHashImpl(SipHash & hash_state) const +void ASTFunctionWithKeyValueArguments::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(name.size()); hash_state.update(name); hash_state.update(has_brackets); - IAST::updateTreeHashImpl(hash_state); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); } } diff --git a/src/Parsers/ASTFunctionWithKeyValueArguments.h b/src/Parsers/ASTFunctionWithKeyValueArguments.h index 75a8ae0415e..ec2a793154f 100644 --- a/src/Parsers/ASTFunctionWithKeyValueArguments.h +++ b/src/Parsers/ASTFunctionWithKeyValueArguments.h @@ -32,7 +32,7 @@ public: bool hasSecretParts() const override; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; void forEachPointerToChild(std::function f) override { @@ -66,7 +66,7 @@ public: void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; }; } diff --git a/src/Parsers/ASTIdentifier.cpp b/src/Parsers/ASTIdentifier.cpp index 042b4d9085d..80a618170c6 100644 --- a/src/Parsers/ASTIdentifier.cpp +++ b/src/Parsers/ASTIdentifier.cpp @@ -87,6 +87,11 @@ void ASTIdentifier::setShortName(const String & new_name) semantic->table = table; } +void ASTIdentifier::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const +{ + ASTWithAlias::updateTreeHashImpl(hash_state, ignore_aliases); +} + const String & ASTIdentifier::name() const { if (children.empty()) @@ -244,10 +249,10 @@ void ASTTableIdentifier::resetTable(const String & database_name, const String & uuid = identifier->uuid; } -void ASTTableIdentifier::updateTreeHashImpl(SipHash & hash_state) const +void ASTTableIdentifier::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(uuid); - IAST::updateTreeHashImpl(hash_state); + ASTIdentifier::updateTreeHashImpl(hash_state, ignore_aliases); } String getIdentifierName(const IAST * ast) diff --git a/src/Parsers/ASTIdentifier.h b/src/Parsers/ASTIdentifier.h index 0e030c797ce..d986b9170f3 100644 --- a/src/Parsers/ASTIdentifier.h +++ b/src/Parsers/ASTIdentifier.h @@ -47,6 +47,8 @@ public: const String & shortName() const { return name_parts.back(); } const String & name() const; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_alias) const override; + void restoreTable(); // TODO(ilezhankin): get rid of this std::shared_ptr createTable() const; // returns |nullptr| if identifier is not table. @@ -91,7 +93,7 @@ public: // FIXME: used only when it's needed to rewrite distributed table name to real remote table name. void resetTable(const String & database_name, const String & table_name); // TODO(ilezhankin): get rid of this - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; }; } diff --git a/src/Parsers/ASTInsertQuery.cpp b/src/Parsers/ASTInsertQuery.cpp index ecb2d4e331b..88e087dd4ee 100644 --- a/src/Parsers/ASTInsertQuery.cpp +++ b/src/Parsers/ASTInsertQuery.cpp @@ -138,13 +138,13 @@ void ASTInsertQuery::formatImpl(const FormatSettings & settings, FormatState & s } } -void ASTInsertQuery::updateTreeHashImpl(SipHash & hash_state) const +void ASTInsertQuery::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(table_id.database_name); hash_state.update(table_id.table_name); hash_state.update(table_id.uuid); hash_state.update(format); - IAST::updateTreeHashImpl(hash_state); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); } diff --git a/src/Parsers/ASTInsertQuery.h b/src/Parsers/ASTInsertQuery.h index 45fd3d97950..6a4ce078f79 100644 --- a/src/Parsers/ASTInsertQuery.h +++ b/src/Parsers/ASTInsertQuery.h @@ -72,7 +72,7 @@ public: protected: void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; }; } diff --git a/src/Parsers/ASTLiteral.cpp b/src/Parsers/ASTLiteral.cpp index 425e5c73bee..8dedc5dc95d 100644 --- a/src/Parsers/ASTLiteral.cpp +++ b/src/Parsers/ASTLiteral.cpp @@ -10,11 +10,13 @@ namespace DB { -void ASTLiteral::updateTreeHashImpl(SipHash & hash_state) const +void ASTLiteral::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { const char * prefix = "Literal_"; hash_state.update(prefix, strlen(prefix)); applyVisitor(FieldVisitorHash(hash_state), value); + if (!ignore_aliases) + ASTWithAlias::updateTreeHashImpl(hash_state, ignore_aliases); } ASTPtr ASTLiteral::clone() const diff --git a/src/Parsers/ASTLiteral.h b/src/Parsers/ASTLiteral.h index e57bcfcd9d5..0c55aceb068 100644 --- a/src/Parsers/ASTLiteral.h +++ b/src/Parsers/ASTLiteral.h @@ -41,7 +41,7 @@ public: ASTPtr clone() const override; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; protected: void formatImplWithoutAlias(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; diff --git a/src/Parsers/ASTOrderByElement.cpp b/src/Parsers/ASTOrderByElement.cpp index 884d69a18e3..318849812aa 100644 --- a/src/Parsers/ASTOrderByElement.cpp +++ b/src/Parsers/ASTOrderByElement.cpp @@ -7,13 +7,13 @@ namespace DB { -void ASTOrderByElement::updateTreeHashImpl(SipHash & hash_state) const +void ASTOrderByElement::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(direction); hash_state.update(nulls_direction); hash_state.update(nulls_direction_was_explicitly_specified); hash_state.update(with_fill); - IAST::updateTreeHashImpl(hash_state); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); } void ASTOrderByElement::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const diff --git a/src/Parsers/ASTOrderByElement.h b/src/Parsers/ASTOrderByElement.h index 468d2161dff..4cebc30be31 100644 --- a/src/Parsers/ASTOrderByElement.h +++ b/src/Parsers/ASTOrderByElement.h @@ -32,7 +32,7 @@ public: return clone; } - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; protected: void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; diff --git a/src/Parsers/ASTQueryParameter.cpp b/src/Parsers/ASTQueryParameter.cpp index c10cced23ce..9e98252e779 100644 --- a/src/Parsers/ASTQueryParameter.cpp +++ b/src/Parsers/ASTQueryParameter.cpp @@ -23,4 +23,9 @@ void ASTQueryParameter::appendColumnNameImpl(WriteBuffer & ostr) const writeString(name, ostr); } +void ASTQueryParameter::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const +{ + ASTWithAlias::updateTreeHashImpl(hash_state, ignore_aliases); +} + } diff --git a/src/Parsers/ASTQueryParameter.h b/src/Parsers/ASTQueryParameter.h index 858b23a0250..dd7f9bff863 100644 --- a/src/Parsers/ASTQueryParameter.h +++ b/src/Parsers/ASTQueryParameter.h @@ -21,6 +21,8 @@ public: ASTPtr clone() const override { return std::make_shared(*this); } + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; + protected: void formatImplWithoutAlias(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; void appendColumnNameImpl(WriteBuffer & ostr) const override; diff --git a/src/Parsers/ASTSelectQuery.cpp b/src/Parsers/ASTSelectQuery.cpp index 2d82708c70d..7c96db006c4 100644 --- a/src/Parsers/ASTSelectQuery.cpp +++ b/src/Parsers/ASTSelectQuery.cpp @@ -42,14 +42,14 @@ ASTPtr ASTSelectQuery::clone() const } -void ASTSelectQuery::updateTreeHashImpl(SipHash & hash_state) const +void ASTSelectQuery::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(distinct); hash_state.update(group_by_with_totals); hash_state.update(group_by_with_rollup); hash_state.update(group_by_with_cube); hash_state.update(limit_with_ties); - IAST::updateTreeHashImpl(hash_state); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); } diff --git a/src/Parsers/ASTSelectQuery.h b/src/Parsers/ASTSelectQuery.h index 101dbe9d02c..57f45a8aacd 100644 --- a/src/Parsers/ASTSelectQuery.h +++ b/src/Parsers/ASTSelectQuery.h @@ -137,7 +137,7 @@ public: void replaceDatabaseAndTable(const String & database_name, const String & table_name); void replaceDatabaseAndTable(const StorageID & table_id); void addTableFunction(ASTPtr & table_function_ptr); - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; void setFinal(); diff --git a/src/Parsers/ASTSetQuery.cpp b/src/Parsers/ASTSetQuery.cpp index 1b7b76fe231..e2c60e8369d 100644 --- a/src/Parsers/ASTSetQuery.cpp +++ b/src/Parsers/ASTSetQuery.cpp @@ -9,7 +9,7 @@ namespace DB { -void ASTSetQuery::updateTreeHashImpl(SipHash & hash_state) const +void ASTSetQuery::updateTreeHashImpl(SipHash & hash_state, bool /*ignore_aliases*/) const { for (const auto & change : changes) { diff --git a/src/Parsers/ASTSetQuery.h b/src/Parsers/ASTSetQuery.h index beed052c79a..944f08dcbaa 100644 --- a/src/Parsers/ASTSetQuery.h +++ b/src/Parsers/ASTSetQuery.h @@ -34,7 +34,7 @@ public: void formatImpl(const FormatSettings & format, FormatState &, FormatStateStacked) const override; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; QueryKind getQueryKind() const override { return QueryKind::Set; } diff --git a/src/Parsers/ASTSubquery.cpp b/src/Parsers/ASTSubquery.cpp index 92adad666ed..75dfccd6e13 100644 --- a/src/Parsers/ASTSubquery.cpp +++ b/src/Parsers/ASTSubquery.cpp @@ -51,11 +51,11 @@ void ASTSubquery::formatImplWithoutAlias(const FormatSettings & settings, Format settings.ostr << nl_or_nothing << indent_str << ")"; } -void ASTSubquery::updateTreeHashImpl(SipHash & hash_state) const +void ASTSubquery::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { if (!cte_name.empty()) hash_state.update(cte_name); - IAST::updateTreeHashImpl(hash_state); + ASTWithAlias::updateTreeHashImpl(hash_state, ignore_aliases); } String ASTSubquery::getAliasOrColumnName() const diff --git a/src/Parsers/ASTSubquery.h b/src/Parsers/ASTSubquery.h index e4de766621a..ef277a63126 100644 --- a/src/Parsers/ASTSubquery.h +++ b/src/Parsers/ASTSubquery.h @@ -14,7 +14,7 @@ class ASTSubquery : public ASTWithAlias public: // Stored the name when the subquery is defined in WITH clause. For example: // WITH (SELECT 1) AS a SELECT * FROM a AS b; cte_name will be `a`. - std::string cte_name; + String cte_name; /** Get the text that identifies this element. */ String getID(char) const override { return "Subquery"; } @@ -26,7 +26,7 @@ public: return clone; } - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; String getAliasOrColumnName() const override; String tryGetAlias() const override; diff --git a/src/Parsers/ASTTablesInSelectQuery.cpp b/src/Parsers/ASTTablesInSelectQuery.cpp index 75c0ef26c07..e4e8c00879e 100644 --- a/src/Parsers/ASTTablesInSelectQuery.cpp +++ b/src/Parsers/ASTTablesInSelectQuery.cpp @@ -21,10 +21,10 @@ do \ while (false) -void ASTTableExpression::updateTreeHashImpl(SipHash & hash_state) const +void ASTTableExpression::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(final); - IAST::updateTreeHashImpl(hash_state); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); } @@ -42,12 +42,12 @@ ASTPtr ASTTableExpression::clone() const return res; } -void ASTTableJoin::updateTreeHashImpl(SipHash & hash_state) const +void ASTTableJoin::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(locality); hash_state.update(strictness); hash_state.update(kind); - IAST::updateTreeHashImpl(hash_state); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); } ASTPtr ASTTableJoin::clone() const @@ -61,10 +61,10 @@ ASTPtr ASTTableJoin::clone() const return res; } -void ASTArrayJoin::updateTreeHashImpl(SipHash & hash_state) const +void ASTArrayJoin::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const { hash_state.update(kind); - IAST::updateTreeHashImpl(hash_state); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); } ASTPtr ASTArrayJoin::clone() const diff --git a/src/Parsers/ASTTablesInSelectQuery.h b/src/Parsers/ASTTablesInSelectQuery.h index a004cbf9847..67370eaee14 100644 --- a/src/Parsers/ASTTablesInSelectQuery.h +++ b/src/Parsers/ASTTablesInSelectQuery.h @@ -59,7 +59,7 @@ struct ASTTableExpression : public IAST String getID(char) const override { return "TableExpression"; } ASTPtr clone() const override; void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; }; @@ -81,7 +81,7 @@ struct ASTTableJoin : public IAST void formatImplBeforeTable(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const; void formatImplAfterTable(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const; void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; }; /// Specification of ARRAY JOIN. @@ -102,7 +102,7 @@ struct ASTArrayJoin : public IAST String getID(char) const override { return "ArrayJoin"; } ASTPtr clone() const override; void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; }; diff --git a/src/Parsers/ASTTransactionControl.cpp b/src/Parsers/ASTTransactionControl.cpp index 3106d432c90..6964441622d 100644 --- a/src/Parsers/ASTTransactionControl.cpp +++ b/src/Parsers/ASTTransactionControl.cpp @@ -39,7 +39,7 @@ IAST::QueryKind ASTTransactionControl::getQueryKind() const } } -void ASTTransactionControl::updateTreeHashImpl(SipHash & hash_state) const +void ASTTransactionControl::updateTreeHashImpl(SipHash & hash_state, bool /*ignore_aliases*/) const { hash_state.update(action); } diff --git a/src/Parsers/ASTTransactionControl.h b/src/Parsers/ASTTransactionControl.h index fb0058144dd..84a1dcf0970 100644 --- a/src/Parsers/ASTTransactionControl.h +++ b/src/Parsers/ASTTransactionControl.h @@ -20,13 +20,13 @@ public: UInt64 snapshot; /// For SET TRANSACTION SNAPSHOT ... - ASTTransactionControl(QueryType action_) : action(action_) {} + explicit ASTTransactionControl(QueryType action_) : action(action_) {} String getID(char /*delimiter*/) const override { return "ASTTransactionControl"; } ASTPtr clone() const override { return std::make_shared(*this); } void formatImpl(const FormatSettings & format, FormatState & /*state*/, FormatStateStacked /*frame*/) const override; - void updateTreeHashImpl(SipHash & hash_state) const override; + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; QueryKind getQueryKind() const override; }; diff --git a/src/Parsers/ASTWithAlias.cpp b/src/Parsers/ASTWithAlias.cpp index 1b5397654fd..5d1122ae4d8 100644 --- a/src/Parsers/ASTWithAlias.cpp +++ b/src/Parsers/ASTWithAlias.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -42,6 +43,13 @@ void ASTWithAlias::formatImpl(const FormatSettings & settings, FormatState & sta } } +void ASTWithAlias::updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const +{ + if (!alias.empty() && !ignore_aliases) + hash_state.update(alias); + IAST::updateTreeHashImpl(hash_state, ignore_aliases); +} + void ASTWithAlias::appendColumnName(WriteBuffer & ostr) const { if (prefer_alias_to_column_name && !alias.empty()) diff --git a/src/Parsers/ASTWithAlias.h b/src/Parsers/ASTWithAlias.h index ea4419402b0..452e2038e55 100644 --- a/src/Parsers/ASTWithAlias.h +++ b/src/Parsers/ASTWithAlias.h @@ -27,7 +27,9 @@ public: void setAlias(const String & to) override { alias = to; } /// Calls formatImplWithoutAlias, and also outputs an alias. If necessary, encloses the entire expression in brackets. - void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override final; + void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const final; + + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; virtual void formatImplWithoutAlias(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const = 0; diff --git a/src/Parsers/IAST.cpp b/src/Parsers/IAST.cpp index a494a528cd2..37d7f458d61 100644 --- a/src/Parsers/IAST.cpp +++ b/src/Parsers/IAST.cpp @@ -114,24 +114,24 @@ size_t IAST::checkSize(size_t max_size) const } -IAST::Hash IAST::getTreeHash() const +IAST::Hash IAST::getTreeHash(bool ignore_aliases) const { SipHash hash_state; - updateTreeHash(hash_state); + updateTreeHash(hash_state, ignore_aliases); return getSipHash128AsPair(hash_state); } -void IAST::updateTreeHash(SipHash & hash_state) const +void IAST::updateTreeHash(SipHash & hash_state, bool ignore_aliases) const { - updateTreeHashImpl(hash_state); + updateTreeHashImpl(hash_state, ignore_aliases); hash_state.update(children.size()); for (const auto & child : children) - child->updateTreeHash(hash_state); + child->updateTreeHash(hash_state, ignore_aliases); } -void IAST::updateTreeHashImpl(SipHash & hash_state) const +void IAST::updateTreeHashImpl(SipHash & hash_state, bool /*ignore_aliases*/) const { auto id = getID(); hash_state.update(id.data(), id.size()); diff --git a/src/Parsers/IAST.h b/src/Parsers/IAST.h index 812fd082476..9afd59caa05 100644 --- a/src/Parsers/IAST.h +++ b/src/Parsers/IAST.h @@ -78,11 +78,13 @@ public: virtual ASTPtr clone() const = 0; /** Get hash code, identifying this element and its subtree. + * Hashing by default ignores aliases (e.g. identifier aliases, function aliases, literal aliases) which is + * useful for common subexpression elimination. Set 'ignore_aliases = false' if you don't want that behavior. */ using Hash = CityHash_v1_0_2::uint128; - Hash getTreeHash() const; - void updateTreeHash(SipHash & hash_state) const; - virtual void updateTreeHashImpl(SipHash & hash_state) const; + Hash getTreeHash(bool ignore_aliases = true) const; + void updateTreeHash(SipHash & hash_state, bool ignore_aliases = true) const; + virtual void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const; void dumpTree(WriteBuffer & ostr, size_t indent = 0) const; std::string dumpTree(size_t indent = 0) const; diff --git a/src/Planner/CollectTableExpressionData.cpp b/src/Planner/CollectTableExpressionData.cpp index 4b44374e8eb..5ba318dab6a 100644 --- a/src/Planner/CollectTableExpressionData.cpp +++ b/src/Planner/CollectTableExpressionData.cpp @@ -206,12 +206,16 @@ void collectTableExpressionData(QueryTreeNodePtr & query_node, PlannerContextPtr if (auto * table_node = table_expression_node->as()) { bool storage_is_remote = table_node->getStorage()->isRemote(); + bool storage_is_merge_tree = table_node->getStorage()->isMergeTree(); table_expression_data.setIsRemote(storage_is_remote); + table_expression_data.setIsMergeTree(storage_is_merge_tree); } else if (auto * table_function_node = table_expression_node->as()) { bool storage_is_remote = table_function_node->getStorage()->isRemote(); + bool storage_is_merge_tree = table_function_node->getStorage()->isMergeTree(); table_expression_data.setIsRemote(storage_is_remote); + table_expression_data.setIsMergeTree(storage_is_merge_tree); } } diff --git a/src/Planner/Planner.cpp b/src/Planner/Planner.cpp index 08fe1d56a18..5b354ccda46 100644 --- a/src/Planner/Planner.cpp +++ b/src/Planner/Planner.cpp @@ -381,6 +381,17 @@ Aggregator::Params getAggregatorParams(const PlannerContextPtr & planner_context return aggregator_params; } +SortDescription getSortDescriptionFromNames(const Names & names) +{ + SortDescription order_descr; + order_descr.reserve(names.size()); + + for (const auto & name : names) + order_descr.emplace_back(name, 1, 1); + + return order_descr; +} + void addAggregationStep(QueryPlan & query_plan, const AggregationAnalysisResult & aggregation_analysis_result, const QueryAnalysisResult & query_analysis_result, @@ -393,6 +404,12 @@ void addAggregationStep(QueryPlan & query_plan, SortDescription sort_description_for_merging; SortDescription group_by_sort_description; + if (settings.force_aggregation_in_order) + { + group_by_sort_description = getSortDescriptionFromNames(aggregation_analysis_result.aggregation_keys); + sort_description_for_merging = group_by_sort_description; + } + auto merge_threads = settings.max_threads; auto temporary_data_merge_threads = settings.aggregation_memory_efficient_merge_threads ? static_cast(settings.aggregation_memory_efficient_merge_threads) @@ -464,12 +481,14 @@ void addMergingAggregatedStep(QueryPlan & query_plan, settings.max_block_size); bool is_remote_storage = false; + bool parallel_replicas_from_merge_tree = false; const auto & table_expression_node_to_data = planner_context->getTableExpressionNodeToData(); if (table_expression_node_to_data.size() == 1) { auto it = table_expression_node_to_data.begin(); is_remote_storage = it->second.isRemote(); + parallel_replicas_from_merge_tree = it->second.isMergeTree() && query_context->canUseParallelReplicasOnInitiator(); } SortDescription group_by_sort_description; @@ -479,7 +498,7 @@ void addMergingAggregatedStep(QueryPlan & query_plan, params, query_analysis_result.aggregate_final, /// Grouping sets don't work with distributed_aggregation_memory_efficient enabled (#43989) - settings.distributed_aggregation_memory_efficient && is_remote_storage && !query_analysis_result.aggregation_with_rollup_or_cube_or_grouping_sets, + settings.distributed_aggregation_memory_efficient && (is_remote_storage || parallel_replicas_from_merge_tree) && !query_analysis_result.aggregation_with_rollup_or_cube_or_grouping_sets, settings.max_threads, settings.aggregation_memory_efficient_merge_threads, query_analysis_result.aggregation_should_produce_results_in_order_of_bucket_number, @@ -1201,6 +1220,8 @@ void Planner::buildPlanForUnionNode() { Planner query_planner(query_node, select_query_options); query_planner.buildQueryPlanIfNeeded(); + for (const auto & row_policy : query_planner.getUsedRowPolicies()) + used_row_policies.insert(row_policy); auto query_node_plan = std::make_unique(std::move(query_planner).extractQueryPlan()); query_plans_headers.push_back(query_node_plan->getCurrentDataStream().header); query_plans.push_back(std::move(query_node_plan)); @@ -1348,8 +1369,10 @@ void Planner::buildPlanForQueryNode() select_query_options, top_level_identifiers, planner_context); + auto from_stage = join_tree_query_plan.from_stage; query_plan = std::move(join_tree_query_plan.query_plan); + used_row_policies = std::move(join_tree_query_plan.used_row_policies); LOG_TRACE(&Poco::Logger::get("Planner"), "Query {} from stage {} to stage {}{}", query_tree->formatConvertedASTForErrorMessage(), diff --git a/src/Planner/Planner.h b/src/Planner/Planner.h index f8d151365cf..2177ed59fc6 100644 --- a/src/Planner/Planner.h +++ b/src/Planner/Planner.h @@ -44,6 +44,11 @@ public: return query_plan; } + const std::set & getUsedRowPolicies() const + { + return used_row_policies; + } + void buildQueryPlanIfNeeded(); QueryPlan && extractQueryPlan() && @@ -70,6 +75,7 @@ private: PlannerContextPtr planner_context; QueryPlan query_plan; StorageLimitsList storage_limits; + std::set used_row_policies; }; } diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 6ffef7cda3c..f8770ca7c9c 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -126,8 +126,8 @@ bool shouldIgnoreQuotaAndLimits(const TableNode & table_node) return false; if (storage_id.database_name == DatabaseCatalog::SYSTEM_DATABASE) { - static const boost::container::flat_set tables_ignoring_quota{"quotas", "quota_limits", "quota_usage", "quotas_usage", "one"}; - if (tables_ignoring_quota.count(storage_id.table_name)) + static const boost::container::flat_set tables_ignoring_quota{"quotas", "quota_limits", "quota_usage", "quotas_usage", "one"}; + if (tables_ignoring_quota.contains(storage_id.table_name)) return true; } return false; @@ -441,7 +441,8 @@ void updatePrewhereOutputsIfNeeded(SelectQueryInfo & table_expression_query_info FilterDAGInfo buildRowPolicyFilterIfNeeded(const StoragePtr & storage, SelectQueryInfo & table_expression_query_info, - PlannerContextPtr & planner_context) + PlannerContextPtr & planner_context, + std::set & used_row_policies) { auto storage_id = storage->getStorageID(); const auto & query_context = planner_context->getQueryContext(); @@ -450,6 +451,12 @@ FilterDAGInfo buildRowPolicyFilterIfNeeded(const StoragePtr & storage, if (!row_policy_filter || row_policy_filter->empty()) return {}; + for (const auto & row_policy : row_policy_filter->policies) + { + auto name = row_policy->getFullName().toString(); + used_row_policies.emplace(std::move(name)); + } + return buildFilterInfo(row_policy_filter->expression, table_expression_query_info.table_expression, planner_context); } @@ -586,6 +593,7 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres auto * union_node = table_expression->as(); QueryPlan query_plan; + std::set used_row_policies; if (table_node || table_function_node) { @@ -781,7 +789,7 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres } }; - auto row_policy_filter_info = buildRowPolicyFilterIfNeeded(storage, table_expression_query_info, planner_context); + auto row_policy_filter_info = buildRowPolicyFilterIfNeeded(storage, table_expression_query_info, planner_context, used_row_policies); add_filter(row_policy_filter_info, "Row-level security filter"); if (row_policy_filter_info.actions) table_expression_data.setRowLevelFilterActions(row_policy_filter_info.actions); @@ -940,7 +948,7 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres } } - return {std::move(query_plan), from_stage}; + return {std::move(query_plan), from_stage, std::move(used_row_policies)}; } JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_expression, @@ -1146,12 +1154,13 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_ const auto & join_clause_right_key_nodes = join_clause.getRightKeyNodes(); size_t join_clause_key_nodes_size = join_clause_left_key_nodes.size(); - assert(join_clause_key_nodes_size == join_clause_right_key_nodes.size()); + chassert(join_clause_key_nodes_size == join_clause_right_key_nodes.size()); for (size_t i = 0; i < join_clause_key_nodes_size; ++i) { - table_join_clause.key_names_left.push_back(join_clause_left_key_nodes[i]->result_name); - table_join_clause.key_names_right.push_back(join_clause_right_key_nodes[i]->result_name); + table_join_clause.addKey(join_clause_left_key_nodes[i]->result_name, + join_clause_right_key_nodes[i]->result_name, + join_clause.isNullsafeCompareKey(i)); } const auto & join_clause_get_left_filter_condition_nodes = join_clause.getLeftFilterConditionNodes(); @@ -1398,7 +1407,10 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_ drop_unused_columns_after_join_transform_step->setStepDescription("DROP unused columns after JOIN"); result_plan.addStep(std::move(drop_unused_columns_after_join_transform_step)); - return {std::move(result_plan), QueryProcessingStage::FetchColumns}; + for (const auto & right_join_tree_query_plan_row_policy : right_join_tree_query_plan.used_row_policies) + left_join_tree_query_plan.used_row_policies.insert(right_join_tree_query_plan_row_policy); + + return {std::move(result_plan), QueryProcessingStage::FetchColumns, std::move(left_join_tree_query_plan.used_row_policies)}; } JoinTreeQueryPlan buildQueryPlanForArrayJoinNode(const QueryTreeNodePtr & array_join_table_expression, @@ -1476,7 +1488,7 @@ JoinTreeQueryPlan buildQueryPlanForArrayJoinNode(const QueryTreeNodePtr & array_ array_join_step->setStepDescription("ARRAY JOIN"); plan.addStep(std::move(array_join_step)); - return {std::move(plan), QueryProcessingStage::FetchColumns}; + return {std::move(plan), QueryProcessingStage::FetchColumns, std::move(join_tree_query_plan.used_row_policies)}; } } diff --git a/src/Planner/PlannerJoinTree.h b/src/Planner/PlannerJoinTree.h index 9d3b98175d0..d4d6c173847 100644 --- a/src/Planner/PlannerJoinTree.h +++ b/src/Planner/PlannerJoinTree.h @@ -15,6 +15,7 @@ struct JoinTreeQueryPlan { QueryPlan query_plan; QueryProcessingStage::Enum from_stage; + std::set used_row_policies; }; /// Build JOIN TREE query plan for query node diff --git a/src/Planner/PlannerJoins.cpp b/src/Planner/PlannerJoins.cpp index 2f7c08b25ba..5f53c8e1fce 100644 --- a/src/Planner/PlannerJoins.cpp +++ b/src/Planner/PlannerJoins.cpp @@ -191,7 +191,7 @@ void buildJoinClause(ActionsDAGPtr join_expression_dag, auto asof_inequality = getASOFJoinInequality(function_name); bool is_asof_join_inequality = join_node.getStrictness() == JoinStrictness::Asof && asof_inequality != ASOFJoinInequality::None; - if (function_name == "equals" || is_asof_join_inequality) + if (function_name == "equals" || function_name == "isNotDistinctFrom" || is_asof_join_inequality) { const auto * left_child = join_expressions_actions_node->children.at(0); const auto * right_child = join_expressions_actions_node->children.at(1); @@ -253,7 +253,8 @@ void buildJoinClause(ActionsDAGPtr join_expression_dag, } else { - join_clause.addKey(left_key, right_key); + bool null_safe_comparison = function_name == "isNotDistinctFrom"; + join_clause.addKey(left_key, right_key, null_safe_comparison); } } else @@ -474,6 +475,24 @@ JoinClausesAndActions buildJoinClausesAndActions(const ColumnsWithTypeAndName & right_key_node = &join_expression_actions->addCast(*right_key_node, common_type, {}); } + if (join_clause.isNullsafeCompareKey(i) && left_key_node->result_type->isNullable() && right_key_node->result_type->isNullable()) + { + /** + * In case of null-safe comparison (a IS NOT DISTICT FROM b), + * we need to wrap keys with a non-nullable type. + * The type `tuple` can be used for this purpose, + * because value tuple(NULL) is not NULL itself (moreover it has type Tuple(Nullable(T) which is not Nullable). + * Thus, join algorithm will match keys with values tuple(NULL). + * Example: + * SELECT * FROM t1 JOIN t2 ON t1.a <=> t2.b + * This will be semantically transformed to: + * SELECT * FROM t1 JOIN t2 ON tuple(t1.a) == tuple(t2.b) + */ + auto wrap_nullsafe_function = FunctionFactory::instance().get("tuple", planner_context->getQueryContext()); + left_key_node = &join_expression_actions->addFunction(wrap_nullsafe_function, {left_key_node}, {}); + right_key_node = &join_expression_actions->addFunction(wrap_nullsafe_function, {right_key_node}, {}); + } + join_expression_actions->addOrReplaceInOutputs(*left_key_node); join_expression_actions->addOrReplaceInOutputs(*right_key_node); diff --git a/src/Planner/PlannerJoins.h b/src/Planner/PlannerJoins.h index c61bce932e0..94f32e7ad51 100644 --- a/src/Planner/PlannerJoins.h +++ b/src/Planner/PlannerJoins.h @@ -53,10 +53,12 @@ class JoinClause { public: /// Add keys - void addKey(const ActionsDAG::Node * left_key_node, const ActionsDAG::Node * right_key_node) + void addKey(const ActionsDAG::Node * left_key_node, const ActionsDAG::Node * right_key_node, bool null_safe_comparison = false) { left_key_nodes.emplace_back(left_key_node); right_key_nodes.emplace_back(right_key_node); + if (null_safe_comparison) + nullsafe_compare_key_indexes.emplace(left_key_nodes.size() - 1); } void addASOFKey(const ActionsDAG::Node * left_key_node, const ActionsDAG::Node * right_key_node, ASOFJoinInequality asof_inequality) @@ -97,6 +99,11 @@ public: return right_key_nodes; } + bool isNullsafeCompareKey(size_t idx) const + { + return nullsafe_compare_key_indexes.contains(idx); + } + /// Returns true if JOIN clause has ASOF conditions, false otherwise bool hasASOF() const { @@ -147,6 +154,8 @@ private: ActionsDAG::NodeRawConstPtrs left_filter_condition_nodes; ActionsDAG::NodeRawConstPtrs right_filter_condition_nodes; + + std::unordered_set nullsafe_compare_key_indexes; }; using JoinClauses = std::vector; diff --git a/src/Planner/TableExpressionData.h b/src/Planner/TableExpressionData.h index 6a89aecdfb9..ee5a05fe7da 100644 --- a/src/Planner/TableExpressionData.h +++ b/src/Planner/TableExpressionData.h @@ -240,6 +240,16 @@ public: is_remote = is_remote_value; } + bool isMergeTree() const + { + return is_merge_tree; + } + + void setIsMergeTree(bool is_merge_tree_value) + { + is_merge_tree = is_merge_tree_value; + } + const ActionsDAGPtr & getPrewhereFilterActions() const { return prewhere_filter_actions; @@ -305,6 +315,9 @@ private: /// Is storage remote bool is_remote = false; + + /// Is storage merge tree + bool is_merge_tree = false; }; } diff --git a/src/Processors/Chunk.cpp b/src/Processors/Chunk.cpp index 3839a8963b2..c91df285539 100644 --- a/src/Processors/Chunk.cpp +++ b/src/Processors/Chunk.cpp @@ -73,7 +73,7 @@ void Chunk::checkNumRowsIsConsistent() auto & column = columns[i]; if (column->size() != num_rows) throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid number of rows in Chunk column {}: expected {}, got {}", - column->getName()+ " position " + toString(i), toString(num_rows), toString(column->size())); + column->getName() + " position " + toString(i), toString(num_rows), toString(column->size())); } } diff --git a/src/Processors/Formats/Impl/ArrowFieldIndexUtil.h b/src/Processors/Formats/Impl/ArrowFieldIndexUtil.h index cf8c5848db5..24ffdc10581 100644 --- a/src/Processors/Formats/Impl/ArrowFieldIndexUtil.h +++ b/src/Processors/Formats/Impl/ArrowFieldIndexUtil.h @@ -1,22 +1,33 @@ #pragma once + #include "config.h" + #if USE_PARQUET || USE_ORC + #include #include #include +#include +#include +#include #include #include #include #include #include + + namespace arrow { -class Schema; -class DataType; -class Field; + class Schema; + class DataType; + class Field; } + + namespace DB { + namespace ErrorCodes { extern const int THERE_IS_NO_COLUMN; @@ -211,5 +222,7 @@ private: } } }; + } + #endif diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index baaff8b497b..9ea42de3d32 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -177,7 +177,7 @@ void CSVFormatReader::skipRow() } } -static void skipEndOfLine(ReadBuffer & in) +static void skipEndOfLine(ReadBuffer & in, bool allow_cr_end_of_line) { /// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic) @@ -192,7 +192,7 @@ static void skipEndOfLine(ReadBuffer & in) ++in.position(); if (!in.eof() && *in.position() == '\n') ++in.position(); - else + else if (!allow_cr_end_of_line) throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)." " Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r."); @@ -258,7 +258,7 @@ void CSVFormatReader::skipRowEndDelimiter() if (buf->eof()) return; - skipEndOfLine(*buf); + skipEndOfLine(*buf, format_settings.csv.allow_cr_end_of_line); } void CSVFormatReader::skipHeaderRow() @@ -343,7 +343,7 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) return false; } - skipEndOfLine(*buf); + skipEndOfLine(*buf, format_settings.csv.allow_cr_end_of_line); return true; } diff --git a/src/Processors/ISource.h b/src/Processors/ISource.h index 767a73d0924..8c140d0d0a3 100644 --- a/src/Processors/ISource.h +++ b/src/Processors/ISource.h @@ -29,7 +29,7 @@ protected: virtual Chunk generate(); virtual std::optional tryGenerate(); - virtual void progress(size_t read_rows, size_t read_bytes); + void progress(size_t read_rows, size_t read_bytes); public: explicit ISource(Block header, bool enable_auto_progress = true); diff --git a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp index cef72fa02b3..c5e42e76653 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp @@ -411,7 +411,6 @@ struct MinMaxProjectionCandidate { AggregateProjectionCandidate candidate; Block block; - MergeTreeData::DataPartsVector normal_parts; }; struct AggregateProjectionCandidates @@ -444,6 +443,7 @@ AggregateProjectionCandidates getAggregateProjectionCandidates( const auto & projections = metadata->projections; std::vector agg_projections; + for (const auto & projection : projections) if (projection.type == ProjectionDescription::Type::Aggregate) agg_projections.push_back(&projection); @@ -476,7 +476,6 @@ AggregateProjectionCandidates getAggregateProjectionCandidates( { // LOG_TRACE(&Poco::Logger::get("optimizeUseProjections"), "Projection analyzed DAG {}", proj_dag->dumpDAG()); AggregateProjectionCandidate candidate{.info = std::move(info), .dag = std::move(proj_dag)}; - MergeTreeData::DataPartsVector minmax_projection_normal_parts; // LOG_TRACE(&Poco::Logger::get("optimizeUseProjections"), "Projection sample block {}", sample_block.dumpStructure()); auto block = reading.getMergeTreeData().getMinMaxCountProjectionBlock( @@ -485,13 +484,12 @@ AggregateProjectionCandidates getAggregateProjectionCandidates( dag.filter_node != nullptr, query_info, parts, - minmax_projection_normal_parts, max_added_blocks.get(), context); // LOG_TRACE(&Poco::Logger::get("optimizeUseProjections"), "Projection sample block 2 {}", block.dumpStructure()); - // minmax_count_projection cannot be used used when there is no data to process, because + // minmax_count_projection cannot be used when there is no data to process, because // it will produce incorrect result during constant aggregation. // See https://github.com/ClickHouse/ClickHouse/issues/36728 if (block) @@ -499,7 +497,6 @@ AggregateProjectionCandidates getAggregateProjectionCandidates( MinMaxProjectionCandidate minmax; minmax.candidate = std::move(candidate); minmax.block = std::move(block); - minmax.normal_parts = std::move(minmax_projection_normal_parts); minmax.candidate.projection = projection; candidates.minmax_projection.emplace(std::move(minmax)); } @@ -508,6 +505,18 @@ AggregateProjectionCandidates getAggregateProjectionCandidates( if (!candidates.minmax_projection) { + auto it = std::find_if(agg_projections.begin(), agg_projections.end(), [&](const auto * projection) + { + return projection->name == context->getSettings().preferred_optimize_projection_name.value; + }); + + if (it != agg_projections.end()) + { + const ProjectionDescription * preferred_projection = *it; + agg_projections.clear(); + agg_projections.push_back(preferred_projection); + } + candidates.real.reserve(agg_projections.size()); for (const auto * projection : agg_projections) { @@ -569,49 +578,74 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & auto candidates = getAggregateProjectionCandidates(node, *aggregating, *reading, max_added_blocks, allow_implicit_projections); - AggregateProjectionCandidate * best_candidate = nullptr; - if (candidates.minmax_projection) - best_candidate = &candidates.minmax_projection->candidate; - else if (candidates.real.empty()) - return false; - const auto & parts = reading->getParts(); + const auto & alter_conversions = reading->getAlterConvertionsForParts(); const auto & query_info = reading->getQueryInfo(); const auto metadata = reading->getStorageMetadata(); ContextPtr context = reading->getContext(); MergeTreeDataSelectExecutor reader(reading->getMergeTreeData()); - - auto ordinary_reading_select_result = reading->selectRangesToRead(parts, /* alter_conversions = */ {}); - size_t ordinary_reading_marks = ordinary_reading_select_result->marks(); - - /// Selecting best candidate. - for (auto & candidate : candidates.real) + AggregateProjectionCandidate * best_candidate = nullptr; + if (candidates.minmax_projection) { - auto required_column_names = candidate.dag->getRequiredColumnsNames(); - ActionDAGNodes added_filter_nodes; - if (candidates.has_filter) - added_filter_nodes.nodes.push_back(candidate.dag->getOutputs().front()); - - bool analyzed = analyzeProjectionCandidate( - candidate, *reading, reader, required_column_names, parts, - metadata, query_info, context, max_added_blocks, added_filter_nodes); - - if (!analyzed) - continue; - - if (candidate.sum_marks > ordinary_reading_marks) - continue; - - if (best_candidate == nullptr || best_candidate->sum_marks > candidate.sum_marks) - best_candidate = &candidate; + best_candidate = &candidates.minmax_projection->candidate; } - - if (!best_candidate) + else if (!candidates.real.empty()) + { + auto ordinary_reading_select_result = reading->selectRangesToRead(parts, alter_conversions); + size_t ordinary_reading_marks = ordinary_reading_select_result->marks(); + + /// Nothing to read. Ignore projections. + if (ordinary_reading_marks == 0) + { + reading->setAnalyzedResult(std::move(ordinary_reading_select_result)); + return false; + } + + const auto & parts_with_ranges = ordinary_reading_select_result->partsWithRanges(); + + /// Selecting best candidate. + for (auto & candidate : candidates.real) + { + auto required_column_names = candidate.dag->getRequiredColumnsNames(); + ActionDAGNodes added_filter_nodes; + if (candidates.has_filter) + added_filter_nodes.nodes.push_back(candidate.dag->getOutputs().front()); + + bool analyzed = analyzeProjectionCandidate( + candidate, + *reading, + reader, + required_column_names, + parts_with_ranges, + metadata, + query_info, + context, + max_added_blocks, + added_filter_nodes); + + if (!analyzed) + continue; + + if (candidate.sum_marks > ordinary_reading_marks) + continue; + + if (best_candidate == nullptr || best_candidate->sum_marks > candidate.sum_marks) + best_candidate = &candidate; + } + + if (!best_candidate) + { + reading->setAnalyzedResult(std::move(ordinary_reading_select_result)); + return false; + } + } + else { - reading->setAnalyzedResult(std::move(ordinary_reading_select_result)); return false; } + chassert(best_candidate != nullptr); + QueryPlanStepPtr projection_reading; bool has_ordinary_parts; @@ -632,9 +666,7 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes & .storage_id = reading->getMergeTreeData().getStorageID(), .projection_name = candidates.minmax_projection->candidate.projection->name, }); - has_ordinary_parts = !candidates.minmax_projection->normal_parts.empty(); - if (has_ordinary_parts) - reading->resetParts(std::move(candidates.minmax_projection->normal_parts)); + has_ordinary_parts = false; } else { diff --git a/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp b/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp index 727afcb1a99..6880d21facb 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp @@ -7,8 +7,10 @@ #include #include #include +#include +#include #include -#include +#include namespace DB::QueryPlanOptimizations { @@ -107,6 +109,19 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes) if (normal_projections.empty()) return false; + ContextPtr context = reading->getContext(); + auto it = std::find_if(normal_projections.begin(), normal_projections.end(), [&](const auto * projection) + { + return projection->name == context->getSettings().preferred_optimize_projection_name.value; + }); + + if (it != normal_projections.end()) + { + const ProjectionDescription * preferred_projection = *it; + normal_projections.clear(); + normal_projections.push_back(preferred_projection); + } + QueryDAG query; { auto & child = iter->node->children[iter->next_child - 1]; @@ -122,13 +137,22 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes) const Names & required_columns = reading->getRealColumnNames(); const auto & parts = reading->getParts(); + const auto & alter_conversions = reading->getAlterConvertionsForParts(); const auto & query_info = reading->getQueryInfo(); - ContextPtr context = reading->getContext(); MergeTreeDataSelectExecutor reader(reading->getMergeTreeData()); - auto ordinary_reading_select_result = reading->selectRangesToRead(parts, /* alter_conversions = */ {}); + auto ordinary_reading_select_result = reading->selectRangesToRead(parts, alter_conversions); size_t ordinary_reading_marks = ordinary_reading_select_result->marks(); + /// Nothing to read. Ignore projections. + if (ordinary_reading_marks == 0) + { + reading->setAnalyzedResult(std::move(ordinary_reading_select_result)); + return false; + } + + const auto & parts_with_ranges = ordinary_reading_select_result->partsWithRanges(); + std::shared_ptr max_added_blocks = getMaxAddedBlocks(reading); for (const auto * projection : normal_projections) @@ -144,8 +168,16 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes) added_filter_nodes.nodes.push_back(query.filter_node); bool analyzed = analyzeProjectionCandidate( - candidate, *reading, reader, required_columns, parts, - metadata, query_info, context, max_added_blocks, added_filter_nodes); + candidate, + *reading, + reader, + required_columns, + parts_with_ranges, + metadata, + query_info, + context, + max_added_blocks, + added_filter_nodes); if (!analyzed) continue; diff --git a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp index 7ddda29cad4..c3b3449857b 100644 --- a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp +++ b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp @@ -210,7 +210,7 @@ bool analyzeProjectionCandidate( const ReadFromMergeTree & reading, const MergeTreeDataSelectExecutor & reader, const Names & required_column_names, - const MergeTreeData::DataPartsVector & parts, + const RangesInDataParts & parts_with_ranges, const StorageMetadataPtr & metadata, const SelectQueryInfo & query_info, const ContextPtr & context, @@ -219,14 +219,20 @@ bool analyzeProjectionCandidate( { MergeTreeData::DataPartsVector projection_parts; MergeTreeData::DataPartsVector normal_parts; - for (const auto & part : parts) + std::vector alter_conversions; + for (const auto & part_with_ranges : parts_with_ranges) { - const auto & created_projections = part->getProjectionParts(); + const auto & created_projections = part_with_ranges.data_part->getProjectionParts(); auto it = created_projections.find(candidate.projection->name); if (it != created_projections.end()) + { projection_parts.push_back(it->second); + } else - normal_parts.push_back(part); + { + normal_parts.push_back(part_with_ranges.data_part); + alter_conversions.push_back(part_with_ranges.alter_conversions); + } } if (projection_parts.empty()) @@ -252,7 +258,8 @@ bool analyzeProjectionCandidate( if (!normal_parts.empty()) { - auto normal_result_ptr = reading.selectRangesToRead(std::move(normal_parts), /* alter_conversions = */ {}); + /// TODO: We can reuse existing analysis_result by filtering out projection parts + auto normal_result_ptr = reading.selectRangesToRead(std::move(normal_parts), std::move(alter_conversions)); if (normal_result_ptr->error()) return false; diff --git a/src/Processors/QueryPlan/Optimizations/projectionsCommon.h b/src/Processors/QueryPlan/Optimizations/projectionsCommon.h index 35daccad115..055ca5d4084 100644 --- a/src/Processors/QueryPlan/Optimizations/projectionsCommon.h +++ b/src/Processors/QueryPlan/Optimizations/projectionsCommon.h @@ -19,6 +19,7 @@ using MergeTreeDataSelectAnalysisResultPtr = std::shared_ptr; using DataPartsVector = std::vector; +struct RangesInDataParts; struct StorageInMemoryMetadata; using StorageMetadataPtr = std::shared_ptr; @@ -71,7 +72,7 @@ bool analyzeProjectionCandidate( const ReadFromMergeTree & reading, const MergeTreeDataSelectExecutor & reader, const Names & required_column_names, - const DataPartsVector & parts, + const RangesInDataParts & parts_with_ranges, const StorageMetadataPtr & metadata, const SelectQueryInfo & query_info, const ContextPtr & context, diff --git a/src/Processors/QueryPlan/Optimizations/useDataParallelAggregation.cpp b/src/Processors/QueryPlan/Optimizations/useDataParallelAggregation.cpp index f90d10b31d5..124cb735d5a 100644 --- a/src/Processors/QueryPlan/Optimizations/useDataParallelAggregation.cpp +++ b/src/Processors/QueryPlan/Optimizations/useDataParallelAggregation.cpp @@ -95,7 +95,7 @@ bool allOutputsDependsOnlyOnAllowedNodes( { const auto & match = matches.at(node); /// Function could be mapped into its argument. In this case .monotonicity != std::nullopt (see matchTrees) - if (match.node && match.node->result_name == node->result_name && !match.monotonicity) + if (match.node && !match.monotonicity) res = irreducible_nodes.contains(match.node); } @@ -155,9 +155,10 @@ bool isPartitionKeySuitsGroupByKey( return false; /// We are interested only in calculations required to obtain group by keys (and not aggregate function arguments for example). - group_by_actions->removeUnusedActions(aggregating.getParams().keys); + auto key_nodes = group_by_actions->findInOutpus(aggregating.getParams().keys); + auto group_by_key_actions = ActionsDAG::cloneSubDAG(key_nodes, /*remove_aliases=*/ true); - const auto & gb_key_required_columns = group_by_actions->getRequiredColumnsNames(); + const auto & gb_key_required_columns = group_by_key_actions->getRequiredColumnsNames(); const auto & partition_actions = reading.getStorageMetadata()->getPartitionKey().expression->getActionsDAG(); @@ -166,9 +167,9 @@ bool isPartitionKeySuitsGroupByKey( if (std::ranges::find(gb_key_required_columns, col) == gb_key_required_columns.end()) return false; - const auto irreducibe_nodes = removeInjectiveFunctionsFromResultsRecursively(group_by_actions); + const auto irreducibe_nodes = removeInjectiveFunctionsFromResultsRecursively(group_by_key_actions); - const auto matches = matchTrees(group_by_actions->getOutputs(), partition_actions); + const auto matches = matchTrees(group_by_key_actions->getOutputs(), partition_actions); return allOutputsDependsOnlyOnAllowedNodes(partition_actions, irreducibe_nodes, matches); } @@ -206,7 +207,7 @@ size_t tryAggregatePartitionsIndependently(QueryPlan::Node * node, QueryPlan::No return 0; if (!reading->willOutputEachPartitionThroughSeparatePort() - && isPartitionKeySuitsGroupByKey(*reading, expression_step->getExpression()->clone(), *aggregating_step)) + && isPartitionKeySuitsGroupByKey(*reading, expression_step->getExpression(), *aggregating_step)) { if (reading->requestOutputEachPartitionThroughSeparatePort()) aggregating_step->skipMerging(); diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 80fcc317d61..617de8c8530 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -343,7 +343,6 @@ Pipe ReadFromMergeTree::readFromPoolParallelReplicas( /// We have a special logic for local replica. It has to read less data, because in some cases it should /// merge states of aggregate functions or do some other important stuff other than reading from Disk. pool_settings.min_marks_for_concurrent_read = static_cast(pool_settings.min_marks_for_concurrent_read * context->getSettingsRef().parallel_replicas_single_task_marks_count_multiplier); - size_t total_rows = parts_with_range.getRowsCountAllParts(); auto pool = std::make_shared( std::move(extension), @@ -371,14 +370,6 @@ Pipe ReadFromMergeTree::readFromPoolParallelReplicas( actions_settings, block_size_copy, reader_settings, virt_column_names); auto source = std::make_shared(std::move(processor)); - - /// Set the approximate number of rows for the first source only - /// In case of parallel processing on replicas do not set approximate rows at all. - /// Because the value will be identical on every replicas and will be accounted - /// multiple times (settings.max_parallel_replicas times more) - if (i == 0 && !client_info.collaborate_with_initiator) - source->addTotalRowsApprox(total_rows); - pipes.emplace_back(std::move(source)); } @@ -2267,10 +2258,7 @@ size_t MergeTreeDataSelectAnalysisResult::marks() const if (std::holds_alternative(result)) std::rethrow_exception(std::get(result)); - const auto & index_stats = std::get(result).index_stats; - if (index_stats.empty()) - return 0; - return index_stats.back().num_granules_after; + return std::get(result).selected_marks; } UInt64 MergeTreeDataSelectAnalysisResult::rows() const @@ -2278,9 +2266,15 @@ UInt64 MergeTreeDataSelectAnalysisResult::rows() const if (std::holds_alternative(result)) std::rethrow_exception(std::get(result)); - const auto & index_stats = std::get(result).index_stats; - if (index_stats.empty()) - return 0; return std::get(result).selected_rows; } + +const RangesInDataParts & MergeTreeDataSelectAnalysisResult::partsWithRanges() const +{ + if (std::holds_alternative(result)) + std::rethrow_exception(std::get(result)); + + return std::get(result).parts_with_ranges; +} + } diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h index d5948ddd9bf..35310e14416 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.h +++ b/src/Processors/QueryPlan/ReadFromMergeTree.h @@ -197,13 +197,9 @@ public: bool hasAnalyzedResult() const { return analyzed_result_ptr != nullptr; } void setAnalyzedResult(MergeTreeDataSelectAnalysisResultPtr analyzed_result_ptr_) { analyzed_result_ptr = std::move(analyzed_result_ptr_); } - void resetParts(MergeTreeData::DataPartsVector parts) - { - prepared_parts = std::move(parts); - alter_conversions_for_parts = {}; - } - const MergeTreeData::DataPartsVector & getParts() const { return prepared_parts; } + const std::vector & getAlterConvertionsForParts() const { return alter_conversions_for_parts; } + const MergeTreeData & getMergeTreeData() const { return data; } size_t getMaxBlockSize() const { return block_size.max_block_size_rows; } size_t getNumStreams() const { return requested_num_streams; } @@ -310,6 +306,7 @@ struct MergeTreeDataSelectAnalysisResult bool error() const; size_t marks() const; UInt64 rows() const; + const RangesInDataParts & partsWithRanges() const; }; } diff --git a/src/Processors/Sinks/IOutputChunkGenerator.h b/src/Processors/Sinks/IOutputChunkGenerator.h deleted file mode 100644 index 824313a2394..00000000000 --- a/src/Processors/Sinks/IOutputChunkGenerator.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include -#include - -namespace DB -{ - -/// This interface is meant to be used by the SinkToStorage processor -/// SinkToStorage delegates on it the creation of the data chunk that will deliver to the next stages of the query pipeline -/// Default implementation (createDefault() factory method) just forwards everything that it receives -class IOutputChunkGenerator -{ -public: - static std::unique_ptr createCopyRanges(bool deduplicate_later); - static std::unique_ptr createDefault(); - - virtual ~IOutputChunkGenerator() = default; - - virtual void onNewChunkArrived(Chunk chunk) = 0; - virtual void onRowsProcessed(size_t row_count, bool append) = 0; - - virtual Chunk generateChunk() = 0; -}; - -} diff --git a/src/Processors/Sinks/OutputChunkGenerator.cpp b/src/Processors/Sinks/OutputChunkGenerator.cpp deleted file mode 100644 index 942bf49a2d4..00000000000 --- a/src/Processors/Sinks/OutputChunkGenerator.cpp +++ /dev/null @@ -1,91 +0,0 @@ -#include - -namespace DB -{ - -/// Default implementation. The new chunk received is forwarded as-is to the next stages of the query -class ForwardEverythingGenerator : public IOutputChunkGenerator -{ -public: - - explicit ForwardEverythingGenerator() = default; - - void onNewChunkArrived(Chunk chunk) override - { - in_chunk = chunk.clone(); - } - - void onRowsProcessed(size_t /*row_count*/, bool /*append*/) override - {} - - Chunk generateChunk() override - { - return std::move(in_chunk); - } - -private: - Chunk in_chunk; -}; - -/// Specific implementation which generates a chunk with just a subset of the rows received originally -/// Rows are assumed to be processed in the same order than they appear in the original chunk -/// Is up to the client to decide how many rows process at once, but after each range processed, -/// onRowsProcessed() has to be called, indicating whether append that range to the output chunk or not -class CopyRangesGenerator : public IOutputChunkGenerator -{ -public: - explicit CopyRangesGenerator() = default; - - void onNewChunkArrived(Chunk chunk) override - { - out_cols = chunk.cloneEmptyColumns(); - in_chunk = std::move(chunk); - row_offset = 0; - final_chunk_rows = 0; - } - - void onRowsProcessed(size_t row_count, bool append) override - { - if (append) - { - const Columns& in_cols = in_chunk.getColumns(); - for (size_t i = 0; i < out_cols.size(); i++) - { - out_cols[i]->insertRangeFrom(*(in_cols[i]), row_offset, row_count); - } - final_chunk_rows += row_count; - } - - row_offset += row_count; - } - - Chunk generateChunk() override - { - return Chunk(std::move(out_cols), final_chunk_rows); - } - -private: - Chunk in_chunk; - MutableColumns out_cols; - size_t row_offset = 0; - size_t final_chunk_rows = 0; -}; - -std::unique_ptr IOutputChunkGenerator::createCopyRanges(bool deduplicate_later) -{ - // If MV is responsible for deduplication, block won't be considered duplicated. - // So default implementation, forwarding all the data, is used - if (deduplicate_later) - { - return createDefault(); - } - - return std::make_unique(); -} - -std::unique_ptr IOutputChunkGenerator::createDefault() -{ - return std::make_unique(); -} - -} diff --git a/src/Processors/Sinks/SinkToStorage.cpp b/src/Processors/Sinks/SinkToStorage.cpp index 84743306446..5f9f9f9b1a1 100644 --- a/src/Processors/Sinks/SinkToStorage.cpp +++ b/src/Processors/Sinks/SinkToStorage.cpp @@ -4,12 +4,7 @@ namespace DB { -SinkToStorage::SinkToStorage(const Block & header) : SinkToStorage(header, IOutputChunkGenerator::createDefault()) {} - -SinkToStorage::SinkToStorage(const Block & header, std::unique_ptr output_generator_) - : ExceptionKeepingTransform(header, header, false), - output_generator(std::move(output_generator_)) -{ } +SinkToStorage::SinkToStorage(const Block & header) : ExceptionKeepingTransform(header, header, false) {} void SinkToStorage::onConsume(Chunk chunk) { @@ -20,15 +15,15 @@ void SinkToStorage::onConsume(Chunk chunk) */ Nested::validateArraySizes(getHeader().cloneWithColumns(chunk.getColumns())); - output_generator->onNewChunkArrived(chunk.clone()); consume(chunk.clone()); + if (!lastBlockIsDuplicate()) + cur_chunk = std::move(chunk); } SinkToStorage::GenerateResult SinkToStorage::onGenerate() { GenerateResult res; - - res.chunk = output_generator->generateChunk(); + res.chunk = std::move(cur_chunk); res.is_done = true; return res; } diff --git a/src/Processors/Sinks/SinkToStorage.h b/src/Processors/Sinks/SinkToStorage.h index dc7ba23b52a..023bbd8b094 100644 --- a/src/Processors/Sinks/SinkToStorage.h +++ b/src/Processors/Sinks/SinkToStorage.h @@ -1,7 +1,6 @@ #pragma once #include #include -#include namespace DB { @@ -14,15 +13,13 @@ friend class PartitionedSink; public: explicit SinkToStorage(const Block & header); - explicit SinkToStorage(const Block & header, std::unique_ptr output_generator_); const Block & getHeader() const { return inputs.front().getHeader(); } void addTableLock(const TableLockHolder & lock) { table_locks.push_back(lock); } protected: virtual void consume(Chunk chunk) = 0; - - IOutputChunkGenerator& getOutputGenerator() { return *output_generator; } + virtual bool lastBlockIsDuplicate() const { return false; } private: std::vector table_locks; @@ -30,7 +27,7 @@ private: void onConsume(Chunk chunk) override; GenerateResult onGenerate() override; - std::unique_ptr output_generator; + Chunk cur_chunk; }; using SinkToStoragePtr = std::shared_ptr; diff --git a/src/Processors/Sources/RemoteSource.cpp b/src/Processors/Sources/RemoteSource.cpp index 74ab3649068..6d2d0dd248b 100644 --- a/src/Processors/Sources/RemoteSource.cpp +++ b/src/Processors/Sources/RemoteSource.cpp @@ -25,6 +25,16 @@ RemoteSource::RemoteSource(RemoteQueryExecutorPtr executor, bool add_aggregation for (auto & type : sample.getDataTypes()) if (typeid_cast(type.get())) add_aggregation_info = true; + + /// Progress method will be called on Progress packet. + query_executor->setProgressCallback([this](const Progress & value) + { + if (value.total_rows_to_read) + addTotalRowsApprox(value.total_rows_to_read); + if (value.total_bytes_to_read) + addTotalBytes(value.total_bytes_to_read); + progress(value.read_rows, value.read_bytes); + }); } RemoteSource::~RemoteSource() = default; @@ -72,16 +82,6 @@ std::optional RemoteSource::tryGenerate() if (!was_query_sent) { - /// Progress method will be called on Progress packet. - query_executor->setProgressCallback([this](const Progress & value) - { - if (value.total_rows_to_read) - addTotalRowsApprox(value.total_rows_to_read); - if (value.total_bytes_to_read) - addTotalBytes(value.total_bytes_to_read); - progress(value.read_rows, value.read_bytes); - }); - /// Get rows_before_limit result for remote query from ProfileInfo packet. query_executor->setProfileInfoCallback([this](const ProfileInfo & info) { diff --git a/src/Processors/Transforms/FillingTransform.cpp b/src/Processors/Transforms/FillingTransform.cpp index 9577f7ca7ff..6d6f4b87cef 100644 --- a/src/Processors/Transforms/FillingTransform.cpp +++ b/src/Processors/Transforms/FillingTransform.cpp @@ -226,7 +226,7 @@ FillingTransform::FillingTransform( throw Exception(ErrorCodes::INVALID_WITH_FILL_EXPRESSION, "Incompatible types of WITH FILL expression values with column type {}", type->getName()); - if (isUnsignedInteger(type) && + if (isUInt(type) && ((!descr.fill_from.isNull() && less(descr.fill_from, Field{0}, 1)) || (!descr.fill_to.isNull() && less(descr.fill_to, Field{0}, 1)))) { diff --git a/src/QueryPipeline/RemoteQueryExecutor.cpp b/src/QueryPipeline/RemoteQueryExecutor.cpp index da40c197b47..e4cfe6a4d35 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.cpp +++ b/src/QueryPipeline/RemoteQueryExecutor.cpp @@ -88,9 +88,7 @@ RemoteQueryExecutor::RemoteQueryExecutor( const String & query_, const Block & header_, ContextPtr context_, const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, QueryProcessingStage::Enum stage_, std::optional extension_) - : header(header_), query(query_), context(context_) - , scalars(scalars_), external_tables(external_tables_), stage(stage_) - , extension(extension_) + : RemoteQueryExecutor(query_, header_, context_, scalars_, external_tables_, stage_, extension_) { create_connections = [this, connections_, throttler, extension_](AsyncCallback) mutable { auto res = std::make_unique(std::move(connections_), context->getSettingsRef(), throttler); @@ -105,9 +103,7 @@ RemoteQueryExecutor::RemoteQueryExecutor( const String & query_, const Block & header_, ContextPtr context_, const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, QueryProcessingStage::Enum stage_, std::optional extension_) - : header(header_), query(query_), context(context_) - , scalars(scalars_), external_tables(external_tables_), stage(stage_) - , extension(extension_) + : RemoteQueryExecutor(query_, header_, context_, scalars_, external_tables_, stage_, extension_) { create_connections = [this, pool, throttler](AsyncCallback async_callback)->std::unique_ptr { @@ -773,4 +769,12 @@ bool RemoteQueryExecutor::hasThrownException() const return got_exception_from_replica || got_unknown_packet_from_replica; } +void RemoteQueryExecutor::setProgressCallback(ProgressCallback callback) +{ + progress_callback = std::move(callback); + + if (extension && extension->parallel_reading_coordinator) + extension->parallel_reading_coordinator->setProgressCallback(progress_callback); +} + } diff --git a/src/QueryPipeline/RemoteQueryExecutor.h b/src/QueryPipeline/RemoteQueryExecutor.h index 9972d4dd45d..8884ea091b6 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.h +++ b/src/QueryPipeline/RemoteQueryExecutor.h @@ -165,7 +165,7 @@ public: Block getExtremes() { return std::move(extremes); } /// Set callback for progress. It will be called on Progress packet. - void setProgressCallback(ProgressCallback callback) { progress_callback = std::move(callback); } + void setProgressCallback(ProgressCallback callback); /// Set callback for profile info. It will be called on ProfileInfo packet. void setProfileInfoCallback(ProfileInfoCallback callback) { profile_info_callback = std::move(callback); } diff --git a/src/Server/GRPCServer.cpp b/src/Server/GRPCServer.cpp index 812c2b5489d..6bb6ba139ad 100644 --- a/src/Server/GRPCServer.cpp +++ b/src/Server/GRPCServer.cpp @@ -161,7 +161,7 @@ namespace case GRPCObsoleteTransportCompression::NO_COMPRESSION: res.algorithm = GRPC_COMPRESS_NONE; break; case GRPCObsoleteTransportCompression::DEFLATE: res.algorithm = GRPC_COMPRESS_DEFLATE; break; case GRPCObsoleteTransportCompression::GZIP: res.algorithm = GRPC_COMPRESS_GZIP; break; - case GRPCObsoleteTransportCompression::STREAM_GZIP: res.algorithm = GRPC_COMPRESS_STREAM_GZIP; break; + case GRPCObsoleteTransportCompression::STREAM_GZIP: throw Exception(ErrorCodes::INVALID_GRPC_QUERY_INFO, "STREAM_GZIP is no longer supported"); /// was flagged experimental in gRPC, removed as per v1.44 default: throw Exception(ErrorCodes::INVALID_GRPC_QUERY_INFO, "Unknown compression algorithm: {}", GRPCObsoleteTransportCompression::CompressionAlgorithm_Name(query_info.obsolete_result_compression().algorithm())); } @@ -206,7 +206,7 @@ namespace else if (str == "gzip") algorithm = GRPC_COMPRESS_GZIP; else if (str == "stream_gzip") - algorithm = GRPC_COMPRESS_STREAM_GZIP; + throw Exception(ErrorCodes::INVALID_GRPC_QUERY_INFO, "STREAM_GZIP is no longer supported"); /// was flagged experimental in gRPC, removed as per v1.44 else throw Exception(error_code, "Unknown compression algorithm: '{}'", str); } diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index a2d067af387..f9cd3b40f4a 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -886,6 +886,7 @@ void HTTPHandler::processQuery( /* allow_into_outfile = */ false, context, set_query_result, + QueryFlags{}, {}, handle_exception_in_output_format); diff --git a/src/Server/KeeperTCPHandler.cpp b/src/Server/KeeperTCPHandler.cpp index 742300f9b2e..9b8fd069531 100644 --- a/src/Server/KeeperTCPHandler.cpp +++ b/src/Server/KeeperTCPHandler.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -110,13 +111,13 @@ struct SocketInterruptablePollWrapper return pipe.fds_rw[1]; } - PollResult poll(Poco::Timespan remaining_time, const std::shared_ptr & in) + PollResult poll(Poco::Timespan remaining_time, const ReadBufferFromPocoSocket & in) { bool socket_ready = false; bool fd_ready = false; - if (in->available() != 0) + if (in.available() != 0) socket_ready = true; if (response_in.available() != 0) @@ -242,12 +243,15 @@ KeeperTCPHandler::KeeperTCPHandler( KeeperTCPHandler::registerConnection(this); } -void KeeperTCPHandler::sendHandshake(bool has_leader) +void KeeperTCPHandler::sendHandshake(bool has_leader, bool & use_compression) { Coordination::write(Coordination::SERVER_HANDSHAKE_LENGTH, *out); if (has_leader) { - Coordination::write(Coordination::ZOOKEEPER_PROTOCOL_VERSION, *out); + if (use_compression) + Coordination::write(Coordination::ZOOKEEPER_PROTOCOL_VERSION_WITH_COMPRESSION, *out); + else + Coordination::write(Coordination::ZOOKEEPER_PROTOCOL_VERSION, *out); } else { @@ -269,7 +273,7 @@ void KeeperTCPHandler::run() runImpl(); } -Poco::Timespan KeeperTCPHandler::receiveHandshake(int32_t handshake_length) +Poco::Timespan KeeperTCPHandler::receiveHandshake(int32_t handshake_length, bool & use_compression) { int32_t protocol_version; int64_t last_zxid_seen; @@ -282,9 +286,11 @@ Poco::Timespan KeeperTCPHandler::receiveHandshake(int32_t handshake_length) Coordination::read(protocol_version, *in); - if (protocol_version != Coordination::ZOOKEEPER_PROTOCOL_VERSION) + if (protocol_version != Coordination::ZOOKEEPER_PROTOCOL_VERSION && protocol_version != Coordination::ZOOKEEPER_PROTOCOL_VERSION_WITH_COMPRESSION) throw Exception(ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT, "Unexpected protocol version: {}", toString(protocol_version)); + use_compression = (protocol_version == Coordination::ZOOKEEPER_PROTOCOL_VERSION_WITH_COMPRESSION); + Coordination::read(last_zxid_seen, *in); Coordination::read(timeout_ms, *in); @@ -309,8 +315,12 @@ void KeeperTCPHandler::runImpl() socket().setSendTimeout(send_timeout); socket().setNoDelay(true); - in = std::make_shared(socket()); - out = std::make_shared(socket()); + in.emplace(socket()); + out.emplace(socket()); + compressed_in.reset(); + compressed_out.reset(); + + bool use_compression = false; if (in->eof()) { @@ -343,7 +353,7 @@ void KeeperTCPHandler::runImpl() try { int32_t handshake_length = header; - auto client_timeout = receiveHandshake(handshake_length); + auto client_timeout = receiveHandshake(handshake_length, use_compression); if (client_timeout.totalMilliseconds() == 0) client_timeout = Poco::Timespan(Coordination::DEFAULT_SESSION_TIMEOUT_MS * Poco::Timespan::MILLISECONDS); @@ -367,20 +377,26 @@ void KeeperTCPHandler::runImpl() catch (const Exception & e) { LOG_WARNING(log, "Cannot receive session id {}", e.displayText()); - sendHandshake(false); + sendHandshake(/* has_leader */ false, use_compression); return; } - sendHandshake(true); + sendHandshake(/* has_leader */ true, use_compression); } else { LOG_WARNING(log, "Ignoring user request, because the server is not active yet"); - sendHandshake(false); + sendHandshake(/* has_leader */ false, use_compression); return; } + if (use_compression) + { + compressed_in.emplace(*in); + compressed_out.emplace(*out, CompressionCodecFactory::instance().get("LZ4",{})); + } + auto response_fd = poll_wrapper->getResponseFD(); auto response_callback = [responses_ = this->responses, response_fd](const Coordination::ZooKeeperResponsePtr & response) { @@ -415,7 +431,7 @@ void KeeperTCPHandler::runImpl() { using namespace std::chrono_literals; - PollResult result = poll_wrapper->poll(session_timeout, in); + PollResult result = poll_wrapper->poll(session_timeout, *in); log_long_operation("Polling socket"); if (result.has_requests && !close_received) { @@ -467,7 +483,8 @@ void KeeperTCPHandler::runImpl() updateStats(response); packageSent(); - response->write(*out); + response->write(getWriteBuffer()); + flushWriteBuffer(); log_long_operation("Sending response"); if (response->error == Coordination::Error::ZSESSIONEXPIRED) { @@ -525,7 +542,7 @@ bool KeeperTCPHandler::tryExecuteFourLetterWordCmd(int32_t command) try { String res = command_ptr->run(); - out->write(res.data(), res.size()); + out->write(res.data(),res.size()); out->next(); } catch (...) @@ -537,19 +554,41 @@ bool KeeperTCPHandler::tryExecuteFourLetterWordCmd(int32_t command) } } +WriteBuffer & KeeperTCPHandler::getWriteBuffer() +{ + if (compressed_out) + return *compressed_out; + return *out; +} + +void KeeperTCPHandler::flushWriteBuffer() +{ + if (compressed_out) + compressed_out->next(); + out->next(); +} + +ReadBuffer & KeeperTCPHandler::getReadBuffer() +{ + if (compressed_in) + return *compressed_in; + return *in; +} + std::pair KeeperTCPHandler::receiveRequest() { + auto & read_buffer = getReadBuffer(); int32_t length; - Coordination::read(length, *in); + Coordination::read(length, read_buffer); int32_t xid; - Coordination::read(xid, *in); + Coordination::read(xid, read_buffer); Coordination::OpNum opnum; - Coordination::read(opnum, *in); + Coordination::read(opnum, read_buffer); Coordination::ZooKeeperRequestPtr request = Coordination::ZooKeeperRequestFactory::instance().get(opnum); request->xid = xid; - request->readImpl(*in); + request->readImpl(read_buffer); if (!keeper_dispatcher->putRequest(request, session_id)) throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Session {} already disconnected", session_id); diff --git a/src/Server/KeeperTCPHandler.h b/src/Server/KeeperTCPHandler.h index 588cdf6305e..adb1baa084f 100644 --- a/src/Server/KeeperTCPHandler.h +++ b/src/Server/KeeperTCPHandler.h @@ -17,6 +17,8 @@ #include #include #include +#include +#include namespace DB { @@ -78,15 +80,21 @@ private: Coordination::XID close_xid = Coordination::CLOSE_XID; /// Streams for reading/writing from/to client connection socket. - std::shared_ptr in; - std::shared_ptr out; + std::optional in; + std::optional out; + std::optional compressed_in; + std::optional compressed_out; std::atomic connected{false}; void runImpl(); - void sendHandshake(bool has_leader); - Poco::Timespan receiveHandshake(int32_t handshake_length); + WriteBuffer & getWriteBuffer(); + void flushWriteBuffer(); + ReadBuffer & getReadBuffer(); + + void sendHandshake(bool has_leader, bool & use_compression); + Poco::Timespan receiveHandshake(int32_t handshake_length, bool & use_compression); static bool isHandShake(int32_t handshake_length); bool tryExecuteFourLetterWordCmd(int32_t command); diff --git a/src/Server/MySQLHandler.cpp b/src/Server/MySQLHandler.cpp index f9155a07e2b..21fa7f7227a 100644 --- a/src/Server/MySQLHandler.cpp +++ b/src/Server/MySQLHandler.cpp @@ -378,7 +378,7 @@ void MySQLHandler::comQuery(ReadBuffer & payload, bool binary_protocol) } }; - executeQuery(should_replace ? replacement : payload, *out, false, query_context, set_result_details, format_settings); + executeQuery(should_replace ? replacement : payload, *out, false, query_context, set_result_details, QueryFlags{}, format_settings); if (!with_output) packet_endpoint->sendPacket(OKPacket(0x00, client_capabilities, affected_rows, 0, 0), true); diff --git a/src/Server/PostgreSQLHandler.cpp b/src/Server/PostgreSQLHandler.cpp index 7b078154252..3956f795657 100644 --- a/src/Server/PostgreSQLHandler.cpp +++ b/src/Server/PostgreSQLHandler.cpp @@ -7,9 +7,10 @@ #include "PostgreSQLHandler.h" #include #include +#include #include #include -#include +#include #include "config_version.h" @@ -284,8 +285,7 @@ void PostgreSQLHandler::processQuery() if (!parse_res.second) throw Exception(ErrorCodes::SYNTAX_ERROR, "Cannot parse and execute the following part of query: {}", String(parse_res.first)); - std::random_device rd; - std::mt19937 gen(rd()); + pcg64_fast gen{randomSeed()}; std::uniform_int_distribution dis(0, INT32_MAX); for (const auto & spl_query : queries) diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 5082d8e4f3b..1da9806b4f5 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -496,7 +496,7 @@ void TCPHandler::runImpl() }); /// Processing Query - std::tie(state.parsed_query, state.io) = executeQuery(state.query, query_context, false, state.stage); + std::tie(state.parsed_query, state.io) = executeQuery(state.query, query_context, QueryFlags{}, state.stage); after_check_cancelled.restart(); after_send_progress.restart(); diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index c6fa17583b5..1f0dcb625f9 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -44,6 +44,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int DUPLICATE_COLUMN; extern const int NOT_IMPLEMENTED; + extern const int SUPPORT_IS_DISABLED; } namespace @@ -1083,6 +1084,13 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const throw Exception(ErrorCodes::BAD_ARGUMENTS, "Data type have to be specified for column {} to add", backQuote(column_name)); + /// FIXME: Adding a new column of type Object(JSON) is broken. + /// Looks like there is something around default expression for this column (method `getDefault` is not implemented for the data type Object). + /// But after ALTER TABLE ADD COLUMN we need to fill existing rows with something (exactly the default value). + /// So we don't allow to do it for now. + if (command.data_type->hasDynamicSubcolumns()) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Adding a new column of a type which has dynamic subcolumns to an existing table is not allowed. It has known bugs"); + if (column_name == LightweightDeleteDescription::FILTER_COLUMN.name && std::dynamic_pointer_cast(table)) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot add column {}: " "this column name is reserved for lightweight delete feature", backQuote(column_name)); @@ -1145,17 +1153,22 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const } } - /// The change of data type to/from Object is broken, so disable it for now + /// FIXME: Modifying the column to/from Object(JSON) is broken. + /// Looks like there is something around default expression for this column (method `getDefault` is not implemented for the data type Object). + /// But after ALTER TABLE MODIFY COLUMN we need to fill existing rows with something (exactly the default value) or calculate the common type for it. + /// So we don't allow to do it for now. if (command.data_type) { const GetColumnsOptions options(GetColumnsOptions::AllPhysical); const auto old_data_type = all_columns.getColumn(options, column_name).type; - if (command.data_type->getName().contains("Object") - || old_data_type->getName().contains("Object")) + bool new_type_has_object = command.data_type->hasDynamicSubcolumns(); + bool old_type_has_object = old_data_type->hasDynamicSubcolumns(); + + if (new_type_has_object || old_type_has_object) throw Exception( ErrorCodes::BAD_ARGUMENTS, - "The change of data type {} of column {} to {} is not allowed", + "The change of data type {} of column {} to {} is not allowed. It has known bugs", old_data_type->getName(), backQuote(column_name), command.data_type->getName()); } diff --git a/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp b/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp index eae17078577..b8bffb267e5 100644 --- a/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp +++ b/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp @@ -157,16 +157,12 @@ struct DeltaLakeMetadataParser::Impl if (json.has("add")) { const auto path = json["add"]["path"].getString(); - const auto [_, inserted] = result.insert(fs::path(configuration.getPath()) / path); - if (!inserted) - throw Exception(ErrorCodes::INCORRECT_DATA, "File already exists {}", path); + result.insert(fs::path(configuration.getPath()) / path); } else if (json.has("remove")) { const auto path = json["remove"]["path"].getString(); - const bool erase = result.erase(fs::path(configuration.getPath()) / path); - if (!erase) - throw Exception(ErrorCodes::INCORRECT_DATA, "File doesn't exist {}", path); + result.erase(fs::path(configuration.getPath()) / path); } } } diff --git a/src/Storages/FileLog/StorageFileLog.cpp b/src/Storages/FileLog/StorageFileLog.cpp index de41ede8a5c..2eea619d654 100644 --- a/src/Storages/FileLog/StorageFileLog.cpp +++ b/src/Storages/FileLog/StorageFileLog.cpp @@ -382,7 +382,7 @@ void StorageFileLog::startup() task->holder->activateAndSchedule(); } -void StorageFileLog::shutdown() +void StorageFileLog::shutdown(bool) { if (task) { diff --git a/src/Storages/FileLog/StorageFileLog.h b/src/Storages/FileLog/StorageFileLog.h index 0fd62a22a18..3cb6ac1fbbf 100644 --- a/src/Storages/FileLog/StorageFileLog.h +++ b/src/Storages/FileLog/StorageFileLog.h @@ -47,7 +47,7 @@ public: bool noPushingToViews() const override { return true; } void startup() override; - void shutdown() override; + void shutdown(bool is_drop) override; Pipe read( const Names & column_names, diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index ef00fc70e22..642ad206097 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -29,7 +29,7 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; - extern const int NETWORK_ERROR; + extern const int HDFS_ERROR; #if USE_KRB5 extern const int EXCESSIVE_ELEMENT_IN_CONFIG; extern const int KERBEROS_ERROR; @@ -127,7 +127,7 @@ HDFSBuilderWrapper createHDFSBuilder(const String & uri_str, const Poco::Util::A HDFSBuilderWrapper builder; if (builder.get() == nullptr) - throw Exception(ErrorCodes::NETWORK_ERROR, "Unable to create builder to connect to HDFS: {} {}", + throw Exception(ErrorCodes::HDFS_ERROR, "Unable to create builder to connect to HDFS: {} {}", uri.toString(), String(hdfsGetLastError())); hdfsBuilderConfSetStr(builder.get(), "input.read.timeout", "60000"); // 1 min @@ -178,7 +178,7 @@ HDFSFSPtr createHDFSFS(hdfsBuilder * builder) { HDFSFSPtr fs(hdfsBuilderConnect(builder)); if (fs == nullptr) - throw Exception(ErrorCodes::NETWORK_ERROR, "Unable to connect to HDFS: {}", String(hdfsGetLastError())); + throw Exception(ErrorCodes::HDFS_ERROR, "Unable to connect to HDFS: {}", String(hdfsGetLastError())); return fs; } diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 2e0e09c4b18..d827353ad8e 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -35,7 +35,6 @@ #include #include -#include #include #include diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index f5ceca625af..0521f14e0f9 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -564,10 +564,10 @@ public: * @see shutdown() * @see flushAndPrepareForShutdown() */ - void flushAndShutdown() + void flushAndShutdown(bool is_drop = false) { flushAndPrepareForShutdown(); - shutdown(); + shutdown(is_drop); } /** If the table have to do some complicated work when destroying an object - do it in advance. @@ -575,7 +575,7 @@ public: * By default, does nothing. * Can be called simultaneously from different threads, even after a call to drop(). */ - virtual void shutdown() {} + virtual void shutdown(bool is_drop = false) { UNUSED(is_drop); } // NOLINT /// Called before shutdown() to flush data to underlying storage /// Data in memory need to be persistent @@ -593,6 +593,7 @@ public: std::atomic is_dropped{false}; std::atomic is_detached{false}; + std::atomic is_being_restarted{false}; /// Does table support index for IN sections virtual bool supportsIndexForIn() const { return false; } diff --git a/src/Storages/KVStorageUtils.cpp b/src/Storages/KVStorageUtils.cpp index 16ab99d03b4..1884671a41d 100644 --- a/src/Storages/KVStorageUtils.cpp +++ b/src/Storages/KVStorageUtils.cpp @@ -1,5 +1,8 @@ #include +#include +#include + #include #include #include @@ -10,6 +13,9 @@ #include #include +#include + + namespace DB { @@ -121,6 +127,121 @@ bool traverseASTFilter( } return false; } + +bool traverseDAGFilter( + const std::string & primary_key, const DataTypePtr & primary_key_type, const ActionsDAG::Node * elem, const ContextPtr & context, FieldVectorPtr & res) +{ + if (elem->type == ActionsDAG::ActionType::ALIAS) + return traverseDAGFilter(primary_key, primary_key_type, elem->children.at(0), context, res); + + if (elem->type != ActionsDAG::ActionType::FUNCTION) + return false; + + auto func_name = elem->function_base->getName(); + + if (func_name == "and") + { + // one child has the key filter condition is ok + for (const auto * child : elem->children) + if (traverseDAGFilter(primary_key, primary_key_type, child, context, res)) + return true; + return false; + } + else if (func_name == "or") + { + // make sure every child has the key filter condition + for (const auto * child : elem->children) + if (!traverseDAGFilter(primary_key, primary_key_type, child, context, res)) + return false; + return true; + } + else if (func_name == "equals" || func_name == "in") + { + if (elem->children.size() != 2) + return false; + + if (func_name == "in") + { + const auto * key = elem->children.at(0); + while (key->type == ActionsDAG::ActionType::ALIAS) + key = key->children.at(0); + + if (key->type != ActionsDAG::ActionType::INPUT) + return false; + + if (key->result_name != primary_key) + return false; + + const auto * value = elem->children.at(1); + if (value->type != ActionsDAG::ActionType::COLUMN) + return false; + + const IColumn * value_col = value->column.get(); + if (const auto * col_const = typeid_cast(value_col)) + value_col = &col_const->getDataColumn(); + + const auto * col_set = typeid_cast(value_col); + if (!col_set) + return false; + + auto future_set = col_set->getData(); + future_set->buildOrderedSetInplace(context); + + auto set = future_set->get(); + if (!set) + return false; + + if (!set->hasExplicitSetElements()) + return false; + + set->checkColumnsNumber(1); + const auto & set_column = *set->getSetElements()[0]; + + if (set_column.getDataType() != primary_key_type->getTypeId()) + return false; + + for (size_t row = 0; row < set_column.size(); ++row) + res->push_back(set_column[row]); + return true; + } + else + { + const auto * key = elem->children.at(0); + while (key->type == ActionsDAG::ActionType::ALIAS) + key = key->children.at(0); + + if (key->type != ActionsDAG::ActionType::INPUT) + return false; + + if (key->result_name != primary_key) + return false; + + const auto * value = elem->children.at(1); + if (value->type != ActionsDAG::ActionType::COLUMN) + return false; + + auto converted_field = convertFieldToType((*value->column)[0], *primary_key_type); + if (!converted_field.isNull()) + res->push_back(converted_field); + return true; + } + } + return false; +} +} + +std::pair getFilterKeys( + const String & primary_key, const DataTypePtr & primary_key_type, const ActionDAGNodes & filter_nodes, const ContextPtr & context) +{ + if (filter_nodes.nodes.empty()) + return {{}, true}; + + auto filter_actions_dag = ActionsDAG::buildFilterActionsDAG(filter_nodes.nodes, {}, context); + const auto * predicate = filter_actions_dag->getOutputs().at(0); + + FieldVectorPtr res = std::make_shared(); + auto matched_keys = traverseDAGFilter(primary_key, primary_key_type, predicate, context, res); + return std::make_pair(res, !matched_keys); } std::pair getFilterKeys( diff --git a/src/Storages/KVStorageUtils.h b/src/Storages/KVStorageUtils.h index c3bb2aefa62..c6d63b800df 100644 --- a/src/Storages/KVStorageUtils.h +++ b/src/Storages/KVStorageUtils.h @@ -21,6 +21,9 @@ using DataTypePtr = std::shared_ptr; std::pair getFilterKeys( const std::string & primary_key, const DataTypePtr & primary_key_type, const SelectQueryInfo & query_info, const ContextPtr & context); +std::pair getFilterKeys( + const String & primary_key, const DataTypePtr & primary_key_type, const ActionDAGNodes & filter_nodes, const ContextPtr & context); + template void fillColumns(const K & key, const V & value, size_t key_pos, const Block & header, MutableColumns & columns) { diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index b9abe175e5f..423d295cdf2 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -435,7 +435,7 @@ void StorageKafka::startup() } -void StorageKafka::shutdown() +void StorageKafka::shutdown(bool) { for (auto & task : tasks) { diff --git a/src/Storages/Kafka/StorageKafka.h b/src/Storages/Kafka/StorageKafka.h index e08baf9fc80..9280809be0e 100644 --- a/src/Storages/Kafka/StorageKafka.h +++ b/src/Storages/Kafka/StorageKafka.h @@ -50,7 +50,7 @@ public: bool noPushingToViews() const override { return true; } void startup() override; - void shutdown() override; + void shutdown(bool is_drop) override; Pipe read( const Names & column_names, diff --git a/src/Storages/LiveView/StorageLiveView.cpp b/src/Storages/LiveView/StorageLiveView.cpp index aec2405b973..3c116321083 100644 --- a/src/Storages/LiveView/StorageLiveView.cpp +++ b/src/Storages/LiveView/StorageLiveView.cpp @@ -253,7 +253,7 @@ StorageLiveView::StorageLiveView( StorageLiveView::~StorageLiveView() { - shutdown(); + shutdown(false); } NamesAndTypesList StorageLiveView::getVirtuals() const @@ -289,7 +289,7 @@ void StorageLiveView::startup() periodic_refresh_task->activate(); } -void StorageLiveView::shutdown() +void StorageLiveView::shutdown(bool) { shutdown_called = true; diff --git a/src/Storages/LiveView/StorageLiveView.h b/src/Storages/LiveView/StorageLiveView.h index 92ffd4dc642..e0566d586ee 100644 --- a/src/Storages/LiveView/StorageLiveView.h +++ b/src/Storages/LiveView/StorageLiveView.h @@ -81,7 +81,7 @@ public: void startup() override; - void shutdown() override; + void shutdown(bool is_drop) override; Pipe read( const Names & column_names, diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 55aea13c639..9bc72577b25 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -381,11 +381,6 @@ void IMergeTreeDataPart::setState(MergeTreeDataPartState new_state) const incrementStateMetric(state); } -MergeTreeDataPartState IMergeTreeDataPart::getState() const -{ - return state; -} - std::pair IMergeTreeDataPart::getMinMaxDate() const { diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 30c9b19fcbc..a9659d2f5f4 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -54,6 +54,7 @@ enum class DataPartRemovalState NON_UNIQUE_OWNERSHIP, NOT_REACHED_REMOVAL_TIME, HAS_SKIPPED_MUTATION_PARENT, + EMPTY_PART_COVERS_OTHER_PARTS, REMOVED, }; @@ -209,6 +210,8 @@ public: private: String mutable_name; + mutable MergeTreeDataPartState state{MergeTreeDataPartState::Temporary}; + public: const String & name; // const ref to private mutable_name MergeTreePartInfo info; @@ -273,7 +276,7 @@ public: /// Current state of the part. If the part is in working set already, it should be accessed via data_parts mutex void setState(MergeTreeDataPartState new_state) const; - MergeTreeDataPartState getState() const; + ALWAYS_INLINE MergeTreeDataPartState getState() const { return state; } static constexpr std::string_view stateString(MergeTreeDataPartState state) { return magic_enum::enum_name(state); } constexpr std::string_view stateString() const { return stateString(state); } @@ -675,8 +678,6 @@ private: void incrementStateMetric(MergeTreeDataPartState state) const; void decrementStateMetric(MergeTreeDataPartState state) const; - mutable MergeTreeDataPartState state{MergeTreeDataPartState::Temporary}; - /// This ugly flag is needed for debug assertions only mutable bool part_is_probably_removed_from_disk = false; }; diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 4ac1662c741..2932bce4262 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -1,6 +1,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -10,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -18,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -1842,7 +1845,7 @@ bool KeyCondition::extractAtomFromTree(const RPNBuilderTreeNode & node, RPNEleme ColumnsWithTypeAndName arguments{ {nullptr, key_expr_type, ""}, {DataTypeString().createColumnConst(1, common_type_maybe_nullable->getName()), common_type_maybe_nullable, ""}}; - FunctionOverloadResolverPtr func_builder_cast = CastInternalOverloadResolver::createImpl(); + FunctionOverloadResolverPtr func_builder_cast = createInternalCastOverloadResolver(CastType::nonAccurate, {}); auto func_cast = func_builder_cast->build(arguments); /// If we know the given range only contains one value, then we treat all functions as positive monotonic. diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index aa0b6b2ff37..1c0f9208fef 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -2211,6 +2211,15 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) continue; } + /// First remove all covered parts, then remove covering empty part + /// Avoids resurrection of old parts for MergeTree and issues with unexpected parts for Replicated + if (part->rows_count == 0 && !getCoveredOutdatedParts(part, parts_lock).empty()) + { + part->removal_state.store(DataPartRemovalState::EMPTY_PART_COVERS_OTHER_PARTS, std::memory_order_relaxed); + skipped_parts.push_back(part->info); + continue; + } + auto part_remove_time = part->remove_time.load(std::memory_order_relaxed); bool reached_removal_time = part_remove_time <= time_now && time_now - part_remove_time >= getSettings()->old_parts_lifetime.totalSeconds(); if ((reached_removal_time && !has_skipped_mutation_parent(part)) @@ -2627,18 +2636,6 @@ size_t MergeTreeData::clearEmptyParts() if (!part->version.getCreationTID().isPrehistoric() && !part->version.isVisible(TransactionLog::instance().getLatestSnapshot())) continue; - /// Don't drop empty parts that cover other parts - /// Otherwise covered parts resurrect - { - auto lock = lockParts(); - if (part->getState() != DataPartState::Active) - continue; - - DataPartsVector covered_parts = getCoveredOutdatedParts(part, lock); - if (!covered_parts.empty()) - continue; - } - parts_names_to_drop.emplace_back(part->name); } } @@ -3445,8 +3442,6 @@ MergeTreeData::PartHierarchy MergeTreeData::getPartHierarchy( if ((*end)->info == part_info) { result.duplicate_part = *end; - result.covering_parts.clear(); - return result; } if (!part_info.contains((*end)->info)) @@ -3466,6 +3461,9 @@ MergeTreeData::PartHierarchy MergeTreeData::getPartHierarchy( ++end; } + if (begin != committed_parts_range.end() && (*begin)->info == part_info) + ++begin; + result.covered_parts.insert(result.covered_parts.end(), begin, end); return result; @@ -3475,10 +3473,11 @@ MergeTreeData::DataPartsVector MergeTreeData::getCoveredOutdatedParts( const DataPartPtr & part, DataPartsLock & data_parts_lock) const { - part->assertState({DataPartState::Active, DataPartState::PreActive}); + part->assertState({DataPartState::Active, DataPartState::PreActive, DataPartState::Outdated}); + bool is_outdated_part = part->getState() == DataPartState::Outdated; PartHierarchy hierarchy = getPartHierarchy(part->info, DataPartState::Outdated, data_parts_lock); - if (hierarchy.duplicate_part) + if (hierarchy.duplicate_part && !is_outdated_part) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected duplicate part {}. It is a bug.", hierarchy.duplicate_part->getNameWithState()); return hierarchy.covered_parts; @@ -3653,6 +3652,10 @@ bool MergeTreeData::renameTempPartAndReplaceImpl( part->name, hierarchy.intersected_parts.back()->getNameWithState(), hierarchy.intersected_parts.size()); } + if (hierarchy.duplicate_part) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected duplicate part {}. It is a bug.", hierarchy.duplicate_part->getNameWithState()); + + if (part->hasLightweightDelete()) has_lightweight_delete_parts.store(true); @@ -3774,7 +3777,7 @@ void MergeTreeData::removePartsFromWorkingSet( void MergeTreeData::removePartsInRangeFromWorkingSet(MergeTreeTransaction * txn, const MergeTreePartInfo & drop_range, DataPartsLock & lock) { - removePartsInRangeFromWorkingSetAndGetPartsToRemoveFromZooKeeper(txn, drop_range, lock); + removePartsInRangeFromWorkingSetAndGetPartsToRemoveFromZooKeeper(txn, drop_range, lock, /*create_empty_part*/ false); } DataPartsVector MergeTreeData::grabActivePartsToRemoveForDropRange( @@ -3849,7 +3852,7 @@ DataPartsVector MergeTreeData::grabActivePartsToRemoveForDropRange( } MergeTreeData::PartsToRemoveFromZooKeeper MergeTreeData::removePartsInRangeFromWorkingSetAndGetPartsToRemoveFromZooKeeper( - MergeTreeTransaction * txn, const MergeTreePartInfo & drop_range, DataPartsLock & lock) + MergeTreeTransaction * txn, const MergeTreePartInfo & drop_range, DataPartsLock & lock, bool create_empty_part) { #ifndef NDEBUG { @@ -3870,6 +3873,42 @@ MergeTreeData::PartsToRemoveFromZooKeeper MergeTreeData::removePartsInRangeFromW /// FIXME refactor removePartsFromWorkingSet(...), do not remove parts twice removePartsFromWorkingSet(txn, parts_to_remove, clear_without_timeout, lock); + bool is_new_syntax = format_version >= MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING; + if (create_empty_part && !parts_to_remove.empty() && is_new_syntax) + { + /// We are going to remove a lot of parts from zookeeper just after returning from this function. + /// And we will remove parts from disk later (because some queries may use them). + /// But if the server restarts in-between, then it will notice a lot of unexpected parts, + /// so it may refuse to start. Let's create an empty part that covers them. + /// We don't need to commit it to zk, and don't even need to activate it. + + MergeTreePartInfo empty_info = drop_range; + empty_info.level = empty_info.mutation = 0; + if (!empty_info.min_block) + empty_info.min_block = MergeTreePartInfo::MAX_BLOCK_NUMBER; + for (const auto & part : parts_to_remove) + { + empty_info.min_block = std::min(empty_info.min_block, part->info.min_block); + empty_info.level = std::max(empty_info.level, part->info.level); + empty_info.mutation = std::max(empty_info.mutation, part->info.mutation); + } + empty_info.level += 1; + + const auto & partition = parts_to_remove.front()->partition; + String empty_part_name = empty_info.getPartNameAndCheckFormat(format_version); + auto [new_data_part, tmp_dir_holder] = createEmptyPart(empty_info, partition, empty_part_name, NO_TRANSACTION_PTR); + + MergeTreeData::Transaction transaction(*this, NO_TRANSACTION_RAW); + renameTempPartAndAdd(new_data_part, transaction, lock); /// All covered parts must be already removed + + /// It will add the empty part to the set of Outdated parts without making it Active (exactly what we need) + transaction.rollback(&lock); + new_data_part->remove_time.store(0, std::memory_order_relaxed); + /// Such parts are always local, they don't participate in replication, they don't have shared blobs. + /// So we don't have locks for shared data in zk for them, and can just remove blobs (this avoids leaving garbage in S3) + new_data_part->remove_tmp_policy = IMergeTreeDataPart::BlobsRemovalPolicyForTemporaryParts::REMOVE_BLOBS; + } + /// Since we can return parts in Deleting state, we have to use a wrapper that restricts access to such parts. PartsToRemoveFromZooKeeper parts_to_remove_from_zookeeper; for (auto & part : parts_to_remove) @@ -6225,7 +6264,7 @@ void MergeTreeData::Transaction::addPart(MutableDataPartPtr & part) precommitted_parts.insert(part); } -void MergeTreeData::Transaction::rollback() +void MergeTreeData::Transaction::rollback(DataPartsLock * lock) { if (!isEmpty()) { @@ -6239,7 +6278,8 @@ void MergeTreeData::Transaction::rollback() for (const auto & part : precommitted_parts) part->version.creation_csn.store(Tx::RolledBackCSN); - auto lock = data.lockParts(); + /// It would be much better with TSA... + auto our_lock = (lock) ? DataPartsLock() : data.lockParts(); if (data.data_parts_indexes.empty()) { @@ -6258,7 +6298,7 @@ void MergeTreeData::Transaction::rollback() { data.removePartsFromWorkingSet(txn, DataPartsVector(precommitted_parts.begin(), precommitted_parts.end()), - /* clear_without_timeout = */ true, &lock); + /* clear_without_timeout = */ true, &our_lock); } } @@ -6458,7 +6498,6 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( bool has_filter, const SelectQueryInfo & query_info, const DataPartsVector & parts, - DataPartsVector & normal_parts, const PartitionIdToMaxBlock * max_block_numbers_to_read, ContextPtr query_context) const { @@ -6583,11 +6622,11 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( continue; } + /// It's extremely rare that some parts have final marks while others don't. To make it + /// straightforward, disable minmax_count projection when `max(pk)' encounters any part with + /// no final mark. if (need_primary_key_max_column && !part->index_granularity.hasFinalMark()) - { - normal_parts.push_back(part); - continue; - } + return {}; real_parts.push_back(part); filter_column_data.back() = 1; diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index aab04260b0e..54104849fe4 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -258,7 +258,7 @@ public: void addPart(MutableDataPartPtr & part); - void rollback(); + void rollback(DataPartsLock * lock = nullptr); /// Immediately remove parts from table's data_parts set and change part /// state to temporary. Useful for new parts which not present in table. @@ -401,17 +401,12 @@ public: /// query_info - used to filter unneeded parts /// /// parts - part set to filter - /// - /// normal_parts - collects parts that don't have all the needed values to form the block. - /// Specifically, this is when a part doesn't contain a final mark and the related max value is - /// required. Block getMinMaxCountProjectionBlock( const StorageMetadataPtr & metadata_snapshot, const Names & required_columns, bool has_filter, const SelectQueryInfo & query_info, const DataPartsVector & parts, - DataPartsVector & normal_parts, const PartitionIdToMaxBlock * max_block_numbers_to_read, ContextPtr query_context) const; @@ -649,7 +644,7 @@ public: /// It includes parts that have been just removed by these method /// and Outdated parts covered by drop_range that were removed earlier for any reason. PartsToRemoveFromZooKeeper removePartsInRangeFromWorkingSetAndGetPartsToRemoveFromZooKeeper( - MergeTreeTransaction * txn, const MergeTreePartInfo & drop_range, DataPartsLock & lock); + MergeTreeTransaction * txn, const MergeTreePartInfo & drop_range, DataPartsLock & lock, bool create_empty_part = true); /// Restores Outdated part and adds it to working set void restoreAndActivatePart(const DataPartPtr & part, DataPartsLock * acquired_lock = nullptr); diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index be5e7c5a938..e521491c2d5 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -828,8 +828,8 @@ std::optional> MergeTreeDataSelectExecutor::filterPar } void MergeTreeDataSelectExecutor::filterPartsByPartition( - std::optional & partition_pruner, - std::optional & minmax_idx_condition, + const std::optional & partition_pruner, + const std::optional & minmax_idx_condition, MergeTreeData::DataPartsVector & parts, std::vector & alter_conversions, const std::optional> & part_values, @@ -1288,6 +1288,8 @@ MergeTreeDataSelectAnalysisResultPtr MergeTreeDataSelectExecutor::estimateNumMar selectColumnNames(column_names_to_return, data, real_column_names, virt_column_names, sample_factor_column_queried); std::optional indexes; + /// NOTE: We don't need alter_conversions because the returned analysis_result is only used for: + /// 1. estimate the number of rows to read; 2. projection reading, which doesn't have alter_conversions. return ReadFromMergeTree::selectRangesToRead( std::move(parts), /*alter_conversions=*/ {}, @@ -1824,7 +1826,7 @@ void MergeTreeDataSelectExecutor::selectPartsToRead( const std::optional> & part_values, const std::optional & minmax_idx_condition, const DataTypes & minmax_columns_types, - std::optional & partition_pruner, + const std::optional & partition_pruner, const PartitionIdToMaxBlock * max_block_numbers_to_read, PartFilterCounters & counters) { @@ -1886,7 +1888,7 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter( MergeTreeData::PinnedPartUUIDsPtr pinned_part_uuids, const std::optional & minmax_idx_condition, const DataTypes & minmax_columns_types, - std::optional & partition_pruner, + const std::optional & partition_pruner, const PartitionIdToMaxBlock * max_block_numbers_to_read, ContextPtr query_context, PartFilterCounters & counters, diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h index d5d8107db48..01c2da9dd63 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h @@ -126,7 +126,7 @@ private: const std::optional> & part_values, const std::optional & minmax_idx_condition, const DataTypes & minmax_columns_types, - std::optional & partition_pruner, + const std::optional & partition_pruner, const PartitionIdToMaxBlock * max_block_numbers_to_read, PartFilterCounters & counters); @@ -138,7 +138,7 @@ private: MergeTreeData::PinnedPartUUIDsPtr pinned_part_uuids, const std::optional & minmax_idx_condition, const DataTypes & minmax_columns_types, - std::optional & partition_pruner, + const std::optional & partition_pruner, const PartitionIdToMaxBlock * max_block_numbers_to_read, ContextPtr query_context, PartFilterCounters & counters, @@ -178,8 +178,8 @@ public: /// Filter parts using minmax index and partition key. static void filterPartsByPartition( - std::optional & partition_pruner, - std::optional & minmax_idx_condition, + const std::optional & partition_pruner, + const std::optional & minmax_idx_condition, MergeTreeData::DataPartsVector & parts, std::vector & alter_conversions, const std::optional> & part_values, diff --git a/src/Storages/MergeTree/MergeTreeSplitPrewhereIntoReadSteps.cpp b/src/Storages/MergeTree/MergeTreeSplitPrewhereIntoReadSteps.cpp index f858cb95846..43e3b0c505a 100644 --- a/src/Storages/MergeTree/MergeTreeSplitPrewhereIntoReadSteps.cpp +++ b/src/Storages/MergeTree/MergeTreeSplitPrewhereIntoReadSteps.cpp @@ -1,9 +1,12 @@ #include #include +#include #include #include +#include #include + namespace DB { @@ -160,7 +163,7 @@ const ActionsDAG::Node & addCast( const auto * cast_type_constant_node = &dag->addColumn(std::move(column)); ActionsDAG::NodeRawConstPtrs children = {&node_to_cast, cast_type_constant_node}; - FunctionOverloadResolverPtr func_builder_cast = CastInternalOverloadResolver::createImpl(); + FunctionOverloadResolverPtr func_builder_cast = createInternalCastOverloadResolver(CastType::nonAccurate, {}); return addFunction(dag, func_builder_cast, std::move(children), node_remap); } diff --git a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp index 3834e175b78..9137dc89705 100644 --- a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp +++ b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp @@ -15,6 +15,7 @@ #include #include #include "IO/WriteBufferFromString.h" +#include #include "Storages/MergeTree/RangesInDataPart.h" #include "Storages/MergeTree/RequestResponse.h" #include @@ -78,6 +79,7 @@ public: Stats stats; size_t replicas_count{0}; size_t unavailable_replicas_count{0}; + ProgressCallback progress_callback; explicit ImplInterface(size_t replicas_count_) : stats{replicas_count_} @@ -88,6 +90,8 @@ public: virtual ParallelReadResponse handleRequest(ParallelReadRequest request) = 0; virtual void handleInitialAllRangesAnnouncement(InitialAllRangesAnnouncement announcement) = 0; virtual void markReplicaAsUnavailable(size_t replica_number) = 0; + + void setProgressCallback(ProgressCallback callback) { progress_callback = std::move(callback); } }; using Parts = std::set; @@ -231,6 +235,20 @@ void DefaultCoordinator::finalizeReadingState() delayed_parts.pop_front(); } + // update progress with total rows + if (progress_callback) + { + size_t total_rows_to_read = 0; + for (const auto & part : all_parts_to_read) + total_rows_to_read += part.description.rows; + + Progress progress; + progress.total_rows_to_read = total_rows_to_read; + progress_callback(progress); + + LOG_DEBUG(log, "Total rows to read: {}", total_rows_to_read); + } + LOG_DEBUG(log, "Reading state is fully initialized: {}", fmt::join(all_parts_to_read, "; ")); } @@ -361,6 +379,7 @@ public: void markReplicaAsUnavailable(size_t replica_number) override; Parts all_parts_to_read; + size_t total_rows_to_read = 0; Poco::Logger * log = &Poco::Logger::get(fmt::format("{}{}", magic_enum::enum_name(mode), "Coordinator")); }; @@ -381,6 +400,8 @@ void InOrderCoordinator::handleInitialAllRangesAnnouncement(InitialAllRang { LOG_TRACE(log, "Received an announcement {}", announcement.describe()); + size_t new_rows_to_read = 0; + /// To get rid of duplicates for (auto && part: announcement.description) { @@ -401,10 +422,23 @@ void InOrderCoordinator::handleInitialAllRangesAnnouncement(InitialAllRang if (covering_or_the_same_it != all_parts_to_read.end()) continue; + new_rows_to_read += part.rows; + auto [inserted_it, _] = all_parts_to_read.emplace(Part{.description = std::move(part), .replicas = {announcement.replica_num}}); auto & ranges = inserted_it->description.ranges; std::sort(ranges.begin(), ranges.end()); } + + if (new_rows_to_read > 0) + { + Progress progress; + progress.total_rows_to_read = new_rows_to_read; + progress_callback(progress); + + total_rows_to_read += new_rows_to_read; + + LOG_DEBUG(log, "Updated total rows to read: added {} rows, total {} rows", new_rows_to_read, total_rows_to_read); + } } template @@ -508,7 +542,6 @@ void ParallelReplicasReadingCoordinator::handleInitialAllRangesAnnouncement(Init initialize(); } - return pimpl->handleInitialAllRangesAnnouncement(std::move(announcement)); } @@ -543,18 +576,28 @@ void ParallelReplicasReadingCoordinator::initialize() { case CoordinationMode::Default: pimpl = std::make_unique(replicas_count); - return; + break; case CoordinationMode::WithOrder: pimpl = std::make_unique>(replicas_count); - return; + break; case CoordinationMode::ReverseOrder: pimpl = std::make_unique>(replicas_count); - return; + break; } + if (progress_callback) + pimpl->setProgressCallback(std::move(progress_callback)); } ParallelReplicasReadingCoordinator::ParallelReplicasReadingCoordinator(size_t replicas_count_) : replicas_count(replicas_count_) {} ParallelReplicasReadingCoordinator::~ParallelReplicasReadingCoordinator() = default; +void ParallelReplicasReadingCoordinator::setProgressCallback(ProgressCallback callback) +{ + // store callback since pimpl can be not instantiated yet + progress_callback = std::move(callback); + if (pimpl) + pimpl->setProgressCallback(std::move(progress_callback)); +} + } diff --git a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.h b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.h index ad8229be2d0..449421797ce 100644 --- a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.h +++ b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.h @@ -6,10 +6,10 @@ namespace DB { +struct Progress; +using ProgressCallback = std::function; /// The main class to spread mark ranges across replicas dynamically -/// The reason why it uses pimpl - this header file is included in -/// multiple other files like Context or RemoteQueryExecutor class ParallelReplicasReadingCoordinator { public: @@ -27,6 +27,9 @@ public: /// "pending" state waiting for the unavailable replica to send the announcement. void markReplicaAsUnavailable(size_t replica_number); + /// needed to report total rows to read + void setProgressCallback(ProgressCallback callback); + private: void initialize(); @@ -35,6 +38,7 @@ private: CoordinationMode mode{CoordinationMode::Default}; std::atomic initialized{false}; std::unique_ptr pimpl; + ProgressCallback progress_callback; // store the callback only to bypass it to coordinator implementation }; using ParallelReplicasReadingCoordinatorPtr = std::shared_ptr; diff --git a/src/Storages/MergeTree/PartitionPruner.cpp b/src/Storages/MergeTree/PartitionPruner.cpp index 97bb9f3b4d4..a5df08e3df9 100644 --- a/src/Storages/MergeTree/PartitionPruner.cpp +++ b/src/Storages/MergeTree/PartitionPruner.cpp @@ -31,7 +31,7 @@ PartitionPruner::PartitionPruner(const StorageMetadataPtr & metadata, ActionsDAG { } -bool PartitionPruner::canBePruned(const IMergeTreeDataPart & part) +bool PartitionPruner::canBePruned(const IMergeTreeDataPart & part) const { if (part.isEmpty()) return true; diff --git a/src/Storages/MergeTree/PartitionPruner.h b/src/Storages/MergeTree/PartitionPruner.h index 7f1b74795c4..e8a740b1524 100644 --- a/src/Storages/MergeTree/PartitionPruner.h +++ b/src/Storages/MergeTree/PartitionPruner.h @@ -16,14 +16,15 @@ public: PartitionPruner(const StorageMetadataPtr & metadata, const SelectQueryInfo & query_info, ContextPtr context, bool strict); PartitionPruner(const StorageMetadataPtr & metadata, ActionsDAGPtr filter_actions_dag, ContextPtr context, bool strict); - bool canBePruned(const IMergeTreeDataPart & part); + bool canBePruned(const IMergeTreeDataPart & part) const; bool isUseless() const { return useless; } const KeyCondition & getKeyCondition() const { return partition_condition; } private: - std::unordered_map partition_filter_map; + /// Cache already analyzed partitions. + mutable std::unordered_map partition_filter_map; /// partition_key is adjusted here (with substitution from modulo to moduloLegacy). KeyDescription partition_key; diff --git a/src/Storages/MergeTree/RangesInDataPart.cpp b/src/Storages/MergeTree/RangesInDataPart.cpp index e64e9ab0b2a..c46385e84ef 100644 --- a/src/Storages/MergeTree/RangesInDataPart.cpp +++ b/src/Storages/MergeTree/RangesInDataPart.cpp @@ -32,6 +32,7 @@ void RangesInDataPartDescription::serialize(WriteBuffer & out) const { info.serialize(out); ranges.serialize(out); + writeVarUInt(rows, out); } String RangesInDataPartDescription::describe() const @@ -45,6 +46,7 @@ void RangesInDataPartDescription::deserialize(ReadBuffer & in) { info.deserialize(in); ranges.deserialize(in); + readVarUInt(rows, in); } void RangesInDataPartsDescription::serialize(WriteBuffer & out) const @@ -82,6 +84,7 @@ RangesInDataPartDescription RangesInDataPart::getDescription() const return RangesInDataPartDescription{ .info = data_part->info, .ranges = ranges, + .rows = getRowsCount(), }; } diff --git a/src/Storages/MergeTree/RangesInDataPart.h b/src/Storages/MergeTree/RangesInDataPart.h index afb3ad33762..82152ee3906 100644 --- a/src/Storages/MergeTree/RangesInDataPart.h +++ b/src/Storages/MergeTree/RangesInDataPart.h @@ -21,6 +21,7 @@ struct RangesInDataPartDescription { MergeTreePartInfo info; MarkRanges ranges; + size_t rows = 0; void serialize(WriteBuffer & out) const; String describe() const; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 13ce882a525..7de5d46c66b 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -131,8 +131,7 @@ ReplicatedMergeTreeSinkImpl::ReplicatedMergeTreeSinkImpl( bool majority_quorum, ContextPtr context_, bool is_attach_) - : SinkToStorage(metadata_snapshot_->getSampleBlock(), - IOutputChunkGenerator::createCopyRanges(context_->getSettingsRef().deduplicate_blocks_in_dependent_materialized_views)) + : SinkToStorage(metadata_snapshot_->getSampleBlock()) , storage(storage_) , metadata_snapshot(metadata_snapshot_) , required_quorum_size(majority_quorum ? std::nullopt : std::make_optional(quorum_size)) @@ -402,7 +401,13 @@ void ReplicatedMergeTreeSinkImpl::consume(Chunk chunk) finishDelayedChunk(zookeeper); delayed_chunk = std::make_unique(); delayed_chunk->partitions = std::move(partitions); - finishDelayedChunk(zookeeper); + + /// If deduplicated data should not be inserted into MV, we need to set proper + /// value for `last_block_is_duplicate`, which is possible only after the part is committed. + /// Othervide we can delay commit. + /// TODO: we can also delay commit if there is no MVs. + if (!settings.deduplicate_blocks_in_dependent_materialized_views) + finishDelayedChunk(zookeeper); ++num_blocks_processed; } @@ -413,6 +418,8 @@ void ReplicatedMergeTreeSinkImpl::finishDelayedChunk(const ZooKeeperWithF if (!delayed_chunk) return; + last_block_is_duplicate = false; + for (auto & partition : delayed_chunk->partitions) { ProfileEventsScope scoped_attach(&partition.part_counters); @@ -423,10 +430,9 @@ void ReplicatedMergeTreeSinkImpl::finishDelayedChunk(const ZooKeeperWithF try { - const size_t rowsCount = partition.temp_part.part->rows_count; - const bool deduplicated = commitPart(zookeeper, part, partition.block_id, delayed_chunk->replicas_num, false).second; + bool deduplicated = commitPart(zookeeper, part, partition.block_id, delayed_chunk->replicas_num, false).second; - getOutputGenerator().onRowsProcessed(rowsCount, !deduplicated); + last_block_is_duplicate = last_block_is_duplicate || deduplicated; /// Set a special error code if the block is duplicate int error = (deduplicate && deduplicated) ? ErrorCodes::INSERT_WAS_DEDUPLICATED : 0; @@ -1085,6 +1091,13 @@ void ReplicatedMergeTreeSinkImpl::onStart() storage.delayInsertOrThrowIfNeeded(&storage.partial_shutdown_event, context, true); } +template +void ReplicatedMergeTreeSinkImpl::onFinish() +{ + auto zookeeper = storage.getZooKeeper(); + finishDelayedChunk(std::make_shared(zookeeper)); +} + template void ReplicatedMergeTreeSinkImpl::waitForQuorum( const ZooKeeperWithFaultInjectionPtr & zookeeper, diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h index b208154631c..4a192a822f5 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h @@ -51,12 +51,23 @@ public: void onStart() override; void consume(Chunk chunk) override; + void onFinish() override; String getName() const override { return "ReplicatedMergeTreeSink"; } /// For ATTACHing existing data on filesystem. bool writeExistingPart(MergeTreeData::MutableDataPartPtr & part); + /// For proper deduplication in MaterializedViews + bool lastBlockIsDuplicate() const override + { + /// If MV is responsible for deduplication, block is not considered duplicating. + if (context->getSettingsRef().deduplicate_blocks_in_dependent_materialized_views) + return false; + + return last_block_is_duplicate; + } + struct DelayedChunk; private: using BlockIDsType = std::conditional_t, String>; @@ -111,6 +122,7 @@ private: bool is_attach = false; bool quorum_parallel = false; const bool deduplicate = true; + bool last_block_is_duplicate = false; UInt64 num_blocks_processed = 0; using Logger = Poco::Logger; diff --git a/src/Storages/NATS/StorageNATS.cpp b/src/Storages/NATS/StorageNATS.cpp index f3ed8ed7825..cd7e99a6d18 100644 --- a/src/Storages/NATS/StorageNATS.cpp +++ b/src/Storages/NATS/StorageNATS.cpp @@ -419,7 +419,7 @@ void StorageNATS::startup() } -void StorageNATS::shutdown() +void StorageNATS::shutdown(bool /* is_drop */) { shutdown_called = true; diff --git a/src/Storages/NATS/StorageNATS.h b/src/Storages/NATS/StorageNATS.h index cc7b0d88be5..16a162b8500 100644 --- a/src/Storages/NATS/StorageNATS.h +++ b/src/Storages/NATS/StorageNATS.h @@ -31,7 +31,7 @@ public: bool noPushingToViews() const override { return true; } void startup() override; - void shutdown() override; + void shutdown(bool is_drop) override; /// This is a bad way to let storage know in shutdown() that table is going to be dropped. There are some actions which need /// to be done only when table is dropped (not when detached). Also connection must be closed only in shutdown, but those diff --git a/src/Storages/PartitionedSink.cpp b/src/Storages/PartitionedSink.cpp index 18442a8691f..71c1dd7ab69 100644 --- a/src/Storages/PartitionedSink.cpp +++ b/src/Storages/PartitionedSink.cpp @@ -2,8 +2,6 @@ #include -#include - #include #include #include @@ -34,7 +32,7 @@ PartitionedSink::PartitionedSink( , sample_block(sample_block_) { ASTs arguments(1, partition_by); - ASTPtr partition_by_string = makeASTFunction(FunctionToString::name, std::move(arguments)); + ASTPtr partition_by_string = makeASTFunction("toString", std::move(arguments)); auto syntax_result = TreeRewriter(context).analyze(partition_by_string, sample_block.getNamesAndTypesList()); partition_by_expr = ExpressionAnalyzer(partition_by_string, syntax_result, context).getActions(false); diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp index d83722dba6c..a7650983db8 100644 --- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp +++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp @@ -228,7 +228,7 @@ void StorageMaterializedPostgreSQL::set(StoragePtr nested_storage) } -void StorageMaterializedPostgreSQL::shutdown() +void StorageMaterializedPostgreSQL::shutdown(bool) { if (replication_handler) replication_handler->shutdown(); diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h index af0adb10f9f..ca7b801cb7c 100644 --- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h +++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h @@ -81,7 +81,7 @@ public: String getName() const override { return "MaterializedPostgreSQL"; } - void shutdown() override; + void shutdown(bool is_drop) override; /// Used only for single MaterializedPostgreSQL storage. void dropInnerTableIfAny(bool sync, ContextPtr local_context) override; diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp index 3fca458310c..ec2e002b285 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp @@ -801,7 +801,7 @@ void StorageRabbitMQ::startup() } -void StorageRabbitMQ::shutdown() +void StorageRabbitMQ::shutdown(bool) { shutdown_called = true; diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.h b/src/Storages/RabbitMQ/StorageRabbitMQ.h index a5ff60f0c6e..120930cf01d 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.h +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.h @@ -34,7 +34,7 @@ public: bool noPushingToViews() const override { return true; } void startup() override; - void shutdown() override; + void shutdown(bool is_drop) override; /// This is a bad way to let storage know in shutdown() that table is going to be dropped. There are some actions which need /// to be done only when table is dropped (not when detached). Also connection must be closed only in shutdown, but those diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index 676a4b6dc17..42519c84f35 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -20,6 +20,9 @@ #include #include +#include +#include +#include #include #include @@ -440,7 +443,46 @@ void StorageEmbeddedRocksDB::initDB() } } -Pipe StorageEmbeddedRocksDB::read( +class ReadFromEmbeddedRocksDB : public SourceStepWithFilter +{ +public: + std::string getName() const override { return "ReadFromEmbeddedRocksDB"; } + void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override; + void applyFilters() override; + + ReadFromEmbeddedRocksDB( + Block sample_block, + StorageSnapshotPtr storage_snapshot_, + const StorageEmbeddedRocksDB & storage_, + SelectQueryInfo query_info_, + ContextPtr context_, + size_t max_block_size_, + size_t num_streams_) + : SourceStepWithFilter(DataStream{.header = std::move(sample_block)}) + , storage_snapshot(std::move(storage_snapshot_)) + , storage(storage_) + , query_info(std::move(query_info_)) + , context(std::move(context_)) + , max_block_size(max_block_size_) + , num_streams(num_streams_) + { + } + +private: + StorageSnapshotPtr storage_snapshot; + const StorageEmbeddedRocksDB & storage; + SelectQueryInfo query_info; + ContextPtr context; + + size_t max_block_size; + size_t num_streams; + + FieldVectorPtr keys; + bool all_scan = false; +}; + +void StorageEmbeddedRocksDB::read( + QueryPlan & query_plan, const Names & column_names, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, @@ -450,23 +492,39 @@ Pipe StorageEmbeddedRocksDB::read( size_t num_streams) { storage_snapshot->check(column_names); - - FieldVectorPtr keys; - bool all_scan = false; - Block sample_block = storage_snapshot->metadata->getSampleBlock(); - auto primary_key_data_type = sample_block.getByName(primary_key).type; - std::tie(keys, all_scan) = getFilterKeys(primary_key, primary_key_data_type, query_info, context_); + + auto reading = std::make_unique( + std::move(sample_block), + storage_snapshot, + *this, + query_info, + context_, + max_block_size, + num_streams); + + query_plan.addStep(std::move(reading)); +} + +void ReadFromEmbeddedRocksDB::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) +{ + const auto & sample_block = getOutputStream().header; + if (all_scan) { - auto iterator = std::unique_ptr(rocksdb_ptr->NewIterator(rocksdb::ReadOptions())); + auto iterator = std::unique_ptr(storage.rocksdb_ptr->NewIterator(rocksdb::ReadOptions())); iterator->SeekToFirst(); - return Pipe(std::make_shared(*this, sample_block, std::move(iterator), max_block_size)); + auto source = std::make_shared(storage, sample_block, std::move(iterator), max_block_size); + source->setStorageLimits(query_info.storage_limits); + pipeline.init(Pipe(std::move(source))); } else { if (keys->empty()) - return {}; + { + pipeline.init(Pipe(std::make_shared(sample_block))); + return; + } ::sort(keys->begin(), keys->end()); keys->erase(std::unique(keys->begin(), keys->end()), keys->end()); @@ -484,13 +542,22 @@ Pipe StorageEmbeddedRocksDB::read( size_t begin = num_keys * thread_idx / num_threads; size_t end = num_keys * (thread_idx + 1) / num_threads; - pipes.emplace_back(std::make_shared( - *this, sample_block, keys, keys->begin() + begin, keys->begin() + end, max_block_size)); + auto source = std::make_shared( + storage, sample_block, keys, keys->begin() + begin, keys->begin() + end, max_block_size); + source->setStorageLimits(query_info.storage_limits); + pipes.emplace_back(std::move(source)); } - return Pipe::unitePipes(std::move(pipes)); + pipeline.init(Pipe::unitePipes(std::move(pipes))); } } +void ReadFromEmbeddedRocksDB::applyFilters() +{ + const auto & sample_block = getOutputStream().header; + auto primary_key_data_type = sample_block.getByName(storage.primary_key).type; + std::tie(keys, all_scan) = getFilterKeys(storage.primary_key, primary_key_data_type, filter_nodes, context); +} + SinkToStoragePtr StorageEmbeddedRocksDB::write( const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/, bool /*async_insert*/) { diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h index 336f6a8abe3..11eba607c3a 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h @@ -26,6 +26,7 @@ class Context; class StorageEmbeddedRocksDB final : public IStorage, public IKeyValueEntity, WithContext { friend class EmbeddedRocksDBSink; + friend class ReadFromEmbeddedRocksDB; public: StorageEmbeddedRocksDB(const StorageID & table_id_, const String & relative_data_path_, @@ -39,7 +40,8 @@ public: std::string getName() const override { return "EmbeddedRocksDB"; } - Pipe read( + void read( + QueryPlan & query_plan, const Names & column_names, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, diff --git a/src/Storages/S3Queue/S3QueueFilesMetadata.cpp b/src/Storages/S3Queue/S3QueueFilesMetadata.cpp index d952160491c..f49e1d6f25c 100644 --- a/src/Storages/S3Queue/S3QueueFilesMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueFilesMetadata.cpp @@ -197,8 +197,7 @@ S3QueueFilesMetadata::NodeMetadata S3QueueFilesMetadata::createNodeMetadata( return metadata; } -std::pair S3QueueFilesMetadata::trySetFileAsProcessing(const std::string & path) +S3QueueFilesMetadata::ProcessingNodeHolderPtr S3QueueFilesMetadata::trySetFileAsProcessing(const std::string & path) { auto timer = DB::CurrentThread::getProfileEvents().timer(ProfileEvents::S3QueueSetFileProcessingMicroseconds); auto file_status = local_file_statuses.get(path, /* create */true); @@ -261,12 +260,12 @@ std::pair S3QueueFilesMetadata::trySetFileAsProcessingForUnorderedMode(const std::string & path) + S3QueueFilesMetadata::ProcessingNodeHolderPtr> S3QueueFilesMetadata::trySetFileAsProcessingForUnorderedMode(const std::string & path, const FileStatusPtr & file_status) { /// In one zookeeper transaction do the following: /// 1. check that corresponding persistent nodes do not exist in processed/ and failed/; @@ -340,7 +339,7 @@ std::pair(node_metadata.processing_id, path, zookeeper_processing_path / node_name, zk_client); + auto holder = std::make_unique(node_metadata.processing_id, path, zookeeper_processing_path / node_name, file_status, zk_client); return std::pair{SetFileProcessingResult::Success, std::move(holder)}; } @@ -363,7 +362,7 @@ std::pair S3QueueFilesMetadata::trySetFileAsProcessingForOrderedMode(const std::string & path) + S3QueueFilesMetadata::ProcessingNodeHolderPtr> S3QueueFilesMetadata::trySetFileAsProcessingForOrderedMode(const std::string & path, const FileStatusPtr & file_status) { /// Same as for Unordered mode. /// The only difference is the check if the file is already processed. @@ -394,7 +393,10 @@ std::pairtryMulti(requests, responses); if (code == Coordination::Error::ZOK) { - auto holder = std::make_unique(node_metadata.processing_id, path, zookeeper_processing_path / node_name, zk_client); + auto holder = std::make_unique(node_metadata.processing_id, path, zookeeper_processing_path / node_name, file_status, zk_client); return std::pair{SetFileProcessingResult::Success, std::move(holder)}; } @@ -423,7 +425,7 @@ std::pairpath; - - auto file_status = local_file_statuses.get(path, /* create */false); + auto file_status = holder->getFileStatus(); { std::lock_guard lock(file_status->metadata_lock); file_status->state = FileStatus::State::Processed; @@ -559,7 +559,7 @@ void S3QueueFilesMetadata::setFileFailed(ProcessingNodeHolderPtr holder, const S auto timer = DB::CurrentThread::getProfileEvents().timer(ProfileEvents::S3QueueSetFileFailedMicroseconds); const auto & path = holder->path; - auto file_status = local_file_statuses.get(path, /* create */false); + auto file_status = holder->getFileStatus(); { std::lock_guard lock(file_status->metadata_lock); file_status->state = FileStatus::State::Failed; @@ -682,8 +682,10 @@ S3QueueFilesMetadata::ProcessingNodeHolder::ProcessingNodeHolder( const std::string & processing_id_, const std::string & path_, const std::string & zk_node_path_, + FileStatusPtr file_status_, zkutil::ZooKeeperPtr zk_client_) : zk_client(zk_client_) + , file_status(file_status_) , path(path_) , zk_node_path(zk_node_path_) , processing_id(processing_id_) @@ -790,7 +792,18 @@ void S3QueueFilesMetadata::cleanupThreadFuncImpl() const bool check_nodes_ttl = max_set_age_sec > 0; const auto zk_client = getZooKeeper(); - auto nodes = zk_client->getChildren(zookeeper_processed_path); + Strings nodes; + auto code = zk_client->tryGetChildren(zookeeper_processed_path, nodes); + if (code != Coordination::Error::ZOK) + { + if (code == Coordination::Error::ZNONODE) + { + LOG_TEST(log, "A `processed` not is not yet created"); + return; + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected error: {}", magic_enum::enum_name(code)); + } + if (nodes.empty()) { LOG_TEST(log, "A set of nodes is empty"); @@ -869,7 +882,7 @@ void S3QueueFilesMetadata::cleanupThreadFuncImpl() local_file_statuses.remove(node.metadata.file_path, /* if_exists */true); - auto code = zk_client->tryRemove(path); + code = zk_client->tryRemove(path); if (code == Coordination::Error::ZOK) --nodes_to_remove; else @@ -886,7 +899,7 @@ void S3QueueFilesMetadata::cleanupThreadFuncImpl() local_file_statuses.remove(node.metadata.file_path, /* if_exists */true); - auto code = zk_client->tryRemove(path); + code = zk_client->tryRemove(path); if (code != Coordination::Error::ZOK) LOG_ERROR(log, "Failed to remove a node `{}` (code: {})", path.string(), code); } diff --git a/src/Storages/S3Queue/S3QueueFilesMetadata.h b/src/Storages/S3Queue/S3QueueFilesMetadata.h index df9db87a621..f3be7c5c3a0 100644 --- a/src/Storages/S3Queue/S3QueueFilesMetadata.h +++ b/src/Storages/S3Queue/S3QueueFilesMetadata.h @@ -70,7 +70,7 @@ public: using FileStatuses = std::unordered_map; /// Set file as processing, if it is not alreaty processed, failed or processing. - std::pair trySetFileAsProcessing(const std::string & path); + ProcessingNodeHolderPtr trySetFileAsProcessing(const std::string & path); FileStatusPtr getFileStatus(const std::string & path); @@ -112,8 +112,8 @@ private: AlreadyProcessed, AlreadyFailed, }; - std::pair trySetFileAsProcessingForOrderedMode(const std::string & path); - std::pair trySetFileAsProcessingForUnorderedMode(const std::string & path); + std::pair trySetFileAsProcessingForOrderedMode(const std::string & path, const FileStatusPtr & file_status); + std::pair trySetFileAsProcessingForUnorderedMode(const std::string & path, const FileStatusPtr & file_status); struct NodeMetadata { @@ -153,14 +153,18 @@ public: const std::string & processing_id_, const std::string & path_, const std::string & zk_node_path_, + FileStatusPtr file_status_, zkutil::ZooKeeperPtr zk_client_); ~ProcessingNodeHolder(); + FileStatusPtr getFileStatus() { return file_status; } + private: bool remove(Coordination::Requests * requests = nullptr, Coordination::Responses * responses = nullptr); zkutil::ZooKeeperPtr zk_client; + FileStatusPtr file_status; std::string path; std::string zk_node_path; std::string processing_id; diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 5d957d885f5..1afd17edbe1 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -33,11 +33,9 @@ namespace ErrorCodes StorageS3QueueSource::S3QueueKeyWithInfo::S3QueueKeyWithInfo( const std::string & key_, std::optional info_, - Metadata::ProcessingNodeHolderPtr processing_holder_, - FileStatusPtr file_status_) + Metadata::ProcessingNodeHolderPtr processing_holder_) : StorageS3Source::KeyWithInfo(key_, info_) , processing_holder(processing_holder_) - , file_status(file_status_) { } @@ -57,13 +55,19 @@ StorageS3QueueSource::KeyWithInfoPtr StorageS3QueueSource::FileIterator::next() { KeyWithInfoPtr val = glob_iterator->next(); - if (!val || shutdown_called) + if (!val) return {}; - if (auto [processing_holder, processing_file_status] = metadata->trySetFileAsProcessing(val->key); + if (shutdown_called) + { + LOG_TEST(&Poco::Logger::get("StorageS3QueueSource"), "Shutdown was called, stopping file iterator"); + return {}; + } + + if (auto processing_holder = metadata->trySetFileAsProcessing(val->key); processing_holder && !shutdown_called) { - return std::make_shared(val->key, val->info, processing_holder, processing_file_status); + return std::make_shared(val->key, val->info, processing_holder); } } return {}; @@ -84,8 +88,10 @@ StorageS3QueueSource::StorageS3QueueSource( const NamesAndTypesList & requested_virtual_columns_, ContextPtr context_, const std::atomic & shutdown_called_, + const std::atomic & table_is_being_dropped_, std::shared_ptr s3_queue_log_, - const StorageID & storage_id_) + const StorageID & storage_id_, + Poco::Logger * log_) : ISource(header_) , WithContext(context_) , name(std::move(name_)) @@ -94,10 +100,11 @@ StorageS3QueueSource::StorageS3QueueSource( , internal_source(std::move(internal_source_)) , requested_virtual_columns(requested_virtual_columns_) , shutdown_called(shutdown_called_) + , table_is_being_dropped(table_is_being_dropped_) , s3_queue_log(s3_queue_log_) , storage_id(storage_id_) , remove_file_func(remove_file_func_) - , log(&Poco::Logger::get("StorageS3QueueSource")) + , log(log_) { } @@ -132,29 +139,60 @@ Chunk StorageS3QueueSource::generate() if (!reader) break; + const auto * key_with_info = dynamic_cast(&reader.getKeyWithInfo()); + auto file_status = key_with_info->processing_holder->getFileStatus(); + if (isCancelled()) { reader->cancel(); + + if (processed_rows_from_file) + { + try + { + files_metadata->setFileFailed(key_with_info->processing_holder, "Cancelled"); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + + appendLogElement(reader.getFile(), *file_status, processed_rows_from_file, false); + } + break; } if (shutdown_called) { - if (processed_rows_from_file) - { - /// We could delay shutdown until files, which already started processing before the shutdown, finished. - /// But if files are big and `s3queue_processing_threads_num` is not small, it can take a significant time. - /// Anyway we cannot do anything in case of SIGTERM, so destination table must anyway support deduplication, - /// so here we will rely on it here as well. - LOG_WARNING( - log, "Shutdown called, {} rows are already processed, but file is not fully processed", - processed_rows_from_file); - } - break; - } + if (processed_rows_from_file == 0) + break; - const auto * key_with_info = dynamic_cast(&reader.getKeyWithInfo()); - auto file_status = key_with_info->file_status; + if (table_is_being_dropped) + { + LOG_DEBUG( + log, "Table is being dropped, {} rows are already processed from {}, but file is not fully processed", + processed_rows_from_file, reader.getFile()); + + try + { + files_metadata->setFileFailed(key_with_info->processing_holder, "Table is dropped"); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + + appendLogElement(reader.getFile(), *file_status, processed_rows_from_file, false); + + /// Leave the file half processed. Table is being dropped, so we do not care. + break; + } + + LOG_DEBUG(log, "Shutdown called, but file {} is partially processed ({} rows). " + "Will process the file fully and then shutdown", + reader.getFile(), processed_rows_from_file); + } auto * prev_scope = CurrentThread::get().attachProfileCountersScope(&file_status->profile_counters); SCOPE_EXIT({ CurrentThread::get().attachProfileCountersScope(prev_scope); }); diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index 8af5256899a..542f8e8fd8c 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -30,11 +30,9 @@ public: S3QueueKeyWithInfo( const std::string & key_, std::optional info_, - Metadata::ProcessingNodeHolderPtr processing_holder_, - FileStatusPtr file_status_); + Metadata::ProcessingNodeHolderPtr processing_holder_); Metadata::ProcessingNodeHolderPtr processing_holder; - FileStatusPtr file_status; }; class FileIterator : public IIterator @@ -66,8 +64,10 @@ public: const NamesAndTypesList & requested_virtual_columns_, ContextPtr context_, const std::atomic & shutdown_called_, + const std::atomic & table_is_being_dropped_, std::shared_ptr s3_queue_log_, - const StorageID & storage_id_); + const StorageID & storage_id_, + Poco::Logger * log_); ~StorageS3QueueSource() override; @@ -84,6 +84,7 @@ private: const std::shared_ptr internal_source; const NamesAndTypesList requested_virtual_columns; const std::atomic & shutdown_called; + const std::atomic & table_is_being_dropped; const std::shared_ptr s3_queue_log; const StorageID storage_id; diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 72e74d3c2a0..99699aab709 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -161,8 +161,9 @@ void StorageS3Queue::startup() task->activateAndSchedule(); } -void StorageS3Queue::shutdown() +void StorageS3Queue::shutdown(bool is_drop) { + table_is_being_dropped = is_drop; shutdown_called = true; if (task) @@ -257,7 +258,7 @@ std::shared_ptr StorageS3Queue::createSource( return std::make_shared( getName(), read_from_format_info.source_header, std::move(internal_source), files_metadata, after_processing, file_deleter, read_from_format_info.requested_virtual_columns, - local_context, shutdown_called, s3_queue_log, getStorageID()); + local_context, shutdown_called, table_is_being_dropped, s3_queue_log, getStorageID(), log); } bool StorageS3Queue::hasDependencies(const StorageID & table_id) diff --git a/src/Storages/S3Queue/StorageS3Queue.h b/src/Storages/S3Queue/StorageS3Queue.h index 000015951ea..e594ddcce3e 100644 --- a/src/Storages/S3Queue/StorageS3Queue.h +++ b/src/Storages/S3Queue/StorageS3Queue.h @@ -73,10 +73,11 @@ private: std::atomic mv_attached = false; std::atomic shutdown_called = false; + std::atomic table_is_being_dropped = false; Poco::Logger * log; void startup() override; - void shutdown() override; + void shutdown(bool is_drop) override; void drop() override; bool supportsSubsetOfColumns(const ContextPtr & context_) const; bool supportsSubcolumns() const override { return true; } diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index d0115bf84df..2e0703a8df3 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -843,8 +843,10 @@ StorageAzureBlobSource::GlobIterator::GlobIterator( /// We don't have to list bucket, because there is no asterisks. if (key_prefix.size() == blob_path_with_globs.size()) { - ObjectMetadata object_metadata = object_storage->getObjectMetadata(blob_path_with_globs); - blobs_with_metadata.emplace_back(blob_path_with_globs, object_metadata); + auto object_metadata = object_storage->getObjectMetadata(blob_path_with_globs); + blobs_with_metadata.emplace_back( + blob_path_with_globs, + object_metadata); if (outer_blobs) outer_blobs->emplace_back(blobs_with_metadata.back()); if (file_progress_callback) @@ -923,8 +925,10 @@ RelativePathWithMetadata StorageAzureBlobSource::GlobIterator::next() blobs_with_metadata = std::move(new_batch); if (file_progress_callback) { - for (const auto & [_, info] : blobs_with_metadata) + for (const auto & [relative_path, info] : blobs_with_metadata) + { file_progress_callback(FileProgress(0, info.size_bytes)); + } } } @@ -970,7 +974,7 @@ StorageAzureBlobSource::KeysIterator::KeysIterator( ObjectMetadata object_metadata = object_storage->getObjectMetadata(key); if (file_progress_callback) file_progress_callback(FileProgress(0, object_metadata.size_bytes)); - keys.emplace_back(RelativePathWithMetadata{key, object_metadata}); + keys.emplace_back(key, object_metadata); } if (outer_blobs) @@ -1114,7 +1118,8 @@ StorageAzureBlobSource::ReaderHolder StorageAzureBlobSource::createReader() QueryPipelineBuilder builder; std::shared_ptr source; std::unique_ptr read_buf; - std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files ? tryGetNumRowsFromCache(path_with_metadata) : std::nullopt; + std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files + ? tryGetNumRowsFromCache(path_with_metadata) : std::nullopt; if (num_rows_from_cache) { /// We should not return single chunk with all number of rows, diff --git a/src/Storages/StorageDictionary.cpp b/src/Storages/StorageDictionary.cpp index 592195a918d..4c354371574 100644 --- a/src/Storages/StorageDictionary.cpp +++ b/src/Storages/StorageDictionary.cpp @@ -170,7 +170,7 @@ std::shared_ptr StorageDictionary::getDictionary() const return getContext()->getExternalDictionariesLoader().getDictionary(registered_dictionary_name, getContext()); } -void StorageDictionary::shutdown() +void StorageDictionary::shutdown(bool) { removeDictionaryConfigurationFromRepository(); } diff --git a/src/Storages/StorageDictionary.h b/src/Storages/StorageDictionary.h index aab704305e1..995a0192269 100644 --- a/src/Storages/StorageDictionary.h +++ b/src/Storages/StorageDictionary.h @@ -83,7 +83,7 @@ public: static NamesAndTypesList getNamesAndTypes(const DictionaryStructure & dictionary_structure); bool isDictionary() const override { return true; } - void shutdown() override; + void shutdown(bool is_drop) override; void startup() override; void renameInMemory(const StorageID & new_table_id) override; diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 7705d0f193f..7d6f8aa5812 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -1248,7 +1248,7 @@ void StorageDistributed::initializeFromDisk() } -void StorageDistributed::shutdown() +void StorageDistributed::shutdown(bool) { async_insert_blocker.cancelForever(); @@ -1269,7 +1269,7 @@ void StorageDistributed::drop() // And second time shutdown() should be fast, since none of // DirectoryMonitor should not do anything, because ActionBlocker is // canceled (in shutdown()). - shutdown(); + shutdown(true); // Distributed table without sharding_key does not allows INSERTs if (relative_data_path.empty()) diff --git a/src/Storages/StorageDistributed.h b/src/Storages/StorageDistributed.h index 520e1445d09..a9e5e93cc92 100644 --- a/src/Storages/StorageDistributed.h +++ b/src/Storages/StorageDistributed.h @@ -137,7 +137,7 @@ public: void alter(const AlterCommands & params, ContextPtr context, AlterLockHolder & table_lock_holder) override; void initializeFromDisk(); - void shutdown() override; + void shutdown(bool is_drop) override; void flushAndPrepareForShutdown() override; void drop() override; diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index d54ab988565..97cfd550769 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -398,7 +398,7 @@ void StorageMaterializedView::startup() DatabaseCatalog::instance().addViewDependency(select_query.select_table_id, getStorageID()); } -void StorageMaterializedView::shutdown() +void StorageMaterializedView::shutdown(bool) { auto metadata_snapshot = getInMemoryMetadataPtr(); const auto & select_query = metadata_snapshot->getSelectQuery(); @@ -435,7 +435,13 @@ void StorageMaterializedView::backupData(BackupEntriesCollector & backup_entries { /// We backup the target table's data only if it's inner. if (hasInnerTable()) - getTargetTable()->backupData(backup_entries_collector, data_path_in_backup, partitions); + { + if (auto table = tryGetTargetTable()) + table->backupData(backup_entries_collector, data_path_in_backup, partitions); + else + LOG_WARNING(&Poco::Logger::get("StorageMaterializedView"), + "Inner table does not exist, will not backup any data"); + } } void StorageMaterializedView::restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index 0f6a6fd3db7..ae38cfb7e59 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -72,7 +72,7 @@ public: void renameInMemory(const StorageID & new_table_id) override; void startup() override; - void shutdown() override; + void shutdown(bool is_drop) override; QueryProcessingStage::Enum getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index b6253fa6daf..7958cdf9f50 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -1,48 +1,55 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include #include #include -#include -#include -#include -#include -#include -#include +#include +#include #include -#include -#include -#include "DataTypes/IDataType.h" -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include #include +#include +#include #include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include - +#include +#include +#include +#include namespace { @@ -398,6 +405,7 @@ ReadFromMerge::ReadFromMerge( , context(std::move(context_)) , common_processed_stage(processed_stage) { + createChildPlans(); } void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) @@ -408,6 +416,65 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu return; } + QueryPlanResourceHolder resources; + std::vector> pipelines; + + chassert(selected_tables.size() == child_plans.size()); + chassert(selected_tables.size() == table_aliases.size()); + auto table_it = selected_tables.begin(); + for (size_t i = 0; i < selected_tables.size(); ++i, ++table_it) + { + auto & plan = child_plans.at(i); + const auto & table = *table_it; + + const auto storage = std::get<1>(table); + const auto storage_metadata_snapshot = storage->getInMemoryMetadataPtr(); + const auto nested_storage_snaphsot = storage->getStorageSnapshot(storage_metadata_snapshot, context); + + auto modified_query_info = getModifiedQueryInfo(query_info, context, table, nested_storage_snaphsot); + + auto source_pipeline = createSources( + plan, nested_storage_snaphsot, modified_query_info, common_processed_stage, common_header, table_aliases.at(i), table, context); + + if (source_pipeline && source_pipeline->initialized()) + { + resources.storage_holders.push_back(std::get<1>(table)); + resources.table_locks.push_back(std::get<2>(table)); + + pipelines.emplace_back(std::move(source_pipeline)); + } + } + + if (pipelines.empty()) + { + pipeline.init(Pipe(std::make_shared(output_stream->header))); + return; + } + + pipeline = QueryPipelineBuilder::unitePipelines(std::move(pipelines)); + + if (!query_info.input_order_info) + { + size_t tables_count = selected_tables.size(); + Float64 num_streams_multiplier = std::min( + static_cast(tables_count), + std::max(1UL, static_cast(context->getSettingsRef().max_streams_multiplier_for_merge_tables))); + size_t num_streams = static_cast(requested_num_streams * num_streams_multiplier); + + // It's possible to have many tables read from merge, resize(num_streams) might open too many files at the same time. + // Using narrowPipe instead. But in case of reading in order of primary key, we cannot do it, + // because narrowPipe doesn't preserve order. + pipeline.narrow(num_streams); + } + + pipeline.addResources(std::move(resources)); +} + +void ReadFromMerge::createChildPlans() +{ + if (selected_tables.empty()) + return; + size_t tables_count = selected_tables.size(); Float64 num_streams_multiplier = std::min(static_cast(tables_count), std::max(1UL, static_cast(context->getSettingsRef().max_streams_multiplier_for_merge_tables))); @@ -438,11 +505,6 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu query_info.input_order_info = input_sorting_info; } - auto sample_block = merge_storage_snapshot->getMetadataForQuery()->getSampleBlock(); - - std::vector> pipelines; - QueryPlanResourceHolder resources; - for (const auto & table : selected_tables) { size_t current_need_streams = tables_count >= num_streams ? 1 : (num_streams / tables_count); @@ -460,7 +522,7 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu if (sampling_requested && !storage->supportsSampling()) throw Exception(ErrorCodes::SAMPLING_NOT_SUPPORTED, "Illegal SAMPLE: table {} doesn't support sampling", storage->getStorageID().getNameForLogs()); - Aliases aliases; + auto & aliases = table_aliases.emplace_back(); auto storage_metadata_snapshot = storage->getInMemoryMetadataPtr(); auto nested_storage_snaphsot = storage->getStorageSnapshot(storage_metadata_snapshot, context); @@ -479,6 +541,8 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu ASTPtr required_columns_expr_list = std::make_shared(); ASTPtr column_expr; + auto sample_block = merge_storage_snapshot->getMetadataForQuery()->getSampleBlock(); + for (const auto & column : column_names) { const auto column_default = storage_columns.getDefault(column); @@ -515,42 +579,16 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu } } - auto source_pipeline = createSources( + child_plans.emplace_back(createPlanForTable( nested_storage_snaphsot, modified_query_info, common_processed_stage, required_max_block_size, - common_header, - aliases, table, column_names_as_aliases.empty() ? column_names : column_names_as_aliases, context, - current_streams); - - if (source_pipeline && source_pipeline->initialized()) - { - resources.storage_holders.push_back(std::get<1>(table)); - resources.table_locks.push_back(std::get<2>(table)); - - pipelines.emplace_back(std::move(source_pipeline)); - } + current_streams)); } - - if (pipelines.empty()) - { - pipeline.init(Pipe(std::make_shared(output_stream->header))); - return; - } - - pipeline = QueryPipelineBuilder::unitePipelines(std::move(pipelines)); - - if (!query_info.input_order_info) - // It's possible to have many tables read from merge, resize(num_streams) might open too many files at the same time. - // Using narrowPipe instead. But in case of reading in order of primary key, we cannot do it, - // because narrowPipe doesn't preserve order. - pipeline.narrow(num_streams); - - pipeline.addResources(std::move(resources)); } SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const SelectQueryInfo & query_info, @@ -616,23 +654,121 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const SelectQueryInfo & quer return modified_query_info; } +bool recursivelyApplyToReadingSteps(QueryPlan::Node * node, const std::function & func) +{ + bool ok = true; + for (auto * child : node->children) + ok &= recursivelyApplyToReadingSteps(child, func); + + // This code is mainly meant to be used to call `requestReadingInOrder` on child steps. + // In this case it is ok if one child will read in order and other will not (though I don't know when it is possible), + // the only important part is to acknowledge this at the parent and don't rely on any particular ordering of input data. + if (!ok) + return false; + + if (auto * read_from_merge_tree = typeid_cast(node->step.get())) + ok &= func(*read_from_merge_tree); + + return ok; +} + QueryPipelineBuilderPtr ReadFromMerge::createSources( + QueryPlan & plan, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & modified_query_info, const QueryProcessingStage::Enum & processed_stage, - const UInt64 max_block_size, const Block & header, const Aliases & aliases, const StorageWithLockAndName & storage_with_lock, + ContextMutablePtr modified_context, + bool concat_streams) const +{ + if (!plan.isInitialized()) + return std::make_unique(); + + QueryPipelineBuilderPtr builder; + + const auto & [database_name, storage, _, table_name] = storage_with_lock; + bool allow_experimental_analyzer = modified_context->getSettingsRef().allow_experimental_analyzer; + auto storage_stage + = storage->getQueryProcessingStage(modified_context, QueryProcessingStage::Complete, storage_snapshot, modified_query_info); + + builder = plan.buildQueryPipeline( + QueryPlanOptimizationSettings::fromContext(modified_context), BuildQueryPipelineSettings::fromContext(modified_context)); + + if (processed_stage > storage_stage || (allow_experimental_analyzer && processed_stage != QueryProcessingStage::FetchColumns)) + { + /** Materialization is needed, since from distributed storage the constants come materialized. + * If you do not do this, different types (Const and non-Const) columns will be produced in different threads, + * And this is not allowed, since all code is based on the assumption that in the block stream all types are the same. + */ + builder->addSimpleTransform([](const Block & stream_header) { return std::make_shared(stream_header); }); + } + + if (builder->initialized()) + { + if (concat_streams && builder->getNumStreams() > 1) + { + // It's possible to have many tables read from merge, resize(1) might open too many files at the same time. + // Using concat instead. + builder->addTransform(std::make_shared(builder->getHeader(), builder->getNumStreams())); + } + + /// Add virtual columns if we don't already have them. + + Block pipe_header = builder->getHeader(); + + if (has_database_virtual_column && !pipe_header.has("_database")) + { + ColumnWithTypeAndName column; + column.name = "_database"; + column.type = std::make_shared(std::make_shared()); + column.column = column.type->createColumnConst(0, Field(database_name)); + + auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); + auto adding_column_actions = std::make_shared( + std::move(adding_column_dag), ExpressionActionsSettings::fromContext(modified_context, CompileExpressions::yes)); + + builder->addSimpleTransform([&](const Block & stream_header) + { return std::make_shared(stream_header, adding_column_actions); }); + } + + if (has_table_virtual_column && !pipe_header.has("_table")) + { + ColumnWithTypeAndName column; + column.name = "_table"; + column.type = std::make_shared(std::make_shared()); + column.column = column.type->createColumnConst(0, Field(table_name)); + + auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); + auto adding_column_actions = std::make_shared( + std::move(adding_column_dag), ExpressionActionsSettings::fromContext(modified_context, CompileExpressions::yes)); + + builder->addSimpleTransform([&](const Block & stream_header) + { return std::make_shared(stream_header, adding_column_actions); }); + } + + /// Subordinary tables could have different but convertible types, like numeric types of different width. + /// We must return streams with structure equals to structure of Merge table. + convertingSourceStream(header, storage_snapshot->metadata, aliases, modified_context, *builder, processed_stage); + } + + return builder; +} + +QueryPlan ReadFromMerge::createPlanForTable( + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & modified_query_info, + const QueryProcessingStage::Enum & processed_stage, + UInt64 max_block_size, + const StorageWithLockAndName & storage_with_lock, Names real_column_names, ContextMutablePtr modified_context, - size_t streams_num, - bool concat_streams) + size_t streams_num) { const auto & [database_name, storage, _, table_name] = storage_with_lock; auto & modified_select = modified_query_info.query->as(); - QueryPipelineBuilderPtr builder; if (!InterpreterSelectQuery::isQueryWithFinal(modified_query_info) && storage->needRewriteQueryWithFinal(real_column_names)) { /// NOTE: It may not work correctly in some cases, because query was analyzed without final. @@ -647,14 +783,14 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( storage_snapshot, modified_query_info); + QueryPlan plan; + if (processed_stage <= storage_stage || (allow_experimental_analyzer && processed_stage == QueryProcessingStage::FetchColumns)) { /// If there are only virtual columns in query, you must request at least one other column. if (real_column_names.empty()) real_column_names.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()).name); - QueryPlan & plan = child_plans.emplace_back(); - StorageView * view = dynamic_cast(storage.get()); if (!view || allow_experimental_analyzer) { @@ -688,16 +824,7 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( if (!plan.isInitialized()) return {}; - if (auto * read_from_merge_tree = typeid_cast(plan.getRootNode()->step.get())) - { - size_t filters_dags_size = filter_dags.size(); - for (size_t i = 0; i < filters_dags_size; ++i) - read_from_merge_tree->addFilter(filter_dags[i], filter_nodes.nodes[i]); - } - - builder = plan.buildQueryPipeline( - QueryPlanOptimizationSettings::fromContext(modified_context), - BuildQueryPipelineSettings::fromContext(modified_context)); + applyFilters(plan); } else if (processed_stage > storage_stage || (allow_experimental_analyzer && processed_stage != QueryProcessingStage::FetchColumns)) { @@ -705,15 +832,14 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( modified_context->setSetting("max_threads", streams_num); modified_context->setSetting("max_streams_to_max_threads_ratio", 1); - QueryPlan & plan = child_plans.emplace_back(); - if (allow_experimental_analyzer) { InterpreterSelectQueryAnalyzer interpreter(modified_query_info.query_tree, modified_context, SelectQueryOptions(processed_stage).ignoreProjections()); - builder = std::make_unique(interpreter.buildQueryPipeline()); - plan = std::move(interpreter.getPlanner()).extractQueryPlan(); + auto & planner = interpreter.getPlanner(); + planner.buildQueryPlanIfNeeded(); + plan = std::move(planner).extractQueryPlan(); } else { @@ -722,71 +848,11 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( InterpreterSelectQuery interpreter{modified_query_info.query, modified_context, SelectQueryOptions(processed_stage).ignoreProjections()}; - builder = std::make_unique(interpreter.buildQueryPipeline(plan)); + interpreter.buildQueryPlan(plan); } - - /** Materialization is needed, since from distributed storage the constants come materialized. - * If you do not do this, different types (Const and non-Const) columns will be produced in different threads, - * And this is not allowed, since all code is based on the assumption that in the block stream all types are the same. - */ - builder->addSimpleTransform([](const Block & stream_header) { return std::make_shared(stream_header); }); } - if (builder->initialized()) - { - if (concat_streams && builder->getNumStreams() > 1) - { - // It's possible to have many tables read from merge, resize(1) might open too many files at the same time. - // Using concat instead. - builder->addTransform(std::make_shared(builder->getHeader(), builder->getNumStreams())); - } - - /// Add virtual columns if we don't already have them. - - Block pipe_header = builder->getHeader(); - - if (has_database_virtual_column && !pipe_header.has("_database")) - { - ColumnWithTypeAndName column; - column.name = "_database"; - column.type = std::make_shared(std::make_shared()); - column.column = column.type->createColumnConst(0, Field(database_name)); - - auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto adding_column_actions = std::make_shared( - std::move(adding_column_dag), - ExpressionActionsSettings::fromContext(modified_context, CompileExpressions::yes)); - - builder->addSimpleTransform([&](const Block & stream_header) - { - return std::make_shared(stream_header, adding_column_actions); - }); - } - - if (has_table_virtual_column && !pipe_header.has("_table")) - { - ColumnWithTypeAndName column; - column.name = "_table"; - column.type = std::make_shared(std::make_shared()); - column.column = column.type->createColumnConst(0, Field(table_name)); - - auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto adding_column_actions = std::make_shared( - std::move(adding_column_dag), - ExpressionActionsSettings::fromContext(modified_context, CompileExpressions::yes)); - - builder->addSimpleTransform([&](const Block & stream_header) - { - return std::make_shared(stream_header, adding_column_actions); - }); - } - - /// Subordinary tables could have different but convertible types, like numeric types of different width. - /// We must return streams with structure equals to structure of Merge table. - convertingSourceStream(header, storage_snapshot->metadata, aliases, modified_context, *builder, processed_stage); - } - - return builder; + return plan; } StorageMerge::StorageListWithLocks StorageMerge::getSelectedTables( @@ -1014,10 +1080,47 @@ bool ReadFromMerge::requestReadingInOrder(InputOrderInfoPtr order_info_) if (order_info_->direction != 1 && InterpreterSelectQuery::isQueryWithFinal(query_info)) return false; + auto request_read_in_order = [order_info_](ReadFromMergeTree & read_from_merge_tree) + { + return read_from_merge_tree.requestReadingInOrder( + order_info_->used_prefix_of_sorting_key_size, order_info_->direction, order_info_->limit); + }; + + bool ok = true; + for (const auto & plan : child_plans) + if (plan.isInitialized()) + ok &= recursivelyApplyToReadingSteps(plan.getRootNode(), request_read_in_order); + + if (!ok) + return false; + order_info = order_info_; + query_info.input_order_info = order_info; return true; } +void ReadFromMerge::applyFilters(const QueryPlan & plan) const +{ + auto apply_filters = [this](ReadFromMergeTree & read_from_merge_tree) + { + size_t filters_dags_size = filter_dags.size(); + for (size_t i = 0; i < filters_dags_size; ++i) + read_from_merge_tree.addFilter(filter_dags[i], filter_nodes.nodes[i]); + + read_from_merge_tree.applyFilters(); + return true; + }; + + recursivelyApplyToReadingSteps(plan.getRootNode(), apply_filters); +} + +void ReadFromMerge::applyFilters() +{ + for (const auto & plan : child_plans) + if (plan.isInitialized()) + applyFilters(plan); +} + IStorage::ColumnSizeByName StorageMerge::getColumnSizes() const { ColumnSizeByName column_sizes; diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index babf0dd92e8..80a5fa335f7 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -1,9 +1,10 @@ #pragma once -#include -#include -#include +#include #include +#include +#include +#include namespace DB @@ -146,6 +147,8 @@ public: /// Returns `false` if requested reading cannot be performed. bool requestReadingInOrder(InputOrderInfoPtr order_info_); + void applyFilters() override; + private: const size_t required_max_block_size; const size_t requested_num_streams; @@ -177,23 +180,37 @@ private: using Aliases = std::vector; - static SelectQueryInfo getModifiedQueryInfo(const SelectQueryInfo & query_info, - const ContextPtr & modified_context, - const StorageWithLockAndName & storage_with_lock_and_name, - const StorageSnapshotPtr & storage_snapshot); + std::vector table_aliases; - QueryPipelineBuilderPtr createSources( + void createChildPlans(); + + void applyFilters(const QueryPlan & plan) const; + + QueryPlan createPlanForTable( const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, const QueryProcessingStage::Enum & processed_stage, UInt64 max_block_size, - const Block & header, - const Aliases & aliases, const StorageWithLockAndName & storage_with_lock, Names real_column_names, ContextMutablePtr modified_context, - size_t streams_num, - bool concat_streams = false); + size_t streams_num); + + QueryPipelineBuilderPtr createSources( + QueryPlan & plan, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & modified_query_info, + const QueryProcessingStage::Enum & processed_stage, + const Block & header, + const Aliases & aliases, + const StorageWithLockAndName & storage_with_lock, + ContextMutablePtr modified_context, + bool concat_streams = false) const; + + static SelectQueryInfo getModifiedQueryInfo(const SelectQueryInfo & query_info, + const ContextPtr & modified_context, + const StorageWithLockAndName & storage_with_lock_and_name, + const StorageSnapshotPtr & storage_snapshot); static void convertingSourceStream( const Block & header, diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 474171ba1b1..e9a0dd5fbf3 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -158,7 +158,7 @@ void StorageMergeTree::startup() /// It means that failed "startup" must not create any background tasks that we will have to wait. try { - shutdown(); + shutdown(false); } catch (...) { @@ -170,7 +170,7 @@ void StorageMergeTree::startup() } } -void StorageMergeTree::shutdown() +void StorageMergeTree::shutdown(bool) { if (shutdown_called.exchange(true)) return; @@ -196,7 +196,7 @@ void StorageMergeTree::shutdown() StorageMergeTree::~StorageMergeTree() { - shutdown(); + shutdown(false); } void StorageMergeTree::read( @@ -290,7 +290,7 @@ void StorageMergeTree::checkTableCanBeDropped([[ maybe_unused ]] ContextPtr quer void StorageMergeTree::drop() { - shutdown(); + shutdown(true); /// In case there is read-only disk we cannot allow to call dropAllData(), but dropping tables is allowed. if (isStaticStorage()) return; @@ -341,6 +341,8 @@ void StorageMergeTree::alter( prev_mutation = it->first; } + /// Always wait previous mutations synchronously, because alters + /// should be executed in sequential order. if (prev_mutation != 0) { LOG_DEBUG(log, "Cannot change metadata with barrier alter query, will wait for mutation {}", prev_mutation); @@ -368,9 +370,7 @@ void StorageMergeTree::alter( resetObjectColumnsFromActiveParts(parts_lock); } - /// Always execute required mutations synchronously, because alters - /// should be executed in sequential order. - if (!maybe_mutation_commands.empty()) + if (!maybe_mutation_commands.empty() && local_context->getSettingsRef().alter_sync > 0) waitForMutation(mutation_version, false); } diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index bd992bd09ac..539037a90ae 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -45,7 +45,7 @@ public: bool has_force_restore_data_flag); void startup() override; - void shutdown() override; + void shutdown(bool is_drop) override; ~StorageMergeTree() override; diff --git a/src/Storages/StorageProxy.h b/src/Storages/StorageProxy.h index 991d37b0b35..5d57f75a620 100644 --- a/src/Storages/StorageProxy.h +++ b/src/Storages/StorageProxy.h @@ -138,7 +138,7 @@ public: CancellationCode killMutation(const String & mutation_id) override { return getNested()->killMutation(mutation_id); } void startup() override { getNested()->startup(); } - void shutdown() override { getNested()->shutdown(); } + void shutdown(bool is_drop) override { getNested()->shutdown(is_drop); } void flushAndPrepareForShutdown() override { getNested()->flushAndPrepareForShutdown(); } ActionLock getActionLock(StorageActionBlockType action_type) override { return getNested()->getActionLock(action_type); } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index a496e382a09..74821a9186c 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -133,6 +133,7 @@ namespace ProfileEvents extern const Event CreatedLogEntryForMutation; extern const Event NotCreatedLogEntryForMutation; extern const Event ReplicaPartialShutdown; + extern const Event ReplicatedCoveredPartsInZooKeeperOnStart; } namespace CurrentMetrics @@ -186,6 +187,7 @@ namespace ErrorCodes extern const int NOT_INITIALIZED; extern const int TOO_LARGE_DISTRIBUTED_DEPTH; extern const int TABLE_IS_DROPPED; + extern const int CANNOT_BACKUP_TABLE; } namespace ActionLocks @@ -1318,6 +1320,7 @@ void StorageReplicatedMergeTree::paranoidCheckForCoveredPartsInZooKeeperOnStart( { LOG_WARNING(log, "Part {} exists in ZooKeeper and covered by another part in ZooKeeper ({}), but doesn't exist on any disk. " "It may cause false-positive 'part is lost forever' messages", part_name, covering_part); + ProfileEvents::increment(ProfileEvents::ReplicatedCoveredPartsInZooKeeperOnStart); chassert(false); } } @@ -1364,6 +1367,18 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks) paranoidCheckForCoveredPartsInZooKeeperOnStart(expected_parts_vec, parts_to_fetch); + ActiveDataPartSet set_of_empty_unexpected_parts(format_version); + for (const auto & part : parts) + { + if (part->rows_count || part->getState() != MergeTreeDataPartState::Active || expected_parts.contains(part->name)) + continue; + + set_of_empty_unexpected_parts.add(part->name); + } + if (auto empty_count = set_of_empty_unexpected_parts.size()) + LOG_WARNING(log, "Found {} empty unexpected parts (probably some dropped parts were not cleaned up before restart): [{}]", + empty_count, fmt::join(set_of_empty_unexpected_parts.getParts(), ", ")); + /** To check the adequacy, for the parts that are in the FS, but not in ZK, we will only consider not the most recent parts. * Because unexpected new parts usually arise only because they did not have time to enroll in ZK with a rough restart of the server. * It also occurs from deduplicated parts that did not have time to retire. @@ -1390,6 +1405,15 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks) continue; } + String covering_empty_part = set_of_empty_unexpected_parts.getContainingPart(part->name); + if (!covering_empty_part.empty()) + { + LOG_INFO(log, "Unexpected part {} is covered by empty part {}, assuming it has been dropped just before restart", + part->name, covering_empty_part); + covered_unexpected_parts.push_back(part->name); + continue; + } + auto covered_parts = local_expected_parts_set.getPartInfosCoveredBy(part->info); if (MergeTreePartInfo::areAllBlockNumbersCovered(part->info, covered_parts)) @@ -4971,7 +4995,7 @@ void StorageReplicatedMergeTree::startupImpl(bool from_attach_thread) } else { - shutdown(); + shutdown(false); } } catch (...) @@ -5046,7 +5070,7 @@ void StorageReplicatedMergeTree::partialShutdown() LOG_TRACE(log, "Threads finished"); } -void StorageReplicatedMergeTree::shutdown() +void StorageReplicatedMergeTree::shutdown(bool) { if (shutdown_called.exchange(true)) return; @@ -5100,7 +5124,7 @@ StorageReplicatedMergeTree::~StorageReplicatedMergeTree() { try { - shutdown(); + shutdown(false); } catch (...) { @@ -9099,6 +9123,14 @@ StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part, co } } + if (part.rows_count == 0 && part.remove_tmp_policy == IMergeTreeDataPart::BlobsRemovalPolicyForTemporaryParts::REMOVE_BLOBS) + { + /// It's a non-replicated empty part that was created to avoid unexpected parts after DROP_RANGE + LOG_INFO(log, "Looks like {} is a non-replicated empty part that was created to avoid unexpected parts after DROP_RANGE, " + "blobs can be removed", part.name); + return std::make_pair(true, NameSet{}); + } + if (has_metadata_in_zookeeper.has_value() && !has_metadata_in_zookeeper) { if (zookeeper->exists(zookeeper_path)) @@ -10017,8 +10049,15 @@ void StorageReplicatedMergeTree::adjustCreateQueryForBackup(ASTPtr & create_quer applyMetadataChangesToCreateQuery(create_query, adjusted_metadata); /// Check that tryGetTableSharedIDFromCreateQuery() works for this storage. - if (tryGetTableSharedIDFromCreateQuery(*create_query, getContext()) != getTableSharedID()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Table {} has its shared ID to be different from one from the create query"); + auto actual_table_shared_id = getTableSharedID(); + auto expected_table_shared_id = tryGetTableSharedIDFromCreateQuery(*create_query, getContext()); + if (actual_table_shared_id != expected_table_shared_id) + { + throw Exception(ErrorCodes::CANNOT_BACKUP_TABLE, "Table {} has its shared ID different from one from the create query: " + "actual shared id = {}, expected shared id = {}, create query = {}", + getStorageID().getNameForLogs(), actual_table_shared_id, expected_table_shared_id.value_or("nullopt"), + create_query); + } } void StorageReplicatedMergeTree::backupData( diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index f196991ae07..8c90d0e2679 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -139,7 +139,7 @@ public: /// In shutdown we completely terminate table -- remove /// is_active node and interserver handler. Also optionally /// wait until other replicas will download some parts from our replica. - void shutdown() override; + void shutdown(bool is_drop) override; ~StorageReplicatedMergeTree() override; diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index a33e5884bf5..63ed84680c9 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -6,8 +6,6 @@ #include -#include - #include #include #include diff --git a/src/Storages/StorageTableFunction.h b/src/Storages/StorageTableFunction.h index 8f96cb46910..9d966fb899b 100644 --- a/src/Storages/StorageTableFunction.h +++ b/src/Storages/StorageTableFunction.h @@ -72,11 +72,11 @@ public: } void startup() override { } - void shutdown() override + void shutdown(bool is_drop) override { std::lock_guard lock{nested_mutex}; if (nested) - nested->shutdown(); + nested->shutdown(is_drop); } void flushAndPrepareForShutdown() override diff --git a/src/Storages/System/StorageSystemBackups.cpp b/src/Storages/System/StorageSystemBackups.cpp index 6fac9b04885..46ab70ff04a 100644 --- a/src/Storages/System/StorageSystemBackups.cpp +++ b/src/Storages/System/StorageSystemBackups.cpp @@ -5,9 +5,11 @@ #include #include #include +#include #include #include #include +#include namespace DB @@ -29,6 +31,7 @@ NamesAndTypesList StorageSystemBackups::getNamesAndTypes() {"compressed_size", std::make_shared()}, {"files_read", std::make_shared()}, {"bytes_read", std::make_shared()}, + {"ProfileEvents", std::make_shared(std::make_shared(), std::make_shared())}, }; return names_and_types; } @@ -50,6 +53,7 @@ void StorageSystemBackups::fillData(MutableColumns & res_columns, ContextPtr con auto & column_compressed_size = assert_cast(*res_columns[column_index++]); auto & column_num_read_files = assert_cast(*res_columns[column_index++]); auto & column_num_read_bytes = assert_cast(*res_columns[column_index++]); + auto & column_profile_events = assert_cast(*res_columns[column_index++]); auto add_row = [&](const BackupOperationInfo & info) { @@ -66,6 +70,10 @@ void StorageSystemBackups::fillData(MutableColumns & res_columns, ContextPtr con column_compressed_size.insertValue(info.compressed_size); column_num_read_files.insertValue(info.num_read_files); column_num_read_bytes.insertValue(info.num_read_bytes); + if (info.profile_counters) + ProfileEvents::dumpToMapColumn(*info.profile_counters, &column_profile_events, true); + else + column_profile_events.insertDefault(); }; for (const auto & entry : context->getBackupsWorker().getAllInfos()) diff --git a/src/Storages/System/StorageSystemParts.cpp b/src/Storages/System/StorageSystemParts.cpp index ac38c9c97b1..4bf1053a7b6 100644 --- a/src/Storages/System/StorageSystemParts.cpp +++ b/src/Storages/System/StorageSystemParts.cpp @@ -34,6 +34,8 @@ std::string_view getRemovalStateDescription(DB::DataPartRemovalState state) return "Part hasn't reached removal time yet"; case DB::DataPartRemovalState::HAS_SKIPPED_MUTATION_PARENT: return "Waiting mutation parent to be removed"; + case DB::DataPartRemovalState::EMPTY_PART_COVERS_OTHER_PARTS: + return "Waiting for covered parts to be removed first"; case DB::DataPartRemovalState::REMOVED: return "Part was selected to be removed"; } diff --git a/src/Storages/System/StorageSystemRemoteDataPaths.cpp b/src/Storages/System/StorageSystemRemoteDataPaths.cpp index eb514d3b3f4..b1cd90448ec 100644 --- a/src/Storages/System/StorageSystemRemoteDataPaths.cpp +++ b/src/Storages/System/StorageSystemRemoteDataPaths.cpp @@ -65,7 +65,7 @@ Pipe StorageSystemRemoteDataPaths::read( if (disk->supportsCache()) cache = FileCacheFactory::instance().getByName(disk->getCacheName()).cache; - for (const auto & [local_path, common_prefox_for_objects, storage_objects] : remote_paths_by_local_path) + for (const auto & [local_path, storage_objects] : remote_paths_by_local_path) { for (const auto & object : storage_objects) { @@ -78,7 +78,8 @@ Pipe StorageSystemRemoteDataPaths::read( col_local_path->insert(local_path); col_remote_path->insert(object.remote_path); col_size->insert(object.bytes_size); - col_namespace->insert(common_prefox_for_objects); + + col_namespace->insertDefault(); if (cache) { diff --git a/src/Storages/System/StorageSystemSymbols.cpp b/src/Storages/System/StorageSystemSymbols.cpp new file mode 100644 index 00000000000..56195544448 --- /dev/null +++ b/src/Storages/System/StorageSystemSymbols.cpp @@ -0,0 +1,114 @@ +#if defined(__ELF__) && !defined(OS_FREEBSD) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + + +StorageSystemSymbols::StorageSystemSymbols(const StorageID & table_id_) + : IStorage(table_id_) +{ + StorageInMemoryMetadata storage_metadata; + storage_metadata.setColumns(ColumnsDescription( + { + {"symbol", std::make_shared()}, + {"address_begin", std::make_shared()}, + {"address_end", std::make_shared()}, + })); + setInMemoryMetadata(storage_metadata); +} + + +namespace +{ + +class SymbolsBlockSource : public ISource +{ +private: + using Iterator = std::vector::const_iterator; + Iterator it; + const Iterator end; + std::vector columns_mask; + UInt64 max_block_size; + +public: + SymbolsBlockSource( + Iterator begin_, + Iterator end_, + std::vector columns_mask_, + Block header, + UInt64 max_block_size_) + : ISource(std::move(header)) + , it(begin_), end(end_), columns_mask(std::move(columns_mask_)), max_block_size(max_block_size_) + { + } + + String getName() const override { return "Symbols"; } + +protected: + Chunk generate() override + { + if (it == end) + return {}; + + MutableColumns res_columns = getPort().getHeader().cloneEmptyColumns(); + + size_t rows_count = 0; + while (rows_count < max_block_size && it != end) + { + size_t src_index = 0; + size_t res_index = 0; + + if (columns_mask[src_index++]) + res_columns[res_index++]->insert(it->name); + if (columns_mask[src_index++]) + res_columns[res_index++]->insert(reinterpret_cast(it->address_begin)); + if (columns_mask[src_index++]) + res_columns[res_index++]->insert(reinterpret_cast(it->address_end)); + + ++rows_count; + ++it; + } + + return Chunk(std::move(res_columns), rows_count); + } +}; + +} + + +Pipe StorageSystemSymbols::read( + const Names & column_names, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & /* query_info */, + ContextPtr context, + QueryProcessingStage::Enum /*processed_stage*/, + const size_t max_block_size, + const size_t /*num_streams*/) +{ + context->getAccess()->checkAccess(AccessType::INTROSPECTION); + + storage_snapshot->check(column_names); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); + auto [columns_mask, res_block] = getQueriedColumnsMaskAndHeader(sample_block, column_names); + + const auto & symbols = SymbolIndex::instance().symbols(); + + return Pipe(std::make_shared( + symbols.cbegin(), symbols.cend(), std::move(columns_mask), std::move(res_block), max_block_size)); +} + +} + +#endif diff --git a/src/Storages/System/StorageSystemSymbols.h b/src/Storages/System/StorageSystemSymbols.h new file mode 100644 index 00000000000..808c406b91d --- /dev/null +++ b/src/Storages/System/StorageSystemSymbols.h @@ -0,0 +1,33 @@ +#pragma once + +#include + + +namespace DB +{ + +class Context; + + +/** Implements the system table `symbols` for introspection of symbols in the ClickHouse binary. + */ +class StorageSystemSymbols final : public IStorage +{ +public: + explicit StorageSystemSymbols(const StorageID & table_id_); + + std::string getName() const override { return "SystemSymbols"; } + + Pipe read( + const Names & column_names, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & query_info, + ContextPtr context, + QueryProcessingStage::Enum processed_stage, + size_t max_block_size, + size_t num_streams) override; + + bool isSystemStorage() const override { return true; } +}; + +} diff --git a/src/Storages/System/attachInformationSchemaTables.cpp b/src/Storages/System/attachInformationSchemaTables.cpp index fbfceb81d77..bfe0f20fc92 100644 --- a/src/Storages/System/attachInformationSchemaTables.cpp +++ b/src/Storages/System/attachInformationSchemaTables.cpp @@ -373,6 +373,86 @@ static constexpr std::string_view referential_constraints = R"( WHERE false; -- make sure this view is always empty )"; +static constexpr std::string_view statistics = R"( + ATTACH VIEW statistics + ( + `table_catalog` String, + `table_schema` String, + `table_name` String, + `non_unique` Int32, + `index_schema` String, + `index_name` Nullable(String), + `seq_in_index` UInt32, + `column_name` Nullable(String), + `collation` Nullable(String), + `cardinality` Nullable(Int64), + `sub_part` Nullable(Int64), + `packed` Nullable(String), + `nullable` String, + `index_type` String, + `comment` String, + `index_comment` String, + `is_visible` String, + `expression` Nullable(String), + `TABLE_CATALOG` String, + `TABLE_SCHEMA` String, + `TABLE_NAME` String, + `NON_UNIQUE` Int32, + `INDEX_SCHEMA` String, + `INDEX_NAME` Nullable(String), + `SEQ_IN_INDEX` UInt32, + `COLUMN_NAME` Nullable(String), + `COLLATION` Nullable(String), + `CARDINALITY` Nullable(Int64), + `SUB_PART` Nullable(Int64), + `PACKED` Nullable(String), + `NULLABLE` String, + `INDEX_TYPE` String, + `COMMENT` String, + `INDEX_COMMENT` String, + `IS_VISIBLE` String, + `EXPRESSION` Nullable(String) + ) AS + SELECT + '' AS table_catalog, + '' AS table_schema, + '' AS table_name, + 0 AS non_unique, + '' AS index_schema, + NULL AS index_name, + 0 AS seq_in_index, + NULL AS column_name, + NULL AS collation, + NULL AS cardinality, + NULL AS sub_part, + NULL AS packed, + '' AS nullable, + '' AS index_type, + '' AS comment, + '' AS index_comment, + '' AS is_visible, + NULL AS expression, + table_catalog AS TABLE_CATALOG, + table_schema AS TABLE_SCHEMA, + table_name AS TABLE_NAME, + non_unique AS NON_UNIQUE, + index_schema AS INDEX_SCHEMA, + index_name AS INDEX_NAME, + seq_in_index AS SEQ_IN_INDEX, + column_name AS COLUMN_NAME, + collation AS COLLATION, + cardinality AS CARDINALITY, + sub_part AS SUB_PART, + packed AS PACKED, + nullable AS NULLABLE, + index_type AS INDEX_TYPE, + comment AS COMMENT, + index_comment AS INDEX_COMMENT, + is_visible AS IS_VISIBLE, + expression AS EXPRESSION + WHERE false; -- make sure this view is always empty +)"; + /// View structures are taken from http://www.contrib.andrew.cmu.edu/~shadow/sql/sql1992.txt static void createInformationSchemaView(ContextMutablePtr context, IDatabase & database, const String & view_name, std::string_view query) @@ -424,6 +504,7 @@ void attachInformationSchema(ContextMutablePtr context, IDatabase & information_ createInformationSchemaView(context, information_schema_database, "columns", columns); createInformationSchemaView(context, information_schema_database, "key_column_usage", key_column_usage); createInformationSchemaView(context, information_schema_database, "referential_constraints", referential_constraints); + createInformationSchemaView(context, information_schema_database, "statistics", statistics); } } diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp index ffa225fb929..6d875208fbb 100644 --- a/src/Storages/System/attachSystemTables.cpp +++ b/src/Storages/System/attachSystemTables.cpp @@ -86,6 +86,10 @@ #include #include +#if defined(__ELF__) && !defined(OS_FREEBSD) +#include +#endif + #if USE_RDKAFKA #include #endif @@ -151,6 +155,9 @@ void attachSystemTablesLocal(ContextPtr context, IDatabase & system_database) attach(context, system_database, "schema_inference_cache"); attach(context, system_database, "dropped_tables"); attach(context, system_database, "scheduler"); +#if defined(__ELF__) && !defined(OS_FREEBSD) + attach(context, system_database, "symbols"); +#endif #if USE_RDKAFKA attach(context, system_database, "kafka_consumers"); #endif diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp index e3fcd6249d1..46c38ffa129 100644 --- a/src/Storages/WindowView/StorageWindowView.cpp +++ b/src/Storages/WindowView/StorageWindowView.cpp @@ -461,7 +461,7 @@ void StorageWindowView::alter( modifying_query = false; }); - shutdown(); + shutdown(false); auto inner_query = initInnerQuery(new_select_query->as(), local_context); @@ -1586,7 +1586,7 @@ void StorageWindowView::startup() fire_task->schedule(); } -void StorageWindowView::shutdown() +void StorageWindowView::shutdown(bool) { shutdown_called = true; diff --git a/src/Storages/WindowView/StorageWindowView.h b/src/Storages/WindowView/StorageWindowView.h index 231616ff820..de8f880c602 100644 --- a/src/Storages/WindowView/StorageWindowView.h +++ b/src/Storages/WindowView/StorageWindowView.h @@ -142,7 +142,7 @@ public: void checkAlterIsPossible(const AlterCommands & commands, ContextPtr context) const override; void startup() override; - void shutdown() override; + void shutdown(bool is_drop) override; void read( QueryPlan & query_plan, diff --git a/src/TableFunctions/TableFunctionMySQL.cpp b/src/TableFunctions/TableFunctionMySQL.cpp index 03bd2264551..a5c6962697d 100644 --- a/src/TableFunctions/TableFunctionMySQL.cpp +++ b/src/TableFunctions/TableFunctionMySQL.cpp @@ -66,7 +66,7 @@ StoragePtr TableFunctionMySQL::executeImpl( const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, - ColumnsDescription /*cached_columns*/, + ColumnsDescription cached_columns, bool /*is_insert_query*/) const { auto res = std::make_shared( @@ -76,7 +76,7 @@ StoragePtr TableFunctionMySQL::executeImpl( configuration->table, configuration->replace_query, configuration->on_duplicate_clause, - ColumnsDescription{}, + cached_columns, ConstraintsDescription{}, String{}, context, diff --git a/src/TableFunctions/TableFunctionPostgreSQL.cpp b/src/TableFunctions/TableFunctionPostgreSQL.cpp index 322e0df7c15..6ed5883c3cc 100644 --- a/src/TableFunctions/TableFunctionPostgreSQL.cpp +++ b/src/TableFunctions/TableFunctionPostgreSQL.cpp @@ -20,13 +20,13 @@ namespace ErrorCodes StoragePtr TableFunctionPostgreSQL::executeImpl(const ASTPtr & /*ast_function*/, - ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const + ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool /*is_insert_query*/) const { auto result = std::make_shared( StorageID(getDatabaseName(), table_name), connection_pool, configuration->table, - ColumnsDescription{}, + cached_columns, ConstraintsDescription{}, String{}, context, diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index bdc7b234c5e..eea56039921 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -336,7 +336,7 @@ bool TableFunctionS3::supportsReadingSubsetOfColumns(const ContextPtr & context) return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format, context); } -StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const +StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool /*is_insert_query*/) const { S3::URI s3_uri (configuration.url); @@ -345,6 +345,8 @@ StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, Context columns = parseColumnsListFromString(configuration.structure, context); else if (!structure_hint.empty()) columns = structure_hint; + else if (!cached_columns.empty()) + columns = cached_columns; StoragePtr storage = std::make_shared( configuration, diff --git a/src/TableFunctions/TableFunctionSQLite.cpp b/src/TableFunctions/TableFunctionSQLite.cpp index 27e6fcf1fd1..dfd104ef81a 100644 --- a/src/TableFunctions/TableFunctionSQLite.cpp +++ b/src/TableFunctions/TableFunctionSQLite.cpp @@ -29,13 +29,13 @@ namespace ErrorCodes StoragePtr TableFunctionSQLite::executeImpl(const ASTPtr & /*ast_function*/, - ContextPtr context, const String & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const + ContextPtr context, const String & table_name, ColumnsDescription cached_columns, bool /*is_insert_query*/) const { auto storage = std::make_shared(StorageID(getDatabaseName(), table_name), sqlite_db, database_path, remote_table_name, - ColumnsDescription{}, ConstraintsDescription{}, context); + cached_columns, ConstraintsDescription{}, context); storage->startup(); return storage; diff --git a/tests/analyzer_integration_broken_tests.txt b/tests/analyzer_integration_broken_tests.txt index 7c87a41dae9..23f22209451 100644 --- a/tests/analyzer_integration_broken_tests.txt +++ b/tests/analyzer_integration_broken_tests.txt @@ -8,7 +8,6 @@ test_executable_table_function/test.py::test_executable_function_input_python test_mask_sensitive_info/test.py::test_encryption_functions test_merge_table_over_distributed/test.py::test_global_in test_merge_table_over_distributed/test.py::test_select_table_name_from_merge_over_distributed -test_merge_tree_s3/test.py::test_heavy_insert_select_check_memory[node] test_mutations_with_merge_tree/test.py::test_mutations_with_merge_background_task test_mysql_database_engine/test.py::test_mysql_ddl_for_mysql_database test_passing_max_partitions_to_read_remotely/test.py::test_default_database_on_cluster diff --git a/tests/analyzer_tech_debt.txt b/tests/analyzer_tech_debt.txt index 06fd4cc80c3..bbb9abda079 100644 --- a/tests/analyzer_tech_debt.txt +++ b/tests/analyzer_tech_debt.txt @@ -2,7 +2,6 @@ 00593_union_all_assert_columns_removed 00717_merge_and_distributed 00725_memory_tracking -00940_order_by_read_in_order_query_plan 01062_pm_all_join_with_block_continuation 01064_incremental_streaming_from_2_src_with_feedback 01083_expressions_in_engine_arguments @@ -14,7 +13,6 @@ 01268_shard_avgweighted 01455_shard_leaf_max_rows_bytes_to_read 01495_subqueries_in_with_statement -01504_rocksdb 01560_merge_distributed_join 01584_distributed_buffer_cannot_find_column 01586_columns_pruning @@ -30,26 +28,19 @@ 01925_test_storage_merge_aliases 01947_mv_subquery 01952_optimize_distributed_group_by_sharding_key -02131_used_row_policies_in_query_log 02139_MV_with_scalar_subquery 02174_cte_scalar_cache_mv 02302_s3_file_pruning -02341_global_join_cte 02345_implicit_transaction 02352_grouby_shadows_arg 02354_annoy -02375_rocksdb_with_filters 02402_merge_engine_with_view -02404_memory_bound_merging -02426_orc_bug 02428_parameterized_view 02458_use_structure_from_insertion_table 02479_race_condition_between_insert_and_droppin_mv 02493_inconsistent_hex_and_binary_number -02521_aggregation_by_partitions 02554_fix_grouping_sets_predicate_push_down 02575_merge_prewhere_different_default_kind -02713_array_low_cardinality_string 02003_WithMergeableStateAfterAggregationAndLimit_LIMIT_BY_LIMIT_OFFSET 01009_global_array_join_names 00917_multiple_joins_denny_crane @@ -66,4 +57,3 @@ 01940_custom_tld_sharding_key 02815_range_dict_no_direct_join 02845_threads_count_in_distributed_queries -02861_join_on_nullsafe_compare diff --git a/tests/ci/autoscale_runners_lambda/app.py b/tests/ci/autoscale_runners_lambda/app.py index d3f26732df6..fcbf0977cdf 100644 --- a/tests/ci/autoscale_runners_lambda/app.py +++ b/tests/ci/autoscale_runners_lambda/app.py @@ -57,7 +57,7 @@ def get_scales(runner_type: str) -> Tuple[int, int]: # Scaling down is quicker on the lack of running jobs than scaling up on # queue scale_down = 2 - scale_up = 5 + scale_up = 3 if runner_type == "style-checker": # The ASG should deflate almost instantly scale_down = 1 @@ -65,8 +65,9 @@ def get_scales(runner_type: str) -> Tuple[int, int]: # The 5 was too quick, there are complainings regarding too slow with # 10. I am trying 7 now. # 7 still looks a bit slow, so I try 6 + # Let's have it the same as the other ASG # UPDATE THE COMMENT ON CHANGES - scale_up = 6 + ## scale_down = 3 elif runner_type == "limited-tester": # The limited runners should inflate and deflate faster scale_down = 1 diff --git a/tests/ci/autoscale_runners_lambda/autoscale_runners_lambda_test.py b/tests/ci/autoscale_runners_lambda/test_autoscale.py similarity index 91% rename from tests/ci/autoscale_runners_lambda/autoscale_runners_lambda_test.py rename to tests/ci/autoscale_runners_lambda/test_autoscale.py index 6a6451cbd2a..d9cda6272c5 100644 --- a/tests/ci/autoscale_runners_lambda/autoscale_runners_lambda_test.py +++ b/tests/ci/autoscale_runners_lambda/test_autoscale.py @@ -68,14 +68,16 @@ class TestSetCapacity(unittest.TestCase): test_cases = ( # Do not change capacity TestCase("noqueue", 1, 13, 20, [Queue("in_progress", 155, "noqueue")], -1), - TestCase("w/reserve", 1, 13, 20, [Queue("queued", 17, "w/reserve")], -1), - # Increase capacity - TestCase("increase", 1, 13, 20, [Queue("queued", 23, "increase")], 15), TestCase( - "style-checker", 1, 13, 20, [Queue("queued", 33, "style-checker")], 16 + "w/reserve-1", 1, 13, 20, [Queue("queued", 15, "w/reserve-1")], -1 ), - TestCase("increase", 1, 13, 20, [Queue("queued", 18, "increase")], 14), - TestCase("increase", 1, 13, 20, [Queue("queued", 183, "increase")], 20), + # Increase capacity + TestCase("increase-1", 1, 13, 20, [Queue("queued", 23, "increase-1")], 16), + TestCase( + "style-checker", 1, 13, 20, [Queue("queued", 33, "style-checker")], 19 + ), + TestCase("increase-2", 1, 13, 20, [Queue("queued", 18, "increase-2")], 14), + TestCase("increase-3", 1, 13, 20, [Queue("queued", 183, "increase-3")], 20), TestCase( "increase-w/o reserve", 1, @@ -85,7 +87,7 @@ class TestSetCapacity(unittest.TestCase): Queue("in_progress", 11, "increase-w/o reserve"), Queue("queued", 12, "increase-w/o reserve"), ], - 15, + 16, ), TestCase("lower-min", 10, 5, 20, [Queue("queued", 5, "lower-min")], 10), # Decrease capacity diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index 0d8f86c82fe..3a20ca846a1 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -11,7 +11,7 @@ from ci_config import CI_CONFIG, BuildConfig from ccache_utils import CargoCache from docker_pull_helper import get_image_with_version from env_helper import ( - GITHUB_JOB, + GITHUB_JOB_API_URL, IMAGES_PATH, REPO_COPY, S3_BUILDS_BUCKET, @@ -163,9 +163,14 @@ def check_for_success_run( version.describe, SUCCESS if success else FAILURE, 0, - GITHUB_JOB, + GITHUB_JOB_API_URL(), + ) + result_json_path = build_result.write_json(Path(TEMP_PATH)) + logging.info( + "Build result file %s is written, content:\n %s", + result_json_path, + result_json_path.read_text(encoding="utf-8"), ) - build_result.write_json(Path(TEMP_PATH)) # Fail build job if not successeded if not success: sys.exit(1) @@ -207,10 +212,17 @@ def upload_master_static_binaries( elif pr_info.base_ref != "master": return - s3_path = "/".join((pr_info.base_ref, static_binary_name, "clickhouse")) - binary = build_output_path / "clickhouse" - url = s3_helper.upload_build_file_to_s3(binary, s3_path) - print(f"::notice ::Binary static URL: {url}") + # Full binary with debug info: + s3_path_full = "/".join((pr_info.base_ref, static_binary_name, "clickhouse-full")) + binary_full = build_output_path / "clickhouse" + url_full = s3_helper.upload_build_file_to_s3(binary_full, s3_path_full) + print(f"::notice ::Binary static URL (with debug info): {url_full}") + + # Stripped binary without debug info: + s3_path_compact = "/".join((pr_info.base_ref, static_binary_name, "clickhouse")) + binary_compact = build_output_path / "clickhouse-stripped" + url_compact = s3_helper.upload_build_file_to_s3(binary_compact, s3_path_compact) + print(f"::notice ::Binary static URL (compact): {url_compact}") def main(): @@ -348,7 +360,7 @@ def main(): version.describe, build_status, elapsed, - GITHUB_JOB, + GITHUB_JOB_API_URL(), ) result_json_path = build_result.write_json(temp_path) logging.info( @@ -421,9 +433,10 @@ FORMAT JSONCompactEachRow""" ) profile_data_file = temp_path / "profile.json" with open(profile_data_file, "wb") as profile_fd: - for profile_sourse in profiles_dir.iterdir(): - with open(profiles_dir / profile_sourse, "rb") as ps_fd: - profile_fd.write(ps_fd.read()) + for profile_source in profiles_dir.iterdir(): + if profile_source.name != "binary_sizes.txt": + with open(profiles_dir / profile_source, "rb") as ps_fd: + profile_fd.write(ps_fd.read()) logging.info( "::notice ::Log Uploading profile data, path: %s, size: %s, query: %s", @@ -433,6 +446,32 @@ FORMAT JSONCompactEachRow""" ) ch_helper.insert_file(url, auth, query, profile_data_file) + query = f"""INSERT INTO binary_sizes +( + pull_request_number, + commit_sha, + check_start_time, + check_name, + instance_type, + instance_id, + file, + size +) +SELECT {pr_info.number}, '{pr_info.sha}', '{stopwatch.start_time_str}', '{build_name}', '{instance_type}', '{instance_id}', file, size +FROM input('size UInt64, file String') +SETTINGS format_regexp = '^\\s*(\\d+) (.+)$' +FORMAT Regexp""" + + binary_sizes_file = profiles_dir / "binary_sizes.txt" + + logging.info( + "::notice ::Log Uploading binary sizes data, path: %s, size: %s, query: %s", + binary_sizes_file, + binary_sizes_file.stat().st_size, + query, + ) + ch_helper.insert_file(url, auth, query, binary_sizes_file) + # Upload statistics to CI database prepared_events = prepare_tests_results_for_clickhouse( pr_info, diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py index ba4d8411193..62224fbcef9 100644 --- a/tests/ci/build_report_check.py +++ b/tests/ci/build_report_check.py @@ -94,7 +94,7 @@ def main(): missing_job_names = [ name for name in needs_data - if not any(1 for build_result in build_results if build_result.job_name == name) + if not any(1 for br in build_results if br.job_name.startswith(name)) ] missing_builds = len(missing_job_names) for job_name in reversed(missing_job_names): diff --git a/tests/ci/ccache_utils.py b/tests/ci/ccache_utils.py index 75a026d2524..6ccaa8c80e0 100644 --- a/tests/ci/ccache_utils.py +++ b/tests/ci/ccache_utils.py @@ -3,13 +3,13 @@ import logging import os import shutil -from hashlib import md5 from pathlib import Path import requests # type: ignore from build_download_helper import download_build_with_progress, DownloadException from compress_files import decompress_fast, compress_fast +from digest_helper import digest_path from env_helper import S3_DOWNLOAD, S3_BUILDS_BUCKET from git_helper import git_runner from s3_helper import S3Helper @@ -108,7 +108,7 @@ class CargoCache: s3_helper: S3Helper, ): self._cargo_lock_file = Path(git_runner.cwd) / "rust" / "Cargo.lock" - self.lock_hash = md5(self._cargo_lock_file.read_bytes()).hexdigest() + self.lock_hash = digest_path(self._cargo_lock_file).hexdigest() self.directory = directory self.archive_name = f"Cargo_cache_{self.lock_hash}.tar.zst" self.temp_path = temp_path diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 54ce7d2e908..dc22babb907 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -2,21 +2,34 @@ import logging +from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser from dataclasses import dataclass -from typing import Callable, Dict, List, Literal +from typing import Callable, Dict, List, Literal, Union @dataclass class BuildConfig: + name: str compiler: str package_type: Literal["deb", "binary", "fuzzers"] additional_pkgs: bool = False debug_build: bool = False sanitizer: str = "" tidy: bool = False + sparse_checkout: bool = False comment: str = "" static_binary_name: str = "" + def export_env(self, export: bool = False) -> str: + def process(field_name: str, field: Union[bool, str]) -> str: + if isinstance(field, bool): + field = str(field).lower() + if export: + return f"export BUILD_{field_name.upper()}={repr(field)}" + return f"BUILD_{field_name.upper()}={field}" + + return "\n".join(process(k, v) for k, v in self.__dict__.items()) + @dataclass class TestConfig: @@ -87,48 +100,57 @@ class CiConfig: CI_CONFIG = CiConfig( build_config={ "package_release": BuildConfig( + name="package_release", compiler="clang-17", package_type="deb", static_binary_name="amd64", additional_pkgs=True, ), "package_aarch64": BuildConfig( + name="package_aarch64", compiler="clang-17-aarch64", package_type="deb", static_binary_name="aarch64", additional_pkgs=True, ), "package_asan": BuildConfig( + name="package_asan", compiler="clang-17", sanitizer="address", package_type="deb", ), "package_ubsan": BuildConfig( + name="package_ubsan", compiler="clang-17", sanitizer="undefined", package_type="deb", ), "package_tsan": BuildConfig( + name="package_tsan", compiler="clang-17", sanitizer="thread", package_type="deb", ), "package_msan": BuildConfig( + name="package_msan", compiler="clang-17", sanitizer="memory", package_type="deb", ), "package_debug": BuildConfig( + name="package_debug", compiler="clang-17", debug_build=True, package_type="deb", - comment="Note: sparse checkout was used", + sparse_checkout=True, ), "binary_release": BuildConfig( + name="binary_release", compiler="clang-17", package_type="binary", ), "binary_tidy": BuildConfig( + name="binary_tidy", compiler="clang-17", debug_build=True, package_type="binary", @@ -137,52 +159,63 @@ CI_CONFIG = CiConfig( comment="clang-tidy is used for static analysis", ), "binary_darwin": BuildConfig( + name="binary_darwin", compiler="clang-17-darwin", package_type="binary", static_binary_name="macos", + sparse_checkout=True, ), "binary_aarch64": BuildConfig( + name="binary_aarch64", compiler="clang-17-aarch64", package_type="binary", ), "binary_aarch64_v80compat": BuildConfig( + name="binary_aarch64_v80compat", compiler="clang-17-aarch64-v80compat", package_type="binary", static_binary_name="aarch64v80compat", comment="For ARMv8.1 and older", ), "binary_freebsd": BuildConfig( + name="binary_freebsd", compiler="clang-17-freebsd", package_type="binary", static_binary_name="freebsd", ), "binary_darwin_aarch64": BuildConfig( + name="binary_darwin_aarch64", compiler="clang-17-darwin-aarch64", package_type="binary", static_binary_name="macos-aarch64", ), "binary_ppc64le": BuildConfig( + name="binary_ppc64le", compiler="clang-17-ppc64le", package_type="binary", static_binary_name="powerpc64le", ), "binary_amd64_compat": BuildConfig( + name="binary_amd64_compat", compiler="clang-17-amd64-compat", package_type="binary", static_binary_name="amd64compat", comment="SSE2-only build", ), "binary_riscv64": BuildConfig( + name="binary_riscv64", compiler="clang-17-riscv64", package_type="binary", static_binary_name="riscv64", ), "binary_s390x": BuildConfig( + name="binary_s390x", compiler="clang-17-s390x", package_type="binary", static_binary_name="s390x", ), "fuzzers": BuildConfig( + name="fuzzers", compiler="clang-16", package_type="fuzzers", ), @@ -467,3 +500,24 @@ CHECK_DESCRIPTIONS = [ lambda x: True, ), ] + + +def main() -> None: + parser = ArgumentParser( + formatter_class=ArgumentDefaultsHelpFormatter, + description="The script provides build config for GITHUB_ENV or shell export", + ) + parser.add_argument("--build-name", help="the build config to export") + parser.add_argument( + "--export", + action="store_true", + help="if set, the ENV parameters are provided for shell export", + ) + args = parser.parse_args() + build_config = CI_CONFIG.build_config.get(args.build_name) + if build_config: + print(build_config.export_env(args.export)) + + +if __name__ == "__main__": + main() diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index 3ae20bd871d..f338a1e14c3 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -297,8 +297,8 @@ class CiLogsCredentials: return "" extra_columns = ( f"CAST({pr_info.number} AS UInt32) AS pull_request_number, '{pr_info.sha}' AS commit_sha, " - f"toDateTime('{check_start_time}', 'UTC') AS check_start_time, '{check_name}' AS check_name, " - f"'{get_instance_type()}' AS instance_type, '{get_instance_id()}' AS instance_id" + f"toDateTime('{check_start_time}', 'UTC') AS check_start_time, toLowCardinality('{check_name}') AS check_name, " + f"toLowCardinality('{get_instance_type()}') AS instance_type, '{get_instance_id()}' AS instance_id" ) return ( f'-e EXTRA_COLUMNS_EXPRESSION="{extra_columns}" ' diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index aeeb8531aac..09e3478b3fc 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -242,12 +242,12 @@ def generate_status_comment(pr_info: PRInfo, statuses: CommitStatuses) -> str: for desc, gs in grouped_statuses.items(): state = get_worst_state(gs) state_text = f"{STATUS_ICON_MAP[state]} {state}" - # take the first target_url - target_url = next( - (status.target_url for status in gs if status.target_url), None - ) - if target_url: - state_text = f'{state_text}' + # take the first target_url with the worst state + for status in gs: + if status.target_url and status.state == state: + state_text = f'{state_text}' + break + table_row = ( f"{desc.name}{desc.description}" f"{state_text}\n" diff --git a/tests/ci/digest_helper.py b/tests/ci/digest_helper.py new file mode 100644 index 00000000000..543de51e46b --- /dev/null +++ b/tests/ci/digest_helper.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 + +from hashlib import md5 +from logging import getLogger +from pathlib import Path +from typing import TYPE_CHECKING, Iterable, Optional +from sys import modules + +if TYPE_CHECKING: + from hashlib import ( # pylint:disable=no-name-in-module,ungrouped-imports + _Hash as HASH, + ) +else: + HASH = "_Hash" + +logger = getLogger(__name__) + + +def _digest_file(file: Path, hash_object: HASH) -> None: + assert file.is_file() + with open(file, "rb") as fd: + for chunk in iter(lambda: fd.read(4096), b""): + hash_object.update(chunk) + + +def _digest_directory(directory: Path, hash_object: HASH) -> None: + assert directory.is_dir() + for p in sorted(directory.rglob("*")): + if p.is_symlink() and p.is_dir(): + # The symlink directory is not listed recursively, so we process it manually + (_digest_directory(p, hash_object)) + if p.is_file(): + (_digest_file(p, hash_object)) + + +def digest_path(path: Path, hash_object: Optional[HASH] = None) -> HASH: + """Calculates md5 (or updates existing hash_object) hash of the path, either it's + directory or file""" + hash_object = hash_object or md5() + if path.is_dir(): + _digest_directory(path, hash_object) + elif path.is_file(): + _digest_file(path, hash_object) + return hash_object + + +def digest_paths(paths: Iterable[Path], hash_object: Optional[HASH] = None) -> HASH: + """Calculates aggregated md5 (or updates existing hash_object) hash of passed paths. + The order is processed as given""" + hash_object = hash_object or md5() + for path in paths: + if path.exists(): + digest_path(path, hash_object) + return hash_object + + +def digest_consistent_paths( + paths: Iterable[Path], hash_object: Optional[HASH] = None +) -> HASH: + """Calculates aggregated md5 (or updates existing hash_object) hash of passed paths. + The order doesn't matter, paths are converted to `absolute` and ordered before + calculation""" + return digest_paths(sorted(p.absolute() for p in paths), hash_object) + + +def digest_script(path_str: str) -> HASH: + """Accepts value of the __file__ executed script and calculates the md5 hash for it""" + path = Path(path_str) + parent = path.parent + md5_hash = md5() + try: + for script in modules.values(): + script_path = getattr(script, "__file__", "") + if parent.absolute().as_posix() in script_path: + logger.debug("Updating the hash with %s", script_path) + _digest_file(Path(script_path), md5_hash) + except RuntimeError: + logger.warning("The modules size has changed, retry calculating digest") + return digest_script(path_str) + return md5_hash diff --git a/tests/ci/env_helper.py b/tests/ci/env_helper.py index 04532ea3b96..2d867e62228 100644 --- a/tests/ci/env_helper.py +++ b/tests/ci/env_helper.py @@ -16,7 +16,7 @@ TEMP_PATH = os.getenv("TEMP_PATH", p.abspath(p.join(module_dir, "./tmp"))) CACHES_PATH = os.getenv("CACHES_PATH", TEMP_PATH) CLOUDFLARE_TOKEN = os.getenv("CLOUDFLARE_TOKEN") GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH", "") -GITHUB_JOB = os.getenv("GITHUB_JOB", "local") +GITHUB_JOB = os.getenv("GITHUB_JOB_OVERRIDDEN", "") or os.getenv("GITHUB_JOB", "local") GITHUB_REPOSITORY = os.getenv("GITHUB_REPOSITORY", "ClickHouse/ClickHouse") GITHUB_RUN_ID = os.getenv("GITHUB_RUN_ID", "0") GITHUB_SERVER_URL = os.getenv("GITHUB_SERVER_URL", "https://github.com") @@ -38,14 +38,16 @@ S3_ARTIFACT_DOWNLOAD_TEMPLATE = ( # These parameters are set only on demand, and only once _GITHUB_JOB_ID = "" _GITHUB_JOB_URL = "" +_GITHUB_JOB_API_URL = "" def GITHUB_JOB_ID() -> str: global _GITHUB_JOB_ID global _GITHUB_JOB_URL + global _GITHUB_JOB_API_URL if _GITHUB_JOB_ID: return _GITHUB_JOB_ID - _GITHUB_JOB_ID, _GITHUB_JOB_URL = get_job_id_url(GITHUB_JOB) + _GITHUB_JOB_ID, _GITHUB_JOB_URL, _GITHUB_JOB_API_URL = get_job_id_url(GITHUB_JOB) return _GITHUB_JOB_ID @@ -54,13 +56,19 @@ def GITHUB_JOB_URL() -> str: return _GITHUB_JOB_URL -def get_job_id_url(job_name: str) -> Tuple[str, str]: +def GITHUB_JOB_API_URL() -> str: + GITHUB_JOB_ID() + return _GITHUB_JOB_API_URL + + +def get_job_id_url(job_name: str) -> Tuple[str, str, str]: job_id = "" job_url = "" + job_api_url = "" if GITHUB_RUN_ID == "0": job_id = "0" if job_id: - return job_id, job_url + return job_id, job_url, job_api_url jobs = [] page = 1 while not job_id: @@ -76,7 +84,8 @@ def get_job_id_url(job_name: str) -> Tuple[str, str]: continue job_id = job["id"] job_url = job["html_url"] - return job_id, job_url + job_api_url = job["url"] + return job_id, job_url, job_api_url if ( len(jobs) >= data["total_count"] # just in case of inconsistency or len(data["jobs"]) == 0 # if we excided pages @@ -100,7 +109,8 @@ def get_job_id_url(job_name: str) -> Tuple[str, str]: # The best case scenario job_id = matched_jobs[0]["id"] job_url = matched_jobs[0]["html_url"] - return job_id, job_url + job_api_url = matched_jobs[0]["url"] + return job_id, job_url, job_api_url if matched_jobs: logging.error( "We could not get the ID and URL for the current job name %s, there " @@ -109,4 +119,4 @@ def get_job_id_url(job_name: str) -> Tuple[str, str]: job_name, ) - return job_id, job_url + return job_id, job_url, job_api_url diff --git a/tests/ci/report.py b/tests/ci/report.py index 808d18ff18c..ba2377faa36 100644 --- a/tests/ci/report.py +++ b/tests/ci/report.py @@ -10,8 +10,8 @@ import json import logging import os +from build_download_helper import get_gh_api from ci_config import BuildConfig, CI_CONFIG -from env_helper import get_job_id_url logger = logging.getLogger(__name__) @@ -298,8 +298,10 @@ class BuildResult: version: str status: StatusType elapsed_seconds: int - job_name: str - _job_link: Optional[str] = None + job_api_url: str + _job_name: Optional[str] = None + _job_html_url: Optional[str] = None + _job_html_link: Optional[str] = None _grouped_urls: Optional[List[List[str]]] = None @property @@ -387,11 +389,39 @@ class BuildResult: @property def job_link(self) -> str: - if self._job_link is not None: - return self._job_link - _, job_url = get_job_id_url(self.job_name) - self._job_link = f'{self.job_name}' - return self._job_link + if self._job_html_link is not None: + return self._job_html_link + self._job_html_link = f'{self.job_name}' + return self._job_html_link + + @property + def job_html_url(self) -> str: + if self._job_html_url is not None: + return self._job_html_url + self._set_properties() + return self._job_html_url or "" + + @property + def job_name(self) -> str: + if self._job_name is not None: + return self._job_name + self._set_properties() + return self._job_name or "" + + @job_name.setter + def job_name(self, job_name: str) -> None: + self._job_name = job_name + + def _set_properties(self) -> None: + if all(p is not None for p in (self._job_name, self._job_html_url)): + return + try: + job_data = get_gh_api(self.job_api_url).json() + except Exception: + job_data = {} + # job_name can be set manually + self._job_name = self._job_name or job_data.get("name", "unknown") + self._job_html_url = job_data.get("html_url", "") @staticmethod def get_report_name(name: str) -> Path: @@ -416,7 +446,7 @@ class BuildResult: data.get("version", ""), data.get("status", ERROR), data.get("elapsed_seconds", 0), - data.get("job_name", ""), + data.get("job_api_url", ""), ) @staticmethod @@ -434,7 +464,7 @@ class BuildResult: "version": self.version, "status": self.status, "elapsed_seconds": self.elapsed_seconds, - "job_name": self.job_name, + "job_api_url": self.job_api_url, } ), encoding="utf-8", @@ -706,7 +736,13 @@ def create_build_html_report( ) row.append(f"{link_separator.join(links)}") - row.append(f"{build_result.comment}") + comment = build_result.comment + if ( + build_result.build_config is not None + and build_result.build_config.sparse_checkout + ): + comment += " (note: sparse checkout is used)" + row.append(f"{comment}") row.append("") rows.append("".join(row)) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index bb047b4f4ef..f94f7f60bb6 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -import hashlib import logging import re import shutil @@ -22,15 +21,6 @@ from env_helper import ( from compress_files import compress_file_fast -def _md5(fname): - hash_md5 = hashlib.md5() - with open(fname, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_md5.update(chunk) - logging.debug("MD5 for %s is %s", fname, hash_md5.hexdigest()) - return hash_md5.hexdigest() - - def _flatten_list(lst): result = [] for elem in lst: diff --git a/tests/ci/stress.py b/tests/ci/stress.py index ef54191620d..ae918363df7 100755 --- a/tests/ci/stress.py +++ b/tests/ci/stress.py @@ -137,7 +137,10 @@ def prepare_for_hung_check(drop_databases: bool) -> bool: # However, it obstructs checking for hung queries. logging.info("Will terminate gdb (if any)") call_with_retry("kill -TERM $(pidof gdb)") - call_with_retry("tail --pid=$(pidof gdb) -f /dev/null") + call_with_retry( + "timeout 50s tail --pid=$(pidof gdb) -f /dev/null || kill -9 $(pidof gdb) ||:", + timeout=60, + ) # Sometimes there is a message `Child process was stopped by signal 19` in logs after stopping gdb call_with_retry( "kill -CONT $(cat /var/run/clickhouse-server/clickhouse-server.pid) && clickhouse client -q 'SELECT 1 FORMAT Null'" @@ -359,7 +362,7 @@ def main(): ) hung_check_log = args.output_folder / "hung_check.log" # type: Path tee = Popen(["/usr/bin/tee", hung_check_log], stdin=PIPE) - res = call(cmd, shell=True, stdout=tee.stdin, stderr=STDOUT) + res = call(cmd, shell=True, stdout=tee.stdin, stderr=STDOUT, timeout=600) if tee.stdin is not None: tee.stdin.close() if res != 0 and have_long_running_queries and not suppress_hung_check: diff --git a/tests/ci/test_digest.py b/tests/ci/test_digest.py new file mode 100644 index 00000000000..246a3226721 --- /dev/null +++ b/tests/ci/test_digest.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python + +import unittest +from hashlib import md5 +from pathlib import Path + +import digest_helper as dh + +_12 = b"12\n" +_13 = b"13\n" + + +# pylint:disable=protected-access +class TestDigests(unittest.TestCase): + tests_dir = Path("tests/digests") + broken_link = tests_dir / "broken-symlink" + empty_digest = "d41d8cd98f00b204e9800998ecf8427e" + + def test__digest_file(self): + hash_tested = md5() + with self.assertRaises( + AssertionError, msg="_digest_file shouldn't work with dirs" + ): + dh._digest_file(self.tests_dir, hash_tested) + with self.assertRaises( + AssertionError, msg="_digest_file shouldn't work with broken links" + ): + dh._digest_file(self.broken_link, hash_tested) + + # file with content '12\n' + hash_expected = md5() + hash_expected.update(_12) + dh._digest_file(self.tests_dir / "12", hash_tested) + self.assertEqual(hash_expected.digest(), hash_tested.digest()) + # symlink to '12\n' + hash_tested = md5() + dh._digest_file(self.tests_dir / "symlink-12", hash_tested) + self.assertEqual(hash_expected.digest(), hash_tested.digest()) + + def test__digest_directory(self): + hash_tested = md5() + with self.assertRaises( + AssertionError, msg="_digest_directory shouldn't work with files" + ): + dh._digest_directory(self.tests_dir / "12", hash_tested) + with self.assertRaises( + AssertionError, msg="_digest_directory shouldn't work with broken links" + ): + dh._digest_file(self.broken_link, hash_tested) + + # dir1 + hash_expected = md5() + hash_expected.update(_12) + dh._digest_directory(self.tests_dir / "dir1", hash_tested) + self.assertEqual(hash_expected.digest(), hash_tested.digest()) + + # dir2 contains 12 and 13 + hash_expected.update(_13) + hash_tested = md5() + dh._digest_directory(self.tests_dir / "dir2", hash_tested) + self.assertEqual(hash_expected.digest(), hash_tested.digest()) + + # dir3 is symlink to dir2 + hash_tested = md5() + dh._digest_directory(self.tests_dir / "dir3", hash_tested) + self.assertEqual(hash_expected.digest(), hash_tested.digest()) + + def test_digest_path(self): + # test broken link does nothing + self.assertEqual( + self.empty_digest, dh.digest_path(self.broken_link).hexdigest() + ) + # Test file works fine + hash_expected = md5() + hash_expected.update(_12) + self.assertEqual( + hash_expected.digest(), dh.digest_path(self.tests_dir / "12").digest() + ) + # Test directory works fine + hash_expected = md5() + hash_expected.update(_12) + self.assertEqual( + hash_expected.digest(), dh.digest_path(self.tests_dir / "dir1").digest() + ) + # Test existed hash is updated from symlink dir3 + hash_tested = hash_expected.copy() + dh.digest_path(self.tests_dir / "dir3", hash_tested) + hash_expected = md5() + hash_expected.update(_12 + _12 + _13) + self.assertEqual(hash_expected.digest(), hash_tested.digest()) + # Test the full content of the following structure + # tests/digests + # ├── 12 + # ├── dir1 + # │   └── 12 + # ├── dir2 + # │   ├── 12 + # │   └── 13 + # ├── dir3 -> dir2 + # └── symlink-12 -> 12 + hash_expected = md5() + hash_expected.update(_12 * 3 + (_13 + _12) * 2) + self.assertEqual( + hash_expected.digest(), dh.digest_path(self.tests_dir).digest() + ) + + def test_digest_paths(self): + # test paths order matters + hash_ordered = dh.digest_paths( + (self.tests_dir / d for d in ("dir1", "dir2", "dir3")) + ) + hash_reversed = dh.digest_paths( + (self.tests_dir / d for d in ("dir3", "dir2", "dir1")) + ) + hash_unordered = dh.digest_paths( + (self.tests_dir / d for d in ("dir3", "dir1", "dir2")) + ) + self.assertNotEqual(hash_ordered.digest(), hash_unordered.digest()) + self.assertNotEqual(hash_ordered.digest(), hash_reversed.digest()) + self.assertNotEqual(hash_unordered.digest(), hash_reversed.digest()) + + def test_digest_consistent_paths(self): + # test paths order does not matter + hash_ordered = dh.digest_consistent_paths( + (self.tests_dir / d for d in ("dir1", "dir2", "dir3")) + ) + hash_reversed = dh.digest_consistent_paths( + (self.tests_dir / d for d in ("dir3", "dir2", "dir1")) + ) + self.assertEqual(hash_ordered.digest(), hash_reversed.digest()) + + @classmethod + def setUpClass(cls): + # create a broken symlink + (TestDigests.broken_link).symlink_to("non-existent-link") + + @classmethod + def tearDownClass(cls): + (TestDigests.broken_link).unlink() diff --git a/tests/ci/docker_test.py b/tests/ci/test_docker.py similarity index 100% rename from tests/ci/docker_test.py rename to tests/ci/test_docker.py diff --git a/tests/ci/git_test.py b/tests/ci/test_git.py similarity index 100% rename from tests/ci/git_test.py rename to tests/ci/test_git.py diff --git a/tests/ci/version_test.py b/tests/ci/test_version.py similarity index 100% rename from tests/ci/version_test.py rename to tests/ci/test_version.py diff --git a/tests/ci/tests/digests/12 b/tests/ci/tests/digests/12 new file mode 100644 index 00000000000..48082f72f08 --- /dev/null +++ b/tests/ci/tests/digests/12 @@ -0,0 +1 @@ +12 diff --git a/tests/ci/tests/digests/dir1/12 b/tests/ci/tests/digests/dir1/12 new file mode 100644 index 00000000000..48082f72f08 --- /dev/null +++ b/tests/ci/tests/digests/dir1/12 @@ -0,0 +1 @@ +12 diff --git a/tests/ci/tests/digests/dir2/12 b/tests/ci/tests/digests/dir2/12 new file mode 100644 index 00000000000..48082f72f08 --- /dev/null +++ b/tests/ci/tests/digests/dir2/12 @@ -0,0 +1 @@ +12 diff --git a/tests/ci/tests/digests/dir2/13 b/tests/ci/tests/digests/dir2/13 new file mode 100644 index 00000000000..b1bd38b62a0 --- /dev/null +++ b/tests/ci/tests/digests/dir2/13 @@ -0,0 +1 @@ +13 diff --git a/tests/ci/tests/digests/dir3 b/tests/ci/tests/digests/dir3 new file mode 120000 index 00000000000..1e039be9000 --- /dev/null +++ b/tests/ci/tests/digests/dir3 @@ -0,0 +1 @@ +dir2 \ No newline at end of file diff --git a/tests/ci/tests/digests/symlink-12 b/tests/ci/tests/digests/symlink-12 new file mode 120000 index 00000000000..3cacc0b93c9 --- /dev/null +++ b/tests/ci/tests/digests/symlink-12 @@ -0,0 +1 @@ +12 \ No newline at end of file diff --git a/tests/config/config.d/zookeeper.xml b/tests/config/config.d/zookeeper.xml index 07142b1a55e..a54149e6617 100644 --- a/tests/config/config.d/zookeeper.xml +++ b/tests/config/config.d/zookeeper.xml @@ -2,6 +2,7 @@ random + true 127.0.0.1 9181 diff --git a/tests/config/install.sh b/tests/config/install.sh index a5037bfb64e..c31275cdcf2 100755 --- a/tests/config/install.sh +++ b/tests/config/install.sh @@ -82,6 +82,7 @@ ln -sf $SRC_PATH/users.d/marks.xml $DEST_SERVER_PATH/users.d/ ln -sf $SRC_PATH/users.d/insert_keeper_retries.xml $DEST_SERVER_PATH/users.d/ ln -sf $SRC_PATH/users.d/prefetch_settings.xml $DEST_SERVER_PATH/users.d/ ln -sf $SRC_PATH/users.d/nonconst_timezone.xml $DEST_SERVER_PATH/users.d/ +ln -sf $SRC_PATH/users.d/allow_introspection_functions.yaml $DEST_SERVER_PATH/users.d/ if [[ -n "$USE_NEW_ANALYZER" ]] && [[ "$USE_NEW_ANALYZER" -eq 1 ]]; then ln -sf $SRC_PATH/users.d/analyzer.xml $DEST_SERVER_PATH/users.d/ diff --git a/tests/config/users.d/allow_introspection_functions.yaml b/tests/config/users.d/allow_introspection_functions.yaml new file mode 100644 index 00000000000..24806bbe235 --- /dev/null +++ b/tests/config/users.d/allow_introspection_functions.yaml @@ -0,0 +1,3 @@ +profiles: + default: + allow_introspection_functions: 1 diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 390362cfceb..729b30ba934 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -575,7 +575,6 @@ class ClickHouseCluster: # available when with_ldap == True self.ldap_host = "openldap" - self.ldap_ip = None self.ldap_container = None self.ldap_port = 1389 self.ldap_id = self.get_instance_docker_id(self.ldap_host) @@ -2619,20 +2618,17 @@ class ClickHouseCluster: raise Exception("Can't wait Cassandra to start") def wait_ldap_to_start(self, timeout=180): - self.ldap_ip = self.get_instance_ip(self.ldap_host) self.ldap_container = self.get_docker_handle(self.ldap_id) start = time.time() while time.time() - start < timeout: try: - logging.info( - f"Check LDAP Online {self.ldap_id} {self.ldap_ip} {self.ldap_port}" - ) + logging.info(f"Check LDAP Online {self.ldap_host} {self.ldap_port}") self.exec_in_container( self.ldap_id, [ "bash", "-c", - f"/opt/bitnami/openldap/bin/ldapsearch -x -H ldap://{self.ldap_ip}:{self.ldap_port} -D cn=admin,dc=example,dc=org -w clickhouse -b dc=example,dc=org", + f"/opt/bitnami/openldap/bin/ldapsearch -x -H ldap://{self.ldap_host}:{self.ldap_port} -D cn=admin,dc=example,dc=org -w clickhouse -b dc=example,dc=org", ], user="root", ) @@ -2970,7 +2966,8 @@ class ClickHouseCluster: self.wait_cassandra_to_start() if self.with_ldap and self.base_ldap_cmd: - subprocess_check_call(self.base_ldap_cmd + ["up", "-d"]) + ldap_start_cmd = self.base_ldap_cmd + common_opts + subprocess_check_call(ldap_start_cmd) self.up_called = True self.wait_ldap_to_start() diff --git a/tests/integration/helpers/keeper_utils.py b/tests/integration/helpers/keeper_utils.py index 79d498b909f..83d0f2969b7 100644 --- a/tests/integration/helpers/keeper_utils.py +++ b/tests/integration/helpers/keeper_utils.py @@ -248,6 +248,11 @@ def is_leader(cluster, node, port=9181): return "Mode: leader" in stat +def is_follower(cluster, node, port=9181): + stat = send_4lw_cmd(cluster, node, "stat", port) + return "Mode: follower" in stat + + def get_leader(cluster, nodes): for node in nodes: if is_leader(cluster, node): diff --git a/tests/queries/0_stateless/02725_keeper_fault_inject_sequential_cleanup.reference b/tests/integration/test_alter_comment_on_cluster/__init__.py similarity index 100% rename from tests/queries/0_stateless/02725_keeper_fault_inject_sequential_cleanup.reference rename to tests/integration/test_alter_comment_on_cluster/__init__.py diff --git a/tests/integration/test_alter_comment_on_cluster/configs/clusters.xml b/tests/integration/test_alter_comment_on_cluster/configs/clusters.xml new file mode 100644 index 00000000000..d5293291c18 --- /dev/null +++ b/tests/integration/test_alter_comment_on_cluster/configs/clusters.xml @@ -0,0 +1,16 @@ + + + + + + node_1 + 9000 + + + node_2 + 9000 + + + + + \ No newline at end of file diff --git a/tests/integration/test_alter_comment_on_cluster/test.py b/tests/integration/test_alter_comment_on_cluster/test.py new file mode 100644 index 00000000000..e6767e35c1b --- /dev/null +++ b/tests/integration/test_alter_comment_on_cluster/test.py @@ -0,0 +1,61 @@ +import pytest +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import assert_eq_with_retry + +cluster = ClickHouseCluster(__file__) + +node_1 = cluster.add_instance( + "node_1", + main_configs=["configs/clusters.xml"], + with_zookeeper=True, + macros={"shard": 1, "replica": 1}, +) + +node_2 = cluster.add_instance( + "node_2", + main_configs=["configs/clusters.xml"], + with_zookeeper=True, + macros={"shard": 1, "replica": 2}, +) + + +def assert_create_query(nodes, database_name, table_name, expected): + query = "SELECT create_table_query FROM system.tables WHERE database='{}' AND table='{}'".format( + database_name, table_name + ) + for node in nodes: + assert_eq_with_retry(node, query, expected) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def test_comment(started_cluster): + node_1.query( + "CREATE TABLE test_table ON CLUSTER 'cluster' (id Int64) ENGINE=ReplicatedMergeTree() ORDER BY id" + ) + node_1.query( + "ALTER TABLE test_table ON CLUSTER 'cluster' COMMENT COLUMN id 'column_comment_1'" + ) + node_1.query( + "ALTER TABLE test_table ON CLUSTER 'cluster' MODIFY COMMENT 'table_comment_1';" + ) + + expected = "CREATE TABLE default.test_table (`id` Int64 COMMENT \\'column_comment_1\\') ENGINE = ReplicatedMergeTree(\\'/clickhouse/tables/{uuid}/{shard}\\', \\'{replica}\\') ORDER BY id SETTINGS index_granularity = 8192 COMMENT \\'table_comment_1\\'" + assert_create_query([node_1, node_2], "default", "test_table", expected) + + node_1.query( + "ALTER TABLE test_table ON CLUSTER 'cluster' COMMENT COLUMN id 'column_comment_2'" + ) + node_1.query( + "ALTER TABLE test_table ON CLUSTER 'cluster' MODIFY COMMENT 'table_comment_2';" + ) + + expected = "CREATE TABLE default.test_table (`id` Int64 COMMENT \\'column_comment_2\\') ENGINE = ReplicatedMergeTree(\\'/clickhouse/tables/{uuid}/{shard}\\', \\'{replica}\\') ORDER BY id SETTINGS index_granularity = 8192 COMMENT \\'table_comment_2\\'" + assert_create_query([node_1, node_2], "default", "test_table", expected) diff --git a/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf_l.xml b/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf_l.xml deleted file mode 100644 index 7f866e9beed..00000000000 --- a/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf_l.xml +++ /dev/null @@ -1,52 +0,0 @@ - - - - - 0 - - - s3 - https://vdimir-test2.s3.amazonaws.com/ttt/ - AKIAZURMN3FVQCQT6Y5U - pTfhdJgl4HOSIgL+aIE/pnGTZ7IAXMMcYvGhiDnb - eu-central-1 - /var/lib/clickhouse/gcs/ - false - - - cache - s3 - /var/lib/clickhouse/s3_cache/ - 10Gi - - - - - - - default - 10000000 - -
- s3_cache -
-
- 0.99 -
- - - - - default - - - s3 - - - - -
-
- - true -
diff --git a/tests/integration/test_attach_without_fetching/test.py b/tests/integration/test_attach_without_fetching/test.py index 60500380b31..b430387e0f1 100644 --- a/tests/integration/test_attach_without_fetching/test.py +++ b/tests/integration/test_attach_without_fetching/test.py @@ -13,7 +13,7 @@ def fill_node(node): """ CREATE TABLE IF NOT EXISTS test(n UInt32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test', '{replica}') - ORDER BY n PARTITION BY n % 10; + ORDER BY n PARTITION BY n % 10 SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, max_cleanup_delay_period=1; """.format( replica=node.name ) diff --git a/tests/integration/test_external_cluster/__init__.py b/tests/integration/test_external_cluster/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_external_cluster/configs/clusters.xml b/tests/integration/test_external_cluster/configs/clusters.xml new file mode 100644 index 00000000000..62ef9e9bf36 --- /dev/null +++ b/tests/integration/test_external_cluster/configs/clusters.xml @@ -0,0 +1,12 @@ + + + + + + data_node + 9000 + + + + + diff --git a/tests/integration/test_external_cluster/test.py b/tests/integration/test_external_cluster/test.py new file mode 100644 index 00000000000..2ed8ada3df4 --- /dev/null +++ b/tests/integration/test_external_cluster/test.py @@ -0,0 +1,69 @@ +import re +import pytest +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import assert_eq_with_retry + +cluster = ClickHouseCluster(__file__) + +control_node = cluster.add_instance( + "control_node", + main_configs=["configs/clusters.xml"], + with_zookeeper=True, +) + +data_node = cluster.add_instance( + "data_node", + main_configs=["configs/clusters.xml"], + with_zookeeper=True, + macros={"shard": 1, "replica": 1}, +) + +uuid_regex = re.compile("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}") + + +def assert_create_query(node, database_name, table_name, expected): + replace_uuid = lambda x: re.sub(uuid_regex, "uuid", x) + query = "SELECT create_table_query FROM system.tables WHERE database='{}' AND table='{}'".format( + database_name, table_name + ) + assert_eq_with_retry(node, query, expected, get_result=replace_uuid) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def test_ddl(started_cluster): + control_node.query("CREATE DATABASE test_db ON CLUSTER 'external' ENGINE=Atomic") + control_node.query( + "CREATE TABLE test_db.test_table ON CLUSTER 'external' (id Int64) Engine=MergeTree ORDER BY id" + ) + control_node.query( + "ALTER TABLE test_db.test_table ON CLUSTER 'external' add column data String" + ) + + expected = "CREATE TABLE test_db.test_table (`id` Int64, `data` String) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192" + assert_create_query(data_node, "test_db", "test_table", expected) + + control_node.query("DROP TABLE test_db.test_table ON CLUSTER 'external'") + control_node.query("DROP DATABASE test_db ON CLUSTER 'external'") + + expected = "" + assert_create_query(data_node, "test_db", "test_table", expected) + + +def test_ddl_replicated(started_cluster): + control_node.query( + "CREATE DATABASE test_db ON CLUSTER 'external' ENGINE=Replicated('/replicated')", + settings={"allow_experimental_database_replicated": 1}, + ) + # Exception is expected + assert "It's not initial query" in control_node.query_and_get_error( + "CREATE TABLE test_db.test_table ON CLUSTER 'external' (id Int64) Engine=MergeTree ORDER BY id" + ) + control_node.query("DROP DATABASE test_db ON CLUSTER 'external'") diff --git a/tests/integration/test_keeper_compression/__init__.py b/tests/integration/test_keeper_compression/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_keeper_compression/configs/keeper.xml b/tests/integration/test_keeper_compression/configs/keeper.xml new file mode 100644 index 00000000000..322938c3dc8 --- /dev/null +++ b/tests/integration/test_keeper_compression/configs/keeper.xml @@ -0,0 +1,29 @@ + + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + 5000 + trace + 10000 + + + + true + node1 + 1 + 2888 + 1 + + + + + + + /clickhouse/access + + + diff --git a/tests/integration/test_keeper_compression/configs/keeper_with_compression.xml b/tests/integration/test_keeper_compression/configs/keeper_with_compression.xml new file mode 100644 index 00000000000..566eb93af36 --- /dev/null +++ b/tests/integration/test_keeper_compression/configs/keeper_with_compression.xml @@ -0,0 +1,9 @@ + + + true + + node1 + 9181 + + + diff --git a/tests/integration/test_keeper_compression/configs/keeper_without_compression.xml b/tests/integration/test_keeper_compression/configs/keeper_without_compression.xml new file mode 100644 index 00000000000..e328dd43b83 --- /dev/null +++ b/tests/integration/test_keeper_compression/configs/keeper_without_compression.xml @@ -0,0 +1,9 @@ + + + false + + node1 + 9181 + + + diff --git a/tests/integration/test_keeper_compression/test_with_compression.py b/tests/integration/test_keeper_compression/test_with_compression.py new file mode 100644 index 00000000000..1c0697ebcbb --- /dev/null +++ b/tests/integration/test_keeper_compression/test_with_compression.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import pytest + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + +node1 = cluster.add_instance( + "node1", + main_configs=["configs/keeper.xml", "configs/keeper_with_compression.xml"], + stay_alive=True, +) + + +# test that server is able to start +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + finally: + cluster.shutdown() + + +def test_select(started_cluster): + assert node1.query("SELECT 1") == "1\n" diff --git a/tests/integration/test_keeper_compression/test_without_compression.py b/tests/integration/test_keeper_compression/test_without_compression.py new file mode 100644 index 00000000000..eb7936cfd42 --- /dev/null +++ b/tests/integration/test_keeper_compression/test_without_compression.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import pytest + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + +node1 = cluster.add_instance( + "node1", + main_configs=["configs/keeper.xml", "configs/keeper_without_compression.xml"], + stay_alive=True, +) + + +# test that server is able to start +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + finally: + cluster.shutdown() + + +def test_select(started_cluster): + assert node1.query("SELECT 1") == "1\n" diff --git a/tests/integration/test_keeper_four_word_command/test.py b/tests/integration/test_keeper_four_word_command/test.py index 5419d2334c7..71501133ae7 100644 --- a/tests/integration/test_keeper_four_word_command/test.py +++ b/tests/integration/test_keeper_four_word_command/test.py @@ -460,12 +460,19 @@ def test_cmd_crst(started_cluster): print("cons output(after crst) -------------------------------------") print(data) - # 2 connections, 1 for 'cons' command, 1 for zk + # 2 or 3 connections, 1 for 'crst', 1 for 'cons' command, 1 for zk + # there can be a case when 'crst' connection is not cleaned before the cons call + print("cons output(after crst) -------------------------------------") + print(data) cons = [n for n in data.split("\n") if len(n) > 0] - assert len(cons) == 2 + assert len(cons) == 2 or len(cons) == 3 # connection for zk - zk_conn = [n for n in cons if not n.__contains__("sid=0xffffffffffffffff")][0] + zk_conns = [n for n in cons if not n.__contains__("sid=0xffffffffffffffff")] + + # there can only be one + assert len(zk_conns) == 1 + zk_conn = zk_conns[0] conn_stat = re.match(r"(.*?)[:].*[(](.*?)[)].*", zk_conn.strip(), re.S).group(2) assert conn_stat is not None @@ -718,3 +725,30 @@ def test_cmd_clrs(started_cluster): finally: destroy_zk_client(zk) + + +def test_cmd_ydld(started_cluster): + wait_nodes() + for node in [node1, node3]: + data = keeper_utils.send_4lw_cmd(cluster, node, cmd="ydld") + assert data == "Sent yield leadership request to leader." + + print("ydld output -------------------------------------") + print(data) + + # Whenever there is a leader switch, there is a brief amount of time when any + # of the 4 letter commands will return empty result. Thus, we need to test for + # negative condition. So we can't use keeper_utils.is_leader() here and likewise + # in the while loop below. + if not keeper_utils.is_follower(cluster, node): + # wait for it to yield leadership + retry = 0 + while not keeper_utils.is_follower(cluster, node) and retry < 30: + time.sleep(1) + retry += 1 + if retry == 30: + print( + node.name + + " did not become follower after 30s of yielding leadership, maybe there is something wrong." + ) + assert keeper_utils.is_follower(cluster, node) diff --git a/tests/integration/test_keeper_reconfig_replace_leader/test.py b/tests/integration/test_keeper_reconfig_replace_leader/test.py index 76a8eb092e2..4cdd48fcf7c 100644 --- a/tests/integration/test_keeper_reconfig_replace_leader/test.py +++ b/tests/integration/test_keeper_reconfig_replace_leader/test.py @@ -83,7 +83,8 @@ def test_reconfig_replace_leader(started_cluster): assert "node3" in config assert "node4" not in config - ku.wait_configs_equal(config, zk2) + # additional 20s wait before removing leader + ku.wait_configs_equal(config, zk2, timeout=50) node4.start_clickhouse() config = zk2.reconfig(joining="server.4=node4:9234", leaving=None, new_members=None) diff --git a/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/__init__.py b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/configs/keeper1.xml b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/configs/keeper1.xml new file mode 100644 index 00000000000..71f3403aca3 --- /dev/null +++ b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/configs/keeper1.xml @@ -0,0 +1,35 @@ + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + true + + + 5000 + 10000 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + true + + + 3 + node3 + 9234 + true + + + + diff --git a/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/configs/keeper2.xml b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/configs/keeper2.xml new file mode 100644 index 00000000000..faefb4d1102 --- /dev/null +++ b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/configs/keeper2.xml @@ -0,0 +1,35 @@ + + + 9181 + 2 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + true + + + 5000 + 10000 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + true + + + 3 + node3 + 9234 + true + + + + diff --git a/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/configs/keeper3.xml b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/configs/keeper3.xml new file mode 100644 index 00000000000..80a9caa92c2 --- /dev/null +++ b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/configs/keeper3.xml @@ -0,0 +1,35 @@ + + + 9181 + 3 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + true + + + 5000 + 10000 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + true + + + 3 + node3 + 9234 + true + + + + diff --git a/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/configs/keeper4.xml b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/configs/keeper4.xml new file mode 100644 index 00000000000..9fd88fe5d63 --- /dev/null +++ b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/configs/keeper4.xml @@ -0,0 +1,21 @@ + + + 9181 + 4 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + true + + + 5000 + 10000 + trace + + + + 2 node2 9234 + 3 node3 9234 + 4 node4 9234 + + + diff --git a/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/test.py b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/test.py new file mode 100644 index 00000000000..95639edf2d0 --- /dev/null +++ b/tests/integration/test_keeper_reconfig_replace_leader_in_one_command/test.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 + +import pytest +from helpers.cluster import ClickHouseCluster, ClickHouseInstance +from os.path import join, dirname, realpath +import time +import helpers.keeper_utils as ku +import typing as tp + +cluster = ClickHouseCluster(__file__) +CONFIG_DIR = join(dirname(realpath(__file__)), "configs") + +node1 = cluster.add_instance("node1", main_configs=["configs/keeper1.xml"]) +node2 = cluster.add_instance("node2", main_configs=["configs/keeper2.xml"]) +node3 = cluster.add_instance("node3", main_configs=["configs/keeper3.xml"]) +node4 = cluster.add_instance("node4", stay_alive=True) +zk1, zk2, zk3, zk4 = None, None, None, None + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + node4.stop_clickhouse() + node4.copy_file_to_container( + join(CONFIG_DIR, "keeper4.xml"), + "/etc/clickhouse-server/config.d/keeper.xml", + ) + + yield cluster + + finally: + conn: tp.Optional[ku.KeeperClient] + for conn in [zk1, zk2, zk3, zk4]: + if conn: + conn.stop() + conn.close() + + cluster.shutdown() + + +# can't use create_client as clickhouse-keeper-client 's reconfig doesn't support +# joining and adding in single reconfig command, thus duplication +# TODO myrrc this should be removed once keeper-client is updated + + +def get_fake_zk(node): + return ku.get_fake_zk(cluster, node) + + +def get_config_str(zk): + return ku.get_config_str(zk)[0].decode("utf-8") + + +def wait_configs_equal( + left_config: str, right_zk: ku.KeeperClient, timeout: float = 30.0 +): + """ + Check whether get /keeper/config result in left_config is equal + to get /keeper/config on right_zk ZK connection. + """ + elapsed: float = 0.0 + while sorted(left_config.split("\n")) != sorted( + get_config_str(right_zk).split("\n") + ): + time.sleep(1) + elapsed += 1 + if elapsed >= timeout: + raise Exception( + f"timeout while checking nodes configs to get equal. " + f"Left: {left_config}, right: {get_config_str(right_zk)}" + ) + + +def test_reconfig_replace_leader_in_one_command(started_cluster): + """ + Remove leader from a cluster of 3 and add a new node to this cluster in a single command + """ + + zk1 = get_fake_zk(node1) + config = get_config_str(zk1) + + assert len(config.split("\n")) == 3 + assert "node1" in config + assert "node2" in config + assert "node3" in config + assert "node4" not in config + + for i in range(100): + zk1.create(f"/test_four_{i}", b"somedata") + + zk2 = get_fake_zk(node2) + zk2.sync("/test_four_0") + wait_configs_equal(config, zk2) + + zk3 = get_fake_zk(node3) + zk3.sync("/test_four_0") + wait_configs_equal(config, zk3) + + for i in range(100): + assert zk2.exists(f"/test_four_{i}") is not None + assert zk3.exists(f"/test_four_{i}") is not None + + assert ku.is_leader(cluster, node1) + node4.start_clickhouse() + config, _ = zk2.reconfig( + joining="server.4=node4:9234", leaving="1", new_members=None + ) + config = config.decode("utf-8") + + print("After removing 1 and adding 4", config) + assert len(config.split("\n")) == 3 + assert "node1" not in config + assert "node2" in config + assert "node3" in config + assert "node4" in config + + ku.wait_until_connected(cluster, node4) + time.sleep(1) + + zk4 = get_fake_zk(node4) + zk4.sync("/test_four_0") + # we have an additional 20s timeout for removing leader + wait_configs_equal(config, zk4, timeout=50) + + for i in range(100): + assert zk4.exists(f"test_four_{i}") is not None + zk4.create(f"/test_four_{100 + i}", b"somedata") + + with pytest.raises(Exception): + zk1.stop() + zk1.close() + zk1 = get_fake_zk(node1) + zk1.sync("/test_four_0") + + zk2.stop() + zk2.close() + zk2 = get_fake_zk(node2) + zk2.sync("/test_four_0") + wait_configs_equal(config, zk2) + + zk3.stop() + zk3.close() + zk3 = get_fake_zk(node3) + zk3.sync("/test_four_0") + wait_configs_equal(config, zk3) + + for i in range(200): + assert zk2.exists(f"test_four_{i}") is not None + assert zk3.exists(f"test_four_{i}") is not None diff --git a/tests/integration/test_merge_tree_s3/test.py b/tests/integration/test_merge_tree_s3/test.py index 507f25209a4..3b2f1c0f6a6 100644 --- a/tests/integration/test_merge_tree_s3/test.py +++ b/tests/integration/test_merge_tree_s3/test.py @@ -941,215 +941,3 @@ def test_s3_disk_heavy_write_check_mem(cluster, broken_s3, node_name): assert int(result) > 0.8 * memory check_no_objects_after_drop(cluster, node_name=node_name) - - -def get_memory_usage(node, query_id): - node.query("SYSTEM FLUSH LOGS") - memory_usage = node.query( - "SELECT memory_usage" - " FROM system.query_log" - f" WHERE query_id='{query_id}'" - " AND type='QueryFinish'" - ) - return int(memory_usage) - - -def get_memory_usages(node, query_ids): - node.query("SYSTEM FLUSH LOGS") - result = [] - for query_id in query_ids: - memory_usage = node.query( - "SELECT memory_usage" - " FROM system.query_log" - f" WHERE query_id='{query_id}'" - " AND type='QueryFinish'" - ) - result.append(int(memory_usage)) - return result - - -@pytest.mark.parametrize("node_name", ["node"]) -def test_heavy_insert_select_check_memory(cluster, broken_s3, node_name): - node = cluster.instances[node_name] - - node.query( - """ - CREATE TABLE central_query_log - ( - control_plane_id UUID, - pod_id LowCardinality(String), - scrape_ts_microseconds DateTime64(6) CODEC(Delta(8), LZ4), - event_date Date, - event_time DateTime, - payload Array(String), - payload_01 String, - payload_02 String, - payload_03 String, - payload_04 String, - payload_05 String, - payload_06 String, - payload_07 String, - payload_08 String, - payload_09 String, - payload_10 String, - payload_11 String, - payload_12 String, - payload_13 String, - payload_14 String, - payload_15 String, - payload_16 String, - payload_17 String, - payload_18 String, - payload_19 String - ) - ENGINE=MergeTree() - PARTITION BY toYYYYMM(event_date) - ORDER BY (control_plane_id, event_date, pod_id) - SETTINGS - storage_policy='s3' - """ - ) - - node.query("SYSTEM STOP MERGES central_query_log") - - write_count = 2 - write_query_ids = [] - for x in range(write_count): - query_id = f"INSERT_INTO_TABLE_RANDOM_DATA_QUERY_ID_{x}" - write_query_ids.append(query_id) - node.query( - """ - INSERT INTO central_query_log - SELECT - control_plane_id, - pod_id, - toStartOfHour(event_time) + toIntervalSecond(randUniform(0,60)) as scrape_ts_microseconds, - toDate(event_time) as event_date, - event_time, - payload, - payload[1] as payload_01, - payload[2] as payload_02, - payload[3] as payload_03, - payload[4] as payload_04, - payload[5] as payload_05, - payload[6] as payload_06, - payload[7] as payload_07, - payload[8] as payload_08, - payload[9] as payload_09, - payload[10] as payload_10, - payload[11] as payload_11, - payload[12] as payload_12, - payload[13] as payload_13, - payload[14] as payload_14, - payload[15] as payload_15, - payload[16] as payload_16, - payload[17] as payload_17, - payload[18] as payload_18, - payload[19] as payload_19 - FROM - ( - SELECT - control_plane_id, - substring(payload[1], 1, 5) as pod_id, - toDateTime('2022-12-12 00:00:00') - + toIntervalDay(floor(randUniform(0,3))) - + toIntervalHour(floor(randUniform(0,24))) - + toIntervalSecond(floor(randUniform(0,60))) - as event_time, - payload - FROM - generateRandom( - 'control_plane_id UUID, payload Array(String)', - NULL, - 100, - 100 - ) - LIMIT 10000 - ) - SETTINGS - max_insert_block_size=256000000, - min_insert_block_size_rows=1000000, - min_insert_block_size_bytes=256000000 - """, - query_id=query_id, - ) - - memory = 845346116 - for memory_usage, query_id in zip( - get_memory_usages(node, write_query_ids), write_query_ids - ): - assert int(memory_usage) < 1.2 * memory, f"{memory_usage} : {query_id}" - assert int(memory_usage) > 0.8 * memory, f"{memory_usage} : {query_id}" - - broken_s3.setup_slow_answers(minimal_length=1000, timeout=5, count=20) - broken_s3.setup_fake_multpartuploads() - - insert_query_id = f"INSERT_INTO_S3_FUNCTION_QUERY_ID" - node.query( - """ - INSERT INTO - TABLE FUNCTION s3( - 'http://resolver:8083/root/data/test-upload_{_partition_id}.csv.gz', - 'minio', 'minio123', - 'CSV', auto, 'gzip' - ) - PARTITION BY formatDateTime(subtractHours(toDateTime('2022-12-13 00:00:00'), 1),'%Y-%m-%d_%H:00') - WITH toDateTime('2022-12-13 00:00:00') as time_point - SELECT - * - FROM central_query_log - WHERE - event_date >= subtractDays(toDate(time_point), 1) - AND scrape_ts_microseconds >= subtractHours(toStartOfHour(time_point), 12) - AND scrape_ts_microseconds < toStartOfDay(time_point) - SETTINGS - s3_max_inflight_parts_for_one_file=1 - """, - query_id=insert_query_id, - ) - - query_id = f"SELECT_QUERY_ID" - total = node.query( - """ - SELECT - count() - FROM central_query_log - """, - query_id=query_id, - ) - assert int(total) == 10000 * write_count - - query_id = f"SELECT_WHERE_QUERY_ID" - selected = node.query( - """ - WITH toDateTime('2022-12-13 00:00:00') as time_point - SELECT - count() - FROM central_query_log - WHERE - event_date >= subtractDays(toDate(time_point), 1) - AND scrape_ts_microseconds >= subtractHours(toStartOfHour(time_point), 12) - AND scrape_ts_microseconds < toStartOfDay(time_point) - """, - query_id=query_id, - ) - assert int(selected) < 4500, selected - assert int(selected) > 2500, selected - - node.query("SYSTEM FLUSH LOGS") - profile_events = node.query( - f""" - SELECT ProfileEvents - FROM system.query_log - WHERE query_id='{insert_query_id}' - AND type='QueryFinish' - """ - ) - - memory_usage = get_memory_usage(node, insert_query_id) - memory = 123507857 - assert int(memory_usage) < 1.2 * memory, f"{memory_usage} {profile_events}" - assert int(memory_usage) > 0.8 * memory, f"{memory_usage} {profile_events}" - - node.query(f"DROP TABLE IF EXISTS central_query_log SYNC") - remove_all_s3_objects(cluster) diff --git a/tests/integration/test_parts_delete_zookeeper/test.py b/tests/integration/test_parts_delete_zookeeper/test.py index 9fd07e7b65d..d7b5fe1cb57 100644 --- a/tests/integration/test_parts_delete_zookeeper/test.py +++ b/tests/integration/test_parts_delete_zookeeper/test.py @@ -61,11 +61,21 @@ def test_merge_doesnt_work_without_zookeeper(start_cluster): node1.query("TRUNCATE TABLE test_table") + total_parts = node1.query( + "SELECT count(*) from system.parts where table = 'test_table'" + ) + assert total_parts == "0\n" or total_parts == "1\n" + assert ( - node1.query("SELECT count(*) from system.parts where table = 'test_table'") + node1.query( + "SELECT count(*) from system.parts where table = 'test_table' and active = 1" + ) == "0\n" ) + node1.query("DETACH TABLE test_table SYNC") + node1.query("ATTACH TABLE test_table") + node1.query( "INSERT INTO test_table VALUES ('2018-10-01', 1), ('2018-10-02', 2), ('2018-10-03', 3)" ) diff --git a/tests/integration/test_remote_blobs_naming/__init__.py b/tests/integration/test_remote_blobs_naming/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_remote_blobs_naming/configs/new_node.xml b/tests/integration/test_remote_blobs_naming/configs/new_node.xml new file mode 100644 index 00000000000..2bb8d49ec4b --- /dev/null +++ b/tests/integration/test_remote_blobs_naming/configs/new_node.xml @@ -0,0 +1,5 @@ + + + + 1 + diff --git a/tests/integration/test_remote_blobs_naming/configs/old_node.xml b/tests/integration/test_remote_blobs_naming/configs/old_node.xml new file mode 100644 index 00000000000..6195d53a2af --- /dev/null +++ b/tests/integration/test_remote_blobs_naming/configs/old_node.xml @@ -0,0 +1,5 @@ + + + + 0 + diff --git a/tests/integration/test_remote_blobs_naming/configs/settings.xml b/tests/integration/test_remote_blobs_naming/configs/settings.xml new file mode 100644 index 00000000000..8e8d870d1ba --- /dev/null +++ b/tests/integration/test_remote_blobs_naming/configs/settings.xml @@ -0,0 +1,10 @@ + + + + + + 1 + 1 + + + diff --git a/tests/integration/test_remote_blobs_naming/configs/storage_conf.xml b/tests/integration/test_remote_blobs_naming/configs/storage_conf.xml new file mode 100644 index 00000000000..31c6a3bf968 --- /dev/null +++ b/tests/integration/test_remote_blobs_naming/configs/storage_conf.xml @@ -0,0 +1,47 @@ + + + + + test + + + + + + s3 + http://minio1:9001/root/data/ + minio + minio123 + + + s3_plain + http://minio1:9001/root/data/s3_pain_key_prefix + minio + minio123 + true + + + + + + +
+ s3 +
+
+
+ + + +
+ s3_plain +
+
+
+
+
+ + + s3 + +
diff --git a/tests/integration/test_remote_blobs_naming/configs/switching_node.xml b/tests/integration/test_remote_blobs_naming/configs/switching_node.xml new file mode 100644 index 00000000000..6195d53a2af --- /dev/null +++ b/tests/integration/test_remote_blobs_naming/configs/switching_node.xml @@ -0,0 +1,5 @@ + + + + 0 + diff --git a/tests/integration/test_remote_blobs_naming/test_backward_compatibility.py b/tests/integration/test_remote_blobs_naming/test_backward_compatibility.py new file mode 100644 index 00000000000..485bf73dad1 --- /dev/null +++ b/tests/integration/test_remote_blobs_naming/test_backward_compatibility.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 + +import logging +import pytest + +import os +from helpers.cluster import ClickHouseCluster + + +@pytest.fixture(scope="module") +def cluster(): + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "node", + main_configs=[ + "configs/old_node.xml", + "configs/storage_conf.xml", + ], + user_configs=[ + "configs/settings.xml", + ], + with_minio=True, + macros={"replica": "1"}, + with_zookeeper=True, + ) + cluster.add_instance( + "new_node", + main_configs=[ + "configs/new_node.xml", + "configs/storage_conf.xml", + ], + user_configs=[ + "configs/settings.xml", + ], + with_minio=True, + macros={"replica": "2"}, + with_zookeeper=True, + ) + cluster.add_instance( + "switching_node", + main_configs=[ + "configs/switching_node.xml", + "configs/storage_conf.xml", + ], + user_configs=[ + "configs/settings.xml", + ], + with_minio=True, + with_zookeeper=True, + stay_alive=True, + ) + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + + # Actually, try/finally section is excess in pytest.fixtures + cluster.shutdown() + + +def get_part_path(node, table, part_name): + part_path = node.query( + f"SELECT path FROM system.parts WHERE table = '{table}' and name = '{part_name}'" + ).strip() + + return os.path.normpath(part_path) + + +def get_first_part_name(node, table): + part_name = node.query( + f"SELECT name FROM system.parts WHERE table = '{table}' and active LIMIT 1" + ).strip() + return part_name + + +def read_file(node, file_path): + return node.exec_in_container(["bash", "-c", f"cat {file_path}"]) + + +def write_file(node, file_path, data): + node.exec_in_container(["bash", "-c", f"echo '{data}' > {file_path}"]) + + +def find_keys_for_local_path(node, local_path): + remote = node.query( + f""" + SELECT + remote_path + FROM + system.remote_data_paths + WHERE + concat(path, local_path) = '{local_path}' + """ + ).split("\n") + return [x for x in remote if x] + + +def test_read_new_format(cluster): + node = cluster.instances["node"] + + node.query( + """ + CREATE TABLE test_read_new_format ( + id Int64, + data String + ) ENGINE=MergeTree() + ORDER BY id + """ + ) + + node.query("INSERT INTO test_read_new_format VALUES (1, 'Hello')") + + part_name = get_first_part_name(node, "test_read_new_format") + part_path = get_part_path(node, "test_read_new_format", part_name) + primary_idx = os.path.join(part_path, "primary.cidx") + + remote = find_keys_for_local_path(node, primary_idx) + assert len(remote) == 1 + remote = remote[0] + + node.query(f"ALTER TABLE test_read_new_format DETACH PART '{part_name}'") + + detached_primary_idx = os.path.join( + os.path.dirname(part_path), "detached", part_name, "primary.cidx" + ) + + # manually change the metadata format and see that CH reads it correctly + meta_data = read_file(node, detached_primary_idx) + lines = meta_data.split("\n") + object_size, object_key = lines[2].split("\t") + assert remote.endswith(object_key), object_key + assert remote != object_key + lines[2] = f"{object_size}\t{remote}" + lines[0] = "5" + + write_file(node, detached_primary_idx, "\n".join(lines)) + + active_count = node.query( + f"SELECT count() FROM system.parts WHERE table = 'test_read_new_format' and active" + ).strip() + assert active_count == "0", active_count + + node.query(f"ALTER TABLE test_read_new_format ATTACH PART '{part_name}'") + + active_count = node.query( + f"SELECT count() FROM system.parts WHERE table = 'test_read_new_format' and active" + ).strip() + assert active_count == "1", active_count + + values = node.query(f"SELECT * FROM test_read_new_format").split("\n") + values = [x for x in values if x] + assert values == ["1\tHello"], values + + # part name has changed after attach + part_name = get_first_part_name(node, "test_read_new_format") + part_path = get_part_path(node, "test_read_new_format", part_name) + primary_idx = os.path.join(part_path, "primary.cidx") + + new_remote = find_keys_for_local_path(node, primary_idx) + assert len(new_remote) == 1 + new_remote = new_remote[0] + assert remote == new_remote + + +def test_write_new_format(cluster): + node = cluster.instances["new_node"] + + node.query( + """ + CREATE TABLE test_read_new_format ( + id Int64, + data String + ) ENGINE=MergeTree() + ORDER BY id + """ + ) + + node.query("INSERT INTO test_read_new_format VALUES (1, 'Hello')") + + part_name = get_first_part_name(node, "test_read_new_format") + part_path = get_part_path(node, "test_read_new_format", part_name) + primary_idx = os.path.join(part_path, "primary.cidx") + + remote = find_keys_for_local_path(node, primary_idx) + assert len(remote) == 1 + remote = remote[0] + + node.query(f"ALTER TABLE test_read_new_format DETACH PART '{part_name}'") + + detached_primary_idx = os.path.join( + os.path.dirname(part_path), "detached", part_name, "primary.cidx" + ) + + # manually change the metadata format and see that CH reads it correctly + meta_data = read_file(node, detached_primary_idx) + lines = meta_data.split("\n") + object_size, object_key = lines[2].split("\t") + assert remote.endswith(object_key), object_key + assert remote == object_key + + +@pytest.mark.parametrize("storage_policy", ["s3", "s3_plain"]) +def test_replicated_merge_tree(cluster, storage_policy): + if storage_policy == "s3_plain": + # MergeTree table doesn't work on s3_plain. Rename operation is not implemented + return + + node_old = cluster.instances["node"] + node_new = cluster.instances["new_node"] + + create_table_statement = f""" + CREATE TABLE test_replicated_merge_tree ( + id Int64, + val String + ) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test_replicated_merge_tree_{storage_policy}', '{{replica}}') + PARTITION BY id + ORDER BY (id, val) + SETTINGS + storage_policy='{storage_policy}' + """ + + node_old.query(create_table_statement) + node_new.query(create_table_statement) + + node_old.query("INSERT INTO test_replicated_merge_tree VALUES (0, 'a')") + node_new.query("INSERT INTO test_replicated_merge_tree VALUES (1, 'b')") + + # node_old have to fetch metadata from node_new and vice versa + node_old.query("SYSTEM SYNC REPLICA test_replicated_merge_tree") + node_new.query("SYSTEM SYNC REPLICA test_replicated_merge_tree") + + count_old = node_old.query("SELECT count() FROM test_replicated_merge_tree").strip() + count_new = node_new.query("SELECT count() FROM test_replicated_merge_tree").strip() + + assert count_old == "2" + assert count_new == "2" + + node_old.query("DROP TABLE test_replicated_merge_tree SYNC") + node_new.query("DROP TABLE test_replicated_merge_tree SYNC") + + +def switch_config_write_full_object_key(node, enable): + setting_path = "/etc/clickhouse-server/config.d/switching_node.xml" + + is_on = "1<" + is_off = "0<" + + data = read_file(node, setting_path) + + assert data != "" + assert is_on in data or is_off in data + + if enable: + node.replace_in_config(setting_path, is_off, is_on) + else: + node.replace_in_config(setting_path, is_on, is_off) + + node.restart_clickhouse() + + +@pytest.mark.parametrize("storage_policy", ["s3", "s3_plain"]) +def test_log_table(cluster, storage_policy): + if storage_policy == "s3_plain": + # Log table doesn't work on s3_plain. Rename operation is not implemented + return + + node = cluster.instances["switching_node"] + + create_table_statement = f""" + CREATE TABLE test_log_table ( + id Int64, + val String + ) ENGINE=Log + SETTINGS + storage_policy='{storage_policy}' + """ + + node.query(create_table_statement) + + node.query("INSERT INTO test_log_table VALUES (0, 'a')") + assert "1" == node.query("SELECT count() FROM test_log_table").strip() + + switch_config_write_full_object_key(node, True) + node.query("INSERT INTO test_log_table VALUES (0, 'a')") + assert "2" == node.query("SELECT count() FROM test_log_table").strip() + + switch_config_write_full_object_key(node, False) + node.query("INSERT INTO test_log_table VALUES (1, 'b')") + assert "3" == node.query("SELECT count() FROM test_log_table").strip() + + switch_config_write_full_object_key(node, True) + node.query("INSERT INTO test_log_table VALUES (2, 'c')") + assert "4" == node.query("SELECT count() FROM test_log_table").strip() + + node.query("DROP TABLE test_log_table SYNC") diff --git a/tests/integration/test_replicated_zero_copy_projection_mutation/test.py b/tests/integration/test_replicated_zero_copy_projection_mutation/test.py index 1b68aac08a7..4839919e23d 100644 --- a/tests/integration/test_replicated_zero_copy_projection_mutation/test.py +++ b/tests/integration/test_replicated_zero_copy_projection_mutation/test.py @@ -174,20 +174,13 @@ def test_hardlinks_preserved_when_projection_dropped( ) ENGINE ReplicatedMergeTree('/clickhouse/tables/test_projection', '{instance}') ORDER BY a + SETTINGS cleanup_delay_period=1, max_cleanup_delay_period=3 """ - first_node_settings = """ - SETTINGS - storage_policy='s3', - old_parts_lifetime=0 - """ + first_node_settings = ", storage_policy='s3', old_parts_lifetime=0" # big old_parts_lifetime value makes second node to hold outdated part for us, we make it as broken_on_start - second_node_settings = """ - SETTINGS - storage_policy='s3', - old_parts_lifetime=10000 - """ + second_node_settings = ", storage_policy='s3', old_parts_lifetime=10000" first_cluster_node.query(create_query + first_node_settings) second_cluster_node.query(create_query + second_node_settings) diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py index d659126f49a..9f41cfd176d 100644 --- a/tests/integration/test_storage_s3_queue/test.py +++ b/tests/integration/test_storage_s3_queue/test.py @@ -164,7 +164,6 @@ def generate_random_files( values_csv = ( "\n".join((",".join(map(str, row)) for row in rand_values)) + "\n" ).encode() - print(f"File {filename}, content: {rand_values}") put_s3_file_content(started_cluster, filename, values_csv) return total_values @@ -626,7 +625,8 @@ def test_multiple_tables_meta_mismatch(started_cluster): ) -@pytest.mark.parametrize("mode", AVAILABLE_MODES) +# TODO: Update the modes for this test to include "ordered" once PR #55795 is finished. +@pytest.mark.parametrize("mode", ["unordered"]) def test_multiple_tables_streaming_sync(started_cluster, mode): node = started_cluster.instances["instance"] table_name = f"multiple_tables_streaming_sync_{mode}" @@ -672,7 +672,7 @@ def test_multiple_tables_streaming_sync(started_cluster, mode): + get_count(f"{dst_table_name}_3") ) != files_to_generate: info = node.query( - f"SELECT * FROM system.s3queue_log WHERE zookeeper_path like '%{table_name}' ORDER BY file_name FORMAT Vertical" + f"SELECT * FROM system.s3queue WHERE zookeeper_path like '%{table_name}' ORDER BY file_name FORMAT Vertical" ) logging.debug(info) assert False @@ -751,7 +751,7 @@ def test_multiple_tables_streaming_sync_distributed(started_cluster, mode): get_count(node, dst_table_name) + get_count(node_2, dst_table_name) ) != files_to_generate: info = node.query( - f"SELECT * FROM system.s3queue_log WHERE zookeeper_path like '%{table_name}' ORDER BY file_name FORMAT Vertical" + f"SELECT * FROM system.s3queue WHERE zookeeper_path like '%{table_name}' ORDER BY file_name FORMAT Vertical" ) logging.debug(info) assert False @@ -888,3 +888,33 @@ def test_max_set_size(started_cluster): time.sleep(10) res1 = [list(map(int, l.split())) for l in run_query(node, get_query).splitlines()] assert res1 == [total_values[1]] + + +def test_drop_table(started_cluster): + node = started_cluster.instances["instance"] + table_name = f"test_drop" + dst_table_name = f"{table_name}_dst" + keeper_path = f"/clickhouse/test_{table_name}" + files_path = f"{table_name}_data" + files_to_generate = 300 + + create_table( + started_cluster, + node, + table_name, + "unordered", + files_path, + additional_settings={ + "keeper_path": keeper_path, + "s3queue_processing_threads_num": 5, + }, + ) + total_values = generate_random_files( + started_cluster, files_path, files_to_generate, start_ind=0, row_num=100000 + ) + create_mv(node, table_name, dst_table_name) + node.wait_for_log_line(f"Reading from file: test_drop_data") + node.query(f"DROP TABLE {table_name} SYNC") + assert node.contains_in_log( + f"StorageS3Queue ({table_name}): Table is being dropped" + ) diff --git a/tests/queries/0_stateless/00408_http_keep_alive.reference b/tests/queries/0_stateless/00408_http_keep_alive.reference index 5f9cc1079a7..17a7fd690a8 100644 --- a/tests/queries/0_stateless/00408_http_keep_alive.reference +++ b/tests/queries/0_stateless/00408_http_keep_alive.reference @@ -1,6 +1,6 @@ < Connection: Keep-Alive -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < Connection: Keep-Alive -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 < Connection: Keep-Alive -< Keep-Alive: timeout=3 +< Keep-Alive: timeout=10 diff --git a/tests/queries/0_stateless/00501_http_head.reference b/tests/queries/0_stateless/00501_http_head.reference index 9727c63b2d8..8351327b356 100644 --- a/tests/queries/0_stateless/00501_http_head.reference +++ b/tests/queries/0_stateless/00501_http_head.reference @@ -2,11 +2,11 @@ HTTP/1.1 200 OK Connection: Keep-Alive Content-Type: text/tab-separated-values; charset=UTF-8 Transfer-Encoding: chunked -Keep-Alive: timeout=3 +Keep-Alive: timeout=10 HTTP/1.1 200 OK Connection: Keep-Alive Content-Type: text/tab-separated-values; charset=UTF-8 Transfer-Encoding: chunked -Keep-Alive: timeout=3 +Keep-Alive: timeout=10 diff --git a/tests/queries/0_stateless/00940_order_by_read_in_order_query_plan.sql b/tests/queries/0_stateless/00940_order_by_read_in_order_query_plan.sql index 8e59e5af254..532539206f7 100644 --- a/tests/queries/0_stateless/00940_order_by_read_in_order_query_plan.sql +++ b/tests/queries/0_stateless/00940_order_by_read_in_order_query_plan.sql @@ -1,4 +1,4 @@ -SET optimize_read_in_order = 1, query_plan_read_in_order=1; +SET optimize_read_in_order = 1, query_plan_read_in_order = 1, allow_experimental_analyzer = 0; drop table if exists tab; drop table if exists tab2; diff --git a/tests/queries/0_stateless/01161_all_system_tables.sh b/tests/queries/0_stateless/01161_all_system_tables.sh index 3ba59f9a424..739df782a39 100755 --- a/tests/queries/0_stateless/01161_all_system_tables.sh +++ b/tests/queries/0_stateless/01161_all_system_tables.sh @@ -23,7 +23,7 @@ function run_selects() for t in "${tables_arr[@]}" do - ${CLICKHOUSE_CLIENT} -q "SELECT * FROM $t LIMIT $LIMIT FORMAT Null" # Suppress style check: database=$CLICKHOUSE_DATABASEs + ${CLICKHOUSE_CLIENT} -q "SELECT * FROM $t LIMIT $LIMIT SETTINGS allow_introspection_functions = 1 FORMAT Null" # Suppress style check: database=$CLICKHOUSE_DATABASEs done } diff --git a/tests/queries/0_stateless/01161_information_schema.reference b/tests/queries/0_stateless/01161_information_schema.reference index bc09eeac091..40e98ca8342 100644 --- a/tests/queries/0_stateless/01161_information_schema.reference +++ b/tests/queries/0_stateless/01161_information_schema.reference @@ -2,24 +2,28 @@ COLUMNS KEY_COLUMN_USAGE REFERENTIAL_CONSTRAINTS SCHEMATA +STATISTICS TABLES VIEWS columns key_column_usage referential_constraints schemata +statistics tables views COLUMNS KEY_COLUMN_USAGE REFERENTIAL_CONSTRAINTS SCHEMATA +STATISTICS TABLES VIEWS columns key_column_usage referential_constraints schemata +statistics tables views -- information_schema.schemata @@ -53,5 +57,6 @@ def default PRIMARY def default kcu1 i 1 \N \N \N \N def default PRIMARY def def def default PRIMARY def default kcu2 d 1 \N \N \N \N def default PRIMARY def default kcu2 d 1 \N \N \N \N def default PRIMARY def default kcu2 u 1 \N \N \N \N def default PRIMARY def default kcu2 u 1 \N \N \N \N -- information_schema.referential_constraints +-- information_schema.statistics 1 1 diff --git a/tests/queries/0_stateless/01161_information_schema.sql b/tests/queries/0_stateless/01161_information_schema.sql index c6a5ff4bd33..d35767c018c 100644 --- a/tests/queries/0_stateless/01161_information_schema.sql +++ b/tests/queries/0_stateless/01161_information_schema.sql @@ -39,6 +39,9 @@ SELECT * FROM information_schema.key_column_usage WHERE table_schema = currentDa SELECT '-- information_schema.referential_constraints'; SELECT * FROM information_schema.referential_constraints; + +SELECT '-- information_schema.statistics'; +SELECT * FROM information_schema.statistics; -- -- mixed upper/lowercase schema and table name: SELECT count() FROM information_schema.TABLES WHERE table_schema = currentDatabase() AND table_name = 't'; diff --git a/tests/queries/0_stateless/01232_untuple.reference b/tests/queries/0_stateless/01232_untuple.reference index 7bb80bf618f..0358cde1354 100644 --- a/tests/queries/0_stateless/01232_untuple.reference +++ b/tests/queries/0_stateless/01232_untuple.reference @@ -2,7 +2,7 @@ hello 1 3 world 9 9 (0,1) -key tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), \'1\') tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), \'2\') tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), \'3\') tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), \'4\') tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), \'5\') +key tupleElement(argMax((v1, v2, v3, v4, v5), v1), \'1\') tupleElement(argMax((v1, v2, v3, v4, v5), v1), \'2\') tupleElement(argMax((v1, v2, v3, v4, v5), v1), \'3\') tupleElement(argMax((v1, v2, v3, v4, v5), v1), \'4\') tupleElement(argMax((v1, v2, v3, v4, v5), v1), \'5\') 1 20 20 10 20 30 2 11 20 10 20 30 3 70 20 10 20 30 diff --git a/tests/queries/0_stateless/01277_toUnixTimestamp64.sql b/tests/queries/0_stateless/01277_toUnixTimestamp64.sql index 42de53beb66..14ee57da5df 100644 --- a/tests/queries/0_stateless/01277_toUnixTimestamp64.sql +++ b/tests/queries/0_stateless/01277_toUnixTimestamp64.sql @@ -1,15 +1,15 @@ -- Error cases -SELECT toUnixTimestamp64Milli(); -- {serverError 42} -SELECT toUnixTimestamp64Micro(); -- {serverError 42} -SELECT toUnixTimestamp64Nano(); -- {serverError 42} +SELECT toUnixTimestamp64Milli(); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +SELECT toUnixTimestamp64Micro(); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +SELECT toUnixTimestamp64Nano(); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} -SELECT toUnixTimestamp64Milli('abc'); -- {serverError 43} -SELECT toUnixTimestamp64Micro('abc'); -- {serverError 43} -SELECT toUnixTimestamp64Nano('abc'); -- {serverError 43} +SELECT toUnixTimestamp64Milli('abc'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +SELECT toUnixTimestamp64Micro('abc'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +SELECT toUnixTimestamp64Nano('abc'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} -SELECT toUnixTimestamp64Milli('abc', 123); -- {serverError 42} -SELECT toUnixTimestamp64Micro('abc', 123); -- {serverError 42} -SELECT toUnixTimestamp64Nano('abc', 123); -- {serverError 42} +SELECT toUnixTimestamp64Milli('abc', 123); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +SELECT toUnixTimestamp64Micro('abc', 123); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +SELECT toUnixTimestamp64Nano('abc', 123); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} SELECT 'const column'; WITH toDateTime64('2019-09-16 19:20:12.345678910', 3, 'Asia/Istanbul') AS dt64 diff --git a/tests/queries/0_stateless/01555_system_distribution_queue_mask.sql b/tests/queries/0_stateless/01555_system_distribution_queue_mask.sql index 472e042a18b..3a90765226a 100644 --- a/tests/queries/0_stateless/01555_system_distribution_queue_mask.sql +++ b/tests/queries/0_stateless/01555_system_distribution_queue_mask.sql @@ -18,7 +18,7 @@ system stop distributed sends dist_01555; insert into dist_01555 values (1)(2); -- since test_cluster_with_incorrect_pw contains incorrect password ignore error system flush distributed dist_01555; -- { serverError 516 } -select length(splitByChar('*', data_path)), replaceRegexpOne(data_path, '^.*/([^/]*)/' , '\\1'), extract(last_exception, 'AUTHENTICATION_FAILED'), dateDiff('s', last_exception_time, now()) < 5 from system.distribution_queue where database = currentDatabase() and table = 'dist_01555' format CSV; +select length(splitByChar('*', data_path)), replaceRegexpOne(data_path, '^.*/([^/]*)/' , '\\1'), extract(last_exception, 'AUTHENTICATION_FAILED'), dateDiff('s', last_exception_time, now()) < 3600 from system.distribution_queue where database = currentDatabase() and table = 'dist_01555' format CSV; drop table dist_01555; @@ -31,7 +31,7 @@ create table dist_01555 (key Int) Engine=Distributed(test_cluster_with_incorrect insert into dist_01555 values (1)(2); -- since test_cluster_with_incorrect_pw contains incorrect password ignore error system flush distributed dist_01555; -- { serverError 516 } -select length(splitByChar('*', data_path)), replaceRegexpOne(data_path, '^.*/([^/]*)/' , '\\1'), extract(last_exception, 'AUTHENTICATION_FAILED'), dateDiff('s', last_exception_time, now()) < 5 from system.distribution_queue where database = currentDatabase() and table = 'dist_01555' format CSV; +select length(splitByChar('*', data_path)), replaceRegexpOne(data_path, '^.*/([^/]*)/' , '\\1'), extract(last_exception, 'AUTHENTICATION_FAILED'), dateDiff('s', last_exception_time, now()) < 3600 from system.distribution_queue where database = currentDatabase() and table = 'dist_01555' format CSV; drop table dist_01555; diff --git a/tests/queries/0_stateless/01600_parts_types_metrics_long.reference b/tests/queries/0_stateless/01600_parts_types_metrics_long.reference deleted file mode 100644 index e8183f05f5d..00000000000 --- a/tests/queries/0_stateless/01600_parts_types_metrics_long.reference +++ /dev/null @@ -1,3 +0,0 @@ -1 -1 -1 diff --git a/tests/queries/0_stateless/01600_parts_types_metrics_long.sh b/tests/queries/0_stateless/01600_parts_types_metrics_long.sh deleted file mode 100755 index 3ffac772aa7..00000000000 --- a/tests/queries/0_stateless/01600_parts_types_metrics_long.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env bash -# Tags: no-s3-storage, no-asan - -CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CURDIR"/../shell_config.sh - -set -e -set -o pipefail - -# NOTE: database = $CLICKHOUSE_DATABASE is unwanted -verify_sql="SELECT - (SELECT sumIf(value, metric = 'PartsCompact'), sumIf(value, metric = 'PartsWide') FROM system.metrics) = - (SELECT countIf(part_type = 'Compact'), countIf(part_type = 'Wide') - FROM (SELECT part_type FROM system.parts UNION ALL SELECT part_type FROM system.projection_parts))" - -# The query is not atomic - it can compare states between system.parts and system.metrics from different points in time. -# So, there is inherent race condition (especially in fasttest that runs tests in parallel). -# -# But it should get the expected result eventually. -# In case of test failure, this code will do infinite loop and timeout. -verify() -{ - while true; do - result=$( $CLICKHOUSE_CLIENT -m --query="$verify_sql" ) - if [ "$result" = "1" ]; then - echo 1 - return - fi - sleep 0.1 - done -} - -$CLICKHOUSE_CLIENT --database_atomic_wait_for_drop_and_detach_synchronously=1 --query="DROP TABLE IF EXISTS data_01600" -# Compact - (5..10] -# Wide - >10 -$CLICKHOUSE_CLIENT --query="CREATE TABLE data_01600 (part_type String, key Int) ENGINE = MergeTree PARTITION BY part_type ORDER BY key SETTINGS min_bytes_for_wide_part=0, min_rows_for_wide_part=10, index_granularity = 8192, index_granularity_bytes = '10Mi'" - -# Compact -$CLICKHOUSE_CLIENT --query="INSERT INTO data_01600 SELECT 'Compact', number FROM system.numbers LIMIT 6" -verify - -# Wide -$CLICKHOUSE_CLIENT --query="INSERT INTO data_01600 SELECT 'Wide', number FROM system.numbers LIMIT 11 OFFSET 6" -verify - -# DROP and check -$CLICKHOUSE_CLIENT --database_atomic_wait_for_drop_and_detach_synchronously=1 --query="DROP TABLE data_01600" -verify diff --git a/tests/queries/0_stateless/01660_system_parts_smoke.reference b/tests/queries/0_stateless/01660_system_parts_smoke.reference index b38d699c2b9..3c134f02d0b 100644 --- a/tests/queries/0_stateless/01660_system_parts_smoke.reference +++ b/tests/queries/0_stateless/01660_system_parts_smoke.reference @@ -9,6 +9,5 @@ all_2_2_0 1 1 Active 2 Outdated # truncate -HAVE PARTS Active HAVE PARTS Outdated # drop diff --git a/tests/queries/0_stateless/01710_projection_analysis_reuse_partition.reference b/tests/queries/0_stateless/01710_projection_analysis_reuse_partition.reference new file mode 100644 index 00000000000..47b07da250f --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_analysis_reuse_partition.reference @@ -0,0 +1 @@ +Selected 2/2 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges diff --git a/tests/queries/0_stateless/01710_projection_analysis_reuse_partition.sh b/tests/queries/0_stateless/01710_projection_analysis_reuse_partition.sh new file mode 100755 index 00000000000..ba8b3818ba3 --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_analysis_reuse_partition.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -q "drop table if exists t" +${CLICKHOUSE_CLIENT} -q "create table t(s LowCardinality(String), e DateTime64(3), projection p1 (select * order by s, e)) engine MergeTree partition by toYYYYMM(e) order by tuple() settings index_granularity = 8192, index_granularity_bytes = '100M'" +${CLICKHOUSE_CLIENT} -q "insert into t select 'AAP', toDateTime('2023-07-01') + 360 * number from numbers(50000)" +${CLICKHOUSE_CLIENT} -q "insert into t select 'AAPL', toDateTime('2023-07-01') + 360 * number from numbers(50000)" + +CLICKHOUSE_CLIENT_DEBUG_LOG=$(echo ${CLICKHOUSE_CLIENT} | sed 's/'"--send_logs_level=${CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL}"'/--send_logs_level=debug/g') + +${CLICKHOUSE_CLIENT_DEBUG_LOG} -q "select count() from t where e >= '2023-11-08 00:00:00.000' and e < '2023-11-09 00:00:00.000' and s in ('AAPL') format Null" 2>&1 | grep -oh "Selected .* parts by partition key, *. parts by primary key, .* marks by primary key, .* marks to read from .* ranges.*$" + +${CLICKHOUSE_CLIENT} -q "drop table t" diff --git a/tests/queries/0_stateless/01710_projection_with_alter_conversions.reference b/tests/queries/0_stateless/01710_projection_with_alter_conversions.reference new file mode 100644 index 00000000000..9874d6464ab --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_with_alter_conversions.reference @@ -0,0 +1 @@ +1 2 diff --git a/tests/queries/0_stateless/01710_projection_with_alter_conversions.sql b/tests/queries/0_stateless/01710_projection_with_alter_conversions.sql new file mode 100644 index 00000000000..649a07b9b5f --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_with_alter_conversions.sql @@ -0,0 +1,15 @@ +drop table if exists t; + +create table t (i int, j int, projection p (select i order by i)) engine MergeTree order by tuple(); + +insert into t values (1, 2); + +system stop merges t; + +set alter_sync = 0; + +alter table t rename column j to k; + +select * from t; + +drop table t; diff --git a/tests/queries/0_stateless/01710_projections.sql b/tests/queries/0_stateless/01710_projections.sql index a96339e30fa..7c45792847e 100644 --- a/tests/queries/0_stateless/01710_projections.sql +++ b/tests/queries/0_stateless/01710_projections.sql @@ -1,6 +1,6 @@ drop table if exists projection_test; -create table projection_test (`sum(block_count)` UInt64, domain_alias UInt64 alias length(domain), datetime DateTime, domain LowCardinality(String), x_id String, y_id String, block_count Int64, retry_count Int64, duration Int64, kbytes Int64, buffer_time Int64, first_time Int64, total_bytes Nullable(UInt64), valid_bytes Nullable(UInt64), completed_bytes Nullable(UInt64), fixed_bytes Nullable(UInt64), force_bytes Nullable(UInt64), projection p (select toStartOfMinute(datetime) dt_m, countIf(first_time = 0) / count(), avg((kbytes * 8) / duration), count(), sum(block_count) / sum(duration), avg(block_count / duration), sum(buffer_time) / sum(duration), avg(buffer_time / duration), sum(valid_bytes) / sum(total_bytes), sum(completed_bytes) / sum(total_bytes), sum(fixed_bytes) / sum(total_bytes), sum(force_bytes) / sum(total_bytes), sum(valid_bytes) / sum(total_bytes), sum(retry_count) / sum(duration), avg(retry_count / duration), countIf(block_count > 0) / count(), countIf(first_time = 0) / count(), uniqHLL12(x_id), uniqHLL12(y_id) group by dt_m, domain)) engine MergeTree partition by toDate(datetime) order by (toStartOfTenMinutes(datetime), domain) settings index_granularity_bytes = 10000000; +create table projection_test (`sum(block_count)` UInt64, domain_alias UInt64 alias length(domain), datetime DateTime, domain LowCardinality(String), x_id String, y_id String, block_count Int64, retry_count Int64, duration Int64, kbytes Int64, buffer_time Int64, first_time Int64, total_bytes Nullable(UInt64), valid_bytes Nullable(UInt64), completed_bytes Nullable(UInt64), fixed_bytes Nullable(UInt64), force_bytes Nullable(UInt64), projection p (select toStartOfMinute(datetime) dt_m, countIf(first_time = 0) / count(), avg((kbytes * 8) / duration), count(), sum(block_count) / sum(duration), avg(block_count / duration), sum(buffer_time) / sum(duration), avg(buffer_time / duration), sum(valid_bytes) / sum(total_bytes), sum(completed_bytes) / sum(total_bytes), sum(fixed_bytes) / sum(total_bytes), sum(force_bytes) / sum(total_bytes), sum(valid_bytes) / sum(total_bytes), sum(retry_count) / sum(duration), avg(retry_count / duration), countIf(block_count > 0) / count(), countIf(first_time = 0) / count(), uniqHLL12(x_id), uniqHLL12(y_id) group by dt_m, domain)) engine MergeTree partition by toDate(datetime) order by toStartOfTenMinutes(datetime) settings index_granularity_bytes = 10000000; insert into projection_test with rowNumberInAllBlocks() as id select 1, toDateTime('2020-10-24 00:00:00') + (id / 20), toString(id % 100), * from generateRandom('x_id String, y_id String, block_count Int64, retry_count Int64, duration Int64, kbytes Int64, buffer_time Int64, first_time Int64, total_bytes Nullable(UInt64), valid_bytes Nullable(UInt64), completed_bytes Nullable(UInt64), fixed_bytes Nullable(UInt64), force_bytes Nullable(UInt64)', 10, 10, 1) limit 1000 settings max_threads = 1; diff --git a/tests/queries/0_stateless/01825_type_json_1.reference b/tests/queries/0_stateless/01825_type_json_1.reference index 3f0eaf3854a..857c624fb9b 100644 --- a/tests/queries/0_stateless/01825_type_json_1.reference +++ b/tests/queries/0_stateless/01825_type_json_1.reference @@ -6,26 +6,22 @@ all_2_2_0 data Tuple(k5 String) all_1_2_1 data Tuple(k1 String, k2 Tuple(k3 String, k4 String), k5 String) ============ 1 ['aaa','ddd'] [['bbb','ccc'],['eee','fff']] -all_1_2_2 data Tuple(_dummy UInt8) all_3_3_0 data Tuple(k1 Nested(k2 String, k3 Nested(k4 String))) ============ 1 a 42 2 b 4200 4242 -all_1_2_3 data Tuple(_dummy UInt8) all_4_4_0 data Tuple(name String, value Int16) 1 a 42 2 b 4200 3 a 42.123 -all_1_2_3 data Tuple(_dummy UInt8) all_4_4_0 data Tuple(name String, value Int16) all_5_5_0 data Tuple(name String, value Float64) 1 a 42 2 b 4200 3 a 42.123 4 a some -all_1_2_3 data Tuple(_dummy UInt8) all_4_4_0 data Tuple(name String, value Int16) all_5_5_0 data Tuple(name String, value Float64) all_6_6_0 data Tuple(name String, value String) -all_1_6_4 data Tuple(name String, value String) +all_4_6_1 data Tuple(name String, value String) diff --git a/tests/queries/0_stateless/01825_type_json_add_column.sql.j2 b/tests/queries/0_stateless/01825_type_json_add_column.sql.j2 index 87c76c042a6..d787ba9b163 100644 --- a/tests/queries/0_stateless/01825_type_json_add_column.sql.j2 +++ b/tests/queries/0_stateless/01825_type_json_add_column.sql.j2 @@ -1,4 +1,5 @@ --- Tags: no-fasttest +-- Tags: no-fasttest, disabled +-- Disabled, because ClickHouse server may crash. https://github.com/ClickHouse/ClickHouse/pull/56307 {% for storage in ["MergeTree", "ReplicatedMergeTree('/clickhouse/tables/{database}/test_01825_add_column/', 'r1')"] -%} diff --git a/tests/queries/0_stateless/01945_system_warnings.reference b/tests/queries/0_stateless/01945_system_warnings.reference index cfec2f63816..f77cdd275c9 100644 --- a/tests/queries/0_stateless/01945_system_warnings.reference +++ b/tests/queries/0_stateless/01945_system_warnings.reference @@ -1,5 +1,5 @@ Server was built in debug mode. It will work slowly. 0 -Obsolete setting [\'multiple_joins_rewriter_version\'] is changed. Please check \'select * from system.settings where changed and is_obsolete\' and read the changelog. +Obsolete setting [\'multiple_joins_rewriter_version\'] is changed. Please check \'SELECT * FROM system.settings WHERE changed AND is_obsolete\' and read the changelog at https://github.com/ClickHouse/ClickHouse/blob/master/CHANGELOG.md 1 1 diff --git a/tests/queries/0_stateless/02006_test_positional_arguments_on_cluster.reference b/tests/queries/0_stateless/02006_test_positional_arguments_on_cluster.reference new file mode 100644 index 00000000000..05dd41748d1 --- /dev/null +++ b/tests/queries/0_stateless/02006_test_positional_arguments_on_cluster.reference @@ -0,0 +1,2 @@ +d Date +f UInt64 diff --git a/tests/queries/0_stateless/02006_test_positional_arguments_on_cluster.sql b/tests/queries/0_stateless/02006_test_positional_arguments_on_cluster.sql new file mode 100644 index 00000000000..1d8a4e4f8d7 --- /dev/null +++ b/tests/queries/0_stateless/02006_test_positional_arguments_on_cluster.sql @@ -0,0 +1,19 @@ +-- Tags: no-ordinary-database, no-replicated-database, distributed, zookeeper + +DROP TABLE IF EXISTS t02006 on cluster test_shard_localhost format Null; +DROP TABLE IF EXISTS m02006 on cluster test_shard_localhost format Null; + +CREATE TABLE t02006 on cluster test_shard_localhost (d Date) +ENGINE = MergeTree ORDER BY d +format Null; + +CREATE MATERIALIZED VIEW m02006 on cluster test_shard_localhost +Engine = MergeTree ORDER BY tuple() AS SELECT d, 0 AS i FROM t02006 GROUP BY d, i +format Null; + +ALTER TABLE t02006 on cluster test_shard_localhost ADD COLUMN IF NOT EXISTS f UInt64 format Null; + +DESC t02006; + +DROP TABLE IF EXISTS t02006 on cluster test_shard_localhost format Null; +DROP TABLE IF EXISTS m02006 on cluster test_shard_localhost format Null; diff --git a/tests/queries/0_stateless/02113_hdfs_assert.sh b/tests/queries/0_stateless/02113_hdfs_assert.sh index e4f97b1fbfd..069092c03e0 100755 --- a/tests/queries/0_stateless/02113_hdfs_assert.sh +++ b/tests/queries/0_stateless/02113_hdfs_assert.sh @@ -7,4 +7,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) TCP_PORT=$($CLICKHOUSE_CLIENT -q "SELECT tcpPort()") -$CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs://localhost:$TCP_PORT/data.csv', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "NETWORK_ERROR" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs://localhost:$TCP_PORT/data.csv', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "HDFS_ERROR" && echo 'OK' || echo 'FAIL'; diff --git a/tests/queries/0_stateless/02114_hdfs_bad_url.sh b/tests/queries/0_stateless/02114_hdfs_bad_url.sh index be48a987f45..22975dddf6f 100755 --- a/tests/queries/0_stateless/02114_hdfs_bad_url.sh +++ b/tests/queries/0_stateless/02114_hdfs_bad_url.sh @@ -20,7 +20,7 @@ $CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('://abcd:9000/data', 'CSV', 'x UInt32' $CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('abcd/', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "BAD_ARGUMENTS" && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs://abcd', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "BAD_ARGUMENTS" && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs1:9000/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "BAD_ARGUMENTS" && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs://hdfs1/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "NETWORK_ERROR" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs://hdfs1/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "HDFS_ERROR" && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('http://hdfs1:9000/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "BAD_ARGUMENTS" && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs://hdfs1@nameservice/abcd/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "NETWORK_ERROR" && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs://hdfs1@nameservice/abcd/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "HDFS_ERROR" && echo 'OK' || echo 'FAIL'; diff --git a/tests/queries/0_stateless/02338_analyzer_constants_basic.reference b/tests/queries/0_stateless/02338_analyzer_constants_basic.reference index f3a69e4d835..32f8a5eb124 100644 --- a/tests/queries/0_stateless/02338_analyzer_constants_basic.reference +++ b/tests/queries/0_stateless/02338_analyzer_constants_basic.reference @@ -25,11 +25,11 @@ NULL Nullable(Nothing) (1, 1) Tuple(UInt8, UInt8) (1,1) -- -array((1, 1)) Array(Tuple(UInt8, UInt8)) +[(1, 1)] Array(Tuple(UInt8, UInt8)) [(1,1)] NULL Nullable(Nothing) 1 UInt8 \'test\' String [1, 2, 3] Array(UInt8) -array((1, 1), (1, 1)) Array(Tuple(UInt8, UInt8)) +[(1, 1), (1, 1)] Array(Tuple(UInt8, UInt8)) \N 1 test [1,2,3] [(1,1),(1,1)] diff --git a/tests/queries/0_stateless/02341_global_join_cte.reference b/tests/queries/0_stateless/02341_global_join_cte.reference index 8b3cd68232a..f2cfe994ffa 100644 --- a/tests/queries/0_stateless/02341_global_join_cte.reference +++ b/tests/queries/0_stateless/02341_global_join_cte.reference @@ -1,5 +1,10 @@ -- { echo } -with rhs as (select * from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one))) select lhs.d2 from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one)) lhs global join rhs using (d1) order by rhs.d2; -- { serverError ALIAS_REQUIRED } +with rhs as (select * from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one))) select lhs.d2 from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one)) lhs global join rhs using (d1) order by rhs.d2 settings allow_experimental_analyzer=0; -- { serverError ALIAS_REQUIRED } +with rhs as (select * from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one))) select lhs.d2 from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one)) lhs global join rhs using (d1) order by rhs.d2 settings allow_experimental_analyzer=1; -- It works with analyzer; rhs is an alias itself. +0 +0 +0 +0 with rhs as (select * from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one))) select lhs.d2 from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one)) lhs global join rhs using (d1) order by rhs.d2 settings joined_subquery_requires_alias=0; 0 0 diff --git a/tests/queries/0_stateless/02341_global_join_cte.sql b/tests/queries/0_stateless/02341_global_join_cte.sql index b77e5b0b688..b9b906afd70 100644 --- a/tests/queries/0_stateless/02341_global_join_cte.sql +++ b/tests/queries/0_stateless/02341_global_join_cte.sql @@ -1,4 +1,5 @@ -- { echo } -with rhs as (select * from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one))) select lhs.d2 from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one)) lhs global join rhs using (d1) order by rhs.d2; -- { serverError ALIAS_REQUIRED } +with rhs as (select * from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one))) select lhs.d2 from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one)) lhs global join rhs using (d1) order by rhs.d2 settings allow_experimental_analyzer=0; -- { serverError ALIAS_REQUIRED } +with rhs as (select * from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one))) select lhs.d2 from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one)) lhs global join rhs using (d1) order by rhs.d2 settings allow_experimental_analyzer=1; -- It works with analyzer; rhs is an alias itself. with rhs as (select * from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one))) select lhs.d2 from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one)) lhs global join rhs using (d1) order by rhs.d2 settings joined_subquery_requires_alias=0; with rhs_ as (select * from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one))) select lhs.d2 from remote('127.{1,2}', view(select dummy d1, dummy d2 from system.one)) lhs global join rhs_ rhs using (d1) order by rhs.d2 settings joined_subquery_requires_alias=0; diff --git a/tests/queries/0_stateless/02378_analyzer_projection_names.reference b/tests/queries/0_stateless/02378_analyzer_projection_names.reference index a82d4d4c5d2..f8b18e6df15 100644 --- a/tests/queries/0_stateless/02378_analyzer_projection_names.reference +++ b/tests/queries/0_stateless/02378_analyzer_projection_names.reference @@ -13,7 +13,7 @@ concat(\'Value_1\', \'Value_2\') String SELECT '--'; -- DESCRIBE (SELECT cast(tuple(1, 'Value'), 'Tuple (id UInt64, value String)')); -CAST(tuple(1, \'Value\'), \'Tuple (id UInt64, value String)\') Tuple(id UInt64, value String) +CAST((1, \'Value\'), \'Tuple (id UInt64, value String)\') Tuple(id UInt64, value String) SELECT 'Columns'; Columns DESCRIBE (SELECT test_table.id, test_table.id, id FROM test_table); @@ -737,3 +737,18 @@ t2.id UInt64 t2.value String t3.id UInt64 t3.value String +SELECT 'Special functions array, tuple'; +Special functions array, tuple +DESCRIBE (SELECT [], array(), [1], array(1), [1, 2], array(1, 2), tuple(1), (1, 2), [[], []], [([], [])], ([], []), ([([], []), ([], [])])); +[] Array(Nothing) +[] Array(Nothing) +[1] Array(UInt8) +[1] Array(UInt8) +[1, 2] Array(UInt8) +[1, 2] Array(UInt8) +(1) Tuple(UInt8) +(1, 2) Tuple(UInt8, UInt8) +[[], []] Array(Array(Nothing)) +[([], [])] Array(Tuple(Array(Nothing), Array(Nothing))) +([], []) Tuple(Array(Nothing), Array(Nothing)) +[([], []), ([], [])] Array(Tuple(Array(Nothing), Array(Nothing))) diff --git a/tests/queries/0_stateless/02378_analyzer_projection_names.sql b/tests/queries/0_stateless/02378_analyzer_projection_names.sql index c69a1c1ad26..f5ac5f7476f 100644 --- a/tests/queries/0_stateless/02378_analyzer_projection_names.sql +++ b/tests/queries/0_stateless/02378_analyzer_projection_names.sql @@ -533,6 +533,9 @@ SELECT '--'; DESCRIBE (SELECT id, value, t1.id, t1.value, t2.id, t2.value, t3.id, t3.value FROM test_table_join_1 AS t1 INNER JOIN test_table_join_2 AS t2 USING (id, value) INNER JOIN test_table_join_3 AS t3 USING (id, value)); +SELECT 'Special functions array, tuple'; +DESCRIBE (SELECT [], array(), [1], array(1), [1, 2], array(1, 2), tuple(1), (1, 2), [[], []], [([], [])], ([], []), ([([], []), ([], [])])); + -- { echoOff } DROP TABLE test_table_join_1; diff --git a/tests/queries/0_stateless/02483_cuturlparameter_with_arrays.reference b/tests/queries/0_stateless/02483_cuturlparameter_with_arrays.reference index 348408a15cc..89c4ab58b21 100644 --- a/tests/queries/0_stateless/02483_cuturlparameter_with_arrays.reference +++ b/tests/queries/0_stateless/02483_cuturlparameter_with_arrays.reference @@ -31,7 +31,7 @@ SELECT FORMAT Vertical; Row 1: ────── -cutURLParameter('http://bigmir.net/?a=b&c=d', array()): http://bigmir.net/?a=b&c=d +cutURLParameter('http://bigmir.net/?a=b&c=d', []): http://bigmir.net/?a=b&c=d cutURLParameter('http://bigmir.net/?a=b&c=d', ['a']): http://bigmir.net/?c=d cutURLParameter('http://bigmir.net/?a=b&c=d', ['a', 'c']): http://bigmir.net/? cutURLParameter('http://bigmir.net/?a=b&c=d', ['c']): http://bigmir.net/?a=b @@ -44,7 +44,7 @@ cutURLParameter('http://bigmir.net/?a=b&c=d#e&g=h', ['c', 'g']): http: cutURLParameter('http://bigmir.net/?a=b&c=d#e&g=h', ['e', 'g']): http://bigmir.net/?a=b&c=d#e cutURLParameter('http://bigmir.net/?a=b&c=d#test?e=f&g=h', ['test', 'e']): http://bigmir.net/?a=b&c=d#test?g=h cutURLParameter('http://bigmir.net/?a=b&c=d#test?e=f&g=h', ['test', 'g']): http://bigmir.net/?a=b&c=d#test?e=f -cutURLParameter('//bigmir.net/?a=b&c=d', array()): //bigmir.net/?a=b&c=d +cutURLParameter('//bigmir.net/?a=b&c=d', []): //bigmir.net/?a=b&c=d cutURLParameter('//bigmir.net/?a=b&c=d', ['a']): //bigmir.net/?c=d cutURLParameter('//bigmir.net/?a=b&c=d', ['a', 'c']): //bigmir.net/? cutURLParameter('//bigmir.net/?a=b&c=d#e=f', ['a', 'e']): //bigmir.net/?c=d# @@ -89,7 +89,7 @@ SELECT FORMAT Vertical; Row 1: ────── -cutURLParameter(materialize('http://bigmir.net/?a=b&c=d'), array()): http://bigmir.net/?a=b&c=d +cutURLParameter(materialize('http://bigmir.net/?a=b&c=d'), []): http://bigmir.net/?a=b&c=d cutURLParameter(materialize('http://bigmir.net/?a=b&c=d'), ['a']): http://bigmir.net/?c=d cutURLParameter(materialize('http://bigmir.net/?a=b&c=d'), ['a', 'c']): http://bigmir.net/? cutURLParameter(materialize('http://bigmir.net/?a=b&c=d'), ['c']): http://bigmir.net/?a=b @@ -102,7 +102,7 @@ cutURLParameter(materialize('http://bigmir.net/?a=b&c=d#e&g=h'), ['c', 'g']): cutURLParameter(materialize('http://bigmir.net/?a=b&c=d#e&g=h'), ['e', 'g']): http://bigmir.net/?a=b&c=d#e cutURLParameter(materialize('http://bigmir.net/?a=b&c=d#test?e=f&g=h'), ['test', 'e']): http://bigmir.net/?a=b&c=d#test?g=h cutURLParameter(materialize('http://bigmir.net/?a=b&c=d#test?e=f&g=h'), ['test', 'g']): http://bigmir.net/?a=b&c=d#test?e=f -cutURLParameter(materialize('//bigmir.net/?a=b&c=d'), array()): //bigmir.net/?a=b&c=d +cutURLParameter(materialize('//bigmir.net/?a=b&c=d'), []): //bigmir.net/?a=b&c=d cutURLParameter(materialize('//bigmir.net/?a=b&c=d'), ['a']): //bigmir.net/?c=d cutURLParameter(materialize('//bigmir.net/?a=b&c=d'), ['a', 'c']): //bigmir.net/? cutURLParameter(materialize('//bigmir.net/?a=b&c=d#e=f'), ['a', 'e']): //bigmir.net/?c=d# diff --git a/tests/queries/0_stateless/02486_truncate_and_unexpected_parts.reference b/tests/queries/0_stateless/02486_truncate_and_unexpected_parts.reference new file mode 100644 index 00000000000..1f991703c7b --- /dev/null +++ b/tests/queries/0_stateless/02486_truncate_and_unexpected_parts.reference @@ -0,0 +1,4 @@ +1 rmt +1 rmt1 +2 rmt +2 rmt1 diff --git a/tests/queries/0_stateless/02486_truncate_and_unexpected_parts.sql b/tests/queries/0_stateless/02486_truncate_and_unexpected_parts.sql new file mode 100644 index 00000000000..fbd90d8ab0f --- /dev/null +++ b/tests/queries/0_stateless/02486_truncate_and_unexpected_parts.sql @@ -0,0 +1,27 @@ + +create table rmt (n int) engine=ReplicatedMergeTree('/test/02468/{database}', '1') order by tuple() partition by n % 2 settings replicated_max_ratio_of_wrong_parts=0, max_suspicious_broken_parts=0, max_suspicious_broken_parts_bytes=0; +create table rmt1 (n int) engine=ReplicatedMergeTree('/test/02468/{database}', '2') order by tuple() partition by n % 2 settings replicated_max_ratio_of_wrong_parts=0, max_suspicious_broken_parts=0, max_suspicious_broken_parts_bytes=0; + +system stop cleanup rmt; +system stop merges rmt1; + +insert into rmt select * from numbers(10) settings max_block_size=1; + +alter table rmt drop partition id '0'; +truncate table rmt1; + +system sync replica rmt; +system sync replica rmt1; + +detach table rmt sync; +detach table rmt1 sync; + +attach table rmt; +attach table rmt1; + +insert into rmt values (1); +insert into rmt1 values (2); +system sync replica rmt; +system sync replica rmt1; + +select *, _table from merge(currentDatabase(), '') order by (*,), _table; diff --git a/tests/queries/0_stateless/02493_analyzer_uniq_injective_functions_elimination.reference b/tests/queries/0_stateless/02493_analyzer_uniq_injective_functions_elimination.reference index c42b9ce0cc4..5b808310f0e 100644 --- a/tests/queries/0_stateless/02493_analyzer_uniq_injective_functions_elimination.reference +++ b/tests/queries/0_stateless/02493_analyzer_uniq_injective_functions_elimination.reference @@ -1,6 +1,6 @@ QUERY id: 0 PROJECTION COLUMNS - uniqCombined(tuple(\'\')) UInt64 + uniqCombined((\'\')) UInt64 PROJECTION LIST id: 1, nodes: 1 FUNCTION id: 2, function_name: uniqCombined, function_type: aggregate, result_type: UInt64 diff --git a/tests/queries/0_stateless/02494_query_cache_bugs.reference b/tests/queries/0_stateless/02494_query_cache_bugs.reference new file mode 100644 index 00000000000..448e1366ea7 --- /dev/null +++ b/tests/queries/0_stateless/02494_query_cache_bugs.reference @@ -0,0 +1,24 @@ +-- Bug 56258: Check literals (ASTLiteral) +Row 1: +────── +10: 10 +Row 1: +────── +x: 10 +2 +-- Bug 56258: Check functions (ASTFunction) +Row 1: +────── +toUInt64(42): 42 +Row 1: +────── +x: 42 +2 +-- Bug 56258: Check identifiers (ASTIdentifier) +Row 1: +────── +c: 1 +Row 1: +────── +x: 1 +2 diff --git a/tests/queries/0_stateless/02494_query_cache_bugs.sql b/tests/queries/0_stateless/02494_query_cache_bugs.sql new file mode 100644 index 00000000000..74496e0f77a --- /dev/null +++ b/tests/queries/0_stateless/02494_query_cache_bugs.sql @@ -0,0 +1,39 @@ +-- Tags: no-parallel +-- Tag no-parallel: Messes with internal cache + +-- Test for Bug 56258 + +SYSTEM DROP QUERY CACHE; + +SELECT '-- Bug 56258: Check literals (ASTLiteral)'; + +SELECT 10 FORMAT Vertical SETTINGS use_query_cache = 1; +SELECT 10 AS x FORMAT Vertical SETTINGS use_query_cache = 1; + +SELECT count(*) FROM system.query_cache; + +SYSTEM DROP QUERY CACHE; + +SELECT '-- Bug 56258: Check functions (ASTFunction)'; + +SELECT toUInt64(42) FORMAT Vertical SETTINGS use_query_cache = 1; +SELECT toUInt64(42) AS x FORMAT Vertical SETTINGS use_query_cache = 1; + +SELECT count(*) FROM system.query_cache; + +SYSTEM DROP QUERY CACHE; + +SELECT '-- Bug 56258: Check identifiers (ASTIdentifier)'; + +DROP TABLE IF EXISTS tab; + +CREATE TABLE tab(c UInt64) ENGINE = Memory AS SELECT 1; + +SELECT c FROM tab FORMAT Vertical SETTINGS use_query_cache = 1; +SELECT c AS x FROM tab FORMAT Vertical SETTINGS use_query_cache = 1; + +SELECT count(*) FROM system.query_cache; + +DROP TABLE tab; + +SYSTEM DROP QUERY CACHE; diff --git a/tests/queries/0_stateless/02713_array_low_cardinality_string.reference b/tests/queries/0_stateless/02713_array_low_cardinality_string.reference index aea0fd62732..f444d1d7e58 100644 --- a/tests/queries/0_stateless/02713_array_low_cardinality_string.reference +++ b/tests/queries/0_stateless/02713_array_low_cardinality_string.reference @@ -1,9 +1,9 @@ --- tab idx bloom_filter --- -Expression ((Projection + Before ORDER BY)) - Filter (WHERE) - ReadFromMergeTree (default.tab) +Expression + Filter + ReadFromMergeTree Indexes: Skip Name: idx diff --git a/tests/queries/0_stateless/02713_array_low_cardinality_string.sql b/tests/queries/0_stateless/02713_array_low_cardinality_string.sql index 4ecd3bf17c1..964e82da963 100644 --- a/tests/queries/0_stateless/02713_array_low_cardinality_string.sql +++ b/tests/queries/0_stateless/02713_array_low_cardinality_string.sql @@ -18,6 +18,6 @@ WHERE database = currentDatabase() AND table = 'tab'; SELECT '---'; -EXPLAIN indexes = 1 SELECT * FROM tab WHERE has(foo, 'b'); +EXPLAIN indexes = 1, description = 0 SELECT * FROM tab WHERE has(foo, 'b'); DROP TABLE tab; diff --git a/tests/queries/0_stateless/02725_database_hdfs.reference b/tests/queries/0_stateless/02725_database_hdfs.reference index dfc5b63647d..f84adfb214f 100644 --- a/tests/queries/0_stateless/02725_database_hdfs.reference +++ b/tests/queries/0_stateless/02725_database_hdfs.reference @@ -1,8 +1,8 @@ Test 1: select from hdfs database 1 2 3 -test1 +test_hdfs_1 1 2 3 -test2 +test_hdfs_2 Test 2: check exceptions BAD_ARGUMENTS OK diff --git a/tests/queries/0_stateless/02725_database_hdfs.sh b/tests/queries/0_stateless/02725_database_hdfs.sh index f6089cfa18a..b4e081f6de0 100755 --- a/tests/queries/0_stateless/02725_database_hdfs.sh +++ b/tests/queries/0_stateless/02725_database_hdfs.sh @@ -9,55 +9,65 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # Prepare data ${CLICKHOUSE_CLIENT} -q "insert into table function hdfs('hdfs://localhost:12222/test_02725_1.tsv', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32') select 1, 2, 3 settings hdfs_truncate_on_insert=1;" +ret=$? +if [ $ret -ne 0 ]; then + echo "Insert failed!" + exit 1 +fi ${CLICKHOUSE_CLIENT} -q "insert into table function hdfs('hdfs://localhost:12222/test_02725_2.tsv', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32') select 4, 5, 6 settings hdfs_truncate_on_insert=1;" +ret=$? +if [ $ret -ne 0 ]; then + echo "Insert failed!" + exit 1 +fi ################# echo "Test 1: select from hdfs database" # Database without specific host ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ -DROP DATABASE IF EXISTS test1; -CREATE DATABASE test1 ENGINE = HDFS; -USE test1; +DROP DATABASE IF EXISTS test_hdfs_1; +CREATE DATABASE test_hdfs_1 ENGINE = HDFS; +USE test_hdfs_1; SELECT * FROM \"hdfs://localhost:12222/test_02725_1.tsv\" """ -${CLICKHOUSE_CLIENT} -q "SHOW DATABASES;" | grep test1 +${CLICKHOUSE_CLIENT} -q "SHOW DATABASES;" | grep test_hdfs_1 # Database with host ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ -DROP DATABASE IF EXISTS test2; -CREATE DATABASE test2 ENGINE = HDFS('hdfs://localhost:12222'); -USE test2; +DROP DATABASE IF EXISTS test_hdfs_2; +CREATE DATABASE test_hdfs_2 ENGINE = HDFS('hdfs://localhost:12222'); +USE test_hdfs_2; SELECT * FROM \"test_02725_1.tsv\" """ -${CLICKHOUSE_CLIENT} -q "SHOW DATABASES;" | grep test2 +${CLICKHOUSE_CLIENT} -q "SHOW DATABASES;" | grep test_hdfs_2 ################# echo "Test 2: check exceptions" ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ -DROP DATABASE IF EXISTS test3; -CREATE DATABASE test3 ENGINE = HDFS('abacaba'); +DROP DATABASE IF EXISTS test_hdfs_3; +CREATE DATABASE test_hdfs_3 ENGINE = HDFS('abacaba'); """ 2>&1 | tr '\n' ' ' | grep -oF "BAD_ARGUMENTS" ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ -DROP DATABASE IF EXISTS test4; -CREATE DATABASE test4 ENGINE = HDFS; -USE test4; +DROP DATABASE IF EXISTS test_hdfs_4; +CREATE DATABASE test_hdfs_4 ENGINE = HDFS; +USE test_hdfs_4; SELECT * FROM \"abacaba/file.tsv\" """ 2>&1 | tr '\n' ' ' | grep -oF "CANNOT_EXTRACT_TABLE_STRUCTURE" -${CLICKHOUSE_CLIENT} -q "SELECT * FROM test4.\`http://localhost:11111/test/a.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test4.\`hdfs://localhost:12222/test_02725_3.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: +${CLICKHOUSE_CLIENT} -q "SELECT * FROM test_hdfs_4.\`http://localhost:11111/test/a.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/test_02725_3.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test4.\`hdfs://localhost:12222\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: # Cleanup ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ -DROP DATABASE IF EXISTS test1; -DROP DATABASE IF EXISTS test2; -DROP DATABASE IF EXISTS test3; -DROP DATABASE IF EXISTS test4; +DROP DATABASE IF EXISTS test_hdfs_1; +DROP DATABASE IF EXISTS test_hdfs_2; +DROP DATABASE IF EXISTS test_hdfs_3; +DROP DATABASE IF EXISTS test_hdfs_4; """ diff --git a/tests/queries/0_stateless/02725_keeper_fault_inject_sequential_cleanup.sql b/tests/queries/0_stateless/02725_keeper_fault_inject_sequential_cleanup.sql deleted file mode 100644 index e1db4ba2fa6..00000000000 --- a/tests/queries/0_stateless/02725_keeper_fault_inject_sequential_cleanup.sql +++ /dev/null @@ -1,10 +0,0 @@ -DROP TABLE IF EXISTS keeper_fault_inject_sequential_cleanup; - -CREATE TABLE keeper_fault_inject_sequential_cleanup (d Int8) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_02725/tables/keeper_fault_inject_sequential_cleanup', '1') ORDER BY d; - -INSERT INTO keeper_fault_inject_sequential_cleanup VALUES (1); -INSERT INTO keeper_fault_inject_sequential_cleanup SETTINGS insert_deduplicate = 0 VALUES (1); -INSERT INTO keeper_fault_inject_sequential_cleanup SETTINGS insert_deduplicate = 0, insert_keeper_fault_injection_probability = 0.4, insert_keeper_fault_injection_seed = 5619964844601345291 VALUES (1); - --- with database ordinary it produced a warning -DROP TABLE keeper_fault_inject_sequential_cleanup; diff --git a/tests/queries/0_stateless/02735_system_zookeeper_connection.sql b/tests/queries/0_stateless/02735_system_zookeeper_connection.sql index c98134e2f68..48ada633225 100644 --- a/tests/queries/0_stateless/02735_system_zookeeper_connection.sql +++ b/tests/queries/0_stateless/02735_system_zookeeper_connection.sql @@ -10,14 +10,15 @@ ORDER BY tuple(); SET session_timezone = 'UTC'; -select name, host, port, index, is_expired, keeper_api_version, (connected_time between yesterday() and now()), +-- NOTE: Durind the query execution, now() can be evaluated a bit earlier than connected_time +select name, host, port, index, is_expired, keeper_api_version, (connected_time between yesterday() and now() + interval 3 seconds), (abs(session_uptime_elapsed_seconds - zookeeperSessionUptime()) < 10), enabled_feature_flags from system.zookeeper_connection where name='default'; -- keeper_api_version will by 0 for auxiliary_zookeeper2, because we fail to get /api_version due to chroot -- I'm not sure if it's a bug or a useful trick to fallback to basic api -- Also, auxiliary zookeeper is created lazily -select name, host, port, index, is_expired, keeper_api_version, (connected_time between yesterday() and now()) +select name, host, port, index, is_expired, keeper_api_version, (connected_time between yesterday() and now() + interval 3 seconds) from system.zookeeper_connection where name!='default'; DROP TABLE IF EXISTS test_zk_connection_table; diff --git a/tests/queries/0_stateless/02775_show_columns_called_from_clickhouse.reference b/tests/queries/0_stateless/02775_show_columns_called_from_clickhouse.reference new file mode 100644 index 00000000000..de0f151db7d --- /dev/null +++ b/tests/queries/0_stateless/02775_show_columns_called_from_clickhouse.reference @@ -0,0 +1,60 @@ +a Array(String) NO \N +agg AggregateFunction(uniq, UInt64) NO \N +b Bool NO \N +d Date NO \N +d32 Date32 NO \N +dec128 Decimal(38, 2) NO \N +dec128_native Decimal(35, 30) NO \N +dec128_text Decimal(35, 31) NO \N +dec256 Decimal(76, 2) NO \N +dec256_native Decimal(65, 2) NO \N +dec256_text Decimal(66, 2) NO \N +dec32 Decimal(9, 2) NO \N +dec64 Decimal(18, 2) NO \N +dt DateTime NO \N +dt64 DateTime64(3) NO \N +dt64_3_tz1 DateTime64(3, \'UTC\') NO \N +dt64_3_tz2 DateTime64(3, \'Asia/Shanghai\') NO \N +dt64_6 DateTime64(6, \'UTC\') NO \N +dt64_9 DateTime64(9, \'UTC\') NO \N +dt_tz1 DateTime(\'UTC\') NO \N +dt_tz2 DateTime(\'Europe/Amsterdam\') NO \N +enm Enum8(\'hallo\' = 1, \'welt\' = 2) NO \N +f32 Float32 NO \N +f64 Float64 NO \N +fs FixedString(3) NO \N +i128 Int128 NO \N +i16 Int16 NO \N +i256 Int256 NO \N +i32 Int32 NO \N +i64 Int64 NO \N +i8 Int8 NO \N +ip4 IPv4 NO \N +ip6 IPv6 NO \N +lfs LowCardinality(FixedString(3)) NO \N +lnfs LowCardinality(Nullable(FixedString(3))) YES \N +lns LowCardinality(Nullable(String)) YES \N +ls LowCardinality(String) NO \N +m Map(Int32, String) NO \N +m_complex Map(Int32, Map(Int32, LowCardinality(Nullable(String)))) NO \N +mpg MultiPolygon NO \N +ndt64 Nullable(DateTime64(3)) YES \N +ndt64_tz Nullable(DateTime64(3, \'Asia/Shanghai\')) YES \N +nested.col1 Array(String) NO \N +nested.col2 Array(UInt32) NO \N +nfs Nullable(FixedString(3)) YES \N +ns Nullable(String) YES \N +o Object(\'json\') NO \N +p Point NO \N +pg Polygon NO \N +r Ring NO \N +s String NO \N +sagg SimpleAggregateFunction(sum, Float64) NO \N +t Tuple(Int32, String, Nullable(String), LowCardinality(String), LowCardinality(Nullable(String)), Tuple(Int32, String)) NO \N +ui128 UInt128 NO \N +ui16 UInt16 NO \N +ui256 UInt256 NO \N +ui32 UInt32 NO \N +ui64 UInt64 NO \N +ui8 UInt8 NO \N +uuid UUID NO \N diff --git a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sql b/tests/queries/0_stateless/02775_show_columns_called_from_clickhouse.sql similarity index 67% rename from tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sql rename to tests/queries/0_stateless/02775_show_columns_called_from_clickhouse.sql index e447dee47ed..89073bd2943 100644 --- a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.sql +++ b/tests/queries/0_stateless/02775_show_columns_called_from_clickhouse.sql @@ -2,7 +2,11 @@ -- no-fasttest: json type needs rapidjson library, geo types need s2 geometry -- no-parallel: can't provide currentDatabase() to SHOW COLUMNS --- Tests setting 'use_mysql_types_in_show_columns' in SHOW COLUMNS and SELECTs on system.columns +-- Tests the output of SHOW COLUMNS when called through the ClickHouse protocol. + +-- ----------------------------------------------------------------------------------- +-- Please keep this test in-sync with 02775_show_columns_called_through_mysql.sql +-- ----------------------------------------------------------------------------------- DROP TABLE IF EXISTS tab; @@ -72,22 +76,6 @@ CREATE TABLE tab lnfs LowCardinality(Nullable(FixedString(3))), ) ENGINE Memory; -SELECT '-- SHOW COLUMNS with use_mysql_types_in_show_columns = 0'; -SHOW COLUMNS FROM tab SETTINGS use_mysql_types_in_show_columns = 0; - -SELECT '-- SHOW COLUMNS with use_mysql_types_in_show_columns = 1'; -SHOW COLUMNS FROM tab SETTINGS use_mysql_types_in_show_columns = 1; - -SELECT '-- SHOW COLUMNS with mysql_map_string_to_text_in_show_columns = 1'; -SHOW COLUMNS FROM tab SETTINGS use_mysql_types_in_show_columns = 1, mysql_map_string_to_text_in_show_columns=1; - -SELECT '-- SHOW COLUMNS with mysql_map_fixed_string_to_text_in_show_columns = 1'; -SHOW COLUMNS FROM tab SETTINGS use_mysql_types_in_show_columns = 1, mysql_map_fixed_string_to_text_in_show_columns=1; - -SELECT '-- SHOW COLUMNS with mysql_map_string_to_text_in_show_columns = 1 and without use_mysql_types_in_show_columns'; -SHOW COLUMNS FROM tab SETTINGS use_mysql_types_in_show_columns = 0, mysql_map_string_to_text_in_show_columns=1; - -SELECT '-- SHOW COLUMNS with mysql_map_fixed_string_to_text_in_show_columns = 1 and without use_mysql_types_in_show_columns'; -SHOW COLUMNS FROM tab SETTINGS use_mysql_types_in_show_columns = 0, mysql_map_fixed_string_to_text_in_show_columns=1; +SHOW COLUMNS FROM tab; DROP TABLE tab; diff --git a/tests/queries/0_stateless/02775_show_columns_called_from_mysql.expect b/tests/queries/0_stateless/02775_show_columns_called_from_mysql.expect new file mode 100755 index 00000000000..bef5bd10ff3 --- /dev/null +++ b/tests/queries/0_stateless/02775_show_columns_called_from_mysql.expect @@ -0,0 +1,299 @@ +#!/usr/bin/expect -f +# Tags: no-fasttest, no-parallel +# no-fasttest: requires mysql client, rapidjson and s2geometry +# no-parallel: can't provide currentDatabase() to SHOW COLUMNS + +# Tests the output of SHOW COLUMNS when called through the MySQL protocol. + +# ----------------------------------------------------------------------------------- +# Please keep this test in-sync with 02775_show_columns_called_through_clickhouse.sql +# ----------------------------------------------------------------------------------- + +set basedir [file dirname $argv0] +set basename [file tail $argv0] +exp_internal -f $env(CLICKHOUSE_TMP)/$basename.debuglog 0 +log_user 0 +set timeout 60 +match_max 100000 +expect_after { + # Do not ignore eof from expect + -i $any_spawn_id eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + -i $any_spawn_id timeout { exit 1 } +} +spawn bash -c "source $basedir/../shell_config.sh ; \$MYSQL_CLIENT_BINARY \$MYSQL_CLIENT_OPT" +expect -nocase -re "mysql.*> " + +send -- "DROP TABLE IF EXISTS tab;\r" +expect "Query OK, 0 rows affected" + +send -- "SET allow_suspicious_low_cardinality_types=1;\r" +send -- "SET allow_experimental_object_type=1;\r" + +send -- " +CREATE TABLE tab +( + i8 Int8, + i16 Int16, + i32 Int32, + i64 Int64, + i128 Int128, + i256 Int256, + ui8 UInt8, + ui16 UInt16, + ui32 UInt32, + ui64 UInt64, + ui128 UInt128, + ui256 UInt256, + f32 Float32, + f64 Float64, + dec32 Decimal32(2), + dec64 Decimal64(2), + dec128 Decimal128(2), + dec128_native Decimal(35, 30), + dec128_text Decimal(35, 31), + dec256 Decimal256(2), + dec256_native Decimal(65, 2), + dec256_text Decimal(66, 2), + p Point, + r Ring, + pg Polygon, + mpg MultiPolygon, + b Bool, + s String, + fs FixedString(3), + uuid UUID, + d Date, + d32 Date32, + dt DateTime, + dt_tz1 DateTime('UTC'), + dt_tz2 DateTime('Europe/Amsterdam'), + dt64 DateTime64(3), + dt64_3_tz1 DateTime64(3, 'UTC'), + dt64_3_tz2 DateTime64(3, 'Asia/Shanghai'), + dt64_6 DateTime64(6, 'UTC'), + dt64_9 DateTime64(9, 'UTC'), + enm Enum('hallo' = 1, 'welt' = 2), + agg AggregateFunction(uniq, UInt64), + sagg SimpleAggregateFunction(sum, Double), + a Array(String), + o JSON, + t Tuple(Int32, String, Nullable(String), LowCardinality(String), LowCardinality(Nullable(String)), Tuple(Int32, String)), + m Map(Int32, String), + m_complex Map(Int32, Map(Int32, LowCardinality(Nullable(String)))), + nested Nested (col1 String, col2 UInt32), + ip4 IPv4, + ip6 IPv6, + ns Nullable(String), + nfs Nullable(FixedString(3)), + ndt64 Nullable(DateTime64(3)), + ndt64_tz Nullable(DateTime64(3, 'Asia/Shanghai')), + ls LowCardinality(String), + lfs LowCardinality(FixedString(3)), + lns LowCardinality(Nullable(String)), + lnfs LowCardinality(Nullable(FixedString(3))), +) ENGINE Memory;\r +" + +send -- "SHOW COLUMNS FROM tab;\r" +expect -- "+---------------+-------------------+------+------+---------+-------+" +expect -- "| field | type | null | key | default | extra |" +expect -- "+---------------+-------------------+------+------+---------+-------+" +expect -- "| a | TEXT | NO | | NULL | |" +expect -- "| agg | TEXT | NO | | NULL | |" +expect -- "| b | TINYINT | NO | | NULL | |" +expect -- "| d | DATE | NO | | NULL | |" +expect -- "| d32 | DATE | NO | | NULL | |" +expect -- "| dec128 | DECIMAL(38, 2) | NO | | NULL | |" +expect -- "| dec128_native | DECIMAL(35, 30) | NO | | NULL | |" +expect -- "| dec128_text | TEXT | NO | | NULL | |" +expect -- "| dec256 | TEXT | NO | | NULL | |" +expect -- "| dec256_native | DECIMAL(65, 2) | NO | | NULL | |" +expect -- "| dec256_text | TEXT | NO | | NULL | |" +expect -- "| dec32 | DECIMAL(9, 2) | NO | | NULL | |" +expect -- "| dec64 | DECIMAL(18, 2) | NO | | NULL | |" +expect -- "| dt | DATETIME | NO | | NULL | |" +expect -- "| dt64 | DATETIME | NO | | NULL | |" +expect -- "| dt64_3_tz1 | DATETIME | NO | | NULL | |" +expect -- "| dt64_3_tz2 | DATETIME | NO | | NULL | |" +expect -- "| dt64_6 | DATETIME | NO | | NULL | |" +expect -- "| dt64_9 | DATETIME | NO | | NULL | |" +expect -- "| dt_tz1 | DATETIME | NO | | NULL | |" +expect -- "| dt_tz2 | DATETIME | NO | | NULL | |" +expect -- "| enm | TEXT | NO | | NULL | |" +expect -- "| f32 | FLOAT | NO | | NULL | |" +expect -- "| f64 | DOUBLE | NO | | NULL | |" +expect -- "| fs | BLOB | NO | | NULL | |" +expect -- "| i128 | TEXT | NO | | NULL | |" +expect -- "| i16 | SMALLINT | NO | | NULL | |" +expect -- "| i256 | TEXT | NO | | NULL | |" +expect -- "| i32 | INTEGER | NO | | NULL | |" +expect -- "| i64 | BIGINT | NO | | NULL | |" +expect -- "| i8 | TINYINT | NO | | NULL | |" +expect -- "| ip4 | TEXT | NO | | NULL | |" +expect -- "| ip6 | TEXT | NO | | NULL | |" +expect -- "| lfs | BLOB | NO | | NULL | |" +expect -- "| lnfs | BLOB | YES | | NULL | |" +expect -- "| lns | BLOB | YES | | NULL | |" +expect -- "| ls | BLOB | NO | | NULL | |" +expect -- "| m | JSON | NO | | NULL | |" +expect -- "| m_complex | JSON | NO | | NULL | |" +expect -- "| mpg | TEXT | NO | | NULL | |" +expect -- "| ndt64 | DATETIME | YES | | NULL | |" +expect -- "| ndt64_tz | DATETIME | YES | | NULL | |" +expect -- "| nested.col1 | TEXT | NO | | NULL | |" +expect -- "| nested.col2 | TEXT | NO | | NULL | |" +expect -- "| nfs | BLOB | YES | | NULL | |" +expect -- "| ns | BLOB | YES | | NULL | |" +expect -- "| o | JSON | NO | | NULL | |" +expect -- "| p | TEXT | NO | | NULL | |" +expect -- "| pg | TEXT | NO | | NULL | |" +expect -- "| r | TEXT | NO | | NULL | |" +expect -- "| s | BLOB | NO | | NULL | |" +expect -- "| sagg | TEXT | NO | | NULL | |" +expect -- "| t | JSON | NO | | NULL | |" +expect -- "| ui128 | TEXT | NO | | NULL | |" +expect -- "| ui16 | SMALLINT UNSIGNED | NO | | NULL | |" +expect -- "| ui256 | TEXT | NO | | NULL | |" +expect -- "| ui32 | INTEGER UNSIGNED | NO | | NULL | |" +expect -- "| ui64 | BIGINT UNSIGNED | NO | | NULL | |" +expect -- "| ui8 | TINYINT UNSIGNED | NO | | NULL | |" +expect -- "| uuid | CHAR | NO | | NULL | |" +expect -- "+---------------+-------------------+------+------+---------+-------+" + +send -- "SHOW COLUMNS FROM tab SETTINGS mysql_map_string_to_text_in_show_columns=1;\r" +expect -- "+---------------+-------------------+------+------+---------+-------+" +expect -- "| field | type | null | key | default | extra |" +expect -- "+---------------+-------------------+------+------+---------+-------+" +expect -- "| a | TEXT | NO | | NULL | |" +expect -- "| agg | TEXT | NO | | NULL | |" +expect -- "| b | TINYINT | NO | | NULL | |" +expect -- "| d | DATE | NO | | NULL | |" +expect -- "| d32 | DATE | NO | | NULL | |" +expect -- "| dec128 | DECIMAL(38, 2) | NO | | NULL | |" +expect -- "| dec128_native | DECIMAL(35, 30) | NO | | NULL | |" +expect -- "| dec128_text | TEXT | NO | | NULL | |" +expect -- "| dec256 | TEXT | NO | | NULL | |" +expect -- "| dec256_native | DECIMAL(65, 2) | NO | | NULL | |" +expect -- "| dec256_text | TEXT | NO | | NULL | |" +expect -- "| dec32 | DECIMAL(9, 2) | NO | | NULL | |" +expect -- "| dec64 | DECIMAL(18, 2) | NO | | NULL | |" +expect -- "| dt | DATETIME | NO | | NULL | |" +expect -- "| dt64 | DATETIME | NO | | NULL | |" +expect -- "| dt64_3_tz1 | DATETIME | NO | | NULL | |" +expect -- "| dt64_3_tz2 | DATETIME | NO | | NULL | |" +expect -- "| dt64_6 | DATETIME | NO | | NULL | |" +expect -- "| dt64_9 | DATETIME | NO | | NULL | |" +expect -- "| dt_tz1 | DATETIME | NO | | NULL | |" +expect -- "| dt_tz2 | DATETIME | NO | | NULL | |" +expect -- "| enm | TEXT | NO | | NULL | |" +expect -- "| f32 | FLOAT | NO | | NULL | |" +expect -- "| f64 | DOUBLE | NO | | NULL | |" +expect -- "| fs | BLOB | NO | | NULL | |" +expect -- "| i128 | TEXT | NO | | NULL | |" +expect -- "| i16 | SMALLINT | NO | | NULL | |" +expect -- "| i256 | TEXT | NO | | NULL | |" +expect -- "| i32 | INTEGER | NO | | NULL | |" +expect -- "| i64 | BIGINT | NO | | NULL | |" +expect -- "| i8 | TINYINT | NO | | NULL | |" +expect -- "| ip4 | TEXT | NO | | NULL | |" +expect -- "| ip6 | TEXT | NO | | NULL | |" +expect -- "| lfs | BLOB | NO | | NULL | |" +expect -- "| lnfs | BLOB | YES | | NULL | |" +expect -- "| lns | TEXT | YES | | NULL | |" +expect -- "| ls | TEXT | NO | | NULL | |" +expect -- "| m | JSON | NO | | NULL | |" +expect -- "| m_complex | JSON | NO | | NULL | |" +expect -- "| mpg | TEXT | NO | | NULL | |" +expect -- "| ndt64 | DATETIME | YES | | NULL | |" +expect -- "| ndt64_tz | DATETIME | YES | | NULL | |" +expect -- "| nested.col1 | TEXT | NO | | NULL | |" +expect -- "| nested.col2 | TEXT | NO | | NULL | |" +expect -- "| nfs | BLOB | YES | | NULL | |" +expect -- "| ns | TEXT | YES | | NULL | |" +expect -- "| o | JSON | NO | | NULL | |" +expect -- "| p | TEXT | NO | | NULL | |" +expect -- "| pg | TEXT | NO | | NULL | |" +expect -- "| r | TEXT | NO | | NULL | |" +expect -- "| s | TEXT | NO | | NULL | |" +expect -- "| sagg | TEXT | NO | | NULL | |" +expect -- "| t | JSON | NO | | NULL | |" +expect -- "| ui128 | TEXT | NO | | NULL | |" +expect -- "| ui16 | SMALLINT UNSIGNED | NO | | NULL | |" +expect -- "| ui256 | TEXT | NO | | NULL | |" +expect -- "| ui32 | INTEGER UNSIGNED | NO | | NULL | |" +expect -- "| ui64 | BIGINT UNSIGNED | NO | | NULL | |" +expect -- "| ui8 | TINYINT UNSIGNED | NO | | NULL | |" +expect -- "| uuid | CHAR | NO | | NULL | |" +expect -- "+---------------+-------------------+------+------+---------+-------+" + +send -- "SHOW COLUMNS FROM tab SETTINGS mysql_map_fixed_string_to_text_in_show_columns=1;\r" +expect -- "+---------------+-------------------+------+------+---------+-------+" +expect -- "| field | type | null | key | default | extra |" +expect -- "+---------------+-------------------+------+------+---------+-------+" +expect -- "| a | TEXT | NO | | NULL | |" +expect -- "| agg | TEXT | NO | | NULL | |" +expect -- "| b | TINYINT | NO | | NULL | |" +expect -- "| d | DATE | NO | | NULL | |" +expect -- "| d32 | DATE | NO | | NULL | |" +expect -- "| dec128 | DECIMAL(38, 2) | NO | | NULL | |" +expect -- "| dec128_native | DECIMAL(35, 30) | NO | | NULL | |" +expect -- "| dec128_text | TEXT | NO | | NULL | |" +expect -- "| dec256 | TEXT | NO | | NULL | |" +expect -- "| dec256_native | DECIMAL(65, 2) | NO | | NULL | |" +expect -- "| dec256_text | TEXT | NO | | NULL | |" +expect -- "| dec32 | DECIMAL(9, 2) | NO | | NULL | |" +expect -- "| dec64 | DECIMAL(18, 2) | NO | | NULL | |" +expect -- "| dt | DATETIME | NO | | NULL | |" +expect -- "| dt64 | DATETIME | NO | | NULL | |" +expect -- "| dt64_3_tz1 | DATETIME | NO | | NULL | |" +expect -- "| dt64_3_tz2 | DATETIME | NO | | NULL | |" +expect -- "| dt64_6 | DATETIME | NO | | NULL | |" +expect -- "| dt64_9 | DATETIME | NO | | NULL | |" +expect -- "| dt_tz1 | DATETIME | NO | | NULL | |" +expect -- "| dt_tz2 | DATETIME | NO | | NULL | |" +expect -- "| enm | TEXT | NO | | NULL | |" +expect -- "| f32 | FLOAT | NO | | NULL | |" +expect -- "| f64 | DOUBLE | NO | | NULL | |" +expect -- "| fs | TEXT | NO | | NULL | |" +expect -- "| i128 | TEXT | NO | | NULL | |" +expect -- "| i16 | SMALLINT | NO | | NULL | |" +expect -- "| i256 | TEXT | NO | | NULL | |" +expect -- "| i32 | INTEGER | NO | | NULL | |" +expect -- "| i64 | BIGINT | NO | | NULL | |" +expect -- "| i8 | TINYINT | NO | | NULL | |" +expect -- "| ip4 | TEXT | NO | | NULL | |" +expect -- "| ip6 | TEXT | NO | | NULL | |" +expect -- "| lfs | TEXT | NO | | NULL | |" +expect -- "| lnfs | TEXT | YES | | NULL | |" +expect -- "| lns | BLOB | YES | | NULL | |" +expect -- "| ls | BLOB | NO | | NULL | |" +expect -- "| m | JSON | NO | | NULL | |" +expect -- "| m_complex | JSON | NO | | NULL | |" +expect -- "| mpg | TEXT | NO | | NULL | |" +expect -- "| ndt64 | DATETIME | YES | | NULL | |" +expect -- "| ndt64_tz | DATETIME | YES | | NULL | |" +expect -- "| nested.col1 | TEXT | NO | | NULL | |" +expect -- "| nested.col2 | TEXT | NO | | NULL | |" +expect -- "| nfs | TEXT | YES | | NULL | |" +expect -- "| ns | BLOB | YES | | NULL | |" +expect -- "| o | JSON | NO | | NULL | |" +expect -- "| p | TEXT | NO | | NULL | |" +expect -- "| pg | TEXT | NO | | NULL | |" +expect -- "| r | TEXT | NO | | NULL | |" +expect -- "| s | BLOB | NO | | NULL | |" +expect -- "| sagg | TEXT | NO | | NULL | |" +expect -- "| t | JSON | NO | | NULL | |" +expect -- "| ui128 | TEXT | NO | | NULL | |" +expect -- "| ui16 | SMALLINT UNSIGNED | NO | | NULL | |" +expect -- "| ui256 | TEXT | NO | | NULL | |" +expect -- "| ui32 | INTEGER UNSIGNED | NO | | NULL | |" +expect -- "| ui64 | BIGINT UNSIGNED | NO | | NULL | |" +expect -- "| ui8 | TINYINT UNSIGNED | NO | | NULL | |" +expect -- "| uuid | CHAR | NO | | NULL | |" +expect -- "+---------------+-------------------+------+------+---------+-------+" + +send -- "DROP TABLE tab;" + +send -- "quit;\r" +expect eof diff --git a/tests/queries/0_stateless/02775_show_columns_called_from_mysql.reference b/tests/queries/0_stateless/02775_show_columns_called_from_mysql.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference b/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference deleted file mode 100644 index e038a3362df..00000000000 --- a/tests/queries/0_stateless/02775_show_columns_mysql_compatibility.reference +++ /dev/null @@ -1,366 +0,0 @@ --- SHOW COLUMNS with use_mysql_types_in_show_columns = 0 -a Array(String) NO \N -agg AggregateFunction(uniq, UInt64) NO \N -b Bool NO \N -d Date NO \N -d32 Date32 NO \N -dec128 Decimal(38, 2) NO \N -dec128_native Decimal(35, 30) NO \N -dec128_text Decimal(35, 31) NO \N -dec256 Decimal(76, 2) NO \N -dec256_native Decimal(65, 2) NO \N -dec256_text Decimal(66, 2) NO \N -dec32 Decimal(9, 2) NO \N -dec64 Decimal(18, 2) NO \N -dt DateTime NO \N -dt64 DateTime64(3) NO \N -dt64_3_tz1 DateTime64(3, \'UTC\') NO \N -dt64_3_tz2 DateTime64(3, \'Asia/Shanghai\') NO \N -dt64_6 DateTime64(6, \'UTC\') NO \N -dt64_9 DateTime64(9, \'UTC\') NO \N -dt_tz1 DateTime(\'UTC\') NO \N -dt_tz2 DateTime(\'Europe/Amsterdam\') NO \N -enm Enum8(\'hallo\' = 1, \'welt\' = 2) NO \N -f32 Float32 NO \N -f64 Float64 NO \N -fs FixedString(3) NO \N -i128 Int128 NO \N -i16 Int16 NO \N -i256 Int256 NO \N -i32 Int32 NO \N -i64 Int64 NO \N -i8 Int8 NO \N -ip4 IPv4 NO \N -ip6 IPv6 NO \N -lfs LowCardinality(FixedString(3)) NO \N -lnfs LowCardinality(Nullable(FixedString(3))) YES \N -lns LowCardinality(Nullable(String)) YES \N -ls LowCardinality(String) NO \N -m Map(Int32, String) NO \N -m_complex Map(Int32, Map(Int32, LowCardinality(Nullable(String)))) NO \N -mpg MultiPolygon NO \N -ndt64 Nullable(DateTime64(3)) YES \N -ndt64_tz Nullable(DateTime64(3, \'Asia/Shanghai\')) YES \N -nested.col1 Array(String) NO \N -nested.col2 Array(UInt32) NO \N -nfs Nullable(FixedString(3)) YES \N -ns Nullable(String) YES \N -o Object(\'json\') NO \N -p Point NO \N -pg Polygon NO \N -r Ring NO \N -s String NO \N -sagg SimpleAggregateFunction(sum, Float64) NO \N -t Tuple(Int32, String, Nullable(String), LowCardinality(String), LowCardinality(Nullable(String)), Tuple(Int32, String)) NO \N -ui128 UInt128 NO \N -ui16 UInt16 NO \N -ui256 UInt256 NO \N -ui32 UInt32 NO \N -ui64 UInt64 NO \N -ui8 UInt8 NO \N -uuid UUID NO \N --- SHOW COLUMNS with use_mysql_types_in_show_columns = 1 -a TEXT NO \N -agg TEXT NO \N -b TINYINT NO \N -d DATE NO \N -d32 DATE NO \N -dec128 DECIMAL(38, 2) NO \N -dec128_native DECIMAL(35, 30) NO \N -dec128_text TEXT NO \N -dec256 TEXT NO \N -dec256_native DECIMAL(65, 2) NO \N -dec256_text TEXT NO \N -dec32 DECIMAL(9, 2) NO \N -dec64 DECIMAL(18, 2) NO \N -dt DATETIME NO \N -dt64 DATETIME NO \N -dt64_3_tz1 DATETIME NO \N -dt64_3_tz2 DATETIME NO \N -dt64_6 DATETIME NO \N -dt64_9 DATETIME NO \N -dt_tz1 DATETIME NO \N -dt_tz2 DATETIME NO \N -enm TEXT NO \N -f32 FLOAT NO \N -f64 DOUBLE NO \N -fs BLOB NO \N -i128 TEXT NO \N -i16 SMALLINT NO \N -i256 TEXT NO \N -i32 INTEGER NO \N -i64 BIGINT NO \N -i8 TINYINT NO \N -ip4 TEXT NO \N -ip6 TEXT NO \N -lfs BLOB NO \N -lnfs BLOB YES \N -lns BLOB YES \N -ls BLOB NO \N -m JSON NO \N -m_complex JSON NO \N -mpg TEXT NO \N -ndt64 DATETIME YES \N -ndt64_tz DATETIME YES \N -nested.col1 TEXT NO \N -nested.col2 TEXT NO \N -nfs BLOB YES \N -ns BLOB YES \N -o JSON NO \N -p TEXT NO \N -pg TEXT NO \N -r TEXT NO \N -s BLOB NO \N -sagg TEXT NO \N -t JSON NO \N -ui128 TEXT NO \N -ui16 SMALLINT UNSIGNED NO \N -ui256 TEXT NO \N -ui32 INTEGER UNSIGNED NO \N -ui64 BIGINT UNSIGNED NO \N -ui8 TINYINT UNSIGNED NO \N -uuid CHAR NO \N --- SHOW COLUMNS with mysql_map_string_to_text_in_show_columns = 1 -a TEXT NO \N -agg TEXT NO \N -b TINYINT NO \N -d DATE NO \N -d32 DATE NO \N -dec128 DECIMAL(38, 2) NO \N -dec128_native DECIMAL(35, 30) NO \N -dec128_text TEXT NO \N -dec256 TEXT NO \N -dec256_native DECIMAL(65, 2) NO \N -dec256_text TEXT NO \N -dec32 DECIMAL(9, 2) NO \N -dec64 DECIMAL(18, 2) NO \N -dt DATETIME NO \N -dt64 DATETIME NO \N -dt64_3_tz1 DATETIME NO \N -dt64_3_tz2 DATETIME NO \N -dt64_6 DATETIME NO \N -dt64_9 DATETIME NO \N -dt_tz1 DATETIME NO \N -dt_tz2 DATETIME NO \N -enm TEXT NO \N -f32 FLOAT NO \N -f64 DOUBLE NO \N -fs BLOB NO \N -i128 TEXT NO \N -i16 SMALLINT NO \N -i256 TEXT NO \N -i32 INTEGER NO \N -i64 BIGINT NO \N -i8 TINYINT NO \N -ip4 TEXT NO \N -ip6 TEXT NO \N -lfs BLOB NO \N -lnfs BLOB YES \N -lns TEXT YES \N -ls TEXT NO \N -m JSON NO \N -m_complex JSON NO \N -mpg TEXT NO \N -ndt64 DATETIME YES \N -ndt64_tz DATETIME YES \N -nested.col1 TEXT NO \N -nested.col2 TEXT NO \N -nfs BLOB YES \N -ns TEXT YES \N -o JSON NO \N -p TEXT NO \N -pg TEXT NO \N -r TEXT NO \N -s TEXT NO \N -sagg TEXT NO \N -t JSON NO \N -ui128 TEXT NO \N -ui16 SMALLINT UNSIGNED NO \N -ui256 TEXT NO \N -ui32 INTEGER UNSIGNED NO \N -ui64 BIGINT UNSIGNED NO \N -ui8 TINYINT UNSIGNED NO \N -uuid CHAR NO \N --- SHOW COLUMNS with mysql_map_fixed_string_to_text_in_show_columns = 1 -a TEXT NO \N -agg TEXT NO \N -b TINYINT NO \N -d DATE NO \N -d32 DATE NO \N -dec128 DECIMAL(38, 2) NO \N -dec128_native DECIMAL(35, 30) NO \N -dec128_text TEXT NO \N -dec256 TEXT NO \N -dec256_native DECIMAL(65, 2) NO \N -dec256_text TEXT NO \N -dec32 DECIMAL(9, 2) NO \N -dec64 DECIMAL(18, 2) NO \N -dt DATETIME NO \N -dt64 DATETIME NO \N -dt64_3_tz1 DATETIME NO \N -dt64_3_tz2 DATETIME NO \N -dt64_6 DATETIME NO \N -dt64_9 DATETIME NO \N -dt_tz1 DATETIME NO \N -dt_tz2 DATETIME NO \N -enm TEXT NO \N -f32 FLOAT NO \N -f64 DOUBLE NO \N -fs TEXT NO \N -i128 TEXT NO \N -i16 SMALLINT NO \N -i256 TEXT NO \N -i32 INTEGER NO \N -i64 BIGINT NO \N -i8 TINYINT NO \N -ip4 TEXT NO \N -ip6 TEXT NO \N -lfs TEXT NO \N -lnfs TEXT YES \N -lns BLOB YES \N -ls BLOB NO \N -m JSON NO \N -m_complex JSON NO \N -mpg TEXT NO \N -ndt64 DATETIME YES \N -ndt64_tz DATETIME YES \N -nested.col1 TEXT NO \N -nested.col2 TEXT NO \N -nfs TEXT YES \N -ns BLOB YES \N -o JSON NO \N -p TEXT NO \N -pg TEXT NO \N -r TEXT NO \N -s BLOB NO \N -sagg TEXT NO \N -t JSON NO \N -ui128 TEXT NO \N -ui16 SMALLINT UNSIGNED NO \N -ui256 TEXT NO \N -ui32 INTEGER UNSIGNED NO \N -ui64 BIGINT UNSIGNED NO \N -ui8 TINYINT UNSIGNED NO \N -uuid CHAR NO \N --- SHOW COLUMNS with mysql_map_string_to_text_in_show_columns = 1 and without use_mysql_types_in_show_columns -a Array(String) NO \N -agg AggregateFunction(uniq, UInt64) NO \N -b Bool NO \N -d Date NO \N -d32 Date32 NO \N -dec128 Decimal(38, 2) NO \N -dec128_native Decimal(35, 30) NO \N -dec128_text Decimal(35, 31) NO \N -dec256 Decimal(76, 2) NO \N -dec256_native Decimal(65, 2) NO \N -dec256_text Decimal(66, 2) NO \N -dec32 Decimal(9, 2) NO \N -dec64 Decimal(18, 2) NO \N -dt DateTime NO \N -dt64 DateTime64(3) NO \N -dt64_3_tz1 DateTime64(3, \'UTC\') NO \N -dt64_3_tz2 DateTime64(3, \'Asia/Shanghai\') NO \N -dt64_6 DateTime64(6, \'UTC\') NO \N -dt64_9 DateTime64(9, \'UTC\') NO \N -dt_tz1 DateTime(\'UTC\') NO \N -dt_tz2 DateTime(\'Europe/Amsterdam\') NO \N -enm Enum8(\'hallo\' = 1, \'welt\' = 2) NO \N -f32 Float32 NO \N -f64 Float64 NO \N -fs FixedString(3) NO \N -i128 Int128 NO \N -i16 Int16 NO \N -i256 Int256 NO \N -i32 Int32 NO \N -i64 Int64 NO \N -i8 Int8 NO \N -ip4 IPv4 NO \N -ip6 IPv6 NO \N -lfs LowCardinality(FixedString(3)) NO \N -lnfs LowCardinality(Nullable(FixedString(3))) YES \N -lns LowCardinality(Nullable(String)) YES \N -ls LowCardinality(String) NO \N -m Map(Int32, String) NO \N -m_complex Map(Int32, Map(Int32, LowCardinality(Nullable(String)))) NO \N -mpg MultiPolygon NO \N -ndt64 Nullable(DateTime64(3)) YES \N -ndt64_tz Nullable(DateTime64(3, \'Asia/Shanghai\')) YES \N -nested.col1 Array(String) NO \N -nested.col2 Array(UInt32) NO \N -nfs Nullable(FixedString(3)) YES \N -ns Nullable(String) YES \N -o Object(\'json\') NO \N -p Point NO \N -pg Polygon NO \N -r Ring NO \N -s String NO \N -sagg SimpleAggregateFunction(sum, Float64) NO \N -t Tuple(Int32, String, Nullable(String), LowCardinality(String), LowCardinality(Nullable(String)), Tuple(Int32, String)) NO \N -ui128 UInt128 NO \N -ui16 UInt16 NO \N -ui256 UInt256 NO \N -ui32 UInt32 NO \N -ui64 UInt64 NO \N -ui8 UInt8 NO \N -uuid UUID NO \N --- SHOW COLUMNS with mysql_map_fixed_string_to_text_in_show_columns = 1 and without use_mysql_types_in_show_columns -a Array(String) NO \N -agg AggregateFunction(uniq, UInt64) NO \N -b Bool NO \N -d Date NO \N -d32 Date32 NO \N -dec128 Decimal(38, 2) NO \N -dec128_native Decimal(35, 30) NO \N -dec128_text Decimal(35, 31) NO \N -dec256 Decimal(76, 2) NO \N -dec256_native Decimal(65, 2) NO \N -dec256_text Decimal(66, 2) NO \N -dec32 Decimal(9, 2) NO \N -dec64 Decimal(18, 2) NO \N -dt DateTime NO \N -dt64 DateTime64(3) NO \N -dt64_3_tz1 DateTime64(3, \'UTC\') NO \N -dt64_3_tz2 DateTime64(3, \'Asia/Shanghai\') NO \N -dt64_6 DateTime64(6, \'UTC\') NO \N -dt64_9 DateTime64(9, \'UTC\') NO \N -dt_tz1 DateTime(\'UTC\') NO \N -dt_tz2 DateTime(\'Europe/Amsterdam\') NO \N -enm Enum8(\'hallo\' = 1, \'welt\' = 2) NO \N -f32 Float32 NO \N -f64 Float64 NO \N -fs FixedString(3) NO \N -i128 Int128 NO \N -i16 Int16 NO \N -i256 Int256 NO \N -i32 Int32 NO \N -i64 Int64 NO \N -i8 Int8 NO \N -ip4 IPv4 NO \N -ip6 IPv6 NO \N -lfs LowCardinality(FixedString(3)) NO \N -lnfs LowCardinality(Nullable(FixedString(3))) YES \N -lns LowCardinality(Nullable(String)) YES \N -ls LowCardinality(String) NO \N -m Map(Int32, String) NO \N -m_complex Map(Int32, Map(Int32, LowCardinality(Nullable(String)))) NO \N -mpg MultiPolygon NO \N -ndt64 Nullable(DateTime64(3)) YES \N -ndt64_tz Nullable(DateTime64(3, \'Asia/Shanghai\')) YES \N -nested.col1 Array(String) NO \N -nested.col2 Array(UInt32) NO \N -nfs Nullable(FixedString(3)) YES \N -ns Nullable(String) YES \N -o Object(\'json\') NO \N -p Point NO \N -pg Polygon NO \N -r Ring NO \N -s String NO \N -sagg SimpleAggregateFunction(sum, Float64) NO \N -t Tuple(Int32, String, Nullable(String), LowCardinality(String), LowCardinality(Nullable(String)), Tuple(Int32, String)) NO \N -ui128 UInt128 NO \N -ui16 UInt16 NO \N -ui256 UInt256 NO \N -ui32 UInt32 NO \N -ui64 UInt64 NO \N -ui8 UInt8 NO \N -uuid UUID NO \N diff --git a/tests/queries/0_stateless/02843_backup_use_same_s3_credentials_for_base_backup.reference b/tests/queries/0_stateless/02843_backup_use_same_s3_credentials_for_base_backup.reference index 2862c459ae1..f9ade26cd0c 100644 --- a/tests/queries/0_stateless/02843_backup_use_same_s3_credentials_for_base_backup.reference +++ b/tests/queries/0_stateless/02843_backup_use_same_s3_credentials_for_base_backup.reference @@ -1,14 +1,27 @@ use_same_s3_credentials_for_base_backup for S3 +base BACKUP_CREATED +inc_1 BACKUP_CREATED +inc_2 +BACKUP_CREATED +inc_3_bad The request signature we calculated does not match the signature you provided. Check your key and signing method. (S3_ERROR) +inc_4 BACKUP_CREATED +restore inc_1 The request signature we calculated does not match the signature you provided. Check your key and signing method. (S3_ERROR) +restore inc_1 RESTORED +restore inc_2 RESTORED use_same_s3_credentials_for_base_backup for S3 (invalid arguments) +inc_5_bad BACKUP_CREATED +inc_6_bad NUMBER_OF_ARGUMENTS_DOESNT_MATCH use_same_s3_credentials_for_base_backup for Disk +backup_1 BACKUP_CREATED +backup_2 BAD_ARGUMENTS diff --git a/tests/queries/0_stateless/02843_backup_use_same_s3_credentials_for_base_backup.sh b/tests/queries/0_stateless/02843_backup_use_same_s3_credentials_for_base_backup.sh index 939179baa26..16ac095312c 100755 --- a/tests/queries/0_stateless/02843_backup_use_same_s3_credentials_for_base_backup.sh +++ b/tests/queries/0_stateless/02843_backup_use_same_s3_credentials_for_base_backup.sh @@ -22,24 +22,41 @@ function write_invalid_password_to_base_backup() # Returns the arguments for the BACKUP TO S3() function, i.e. (url, access_key_id, secret_access_key) function s3_location() { echo "'http://localhost:11111/test/backups/$CLICKHOUSE_DATABASE/use_same_s3_credentials_for_base_backup_base_$*', 'test', 'testtest'"; } +function s3_location_with_invalid_password() { echo "'http://localhost:11111/test/backups/$CLICKHOUSE_DATABASE/use_same_s3_credentials_for_base_backup_base_$*', 'test', 'INVALID_PASSWORD'"; } echo 'use_same_s3_credentials_for_base_backup for S3' +echo "base" $CLICKHOUSE_CLIENT -q "BACKUP TABLE data TO S3($(s3_location base))" | cut -f2 + +echo "inc_1" $CLICKHOUSE_CLIENT -q "BACKUP TABLE data TO S3($(s3_location inc_1)) SETTINGS base_backup=S3($(s3_location base))" | cut -f2 write_invalid_password_to_base_backup inc_1 -$CLICKHOUSE_CLIENT --format Null -q "BACKUP TABLE data TO S3($(s3_location inc_2)) SETTINGS base_backup=S3($(s3_location inc_1))" |& grep -m1 -o 'The request signature we calculated does not match the signature you provided. Check your key and signing method. (S3_ERROR)' -$CLICKHOUSE_CLIENT -q "BACKUP TABLE data TO S3($(s3_location inc_3)) SETTINGS base_backup=S3($(s3_location inc_1)), use_same_s3_credentials_for_base_backup=1" | cut -f2 +echo "inc_2" +$CLICKHOUSE_CLIENT -q "BACKUP TABLE data TO S3($(s3_location inc_2)) SETTINGS base_backup=S3($(s3_location inc_1))" | cut -f2 + +echo "inc_3_bad" +$CLICKHOUSE_CLIENT --format Null -q "BACKUP TABLE data TO S3($(s3_location inc_3_bad)) SETTINGS base_backup=S3($(s3_location_with_invalid_password inc_1))" |& grep -m1 -o 'The request signature we calculated does not match the signature you provided. Check your key and signing method. (S3_ERROR)' +echo "inc_4" +$CLICKHOUSE_CLIENT -q "BACKUP TABLE data TO S3($(s3_location inc_4)) SETTINGS base_backup=S3($(s3_location_with_invalid_password inc_1)), use_same_s3_credentials_for_base_backup=1" | cut -f2 + +echo "restore inc_1" $CLICKHOUSE_CLIENT --format Null -q "RESTORE TABLE data AS data FROM S3($(s3_location inc_1))" |& grep -m1 -o 'The request signature we calculated does not match the signature you provided. Check your key and signing method. (S3_ERROR)' +echo "restore inc_1" $CLICKHOUSE_CLIENT -q "RESTORE TABLE data AS data_1 FROM S3($(s3_location inc_1)) SETTINGS use_same_s3_credentials_for_base_backup=1" | cut -f2 -$CLICKHOUSE_CLIENT -q "RESTORE TABLE data AS data_2 FROM S3($(s3_location inc_3)) SETTINGS use_same_s3_credentials_for_base_backup=1" | cut -f2 +echo "restore inc_2" +$CLICKHOUSE_CLIENT -q "RESTORE TABLE data AS data_2 FROM S3($(s3_location inc_2)) SETTINGS use_same_s3_credentials_for_base_backup=1" | cut -f2 echo 'use_same_s3_credentials_for_base_backup for S3 (invalid arguments)' -$CLICKHOUSE_CLIENT -q "BACKUP TABLE data TO S3($(s3_location inc_4_bad)) SETTINGS base_backup=S3($(s3_location inc_1), 'foo'), use_same_s3_credentials_for_base_backup=1" |& cut -f2 -$CLICKHOUSE_CLIENT -q "BACKUP TABLE data TO S3($(s3_location inc_5_bad), 'foo') SETTINGS base_backup=S3($(s3_location inc_1)), use_same_s3_credentials_for_base_backup=1" |& grep -o -m1 NUMBER_OF_ARGUMENTS_DOESNT_MATCH +echo "inc_5_bad" +$CLICKHOUSE_CLIENT -q "BACKUP TABLE data TO S3($(s3_location inc_5_bad)) SETTINGS base_backup=S3($(s3_location inc_1), 'foo'), use_same_s3_credentials_for_base_backup=1" |& cut -f2 +echo "inc_6_bad" +$CLICKHOUSE_CLIENT -q "BACKUP TABLE data TO S3($(s3_location inc_6_bad), 'foo') SETTINGS base_backup=S3($(s3_location inc_1)), use_same_s3_credentials_for_base_backup=1" |& grep -o -m1 NUMBER_OF_ARGUMENTS_DOESNT_MATCH echo 'use_same_s3_credentials_for_base_backup for Disk' +echo "backup_1" $CLICKHOUSE_CLIENT -q "BACKUP TABLE data TO Disk('backups', '$CLICKHOUSE_DATABASE/backup_1') SETTINGS use_same_s3_credentials_for_base_backup=1" | cut -f2 +echo "backup_2" $CLICKHOUSE_CLIENT -q "BACKUP TABLE data TO Disk('backups', '$CLICKHOUSE_DATABASE/backup_2') SETTINGS use_same_s3_credentials_for_base_backup=1, base_backup=Disk('backups', '$CLICKHOUSE_DATABASE/backup_1')" |& grep -o -m1 BAD_ARGUMENTS exit 0 diff --git a/tests/queries/0_stateless/02861_join_on_nullsafe_compare.reference.j2 b/tests/queries/0_stateless/02861_join_on_nullsafe_compare.reference.j2 index d97d6c2b314..c0e35d7ae87 100644 --- a/tests/queries/0_stateless/02861_join_on_nullsafe_compare.reference.j2 +++ b/tests/queries/0_stateless/02861_join_on_nullsafe_compare.reference.j2 @@ -647,6 +647,37 @@ join_algorithm = default, join_use_nulls = 0, t1 JOIN t4 19 19 19 19 \N 20 \N 0 -- +\N 0 2 2 +\N 0 \N 4 +\N 0 6 6 +\N 0 \N 8 +\N 0 10 10 +\N 0 \N 12 +\N 0 14 14 +\N 0 \N 16 +\N 0 18 18 +\N 0 \N 20 +1 1 1 1 +\N 2 \N 0 +3 3 3 3 +\N 4 \N 0 +5 5 5 5 +\N 6 \N 0 +7 7 7 7 +\N 8 \N 0 +9 9 9 9 +\N 10 \N 0 +11 11 11 11 +\N 12 \N 0 +13 13 13 13 +\N 14 \N 0 +15 15 15 15 +\N 16 \N 0 +17 17 17 17 +\N 18 \N 0 +19 19 19 19 +\N 20 \N 0 +-- 1 42 420 1 1 43 430 1 \N 42 420 2 \N 43 430 4 \N 42 420 2 \N 43 430 8 diff --git a/tests/queries/0_stateless/02861_join_on_nullsafe_compare.sql.j2 b/tests/queries/0_stateless/02861_join_on_nullsafe_compare.sql.j2 index 64960d2b2e5..2ae18d3b8a9 100644 --- a/tests/queries/0_stateless/02861_join_on_nullsafe_compare.sql.j2 +++ b/tests/queries/0_stateless/02861_join_on_nullsafe_compare.sql.j2 @@ -30,28 +30,28 @@ SELECT 'join_algorithm = {{ join_algorithm }}, join_use_nulls = {{ join_use_null SELECT '--'; SELECT {{ t1 }}.a, {{ t1 }}.val, {{ t2 }}.a, {{ t2 }}.val FROM {{ t1 }} FULL JOIN {{ t2 }} -ON isNotDistinctFrom({{ t1 }}.a, {{ t2 }}.a) +ON {{ t1 }}.a <=> {{ t2 }}.a ORDER BY {{ t1 }}.val NULLS FIRST, {{ t2 }}.val NULLS FIRST ; SELECT '--'; SELECT * FROM {{ t1 }} FULL JOIN {{ t2 }} -ON isNotDistinctFrom({{ t1 }}.a, {{ t2 }}.a) AND isNotDistinctFrom({{ t1 }}.b, {{ t2 }}.b) +ON {{ t1 }}.a <=> {{ t2 }}.a AND {{ t1 }}.b <=> {{ t2 }}.b ORDER BY {{ t1 }}.val NULLS FIRST, {{ t2 }}.val NULLS FIRST ; SELECT '--'; SELECT * FROM {{ t1 }} FULL JOIN {{ t2 }} -ON {{ t1 }}.a == {{ t2 }}.a AND isNotDistinctFrom({{ t1 }}.b, {{ t2 }}.b) +ON {{ t1 }}.a == {{ t2 }}.a AND {{ t1 }}.b <=> {{ t2 }}.b ORDER BY {{ t1 }}.val NULLS FIRST, {{ t2 }}.val NULLS FIRST ; SELECT '--'; SELECT * FROM {{ t1 }} FULL JOIN {{ t2 }} -ON isNotDistinctFrom({{ t1 }}.a, {{ t2 }}.a) AND {{ t1 }}.b == {{ t2 }}.b +ON {{ t1 }}.a <=> {{ t2 }}.a AND {{ t1 }}.b == {{ t2 }}.b ORDER BY {{ t1 }}.val NULLS FIRST, {{ t2 }}.val NULLS FIRST ; @@ -62,7 +62,14 @@ SELECT '--'; SET join_use_nulls = 0; SET join_algorithm = 'hash'; SELECT t1.a, t1.val, t2.a, t2.val FROM t1 FULL JOIN t2 -ON isNotDistinctFrom(t1.a, t2.a) AND t1.b < 2 OR t1.a == t2.a +ON t1.a <=> t2.a AND t1.b < 2 OR t1.a == t2.a +ORDER BY t1.val NULLS FIRST, t2.val NULLS FIRST +; + +SELECT '--'; + +SELECT t1.a, t1.val, t2.a, t2.val FROM t1 FULL JOIN t2 +ON t1.a IS NOT DISTINCT FROM t2.a AND t1.b < 2 OR t1.a == t2.a ORDER BY t1.val NULLS FIRST, t2.val NULLS FIRST ; @@ -76,7 +83,7 @@ SET join_use_nulls = 1; SELECT * FROM (SELECT a, 42 as `__wrapNullsafe(a)`, 420 as `tuple(a)`, val FROM t1) t1 JOIN (SELECT a, 43 as `__wrapNullsafe(t2.a)`, 430 as `tuple(t2.a)`, val FROM t2) t2 -ON isNotDistinctFrom(t1.a, t2.a) +ON t1.a <=> t2.a ORDER BY t1.val NULLS FIRST, t2.val NULLS FIRST LIMIT 10; @@ -85,7 +92,7 @@ SELECT '--'; SELECT a, 42 as `__wrapNullsafe(a)`, 420 as `tuple(a)`, val, t2.a, 43 as `__wrapNullsafe(t2.a)`, 430 as `tuple(t2.a)`, t2.val FROM (SELECT a, val, 111 as `__wrapNullsafe(a)_0` FROM t1) t1 JOIN (SELECT a, val, 111 as `__wrapNullsafe(t2.a)_0` FROM t2) t2 -ON isNotDistinctFrom(t1.a, t2.a) +ON t1.a <=> t2.a ORDER BY t1.val NULLS FIRST, t2.val NULLS FIRST LIMIT 10; @@ -99,3 +106,5 @@ SELECT * FROM t1 JOIN t2 ON isNotDistinctFrom(t1.a, t2.a, t2.b); -- { serverErro SELECT isNotDistinctFrom(a) from t1; -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } SELECT isNotDistinctFrom(a, b) from t1; -- { serverError NOT_IMPLEMENTED } +SELECT a <=> b from t1; -- { serverError NOT_IMPLEMENTED } +SELECT a IS NOT DISTINCT FROM b from t1; -- { serverError NOT_IMPLEMENTED } diff --git a/tests/queries/0_stateless/02869_insert_materialized_views_duplicated_parts.reference b/tests/queries/0_stateless/02869_insert_materialized_views_duplicated_parts.reference deleted file mode 100644 index 325f639813a..00000000000 --- a/tests/queries/0_stateless/02869_insert_materialized_views_duplicated_parts.reference +++ /dev/null @@ -1,14 +0,0 @@ -Initial -2020-01-01 13:00:00 24 -Last block is duplicate -2020-01-01 13:00:00 24 -2021-09-01 11:00:00 24 -One block is duplicate (default setting) -2020-01-01 13:00:00 24 -2021-09-01 11:00:00 24 -2022-01-01 12:00:00 24 -One block is duplicate (changed setting) -2020-01-01 13:00:00 24 -2021-09-01 11:00:00 24 -2022-01-01 12:00:00 24 -2023-01-01 12:00:00 24 diff --git a/tests/queries/0_stateless/02869_insert_materialized_views_duplicated_parts.sql b/tests/queries/0_stateless/02869_insert_materialized_views_duplicated_parts.sql deleted file mode 100644 index c087e826a13..00000000000 --- a/tests/queries/0_stateless/02869_insert_materialized_views_duplicated_parts.sql +++ /dev/null @@ -1,44 +0,0 @@ --- Tags: zookeeper - -DROP TABLE IF EXISTS landing SYNC; -DROP TABLE IF EXISTS mv SYNC; - -CREATE TABLE landing -( - `time` DateTime, - `number` Int64 -) -ENGINE = ReplicatedReplacingMergeTree('/clickhouse/{database}/tables/landing/', 'r1') -PARTITION BY toYYYYMMDD(time) -ORDER BY time; - -CREATE MATERIALIZED VIEW mv -ENGINE = ReplicatedSummingMergeTree('/clickhouse/{database}/tables/mv', 'r1') -PARTITION BY toYYYYMMDD(hour) ORDER BY hour -AS SELECT - toStartOfHour(time) AS hour, - sum(number) AS sum_amount - FROM landing GROUP BY hour; - -SELECT 'Initial'; -INSERT INTO landing VALUES ('2020-01-01 13:23:34', 24); -SELECT * FROM mv ORDER BY hour; - -SELECT 'Last block is duplicate'; -INSERT INTO landing VALUES ('2021-09-01 11:00:00', 24), ('2020-01-01 13:23:34', 24); -SELECT * FROM mv ORDER BY hour; - -SELECT 'One block is duplicate (default setting)'; -SET max_insert_delayed_streams_for_parallel_write = 0; -INSERT INTO landing VALUES ('2021-09-01 11:00:00', 24), ('2022-01-01 12:03:00', 24); -SELECT * FROM mv ORDER BY hour; - -SELECT 'One block is duplicate (changed setting)'; -SET max_insert_delayed_streams_for_parallel_write = 5; -INSERT INTO landing VALUES ('2021-09-01 11:00:00', 24), ('2023-01-01 12:03:00', 24); - -SELECT * FROM mv ORDER BY hour; - -DROP TABLE mv; -DROP TABLE landing; - diff --git a/tests/queries/0_stateless/02875_merge_engine_set_index.reference b/tests/queries/0_stateless/02875_merge_engine_set_index.reference new file mode 100644 index 00000000000..00750edc07d --- /dev/null +++ b/tests/queries/0_stateless/02875_merge_engine_set_index.reference @@ -0,0 +1 @@ +3 diff --git a/tests/queries/0_stateless/02875_merge_engine_set_index.sh b/tests/queries/0_stateless/02875_merge_engine_set_index.sh new file mode 100755 index 00000000000..57b5db374c1 --- /dev/null +++ b/tests/queries/0_stateless/02875_merge_engine_set_index.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash + +# shellcheck disable=SC2154 + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +$CLICKHOUSE_CLIENT -nq " + CREATE TABLE t1 + ( + a UInt32, + b UInt32 + ) + ENGINE = MergeTree + ORDER BY (a, b) + SETTINGS index_granularity = 8192; + + INSERT INTO t1 SELECT number, number FROM numbers_mt(1e6); + + CREATE TABLE t2 + ( + a UInt32, + b UInt32 + ) + ENGINE = MergeTree + ORDER BY (a, b) + SETTINGS index_granularity = 8192; + + INSERT INTO t2 VALUES (1, 1) (2, 2) (3, 3); + + CREATE TABLE t + ( + a UInt32, + b UInt32 + ) + ENGINE = Merge(currentDatabase(), 't*');" + +query_id="${CLICKHOUSE_DATABASE}_merge_engine_set_index_$RANDOM$RANDOM" +$CLICKHOUSE_CLIENT --query_id="$query_id" --multiquery -q " +SELECT + a, + b +FROM t +WHERE (a, b) IN ( + SELECT DISTINCT + a, + b + FROM t2 +) +GROUP BY + a, + b +ORDER BY + a ASC, + b DESC +FORMAT Null;" + +$CLICKHOUSE_CLIENT -nq " +SYSTEM FLUSH LOGS; + +SELECT ProfileEvents['SelectedMarks'] +FROM system.query_log +WHERE event_date >= yesterday() AND current_database = currentDatabase() AND (query_id = '$query_id') AND (type = 'QueryFinish');" diff --git a/tests/queries/0_stateless/02882_formatQuery.reference b/tests/queries/0_stateless/02882_formatQuery.reference index fd84a9505b1..7907362a881 100644 --- a/tests/queries/0_stateless/02882_formatQuery.reference +++ b/tests/queries/0_stateless/02882_formatQuery.reference @@ -1,19 +1,44 @@ +-- formatQuery SELECT 1 SELECT 1 SELECT 1 -SELECT 1 +1 1 INSERT INTO tab FORMAT Values CREATE TABLE default.no_prop_table\n(\n `some_column` UInt64\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 EXPLAIN SYNTAX\nSELECT\n CAST(1, \'INT\'),\n CEIL(1),\n CEILING(1),\n CHAR(49),\n CHAR_LENGTH(\'1\'),\n CHARACTER_LENGTH(\'1\'),\n COALESCE(1),\n CONCAT(\'1\', \'1\'),\n CORR(1, 1),\n COS(1),\n COUNT(1),\n COVAR_POP(1, 1),\n COVAR_SAMP(1, 1),\n DATABASE(),\n SCHEMA(),\n dateDiff(\'DAY\', toDate(\'2020-10-24\'), toDate(\'2019-10-24\')),\n EXP(1),\n FLATTEN([[1]]),\n FLOOR(1),\n FQDN(),\n GREATEST(1),\n IF(1, 1, 1),\n IFNULL(1, 1),\n LCASE(\'A\'),\n LEAST(1),\n LENGTH(\'1\'),\n LN(1),\n LOCATE(\'1\', \'1\'),\n LOG(1),\n LOG10(1),\n LOG2(1),\n LOWER(\'A\'),\n MAX(1),\n MID(\'123\', 1, 1),\n MIN(1),\n MOD(1, 1),\n NOT 1,\n NOW(),\n NOW64(),\n NULLIF(1, 1),\n PI(),\n position(\'123\', \'2\'),\n POW(1, 1),\n POWER(1, 1),\n RAND(),\n REPLACE(\'1\', \'1\', \'2\'),\n REVERSE(\'123\'),\n ROUND(1),\n SIN(1),\n SQRT(1),\n STDDEV_POP(1),\n STDDEV_SAMP(1),\n SUBSTR(\'123\', 2),\n substring(\'123\', 2),\n SUM(1),\n TAN(1),\n TANH(1),\n TRUNC(1),\n TRUNCATE(1),\n UCASE(\'A\'),\n UPPER(\'A\'),\n USER(),\n VAR_POP(1),\n VAR_SAMP(1),\n WEEK(toDate(\'2020-10-24\')),\n YEARWEEK(toDate(\'2020-10-24\'))\nFORMAT TSVRaw +1 SELECT 1 SELECT 1 +2 SeLeCt 22 SELECT 22 +3 InSerT into TAB values (\'\') INSERT INTO TAB FORMAT Values +1 SELECT 1 SELECT 1 +2 SeLeCt 22 SELECT 22 +3 InSerT into TAB values (\'\') INSERT INTO TAB FORMAT Values +1 SELECT 1 SELECT 1 +2 SeLeCt 2 SELECT 2 +3 bad 3 \N +4 select 4 SELECT 4 +5 bad 5 \N +6 \N +7 SELECT 7 SELECT 7 +-- formatQuerySingleLine +SELECT 1 +SELECT 1 +SELECT 1 1 -formatQuerySingleLine -SELECT 1 -SELECT 1 -SELECT 1 -SELECT 1 1 INSERT INTO tab FORMAT Values CREATE TABLE default.no_prop_table (`some_column` UInt64) ENGINE = MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192 EXPLAIN SYNTAX SELECT CAST(1, \'INT\'), CEIL(1), CEILING(1), CHAR(49), CHAR_LENGTH(\'1\'), CHARACTER_LENGTH(\'1\'), COALESCE(1), CONCAT(\'1\', \'1\'), CORR(1, 1), COS(1), COUNT(1), COVAR_POP(1, 1), COVAR_SAMP(1, 1), DATABASE(), SCHEMA(), dateDiff(\'DAY\', toDate(\'2020-10-24\'), toDate(\'2019-10-24\')), EXP(1), FLATTEN([[1]]), FLOOR(1), FQDN(), GREATEST(1), IF(1, 1, 1), IFNULL(1, 1), LCASE(\'A\'), LEAST(1), LENGTH(\'1\'), LN(1), LOCATE(\'1\', \'1\'), LOG(1), LOG10(1), LOG2(1), LOWER(\'A\'), MAX(1), MID(\'123\', 1, 1), MIN(1), MOD(1, 1), NOT 1, NOW(), NOW64(), NULLIF(1, 1), PI(), position(\'123\', \'2\'), POW(1, 1), POWER(1, 1), RAND(), REPLACE(\'1\', \'1\', \'2\'), REVERSE(\'123\'), ROUND(1), SIN(1), SQRT(1), STDDEV_POP(1), STDDEV_SAMP(1), SUBSTR(\'123\', 2), substring(\'123\', 2), SUM(1), TAN(1), TANH(1), TRUNC(1), TRUNCATE(1), UCASE(\'A\'), UPPER(\'A\'), USER(), VAR_POP(1), VAR_SAMP(1), WEEK(toDate(\'2020-10-24\')), YEARWEEK(toDate(\'2020-10-24\')) FORMAT TSVRaw -1 +1 SELECT 1 SELECT 1 +2 SeLeCt 22 SELECT 22 +3 InSerT into TAB values (\'\') INSERT INTO TAB FORMAT Values +1 SELECT 1 SELECT 1 +2 SeLeCt 22 SELECT 22 +3 InSerT into TAB values (\'\') INSERT INTO TAB FORMAT Values +1 SELECT 1 SELECT 1 +2 SeLeCt 2 SELECT 2 +3 bad 3 \N +4 select 4 SELECT 4 +5 bad 5 \N +6 \N +7 SELECT 7 SELECT 7 diff --git a/tests/queries/0_stateless/02882_formatQuery.sql b/tests/queries/0_stateless/02882_formatQuery.sql index 767283552d5..c3b3f202c9c 100644 --- a/tests/queries/0_stateless/02882_formatQuery.sql +++ b/tests/queries/0_stateless/02882_formatQuery.sql @@ -1,21 +1,51 @@ -SELECT formatQuery('select 1;'); -SELECT formatQuery('select 1'); +DROP TABLE IF EXISTS all_valid; +CREATE TABLE all_valid (id UInt64, query String) ENGINE=MergeTree ORDER BY id; +INSERT INTO all_valid VALUES (1, 'SELECT 1') (2, 'SeLeCt 22') (3, 'InSerT into TAB values (\'\')'); + +DROP TABLE IF EXISTS some_invalid; +CREATE TABLE some_invalid (id UInt64, query String) ENGINE=MergeTree ORDER BY id; +INSERT INTO some_invalid VALUES (1, 'SELECT 1') (2, 'SeLeCt 2') (3, 'bad 3') (4, 'select 4') (5, 'bad 5') (6, '') (7, 'SELECT 7'); + +SELECT '-- formatQuery'; + SELECT formatQuery('SELECT 1;'); SELECT formatQuery('SELECT 1'); -SELECT formatQuery('select 1;') == formatQuery('SeLecT 1'); +SELECT formatQuery('SeLeCt 1;'); +SELECT formatQuery('select 1;') == formatQuery('SeLeCt 1'); +SELECT normalizedQueryHash(formatQuery('select 1')) = normalizedQueryHash(formatQuery('SELECT 1')); + SELECT formatQuery('INSERT INTO tab VALUES (\'\') (\'test\')'); SELECT formatQuery('CREATE TABLE default.no_prop_table(`some_column` UInt64) ENGINE = MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192'); SELECT formatQuery('EXPLAIN SYNTAX SELECT CAST(1 AS INT), CEIL(1), CEILING(1), CHAR(49), CHAR_LENGTH(\'1\'), CHARACTER_LENGTH(\'1\'), COALESCE(1), CONCAT(\'1\', \'1\'), CORR(1, 1), COS(1), COUNT(1), COVAR_POP(1, 1), COVAR_SAMP(1, 1), DATABASE(), SCHEMA(), DATEDIFF(\'DAY\', toDate(\'2020-10-24\'), toDate(\'2019-10-24\')), EXP(1), FLATTEN([[1]]), FLOOR(1), FQDN(), GREATEST(1), IF(1, 1, 1), IFNULL(1, 1), LCASE(\'A\'), LEAST(1), LENGTH(\'1\'), LN(1), LOCATE(\'1\', \'1\'), LOG(1), LOG10(1), LOG2(1), LOWER(\'A\'), MAX(1), MID(\'123\', 1, 1), MIN(1), MOD(1, 1), NOT(1), NOW(), NOW64(), NULLIF(1, 1), PI(), POSITION(\'123\', \'2\'), POW(1, 1), POWER(1, 1), RAND(), REPLACE(\'1\', \'1\', \'2\'), REVERSE(\'123\'), ROUND(1), SIN(1), SQRT(1), STDDEV_POP(1), STDDEV_SAMP(1), SUBSTR(\'123\', 2), SUBSTRING(\'123\', 2), SUM(1), TAN(1), TANH(1), TRUNC(1), TRUNCATE(1), UCASE(\'A\'), UPPER(\'A\'), USER(), VAR_POP(1), VAR_SAMP(1), WEEK(toDate(\'2020-10-24\')), YEARWEEK(toDate(\'2020-10-24\')) format TSVRaw;'); -SELECT normalizedQueryHash(formatQuery('select 1')) = normalizedQueryHash(formatQuery('SELECT 1')); + +SELECT formatQuery(''); -- { serverError SYNTAX_ERROR } SELECT formatQuery('SEECTwrong'); -- { serverError SYNTAX_ERROR } -SELECT 'formatQuerySingleLine'; -SELECT formatQuerySingleLine('select 1;'); -SELECT formatQuerySingleLine('select 1'); + +SELECT id, query, formatQuery(query) FROM all_valid ORDER BY id; +SELECT id, query, formatQuery(query) FROM some_invalid ORDER BY id; -- { serverError SYNTAX_ERROR } +SELECT id, query, formatQueryOrNull(query) FROM all_valid ORDER BY id; +SELECT id, query, formatQueryOrNull(query) FROM some_invalid ORDER BY id; + +SELECT '-- formatQuerySingleLine'; + SELECT formatQuerySingleLine('SELECT 1;'); SELECT formatQuerySingleLine('SELECT 1'); -SELECT formatQuerySingleLine('select 1;') == formatQuerySingleLine('SeLecT 1'); +SELECT formatQuerySingleLine('SeLeCt 1;'); +SELECT formatQuerySingleLine('select 1;') == formatQuerySingleLine('SeLeCt 1'); +SELECT normalizedQueryHash(formatQuerySingleLine('select 1')) = normalizedQueryHash(formatQuerySingleLine('SELECT 1')); + SELECT formatQuerySingleLine('INSERT INTO tab VALUES (\'\') (\'test\')'); + SELECT formatQuerySingleLine('CREATE TABLE default.no_prop_table(`some_column` UInt64) ENGINE = MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192'); SELECT formatQuerySingleLine('EXPLAIN SYNTAX SELECT CAST(1 AS INT), CEIL(1), CEILING(1), CHAR(49), CHAR_LENGTH(\'1\'), CHARACTER_LENGTH(\'1\'), COALESCE(1), CONCAT(\'1\', \'1\'), CORR(1, 1), COS(1), COUNT(1), COVAR_POP(1, 1), COVAR_SAMP(1, 1), DATABASE(), SCHEMA(), DATEDIFF(\'DAY\', toDate(\'2020-10-24\'), toDate(\'2019-10-24\')), EXP(1), FLATTEN([[1]]), FLOOR(1), FQDN(), GREATEST(1), IF(1, 1, 1), IFNULL(1, 1), LCASE(\'A\'), LEAST(1), LENGTH(\'1\'), LN(1), LOCATE(\'1\', \'1\'), LOG(1), LOG10(1), LOG2(1), LOWER(\'A\'), MAX(1), MID(\'123\', 1, 1), MIN(1), MOD(1, 1), NOT(1), NOW(), NOW64(), NULLIF(1, 1), PI(), POSITION(\'123\', \'2\'), POW(1, 1), POWER(1, 1), RAND(), REPLACE(\'1\', \'1\', \'2\'), REVERSE(\'123\'), ROUND(1), SIN(1), SQRT(1), STDDEV_POP(1), STDDEV_SAMP(1), SUBSTR(\'123\', 2), SUBSTRING(\'123\', 2), SUM(1), TAN(1), TANH(1), TRUNC(1), TRUNCATE(1), UCASE(\'A\'), UPPER(\'A\'), USER(), VAR_POP(1), VAR_SAMP(1), WEEK(toDate(\'2020-10-24\')), YEARWEEK(toDate(\'2020-10-24\')) format TSVRaw;'); -SELECT normalizedQueryHash(formatQuerySingleLine('select 1')) = normalizedQueryHash(formatQuerySingleLine('SELECT 1')); + +SELECT formatQuerySingleLine(''); -- { serverError SYNTAX_ERROR } SELECT formatQuerySingleLine('SEECTwrong'); -- { serverError SYNTAX_ERROR } + +SELECT id, query, formatQuerySingleLine(query) FROM all_valid ORDER BY id; +SELECT id, query, formatQuerySingleLine(query) FROM some_invalid ORDER BY id; -- { serverError SYNTAX_ERROR } +SELECT id, query, formatQuerySingleLineOrNull(query) FROM all_valid ORDER BY id; +SELECT id, query, formatQuerySingleLineOrNull(query) FROM some_invalid ORDER BY id; + +DROP TABLE all_valid; +DROP TABLE some_invalid; diff --git a/tests/queries/0_stateless/02884_string_distance_function.reference b/tests/queries/0_stateless/02884_string_distance_function.reference index cedc23cc84d..3ac30825fd0 100644 --- a/tests/queries/0_stateless/02884_string_distance_function.reference +++ b/tests/queries/0_stateless/02884_string_distance_function.reference @@ -2,41 +2,58 @@ const arguments byteHammingDistance 0 const arguments editDistance 6 +const arguments stringJaccardIndex +0.4 byteHammingDistance -1 7 +1 7 10 byteHammingDistance(const, non const) 3 -6 3 +6 10 byteHammingDistance(non const, const) -6 -6 3 +6 +6 10 mismatches(alias) +7 1 7 -7 +10 +3 +3 +6 10 3 6 -3 -10 6 -6 -3 10 +stringJaccardIndex +0 +0.8571428571428571 +0.8571428571428571 +0.4 +0 +0.8571428571428571 +0.8571428571428571 +0.4 +0.4 +0 +0 +0 +0 +0.25 0.625 editDistance -1 -1 7 +1 +1 6 levenshteinDistance -1 -1 7 +1 +1 6 diff --git a/tests/queries/0_stateless/02884_string_distance_function.sql b/tests/queries/0_stateless/02884_string_distance_function.sql index 1ddb9bfbafd..e3d9051ce5b 100644 --- a/tests/queries/0_stateless/02884_string_distance_function.sql +++ b/tests/queries/0_stateless/02884_string_distance_function.sql @@ -2,8 +2,9 @@ select 'const arguments byteHammingDistance'; select byteHammingDistance('abcd', 'abcd'); select 'const arguments editDistance'; select editDistance('clickhouse', 'mouse'); -/*select 'const arguments jaccardIndex'; -select jaccardIndex('clickhouse', 'mouse');*/ + +select 'const arguments stringJaccardIndex'; +select stringJaccardIndex('clickhouse', 'mouse'); drop table if exists t; create table t @@ -14,23 +15,40 @@ create table t insert into t values ('abcdefg', 'abcdef') ('abcdefg', 'bcdefg') ('abcdefg', '') ('mouse', 'clickhouse'); select 'byteHammingDistance'; -select byteHammingDistance(s1, s2) from t; +select byteHammingDistance(s1, s2) FROM t ORDER BY s1, s2; select 'byteHammingDistance(const, non const)'; -select byteHammingDistance('abc', s2) from t; +select byteHammingDistance('abc', s2) FROM t ORDER BY s1, s2; select 'byteHammingDistance(non const, const)'; -select byteHammingDistance(s2, 'def') from t; +select byteHammingDistance(s2, 'def') FROM t ORDER BY s1, s2; select 'mismatches(alias)'; -select mismatches(s1, s2) from t; -select mismatches('abc', s2) from t; -select mismatches(s2, 'def') from t; +select mismatches(s1, s2) FROM t ORDER BY s1, s2; +select mismatches('abc', s2) FROM t ORDER BY s1, s2; +select mismatches(s2, 'def') FROM t ORDER BY s1, s2; + +select 'stringJaccardIndex'; +select stringJaccardIndex(s1, s2) FROM t ORDER BY s1, s2; +select stringJaccardIndexUTF8(s1, s2) FROM t ORDER BY s1, s2; + +-- we do not perform full UTF8 validation, so sometimes it just returns some result +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\x48\x65\x6C')); +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xFF\xFF\xFF\xFF')); +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\x41\xE2\x82\xAC')); +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xF0\x9F\x99\x82')); +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xFF')); +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC2\x01')); -- { serverError BAD_ARGUMENTS } +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC1\x81')); -- { serverError BAD_ARGUMENTS } +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xF0\x80\x80\x41')); -- { serverError BAD_ARGUMENTS } +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC0\x80')); -- { serverError BAD_ARGUMENTS } +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xD8\x00 ')); -- { serverError BAD_ARGUMENTS } +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xDC\x00')); -- { serverError BAD_ARGUMENTS } + +SELECT stringJaccardIndexUTF8('😃🌍', '🙃😃🌑'), stringJaccardIndex('😃🌍', '🙃😃🌑'); -/*select 'byteJaccardIndex'; -select byteJaccardIndex(s1, s2) from t;*/ select 'editDistance'; -select editDistance(s1, s2) from t; +select editDistance(s1, s2) FROM t ORDER BY s1, s2; select 'levenshteinDistance'; -select levenshteinDistance(s1, s2) from t; +select levenshteinDistance(s1, s2) FROM t ORDER BY s1, s2; SELECT editDistance(randomString(power(2, 17)), 'abc'); -- { serverError TOO_LARGE_STRING_SIZE} diff --git a/tests/queries/0_stateless/02888_obsolete_settings.reference b/tests/queries/0_stateless/02888_obsolete_settings.reference index e95b1c7aaed..6ee5216cd73 100644 --- a/tests/queries/0_stateless/02888_obsolete_settings.reference +++ b/tests/queries/0_stateless/02888_obsolete_settings.reference @@ -4,6 +4,7 @@ allow_experimental_bigint_types allow_experimental_database_atomic allow_experimental_geo_types allow_experimental_map_type +allow_experimental_query_cache allow_experimental_window_functions async_insert_cleanup_timeout_ms async_insert_stale_timeout_ms @@ -45,6 +46,7 @@ query_plan_optimize_projection replication_alter_columns_timeout restore_threads temporary_live_view_timeout +use_mysql_types_in_show_columns -- Obsolete merge tree settings check_delay_period in_memory_parts_enable_wal diff --git a/tests/queries/0_stateless/02888_system_tables_with_inaccsessible_table_function.reference b/tests/queries/0_stateless/02888_system_tables_with_inaccsessible_table_function.reference new file mode 100644 index 00000000000..5efe10177dd --- /dev/null +++ b/tests/queries/0_stateless/02888_system_tables_with_inaccsessible_table_function.reference @@ -0,0 +1,12 @@ +tablefunc01 StorageProxy CREATE TABLE default.tablefunc01 (`x` Int32) AS postgresql(\'127.121.0.1:5432\', \'postgres_db\', \'postgres_table\', \'postgres_user\', \'[HIDDEN]\') [] 1 1 +tablefunc02 StorageProxy CREATE TABLE default.tablefunc02 (`x` Int32) AS mysql(\'127.123.0.1:3306\', \'mysql_db\', \'mysql_table\', \'mysql_user\', \'[HIDDEN]\') [] 1 1 +tablefunc03 StorageProxy CREATE TABLE default.tablefunc03 (`a` Int32) AS sqlite(\'db_path\', \'table_name\') [] 1 1 +tablefunc04 StorageProxy CREATE TABLE default.tablefunc04 (`a` Int32) AS mongodb(\'127.0.0.1:27017\', \'test\', \'my_collection\', \'test_user\', \'[HIDDEN]\', \'a Int\') [] 1 1 +tablefunc05 StorageProxy CREATE TABLE default.tablefunc05 (`a` Int32) AS redis(\'127.0.0.1:6379\', \'key\', \'key UInt32\') [] 1 1 +tablefunc06 StorageProxy CREATE TABLE default.tablefunc06 (`a` Int32) AS s3(\'http://some_addr:9000/cloud-storage-01/data.tsv\', \'M9O7o0SX5I4udXhWxI12\', \'[HIDDEN]\', \'TSV\') [] 1 1 +tablefunc01 StorageProxy CREATE TABLE default.tablefunc01 (`x` Int32) AS postgresql(\'127.121.0.1:5432\', \'postgres_db\', \'postgres_table\', \'postgres_user\', \'[HIDDEN]\') [] 1 1 +tablefunc02 StorageProxy CREATE TABLE default.tablefunc02 (`x` Int32) AS mysql(\'127.123.0.1:3306\', \'mysql_db\', \'mysql_table\', \'mysql_user\', \'[HIDDEN]\') [] 1 1 +tablefunc03 StorageProxy CREATE TABLE default.tablefunc03 (`a` Int32) AS sqlite(\'db_path\', \'table_name\') [] 1 1 +tablefunc04 StorageProxy CREATE TABLE default.tablefunc04 (`a` Int32) AS mongodb(\'127.0.0.1:27017\', \'test\', \'my_collection\', \'test_user\', \'[HIDDEN]\', \'a Int\') [] 1 1 +tablefunc05 StorageProxy CREATE TABLE default.tablefunc05 (`a` Int32) AS redis(\'127.0.0.1:6379\', \'key\', \'key UInt32\') [] 1 1 +tablefunc06 StorageProxy CREATE TABLE default.tablefunc06 (`a` Int32) AS s3(\'http://some_addr:9000/cloud-storage-01/data.tsv\', \'M9O7o0SX5I4udXhWxI12\', \'[HIDDEN]\', \'TSV\') [] 1 1 diff --git a/tests/queries/0_stateless/02888_system_tables_with_inaccsessible_table_function.sql b/tests/queries/0_stateless/02888_system_tables_with_inaccsessible_table_function.sql new file mode 100644 index 00000000000..adcdeecb9e1 --- /dev/null +++ b/tests/queries/0_stateless/02888_system_tables_with_inaccsessible_table_function.sql @@ -0,0 +1,43 @@ +-- Tags: no-fasttest + +DROP DATABASE IF EXISTS {CLICKHOUSE_DATABASE:Identifier}; + +CREATE DATABASE {CLICKHOUSE_DATABASE:Identifier}; + + +CREATE TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc01 (x int) AS postgresql('127.121.0.1:5432', 'postgres_db', 'postgres_table', 'postgres_user', '124444'); +CREATE TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc02 (x int) AS mysql('127.123.0.1:3306', 'mysql_db', 'mysql_table', 'mysql_user','123123'); +CREATE TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc03 (a int) AS sqlite('db_path', 'table_name'); +CREATE TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc04 (a int) AS mongodb('127.0.0.1:27017','test', 'my_collection', 'test_user', 'password', 'a Int'); +CREATE TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc05 (a int) AS redis('127.0.0.1:6379', 'key', 'key UInt32'); +CREATE TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc06 (a int) AS s3('http://some_addr:9000/cloud-storage-01/data.tsv', 'M9O7o0SX5I4udXhWxI12', '9ijqzmVN83fzD9XDkEAAAAAAAA', 'TSV'); + + +CREATE TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc01_without_schema AS postgresql('127.121.0.1:5432', 'postgres_db', 'postgres_table', 'postgres_user', '124444'); -- { serverError 614 } +CREATE TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc02_without_schema AS mysql('127.123.0.1:3306', 'mysql_db', 'mysql_table', 'mysql_user','123123'); -- {serverError 279 } + +SELECT name, engine, engine_full, create_table_query, data_paths, notEmpty([metadata_path]), notEmpty([uuid]) + FROM system.tables + WHERE name like '%tablefunc%' and database=currentDatabase() + ORDER BY name; + +DETACH TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc01; +DETACH TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc02; +DETACH TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc03; +DETACH TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc04; +DETACH TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc05; +DETACH TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc06; + +ATTACH TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc01; +ATTACH TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc02; +ATTACH TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc03; +ATTACH TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc04; +ATTACH TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc05; +ATTACH TABLE {CLICKHOUSE_DATABASE:Identifier}.tablefunc06; + +SELECT name, engine, engine_full, create_table_query, data_paths, notEmpty([metadata_path]), notEmpty([uuid]) + FROM system.tables + WHERE name like '%tablefunc%' and database=currentDatabase() + ORDER BY name; + +DROP DATABASE IF EXISTS {CLICKHOUSE_DATABASE:Identifier}; diff --git a/tests/queries/0_stateless/02889_datetime64_from_string.reference b/tests/queries/0_stateless/02889_datetime64_from_string.reference index e6e2208ed4c..825ed2b7ff4 100644 --- a/tests/queries/0_stateless/02889_datetime64_from_string.reference +++ b/tests/queries/0_stateless/02889_datetime64_from_string.reference @@ -1,3 +1,5 @@ 1969-12-31 23:57:57.000 1970-01-01 00:00:23.900 1969-12-31 23:59:36.100 +\N +\N diff --git a/tests/queries/0_stateless/02889_datetime64_from_string.sql b/tests/queries/0_stateless/02889_datetime64_from_string.sql index 50c29de19bd..99ace8a6ea4 100644 --- a/tests/queries/0_stateless/02889_datetime64_from_string.sql +++ b/tests/queries/0_stateless/02889_datetime64_from_string.sql @@ -2,4 +2,9 @@ SELECT toDateTime64('-123', 3, 'UTC'); -- Allowed: no year starts with '-' SELECT toDateTime64('23.9', 3, 'UTC'); -- Allowed: no year has a dot in notation SELECT toDateTime64('-23.9', 3, 'UTC'); -- Allowed -SELECT toDateTime64('1234', 3, 'UTC'); -- { serverError CANNOT_PARSE_DATETIME } +SELECT toDateTime64OrNull('0', 3, 'UTC'); +SELECT cast('0' as Nullable(DateTime64(3, 'UTC'))); + +SELECT toDateTime64('1234', 3, 'UTC'); -- { serverError CANNOT_PARSE_DATETIME } +SELECT toDateTime64('0', 3, 'UTC'); -- { serverError CANNOT_PARSE_DATETIME } +SELECT cast('0' as DateTime64(3, 'UTC')); -- { serverError CANNOT_PARSE_DATETIME } diff --git a/tests/queries/0_stateless/02890_untuple_column_names.reference b/tests/queries/0_stateless/02890_untuple_column_names.reference index 831d5dac39a..388f974c45f 100644 --- a/tests/queries/0_stateless/02890_untuple_column_names.reference +++ b/tests/queries/0_stateless/02890_untuple_column_names.reference @@ -5,12 +5,12 @@ tupleElement(CAST(tuple(1), 'Tuple(a Int)'), 1): 1 tupleElement(CAST(tuple('s'), 'Tuple(a String)'), 1): s Row 1: ────── -tupleElement(CAST(tuple(1), 'Tuple(a Int)'), 'a'): 1 -tupleElement(CAST(tuple('s'), 'Tuple(a String)'), 'a'): s +tupleElement(CAST((1), 'Tuple(a Int)'), 'a'): 1 +tupleElement(CAST(('s'), 'Tuple(a String)'), 'a'): s Row 1: ────── -tupleElement(CAST(tuple(1), 'Tuple(a Int)'), 'a'): 1 -tupleElement(CAST(tuple(1), 'Tuple(a Int)'), 'a'): 1 +tupleElement(CAST((1), 'Tuple(a Int)'), 'a'): 1 +tupleElement(CAST((1), 'Tuple(a Int)'), 'a'): 1 -- tuple element alias + untuple() alias Row 1: ────── @@ -44,12 +44,12 @@ tupleElement(CAST(tuple(1), 'Tuple(Int)'), 1): 1 tupleElement(CAST(tuple('s'), 'Tuple(String)'), 1): s Row 1: ────── -tupleElement(CAST(tuple(1), 'Tuple(Int)'), '1'): 1 -tupleElement(CAST(tuple('s'), 'Tuple(String)'), '1'): s +tupleElement(CAST((1), 'Tuple(Int)'), '1'): 1 +tupleElement(CAST(('s'), 'Tuple(String)'), '1'): s Row 1: ────── -tupleElement(CAST(tuple(1), 'Tuple(Int)'), '1'): 1 -tupleElement(CAST(tuple(1), 'Tuple(Int)'), '1'): 1 +tupleElement(CAST((1), 'Tuple(Int)'), '1'): 1 +tupleElement(CAST((1), 'Tuple(Int)'), '1'): 1 -- tuple() loses the column names (would be good to fix, see #36773) Row 1: ────── diff --git a/tests/queries/0_stateless/02891_input_csv_cr_end_of_line.reference b/tests/queries/0_stateless/02891_input_csv_cr_end_of_line.reference new file mode 100644 index 00000000000..9a9f63dc0a5 --- /dev/null +++ b/tests/queries/0_stateless/02891_input_csv_cr_end_of_line.reference @@ -0,0 +1,3 @@ +A 110 208819249 +B 112 208819248 +C 123 783434434 diff --git a/tests/queries/0_stateless/02891_input_csv_cr_end_of_line.sh b/tests/queries/0_stateless/02891_input_csv_cr_end_of_line.sh new file mode 100755 index 00000000000..1e2f647fae3 --- /dev/null +++ b/tests/queries/0_stateless/02891_input_csv_cr_end_of_line.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +# NOTE: this sh wrapper is required because of shell_config + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "drop table if exists test_tbl" +$CLICKHOUSE_CLIENT -q "create table test_tbl (a String, b String, c String) engine=MergeTree order by a" +cat $CURDIR/data_csv/csv_with_cr_end_of_line.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_tbl SETTINGS input_format_csv_allow_cr_end_of_line=true FORMAT CSV" +$CLICKHOUSE_CLIENT -q "select * from test_tbl" +$CLICKHOUSE_CLIENT -q "drop table test_tbl" \ No newline at end of file diff --git a/tests/queries/0_stateless/02898_parallel_replicas_progress_bar.reference b/tests/queries/0_stateless/02898_parallel_replicas_progress_bar.reference new file mode 100644 index 00000000000..c66597436f3 --- /dev/null +++ b/tests/queries/0_stateless/02898_parallel_replicas_progress_bar.reference @@ -0,0 +1,8 @@ +3000 1000 3999 2499.5 +1 +1998 2944475297004403859 +1999 254596732598015005 +2000 6863370867519437063 +2001 17844331710293705251 +2002 1587587338113897332 +1 diff --git a/tests/queries/0_stateless/02898_parallel_replicas_progress_bar.sql b/tests/queries/0_stateless/02898_parallel_replicas_progress_bar.sql new file mode 100644 index 00000000000..70a1cedf663 --- /dev/null +++ b/tests/queries/0_stateless/02898_parallel_replicas_progress_bar.sql @@ -0,0 +1,38 @@ +DROP TABLE IF EXISTS t1 SYNC; +DROP TABLE IF EXISTS t2 SYNC; +DROP TABLE IF EXISTS t3 SYNC; + +CREATE TABLE t1(k UInt32, v String) ENGINE ReplicatedMergeTree('/parallel_replicas/{database}/test_tbl', 'r1') ORDER BY k; +CREATE TABLE t2(k UInt32, v String) ENGINE ReplicatedMergeTree('/parallel_replicas/{database}/test_tbl', 'r2') ORDER BY k; +CREATE TABLE t3(k UInt32, v String) ENGINE ReplicatedMergeTree('/parallel_replicas/{database}/test_tbl', 'r3') ORDER BY k; + +insert into t1 select number, toString(number) from numbers(1000, 1000); +insert into t2 select number, toString(number) from numbers(2000, 1000); +insert into t3 select number, toString(number) from numbers(3000, 1000); + +system sync replica t1; +system sync replica t2; +system sync replica t3; + +SET allow_experimental_parallel_reading_from_replicas=1, max_parallel_replicas=3, use_hedged_requests=0, parallel_replicas_for_non_replicated_merge_tree=1, cluster_for_parallel_replicas='test_cluster_one_shard_three_replicas_localhost'; + +-- default coordinator +SELECT count(), min(k), max(k), avg(k) FROM t1 SETTINGS log_comment='02898_default_190aed82-2423-413b-ad4c-24dcca50f65b'; + +-- check logs +SYSTEM FLUSH LOGS; +SELECT count() > 0 FROM system.text_log +WHERE query_id in (select query_id from system.query_log where current_database = currentDatabase() AND log_comment='02898_default_190aed82-2423-413b-ad4c-24dcca50f65b') + AND message LIKE '%Total rows to read: 3000%' SETTINGS allow_experimental_parallel_reading_from_replicas=0; + +-- reading in order coordinator +SELECT k, sipHash64(v) FROM t1 order by k limit 5 offset 998 SETTINGS optimize_read_in_order=1, log_comment='02898_inorder_190aed82-2423-413b-ad4c-24dcca50f65b'; + +SYSTEM FLUSH LOGS; +SELECT count() > 0 FROM system.text_log +WHERE query_id in (select query_id from system.query_log where current_database = currentDatabase() AND log_comment='02898_inorder_190aed82-2423-413b-ad4c-24dcca50f65b') + AND message LIKE '%Updated total rows to read: added % rows, total 3000 rows%' SETTINGS allow_experimental_parallel_reading_from_replicas=0; + +DROP TABLE t1 SYNC; +DROP TABLE t2 SYNC; +DROP TABLE t3 SYNC; diff --git a/tests/queries/0_stateless/02907_backup_mv_with_no_inner_table.reference b/tests/queries/0_stateless/02907_backup_mv_with_no_inner_table.reference new file mode 100644 index 00000000000..4c0c7fb5bd1 --- /dev/null +++ b/tests/queries/0_stateless/02907_backup_mv_with_no_inner_table.reference @@ -0,0 +1,3 @@ +BACKUP_CREATED +RESTORED +0 diff --git a/tests/queries/0_stateless/02907_backup_mv_with_no_inner_table.sh b/tests/queries/0_stateless/02907_backup_mv_with_no_inner_table.sh new file mode 100755 index 00000000000..30ec50fa20f --- /dev/null +++ b/tests/queries/0_stateless/02907_backup_mv_with_no_inner_table.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Tags: no-ordinary-database + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -nm --query " +drop table if exists src; +create table src (a Int32) engine = MergeTree() order by tuple(); + +drop table if exists mv; +create materialized view mv (a Int32) engine = MergeTree() order by tuple() as select * from src; +" + +uuid=$(${CLICKHOUSE_CLIENT} --query "select uuid from system.tables where table='mv' and database == currentDatabase()") +inner_table=".inner_id.${uuid}" +${CLICKHOUSE_CLIENT} -nm --query "drop table \`$inner_table\` sync" + +${CLICKHOUSE_CLIENT} -nm --query " +set send_logs_level = 'error'; +backup table ${CLICKHOUSE_DATABASE}.\`mv\` to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); +" | grep -o "BACKUP_CREATED" + +${CLICKHOUSE_CLIENT} -nm --query " +drop table mv; +restore table ${CLICKHOUSE_DATABASE}.\`mv\` from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); +" | grep -o "RESTORED" + +${CLICKHOUSE_CLIENT} --query "select count() from mv;" diff --git a/tests/queries/0_stateless/02907_backup_restore_default_nullable.reference b/tests/queries/0_stateless/02907_backup_restore_default_nullable.reference new file mode 100644 index 00000000000..0aed0444667 --- /dev/null +++ b/tests/queries/0_stateless/02907_backup_restore_default_nullable.reference @@ -0,0 +1,4 @@ +BACKUP_CREATED +CREATE TABLE default.test\n(\n `test` String\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 +RESTORED +CREATE TABLE default.test\n(\n `test` String\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 diff --git a/tests/queries/0_stateless/02907_backup_restore_default_nullable.sh b/tests/queries/0_stateless/02907_backup_restore_default_nullable.sh new file mode 100755 index 00000000000..8ed36a7edd7 --- /dev/null +++ b/tests/queries/0_stateless/02907_backup_restore_default_nullable.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -nm --query " +drop table if exists test; +set data_type_default_nullable = 0; +create table test (test String) ENGINE = MergeTree() ORDER BY tuple(); +backup table ${CLICKHOUSE_DATABASE}.test on cluster test_shard_localhost to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); +" | grep -o "BACKUP_CREATED" + +${CLICKHOUSE_CLIENT} --query "show create table test" + +${CLICKHOUSE_CLIENT} -nm --query " +drop table test sync; +set data_type_default_nullable = 1; +restore table ${CLICKHOUSE_DATABASE}.test on cluster test_shard_localhost from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); +" | grep -o "RESTORED" + +${CLICKHOUSE_CLIENT} --query "show create table test" diff --git a/tests/queries/0_stateless/02907_backup_restore_flatten_nested.reference b/tests/queries/0_stateless/02907_backup_restore_flatten_nested.reference new file mode 100644 index 00000000000..aa8f22f590a --- /dev/null +++ b/tests/queries/0_stateless/02907_backup_restore_flatten_nested.reference @@ -0,0 +1,8 @@ +BACKUP_CREATED +CREATE TABLE default.test\n(\n `test` Array(Tuple(foo String, bar Float64))\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 +BACKUP_CREATED +CREATE TABLE default.test2\n(\n `test` Nested(foo String, bar Float64)\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 +RESTORED +CREATE TABLE default.test\n(\n `test` Array(Tuple(foo String, bar Float64))\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 +RESTORED +CREATE TABLE default.test2\n(\n `test` Nested(foo String, bar Float64)\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 diff --git a/tests/queries/0_stateless/02907_backup_restore_flatten_nested.sh b/tests/queries/0_stateless/02907_backup_restore_flatten_nested.sh new file mode 100755 index 00000000000..742d24a97eb --- /dev/null +++ b/tests/queries/0_stateless/02907_backup_restore_flatten_nested.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -nm --query " +drop table if exists test; +set flatten_nested = 0; +create table test (test Array(Tuple(foo String, bar Float64))) ENGINE = MergeTree() ORDER BY tuple(); +backup table ${CLICKHOUSE_DATABASE}.test on cluster test_shard_localhost to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); +" | grep -o "BACKUP_CREATED" + +${CLICKHOUSE_CLIENT} --query "show create table test" + +${CLICKHOUSE_CLIENT} -nm --query " +drop table if exists test2; +set flatten_nested = 0; +create table test2 (test Nested(foo String, bar Float64)) ENGINE = MergeTree() ORDER BY tuple(); +backup table ${CLICKHOUSE_DATABASE}.test2 on cluster test_shard_localhost to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}2'); +" | grep -o "BACKUP_CREATED" + +${CLICKHOUSE_CLIENT} --query "show create table test2" + +${CLICKHOUSE_CLIENT} -nm --query " +drop table test sync; +set flatten_nested = 1; +restore table ${CLICKHOUSE_DATABASE}.test on cluster test_shard_localhost from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); +" | grep -o "RESTORED" + +${CLICKHOUSE_CLIENT} --query "show create table test" + +${CLICKHOUSE_CLIENT} -nm --query " +drop table test2 sync; +set flatten_nested = 1; +restore table ${CLICKHOUSE_DATABASE}.test2 on cluster test_shard_localhost from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}2'); +" | grep -o "RESTORED" + +${CLICKHOUSE_CLIENT} --query "show create table test2" diff --git a/tests/queries/0_stateless/02907_clickhouse_dictionary_bug.reference b/tests/queries/0_stateless/02907_clickhouse_dictionary_bug.reference new file mode 100644 index 00000000000..61780798228 --- /dev/null +++ b/tests/queries/0_stateless/02907_clickhouse_dictionary_bug.reference @@ -0,0 +1 @@ +b diff --git a/tests/queries/0_stateless/02907_clickhouse_dictionary_bug.sh b/tests/queries/0_stateless/02907_clickhouse_dictionary_bug.sh new file mode 100755 index 00000000000..57182050534 --- /dev/null +++ b/tests/queries/0_stateless/02907_clickhouse_dictionary_bug.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# Tags: zookeeper + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -n -q " + DROP DICTIONARY IF EXISTS 02907_dictionary; + DROP TABLE IF EXISTS 02907_table; + + CREATE TABLE 02907_table (A String, B String) ENGINE=Memory AS SELECT 'a', 'b'; + CREATE DICTIONARY 02907_dictionary(A String, B String) PRIMARY KEY A + SOURCE(CLICKHOUSE(QUERY \$\$ SELECT A, B FROM ${CLICKHOUSE_DATABASE}.02907_table ORDER BY A DESC LIMIT 1 BY A \$\$)) + LAYOUT(complex_key_direct()); + + SELECT dictGet('02907_dictionary','B','a'); + + DROP DICTIONARY 02907_dictionary; + DROP TABLE 02907_table;" diff --git a/tests/queries/0_stateless/02907_filter_pushdown_crash.reference b/tests/queries/0_stateless/02907_filter_pushdown_crash.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02907_filter_pushdown_crash.sql b/tests/queries/0_stateless/02907_filter_pushdown_crash.sql new file mode 100644 index 00000000000..eb881823f27 --- /dev/null +++ b/tests/queries/0_stateless/02907_filter_pushdown_crash.sql @@ -0,0 +1,18 @@ +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; + +CREATE TABLE t1 (key UInt8) ENGINE = MergeTree ORDER BY key; +INSERT INTO t1 VALUES (1),(2); + +CREATE TABLE t2 (key UInt32) ENGINE = MergeTree ORDER BY key; +INSERT INTO t2 VALUES (1),(2); + +SELECT a FROM ( SELECT key + 1 as a, key FROM t1 GROUP BY key ) WHERE key FORMAT Null; + +SET join_algorithm = 'full_sorting_merge'; +SET max_rows_in_set_to_optimize_join = 0; + +SELECT key FROM ( SELECT key FROM t1 GROUP BY key ) t1 JOIN (SELECT key FROM t2) t2 ON t1.key = t2.key WHERE key FORMAT Null; + +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; diff --git a/tests/queries/0_stateless/02907_preferred_optimize_projection_name.reference b/tests/queries/0_stateless/02907_preferred_optimize_projection_name.reference new file mode 100644 index 00000000000..8e2a7d8af18 --- /dev/null +++ b/tests/queries/0_stateless/02907_preferred_optimize_projection_name.reference @@ -0,0 +1,8 @@ +test +projection_test_by_string +Executing query with setting +test +projection_test_by_more +Executing query with wrong projection +test +projection_test_by_string diff --git a/tests/queries/0_stateless/02907_preferred_optimize_projection_name.sh b/tests/queries/0_stateless/02907_preferred_optimize_projection_name.sh new file mode 100755 index 00000000000..5a109605c3d --- /dev/null +++ b/tests/queries/0_stateless/02907_preferred_optimize_projection_name.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_opt_proj;" + +$CLICKHOUSE_CLIENT -q " +CREATE TABLE test_opt_proj ( + test_id UInt64, + test_name String, + test_count Nullable(Float64), + test_string String, + PROJECTION projection_test_by_string ( + SELECT test_string, + sum(test_count) + GROUP BY test_id, + test_string, + test_name + ), + PROJECTION projection_test_by_more ( + SELECT test_string, + test_name, + sum(test_count) + GROUP BY test_id, + test_string, + test_name + ) +) ENGINE = MergeTree +ORDER BY test_string;" + +$CLICKHOUSE_CLIENT -q " +INSERT INTO test_opt_proj +SELECT number, + 'test', + 1.* (number / 2), + 'test' +FROM numbers(100, 500);" + +$CLICKHOUSE_CLIENT --query_id 02907_test_$CLICKHOUSE_DATABASE -q " +SELECT test_string +FROM test_opt_proj +WHERE (test_id > 50) + AND (test_id < 150) +GROUP BY test_string;" + +$CLICKHOUSE_CLIENT -q "SYSTEM FLUSH LOGS;" + +$CLICKHOUSE_CLIENT -q " +SELECT projections +FROM system.query_log +WHERE query_id = '02907_test_$CLICKHOUSE_DATABASE' AND current_database=currentDatabase() +LIMIT 1;" | grep -o "projection_test_by_string" || true + +$CLICKHOUSE_CLIENT -q " +SELECT projections +FROM system.query_log +WHERE query_id = '02907_test_$CLICKHOUSE_DATABASE' AND current_database=currentDatabase() +LIMIT 1;" | grep -o "projection_test_by_more" || true + +echo "Executing query with setting" + +$CLICKHOUSE_CLIENT --query_id 02907_test_1_$CLICKHOUSE_DATABASE --preferred_optimize_projection_name 'projection_test_by_more' -q " +SELECT test_string +FROM test_opt_proj +WHERE (test_id > 50) + AND (test_id < 150) +GROUP BY test_string;" + +$CLICKHOUSE_CLIENT -q "SYSTEM FLUSH LOGS;" + +$CLICKHOUSE_CLIENT -q " +SELECT projections +FROM system.query_log +WHERE query_id = '02907_test_1_$CLICKHOUSE_DATABASE' AND current_database=currentDatabase() +LIMIT 1;" | grep -o "projection_test_by_more" || true + +$CLICKHOUSE_CLIENT -q " +SELECT projections +FROM system.query_log +WHERE query_id = '02907_test_1_$CLICKHOUSE_DATABASE' AND current_database=currentDatabase() +LIMIT 1" | grep -o "projection_test_by_string" || true + +echo "Executing query with wrong projection" + +$CLICKHOUSE_CLIENT --query_id 02907_test_2_$CLICKHOUSE_DATABASE --preferred_optimize_projection_name 'non_existing_projection' -q " +SELECT test_string +FROM test_opt_proj +WHERE (test_id > 50) + AND (test_id < 150) +GROUP BY test_string;" + +$CLICKHOUSE_CLIENT -q "SYSTEM FLUSH LOGS;" + +$CLICKHOUSE_CLIENT -q " +SELECT projections +FROM system.query_log +WHERE query_id = '02907_test_2_$CLICKHOUSE_DATABASE' AND current_database=currentDatabase() +LIMIT 1;" | grep -o "projection_test_by_string" || true + +$CLICKHOUSE_CLIENT -q " +SELECT projections +FROM system.query_log +WHERE query_id = '02907_test_2_$CLICKHOUSE_DATABASE' AND current_database=currentDatabase() +LIMIT 1;" | grep -o "projection_test_by_more" || true diff --git a/tests/queries/0_stateless/02907_system_backups_profile_events.reference b/tests/queries/0_stateless/02907_system_backups_profile_events.reference new file mode 100644 index 00000000000..46581c6a12b --- /dev/null +++ b/tests/queries/0_stateless/02907_system_backups_profile_events.reference @@ -0,0 +1,2 @@ +BACKUP_CREATED +1 diff --git a/tests/queries/0_stateless/02907_system_backups_profile_events.sh b/tests/queries/0_stateless/02907_system_backups_profile_events.sh new file mode 100755 index 00000000000..801056a2844 --- /dev/null +++ b/tests/queries/0_stateless/02907_system_backups_profile_events.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -nm --query " +drop table if exists test; +create table test (a Int32) engine = MergeTree() order by tuple(); +" + +backup_id=${CLICKHOUSE_TEST_UNIQUE_NAME} +backup_name="Disk('backups', '$backup_id')"; + +${CLICKHOUSE_CLIENT} -nm --query " +backup table ${CLICKHOUSE_DATABASE}.test to $backup_name; +" | grep -o "BACKUP_CREATED" + +${CLICKHOUSE_CLIENT} -nm --query " +select ProfileEvents['BackupEntriesCollectorMicroseconds'] > 10 from system.backups where name='Disk(\'backups\', \'$backup_id\')' +" diff --git a/tests/queries/0_stateless/02910_nullable_enum_cast.reference b/tests/queries/0_stateless/02910_nullable_enum_cast.reference new file mode 100644 index 00000000000..6ce48f2ea6b --- /dev/null +++ b/tests/queries/0_stateless/02910_nullable_enum_cast.reference @@ -0,0 +1,4 @@ +\N +\N +A +A diff --git a/tests/queries/0_stateless/02910_nullable_enum_cast.sql b/tests/queries/0_stateless/02910_nullable_enum_cast.sql new file mode 100644 index 00000000000..09189539c5a --- /dev/null +++ b/tests/queries/0_stateless/02910_nullable_enum_cast.sql @@ -0,0 +1,4 @@ +SELECT CAST(materialize(CAST(NULL, 'Nullable(Enum(\'A\' = 1, \'B\' = 2))')), 'Nullable(String)'); +SELECT CAST(CAST(NULL, 'Nullable(Enum(\'A\' = 1, \'B\' = 2))'), 'Nullable(String)'); +SELECT CAST(materialize(CAST(1, 'Nullable(Enum(\'A\' = 1, \'B\' = 2))')), 'Nullable(String)'); +SELECT CAST(CAST(1, 'Nullable(Enum(\'A\' = 1, \'B\' = 2))'), 'Nullable(String)'); diff --git a/tests/queries/0_stateless/02910_object-json-crash-add-column.reference b/tests/queries/0_stateless/02910_object-json-crash-add-column.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02910_object-json-crash-add-column.sql b/tests/queries/0_stateless/02910_object-json-crash-add-column.sql new file mode 100644 index 00000000000..b2d64be1676 --- /dev/null +++ b/tests/queries/0_stateless/02910_object-json-crash-add-column.sql @@ -0,0 +1,49 @@ +DROP TABLE IF EXISTS test02910; + +CREATE TABLE test02910 +( + i Int8, + jString String +) ENGINE = MergeTree +ORDER BY i; + +INSERT INTO test02910 (i, jString) SELECT 1, '{"a":"123"}'; + +ALTER TABLE test02910 ADD COLUMN j2 Tuple(JSON) DEFAULT jString; -- { serverError SUPPORT_IS_DISABLED } +ALTER TABLE test02910 ADD COLUMN j2 Tuple(Float64, JSON); -- { serverError SUPPORT_IS_DISABLED } +ALTER TABLE test02910 ADD COLUMN j2 Tuple(Array(Tuple(JSON))) DEFAULT jString; -- { serverError SUPPORT_IS_DISABLED } +ALTER TABLE test02910 ADD COLUMN j2 JSON default jString; -- { serverError SUPPORT_IS_DISABLED } + +-- If we would allow adding a column with dynamic subcolumns the subsequent select would crash the server. +-- SELECT * FROM test02910; + +DROP TABLE IF EXISTS test02910_second; + +CREATE TABLE test02910_second +( + `Id1` String, + `Id2` String, + `timestamp` DateTime64(6), + `tags` Array(String), +) +ENGINE = MergeTree +PRIMARY KEY (Id1, Id2) +ORDER BY (Id1, Id2, timestamp) +SETTINGS index_granularity = 8192, index_granularity_bytes = 0; + +INSERT INTO test02910_second SELECT number, number, '2023-10-28 11:11:11.11111', [] FROM numbers(10); +INSERT INTO test02910_second SELECT number, number, '2023-10-28 11:11:11.11111', ['a'] FROM numbers(10); +INSERT INTO test02910_second SELECT number, number, '2023-10-28 11:11:11.11111', ['b'] FROM numbers(10); +INSERT INTO test02910_second SELECT number, number, '2023-10-28 11:11:11.11111', ['c', 'd'] FROM numbers(10); +INSERT INTO test02910_second SELECT number, number, '2023-10-28 11:11:11.11111', [] FROM numbers(10); + +ALTER TABLE test02910_second ADD COLUMN `tags_json` Tuple(JSON) DEFAULT jString; -- { serverError SUPPORT_IS_DISABLED } +ALTER TABLE test02910_second ADD COLUMN `tags_json` Tuple(Float64, JSON); -- { serverError SUPPORT_IS_DISABLED } +ALTER TABLE test02910_second ADD COLUMN `tags_json` Tuple(Array(Tuple(JSON))) DEFAULT jString; -- { serverError SUPPORT_IS_DISABLED } +ALTER TABLE test02910_second ADD COLUMN `tags_json` JSON; -- { serverError SUPPORT_IS_DISABLED } + +-- If we would allow adding a column with dynamic subcolumns the subsequent select would crash the server. +-- SELECT * FROM test02910; + +DROP TABLE IF EXISTS test02910; +DROP TABLE IF EXISTS test02910_second; diff --git a/tests/queries/0_stateless/02911_analyzer_order_by_read_in_order_query_plan.reference b/tests/queries/0_stateless/02911_analyzer_order_by_read_in_order_query_plan.reference new file mode 100644 index 00000000000..5dd0d0d1820 --- /dev/null +++ b/tests/queries/0_stateless/02911_analyzer_order_by_read_in_order_query_plan.reference @@ -0,0 +1,410 @@ +-- { echoOn } + +-- Exact match, single key +select * from tab order by (a + b) * c; +0 0 0 0 +0 0 0 0 +1 1 1 1 +1 1 1 1 +2 2 2 2 +2 2 2 2 +3 3 3 3 +3 3 3 3 +4 4 4 4 +4 4 4 4 +select * from (explain plan actions = 1 select * from tab order by (a + b) * c) where explain like '%sort description%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) ASC +select * from tab order by (a + b) * c desc; +4 4 4 4 +4 4 4 4 +3 3 3 3 +3 3 3 3 +2 2 2 2 +2 2 2 2 +1 1 1 1 +1 1 1 1 +0 0 0 0 +0 0 0 0 +select * from (explain plan actions = 1 select * from tab order by (a + b) * c desc) where explain like '%sort description%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) DESC + Result sort description: multiply(plus(a_0, b_1), c_2) DESC +-- Exact match, full key +select * from tab order by (a + b) * c, sin(a / b); +0 0 0 0 +0 0 0 0 +1 1 1 1 +1 1 1 1 +2 2 2 2 +2 2 2 2 +3 3 3 3 +3 3 3 3 +4 4 4 4 +4 4 4 4 +select * from (explain plan actions = 1 select * from tab order by (a + b) * c, sin(a / b)) where explain like '%sort description%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) ASC +select * from tab order by (a + b) * c desc, sin(a / b) desc; +4 4 4 4 +4 4 4 4 +3 3 3 3 +3 3 3 3 +2 2 2 2 +2 2 2 2 +1 1 1 1 +1 1 1 1 +0 0 0 0 +0 0 0 0 +select * from (explain plan actions = 1 select * from tab order by (a + b) * c desc, sin(a / b) desc) where explain like '%sort description%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) DESC, sin(divide(a_0, b_1)) DESC + Result sort description: multiply(plus(a_0, b_1), c_2) DESC, sin(divide(a_0, b_1)) DESC +-- Exact match, mixed direction +select * from tab order by (a + b) * c desc, sin(a / b); +4 4 4 4 +4 4 4 4 +3 3 3 3 +3 3 3 3 +2 2 2 2 +2 2 2 2 +1 1 1 1 +1 1 1 1 +0 0 0 0 +0 0 0 0 +select * from (explain plan actions = 1 select * from tab order by (a + b) * c desc, sin(a / b)) where explain like '%sort description%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) DESC + Result sort description: multiply(plus(a_0, b_1), c_2) DESC, sin(divide(a_0, b_1)) ASC +select * from tab order by (a + b) * c, sin(a / b) desc; +0 0 0 0 +0 0 0 0 +1 1 1 1 +1 1 1 1 +2 2 2 2 +2 2 2 2 +3 3 3 3 +3 3 3 3 +4 4 4 4 +4 4 4 4 +select * from (explain plan actions = 1 select * from tab order by (a + b) * c, sin(a / b) desc) where explain like '%sort description%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) DESC +-- Wrong order, full sort +select * from tab order by sin(a / b), (a + b) * c; +1 1 1 1 +1 1 1 1 +2 2 2 2 +2 2 2 2 +3 3 3 3 +3 3 3 3 +4 4 4 4 +4 4 4 4 +0 0 0 0 +0 0 0 0 +select * from (explain plan actions = 1 select * from tab order by sin(a / b), (a + b) * c) where explain ilike '%sort description%'; + Sort description: sin(divide(a_0, b_1)) ASC, multiply(plus(a_0, b_1), c_2) ASC +-- Fixed point +select * from tab where (a + b) * c = 8 order by sin(a / b); +2 2 2 2 +2 2 2 2 +select * from (explain plan actions = 1 select * from tab where (a + b) * c = 8 order by sin(a / b)) where explain ilike '%sort description%'; + Prefix sort description: sin(divide(a_0, b_1)) ASC + Result sort description: sin(divide(a_0, b_1)) ASC +select * from tab where d + 1 = 2 order by (d + 1) * 4, (a + b) * c; +1 1 1 1 +1 1 1 1 +select * from (explain plan actions = 1 select * from tab where d + 1 = 2 order by (d + 1) * 4, (a + b) * c) where explain ilike '%sort description%'; + Prefix sort description: multiply(plus(d_3, 1_UInt8), 4_UInt8) ASC, multiply(plus(a_0, b_1), c_2) ASC + Result sort description: multiply(plus(d_3, 1_UInt8), 4_UInt8) ASC, multiply(plus(a_0, b_1), c_2) ASC +select * from tab where d + 1 = 3 and (a + b) = 4 and c = 2 order by (d + 1) * 4, sin(a / b); +2 2 2 2 +2 2 2 2 +select * from (explain plan actions = 1 select * from tab where d + 1 = 3 and (a + b) = 4 and c = 2 order by (d + 1) * 4, sin(a / b)) where explain ilike '%sort description%'; + Prefix sort description: multiply(plus(d_3, 1_UInt8), 4_UInt8) ASC, sin(divide(a_0, b_1)) ASC + Result sort description: multiply(plus(d_3, 1_UInt8), 4_UInt8) ASC, sin(divide(a_0, b_1)) ASC +-- Wrong order with fixed point +select * from tab where (a + b) * c = 8 order by sin(b / a); +2 2 2 2 +2 2 2 2 +select * from (explain plan actions = 1 select * from tab where (a + b) * c = 8 order by sin(b / a)) where explain ilike '%sort description%'; + Sort description: sin(divide(b_1, a_0)) ASC +-- Monotonicity +select * from tab order by intDiv((a + b) * c, 2); +0 0 0 0 +0 0 0 0 +1 1 1 1 +1 1 1 1 +2 2 2 2 +2 2 2 2 +3 3 3 3 +3 3 3 3 +4 4 4 4 +4 4 4 4 +select * from (explain plan actions = 1 select * from tab order by intDiv((a + b) * c, 2)) where explain like '%sort description%'; + Prefix sort description: intDiv(multiply(plus(a_0, b_1), c_2), 2_UInt8) ASC + Result sort description: intDiv(multiply(plus(a_0, b_1), c_2), 2_UInt8) ASC +select * from tab order by intDiv((a + b) * c, 2), sin(a / b); +0 0 0 0 +0 0 0 0 +1 1 1 1 +1 1 1 1 +2 2 2 2 +2 2 2 2 +3 3 3 3 +3 3 3 3 +4 4 4 4 +4 4 4 4 +select * from (explain plan actions = 1 select * from tab order by intDiv((a + b) * c, 2), sin(a / b)) where explain like '%sort description%'; + Prefix sort description: intDiv(multiply(plus(a_0, b_1), c_2), 2_UInt8) ASC + Result sort description: intDiv(multiply(plus(a_0, b_1), c_2), 2_UInt8) ASC, sin(divide(a_0, b_1)) ASC +-- select * from tab order by (a + b) * c, intDiv(sin(a / b), 2); +select * from (explain plan actions = 1 select * from tab order by (a + b) * c, intDiv(sin(a / b), 2)) where explain like '%sort description%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) ASC, intDiv(sin(divide(a_0, b_1)), 2_UInt8) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) ASC, intDiv(sin(divide(a_0, b_1)), 2_UInt8) ASC +-- select * from tab order by (a + b) * c desc , intDiv(sin(a / b), 2); +select * from (explain plan actions = 1 select * from tab order by (a + b) * c desc , intDiv(sin(a / b), 2)) where explain like '%sort description%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) DESC + Result sort description: multiply(plus(a_0, b_1), c_2) DESC, intDiv(sin(divide(a_0, b_1)), 2_UInt8) ASC +-- select * from tab order by (a + b) * c, intDiv(sin(a / b), 2) desc; +select * from (explain plan actions = 1 select * from tab order by (a + b) * c, intDiv(sin(a / b), 2) desc) where explain like '%sort description%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) ASC, intDiv(sin(divide(a_0, b_1)), 2_UInt8) DESC +-- select * from tab order by (a + b) * c desc, intDiv(sin(a / b), 2) desc; +select * from (explain plan actions = 1 select * from tab order by (a + b) * c desc, intDiv(sin(a / b), 2) desc) where explain like '%sort description%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) DESC, intDiv(sin(divide(a_0, b_1)), 2_UInt8) DESC + Result sort description: multiply(plus(a_0, b_1), c_2) DESC, intDiv(sin(divide(a_0, b_1)), 2_UInt8) DESC +-- select * from tab order by (a + b) * c desc, intDiv(sin(a / b), -2); +select * from (explain plan actions = 1 select * from tab order by (a + b) * c desc, intDiv(sin(a / b), -2)) where explain like '%sort description%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) DESC, intDiv(sin(divide(a_0, b_1)), -2_Int8) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) DESC, intDiv(sin(divide(a_0, b_1)), -2_Int8) ASC +-- select * from tab order by (a + b) * c desc, intDiv(intDiv(sin(a / b), -2), -3); +select * from (explain plan actions = 1 select * from tab order by (a + b) * c desc, intDiv(intDiv(sin(a / b), -2), -3)) where explain like '%sort description%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) DESC + Result sort description: multiply(plus(a_0, b_1), c_2) DESC, intDiv(intDiv(sin(divide(a_0, b_1)), -2_Int8), -3_Int8) ASC +-- select * from tab order by (a + b) * c, intDiv(intDiv(sin(a / b), -2), -3); +select * from (explain plan actions = 1 select * from tab order by (a + b) * c, intDiv(intDiv(sin(a / b), -2), -3)) where explain like '%sort description%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) ASC, intDiv(intDiv(sin(divide(a_0, b_1)), -2_Int8), -3_Int8) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) ASC, intDiv(intDiv(sin(divide(a_0, b_1)), -2_Int8), -3_Int8) ASC +-- Aliases +select * from (select *, a + b as x from tab) order by x * c; +0 0 0 0 0 +0 0 0 0 0 +1 1 1 1 2 +1 1 1 1 2 +2 2 2 2 4 +2 2 2 2 4 +3 3 3 3 6 +3 3 3 3 6 +4 4 4 4 8 +4 4 4 4 8 +select * from (explain plan actions = 1 select * from (select *, a + b as x from tab) order by x * c) where explain like '%sort description%'; + Prefix sort description: multiply(x_4, c_2) ASC + Result sort description: multiply(x_4, c_2) ASC +select * from (select *, a + b as x, a / b as y from tab) order by x * c, sin(y); +0 0 0 0 0 nan +0 0 0 0 0 nan +1 1 1 1 2 1 +1 1 1 1 2 1 +2 2 2 2 4 1 +2 2 2 2 4 1 +3 3 3 3 6 1 +3 3 3 3 6 1 +4 4 4 4 8 1 +4 4 4 4 8 1 +select * from (explain plan actions = 1 select * from (select *, a + b as x, a / b as y from tab) order by x * c, sin(y)) where explain like '%sort description%'; + Prefix sort description: multiply(x_4, c_2) ASC, sin(y_5) ASC + Result sort description: multiply(x_4, c_2) ASC, sin(y_5) ASC +select * from (select *, a / b as y from (select *, a + b as x from tab)) order by x * c, sin(y); +0 0 0 0 0 nan +0 0 0 0 0 nan +1 1 1 1 2 1 +1 1 1 1 2 1 +2 2 2 2 4 1 +2 2 2 2 4 1 +3 3 3 3 6 1 +3 3 3 3 6 1 +4 4 4 4 8 1 +4 4 4 4 8 1 +select * from (explain plan actions = 1 select * from (select *, a / b as y from (select *, a + b as x from tab)) order by x * c, sin(y)) where explain like '%sort description%'; + Prefix sort description: multiply(x_4, c_2) ASC, sin(y_5) ASC + Result sort description: multiply(x_4, c_2) ASC, sin(y_5) ASC +-- { echoOn } + +select * from tab2 order by toTimeZone(toTimezone(x, 'UTC'), 'CET'), intDiv(intDiv(y, -2), -3); +2020-02-02 00:00:00 0 0 +2020-02-02 00:00:00 0 0 +2020-02-03 00:00:00 1 1 +2020-02-03 00:00:00 1 1 +2020-02-04 00:00:00 2 2 +2020-02-04 00:00:00 2 2 +2020-02-05 00:00:00 3 3 +2020-02-05 00:00:00 3 3 +select * from (explain plan actions = 1 select * from tab2 order by toTimeZone(toTimezone(x, 'UTC'), 'CET'), intDiv(intDiv(y, -2), -3)) where explain like '%sort description%'; + Prefix sort description: toTimezone(toTimezone(x_0, \'UTC\'_String), \'CET\'_String) ASC, intDiv(intDiv(y_1, -2_Int8), -3_Int8) ASC + Result sort description: toTimezone(toTimezone(x_0, \'UTC\'_String), \'CET\'_String) ASC, intDiv(intDiv(y_1, -2_Int8), -3_Int8) ASC +select * from tab2 order by toStartOfDay(x), intDiv(intDiv(y, -2), -3); +2020-02-02 00:00:00 0 0 +2020-02-02 00:00:00 0 0 +2020-02-03 00:00:00 1 1 +2020-02-03 00:00:00 1 1 +2020-02-04 00:00:00 2 2 +2020-02-04 00:00:00 2 2 +2020-02-05 00:00:00 3 3 +2020-02-05 00:00:00 3 3 +select * from (explain plan actions = 1 select * from tab2 order by toStartOfDay(x), intDiv(intDiv(y, -2), -3)) where explain like '%sort description%'; + Prefix sort description: toStartOfDay(x_0) ASC + Result sort description: toStartOfDay(x_0) ASC, intDiv(intDiv(y_1, -2_Int8), -3_Int8) ASC +-- select * from tab2 where toTimezone(x, 'CET') = '2020-02-03 01:00:00' order by intDiv(intDiv(y, -2), -3); +select * from (explain plan actions = 1 select * from tab2 where toTimezone(x, 'CET') = '2020-02-03 01:00:00' order by intDiv(intDiv(y, -2), -3)) where explain like '%sort description%'; + Prefix sort description: intDiv(intDiv(y_1, -2_Int8), -3_Int8) ASC + Result sort description: intDiv(intDiv(y_1, -2_Int8), -3_Int8) ASC +-- { echoOn } + +-- Union (not fully supported) +select * from (select * from tab union all select * from tab3) order by (a + b) * c, sin(a / b); +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +1 1 1 1 +1 1 1 1 +1 1 1 1 +1 1 1 1 +2 2 2 2 +2 2 2 2 +2 2 2 2 +2 2 2 2 +3 3 3 3 +3 3 3 3 +3 3 3 3 +3 3 3 3 +4 4 4 4 +4 4 4 4 +4 4 4 4 +4 4 4 4 +select * from (explain plan actions = 1 select * from (select * from tab union all select * from tab3) order by (a + b) * c, sin(a / b)) where explain like '%sort description%' or explain like '%ReadType%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) ASC + ReadType: InOrder + ReadType: InOrder +select * from (select * from tab where (a + b) * c = 8 union all select * from tab3 where (a + b) * c = 18) order by sin(a / b); +2 2 2 2 +2 2 2 2 +3 3 3 3 +3 3 3 3 +select * from (explain plan actions = 1 select * from (select * from tab where (a + b) * c = 8 union all select * from tab3 where (a + b) * c = 18) order by sin(a / b)) where explain like '%sort description%' or explain like '%ReadType%'; + Prefix sort description: sin(divide(a_0, b_1)) ASC + Result sort description: sin(divide(a_0, b_1)) ASC + ReadType: InOrder + ReadType: InOrder +select * from (select * from tab where (a + b) * c = 8 union all select * from tab4) order by sin(a / b); +2 2 2 2 +2 2 2 2 +1 1 1 1 +2 2 2 2 +3 3 3 3 +4 4 4 4 +1 1 1 1 +2 2 2 2 +3 3 3 3 +4 4 4 4 +0 0 0 0 +0 0 0 0 +select * from (explain plan actions = 1 select * from (select * from tab where (a + b) * c = 8 union all select * from tab4) order by sin(a / b)) where explain like '%sort description%' or explain like '%ReadType%'; + Prefix sort description: sin(divide(a_0, b_1)) ASC + Result sort description: sin(divide(a_0, b_1)) ASC + ReadType: InOrder + ReadType: InOrder +select * from (select * from tab union all select * from tab5) order by (a + b) * c; +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +1 1 1 1 +1 1 1 1 +1 1 1 1 +1 1 1 1 +2 2 2 2 +2 2 2 2 +2 2 2 2 +2 2 2 2 +3 3 3 3 +3 3 3 3 +3 3 3 3 +3 3 3 3 +4 4 4 4 +4 4 4 4 +4 4 4 4 +4 4 4 4 +select * from (explain plan actions = 1 select * from (select * from tab union all select * from tab5) order by (a + b) * c) where explain like '%sort description%' or explain like '%ReadType%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) ASC + ReadType: InOrder + ReadType: InOrder +select * from (select * from tab union all select * from tab5) order by (a + b) * c, sin(a / b); +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +1 1 1 1 +1 1 1 1 +1 1 1 1 +1 1 1 1 +2 2 2 2 +2 2 2 2 +2 2 2 2 +2 2 2 2 +3 3 3 3 +3 3 3 3 +3 3 3 3 +3 3 3 3 +4 4 4 4 +4 4 4 4 +4 4 4 4 +4 4 4 4 +select * from (explain plan actions = 1 select * from (select * from tab union all select * from tab5) order by (a + b) * c, sin(a / b)) where explain like '%sort description%' or explain like '%ReadType%'; + Prefix sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) ASC + ReadType: InOrder + Prefix sort description: multiply(plus(a_0, b_1), c_2) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) ASC + ReadType: InOrder +-- Union with limit +select * from (select * from tab union all select * from tab5) order by (a + b) * c, sin(a / b) limit 3; +0 0 0 0 +0 0 0 0 +0 0 0 0 +select * from (explain plan actions = 1 select * from (select * from tab union all select * from tab5) order by (a + b) * c, sin(a / b) limit 3) where explain ilike '%sort description%' or explain like '%ReadType%' or explain like '%Limit%'; + Limit (preliminary LIMIT (without OFFSET)) + Limit 3 + Prefix sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) ASC + Limit 3 + ReadType: InOrder + Prefix sort description: multiply(plus(a_0, b_1), c_2) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) ASC + ReadType: InOrder +-- In this example, we read-in-order from tab up to ((a + b) * c, sin(a / b)) and from tab5 up to ((a + b) * c). +-- In case of tab5, there would be two finish sorting transforms: ((a + b) * c) -> ((a + b) * c, sin(a / b)) -> ((a + b) * c, sin(a / b), d). +-- It's important that ((a + b) * c) -> ((a + b) * c does not have LIMIT. We can add LIMIT WITH TIES later, when sorting alog support it. +-- In case of tab4, we do full sorting by ((a + b) * c, sin(a / b), d) with LIMIT. We can replace it to sorting by ((a + b) * c, sin(a / b)) and LIMIT WITH TIES, when sorting alog support it. +select * from (select * from tab union all select * from tab5 union all select * from tab4) order by (a + b) * c, sin(a / b), d limit 3; +0 0 0 0 +0 0 0 0 +0 0 0 0 +select * from (explain plan actions = 1 select * from (select * from tab union all select * from tab5 union all select * from tab4) order by (a + b) * c, sin(a / b), d limit 3) where explain ilike '%sort description%' or explain like '%ReadType%' or explain like '%Limit%'; + Limit (preliminary LIMIT (without OFFSET)) + Limit 3 + Prefix sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) ASC, d_3 ASC + Limit 3 + ReadType: InOrder + Prefix sort description: multiply(plus(a_0, b_1), c_2) ASC + Result sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) ASC + ReadType: InOrder + Sort description: multiply(plus(a_0, b_1), c_2) ASC, sin(divide(a_0, b_1)) ASC, d_3 ASC + Limit 3 + ReadType: Default +drop table if exists tab; +drop table if exists tab2; +drop table if exists tab3; +drop table if exists tab4; +drop table if exists tab5; diff --git a/tests/queries/0_stateless/02911_analyzer_order_by_read_in_order_query_plan.sql b/tests/queries/0_stateless/02911_analyzer_order_by_read_in_order_query_plan.sql new file mode 100644 index 00000000000..77a72c24f5a --- /dev/null +++ b/tests/queries/0_stateless/02911_analyzer_order_by_read_in_order_query_plan.sql @@ -0,0 +1,156 @@ +SET optimize_read_in_order = 1, query_plan_read_in_order = 1, allow_experimental_analyzer = 1; + +drop table if exists tab; +drop table if exists tab2; +drop table if exists tab3; +drop table if exists tab4; +drop table if exists tab5; + +create table tab (a UInt32, b UInt32, c UInt32, d UInt32) engine = MergeTree order by ((a + b) * c, sin(a / b)); +insert into tab select number, number, number, number from numbers(5); +insert into tab select number, number, number, number from numbers(5); + +-- { echoOn } + +-- Exact match, single key +select * from tab order by (a + b) * c; +select * from (explain plan actions = 1 select * from tab order by (a + b) * c) where explain like '%sort description%'; + +select * from tab order by (a + b) * c desc; +select * from (explain plan actions = 1 select * from tab order by (a + b) * c desc) where explain like '%sort description%'; + +-- Exact match, full key +select * from tab order by (a + b) * c, sin(a / b); +select * from (explain plan actions = 1 select * from tab order by (a + b) * c, sin(a / b)) where explain like '%sort description%'; + +select * from tab order by (a + b) * c desc, sin(a / b) desc; +select * from (explain plan actions = 1 select * from tab order by (a + b) * c desc, sin(a / b) desc) where explain like '%sort description%'; + +-- Exact match, mixed direction +select * from tab order by (a + b) * c desc, sin(a / b); +select * from (explain plan actions = 1 select * from tab order by (a + b) * c desc, sin(a / b)) where explain like '%sort description%'; + +select * from tab order by (a + b) * c, sin(a / b) desc; +select * from (explain plan actions = 1 select * from tab order by (a + b) * c, sin(a / b) desc) where explain like '%sort description%'; + +-- Wrong order, full sort +select * from tab order by sin(a / b), (a + b) * c; +select * from (explain plan actions = 1 select * from tab order by sin(a / b), (a + b) * c) where explain ilike '%sort description%'; + +-- Fixed point +select * from tab where (a + b) * c = 8 order by sin(a / b); +select * from (explain plan actions = 1 select * from tab where (a + b) * c = 8 order by sin(a / b)) where explain ilike '%sort description%'; + +select * from tab where d + 1 = 2 order by (d + 1) * 4, (a + b) * c; +select * from (explain plan actions = 1 select * from tab where d + 1 = 2 order by (d + 1) * 4, (a + b) * c) where explain ilike '%sort description%'; + +select * from tab where d + 1 = 3 and (a + b) = 4 and c = 2 order by (d + 1) * 4, sin(a / b); +select * from (explain plan actions = 1 select * from tab where d + 1 = 3 and (a + b) = 4 and c = 2 order by (d + 1) * 4, sin(a / b)) where explain ilike '%sort description%'; + +-- Wrong order with fixed point +select * from tab where (a + b) * c = 8 order by sin(b / a); +select * from (explain plan actions = 1 select * from tab where (a + b) * c = 8 order by sin(b / a)) where explain ilike '%sort description%'; + +-- Monotonicity +select * from tab order by intDiv((a + b) * c, 2); +select * from (explain plan actions = 1 select * from tab order by intDiv((a + b) * c, 2)) where explain like '%sort description%'; + +select * from tab order by intDiv((a + b) * c, 2), sin(a / b); +select * from (explain plan actions = 1 select * from tab order by intDiv((a + b) * c, 2), sin(a / b)) where explain like '%sort description%'; + +-- select * from tab order by (a + b) * c, intDiv(sin(a / b), 2); +select * from (explain plan actions = 1 select * from tab order by (a + b) * c, intDiv(sin(a / b), 2)) where explain like '%sort description%'; + +-- select * from tab order by (a + b) * c desc , intDiv(sin(a / b), 2); +select * from (explain plan actions = 1 select * from tab order by (a + b) * c desc , intDiv(sin(a / b), 2)) where explain like '%sort description%'; + +-- select * from tab order by (a + b) * c, intDiv(sin(a / b), 2) desc; +select * from (explain plan actions = 1 select * from tab order by (a + b) * c, intDiv(sin(a / b), 2) desc) where explain like '%sort description%'; + +-- select * from tab order by (a + b) * c desc, intDiv(sin(a / b), 2) desc; +select * from (explain plan actions = 1 select * from tab order by (a + b) * c desc, intDiv(sin(a / b), 2) desc) where explain like '%sort description%'; + +-- select * from tab order by (a + b) * c desc, intDiv(sin(a / b), -2); +select * from (explain plan actions = 1 select * from tab order by (a + b) * c desc, intDiv(sin(a / b), -2)) where explain like '%sort description%'; + +-- select * from tab order by (a + b) * c desc, intDiv(intDiv(sin(a / b), -2), -3); +select * from (explain plan actions = 1 select * from tab order by (a + b) * c desc, intDiv(intDiv(sin(a / b), -2), -3)) where explain like '%sort description%'; + +-- select * from tab order by (a + b) * c, intDiv(intDiv(sin(a / b), -2), -3); +select * from (explain plan actions = 1 select * from tab order by (a + b) * c, intDiv(intDiv(sin(a / b), -2), -3)) where explain like '%sort description%'; + +-- Aliases +select * from (select *, a + b as x from tab) order by x * c; +select * from (explain plan actions = 1 select * from (select *, a + b as x from tab) order by x * c) where explain like '%sort description%'; + +select * from (select *, a + b as x, a / b as y from tab) order by x * c, sin(y); +select * from (explain plan actions = 1 select * from (select *, a + b as x, a / b as y from tab) order by x * c, sin(y)) where explain like '%sort description%'; + +select * from (select *, a / b as y from (select *, a + b as x from tab)) order by x * c, sin(y); +select * from (explain plan actions = 1 select * from (select *, a / b as y from (select *, a + b as x from tab)) order by x * c, sin(y)) where explain like '%sort description%'; + +-- { echoOff } + +create table tab2 (x DateTime, y UInt32, z UInt32) engine = MergeTree order by (x, y); +insert into tab2 select toDate('2020-02-02') + number, number, number from numbers(4); +insert into tab2 select toDate('2020-02-02') + number, number, number from numbers(4); + +-- { echoOn } + +select * from tab2 order by toTimeZone(toTimezone(x, 'UTC'), 'CET'), intDiv(intDiv(y, -2), -3); +select * from (explain plan actions = 1 select * from tab2 order by toTimeZone(toTimezone(x, 'UTC'), 'CET'), intDiv(intDiv(y, -2), -3)) where explain like '%sort description%'; + +select * from tab2 order by toStartOfDay(x), intDiv(intDiv(y, -2), -3); +select * from (explain plan actions = 1 select * from tab2 order by toStartOfDay(x), intDiv(intDiv(y, -2), -3)) where explain like '%sort description%'; + +-- select * from tab2 where toTimezone(x, 'CET') = '2020-02-03 01:00:00' order by intDiv(intDiv(y, -2), -3); +select * from (explain plan actions = 1 select * from tab2 where toTimezone(x, 'CET') = '2020-02-03 01:00:00' order by intDiv(intDiv(y, -2), -3)) where explain like '%sort description%'; + +-- { echoOff } + +create table tab3 (a UInt32, b UInt32, c UInt32, d UInt32) engine = MergeTree order by ((a + b) * c, sin(a / b)); +insert into tab3 select number, number, number, number from numbers(5); +insert into tab3 select number, number, number, number from numbers(5); + +create table tab4 (a UInt32, b UInt32, c UInt32, d UInt32) engine = MergeTree order by sin(a / b); +insert into tab4 select number, number, number, number from numbers(5); +insert into tab4 select number, number, number, number from numbers(5); + +create table tab5 (a UInt32, b UInt32, c UInt32, d UInt32) engine = MergeTree order by (a + b) * c; +insert into tab5 select number, number, number, number from numbers(5); +insert into tab5 select number, number, number, number from numbers(5); + +-- { echoOn } + +-- Union (not fully supported) +select * from (select * from tab union all select * from tab3) order by (a + b) * c, sin(a / b); +select * from (explain plan actions = 1 select * from (select * from tab union all select * from tab3) order by (a + b) * c, sin(a / b)) where explain like '%sort description%' or explain like '%ReadType%'; + +select * from (select * from tab where (a + b) * c = 8 union all select * from tab3 where (a + b) * c = 18) order by sin(a / b); +select * from (explain plan actions = 1 select * from (select * from tab where (a + b) * c = 8 union all select * from tab3 where (a + b) * c = 18) order by sin(a / b)) where explain like '%sort description%' or explain like '%ReadType%'; + +select * from (select * from tab where (a + b) * c = 8 union all select * from tab4) order by sin(a / b); +select * from (explain plan actions = 1 select * from (select * from tab where (a + b) * c = 8 union all select * from tab4) order by sin(a / b)) where explain like '%sort description%' or explain like '%ReadType%'; + +select * from (select * from tab union all select * from tab5) order by (a + b) * c; +select * from (explain plan actions = 1 select * from (select * from tab union all select * from tab5) order by (a + b) * c) where explain like '%sort description%' or explain like '%ReadType%'; + +select * from (select * from tab union all select * from tab5) order by (a + b) * c, sin(a / b); +select * from (explain plan actions = 1 select * from (select * from tab union all select * from tab5) order by (a + b) * c, sin(a / b)) where explain like '%sort description%' or explain like '%ReadType%'; + +-- Union with limit +select * from (select * from tab union all select * from tab5) order by (a + b) * c, sin(a / b) limit 3; +select * from (explain plan actions = 1 select * from (select * from tab union all select * from tab5) order by (a + b) * c, sin(a / b) limit 3) where explain ilike '%sort description%' or explain like '%ReadType%' or explain like '%Limit%'; + +-- In this example, we read-in-order from tab up to ((a + b) * c, sin(a / b)) and from tab5 up to ((a + b) * c). +-- In case of tab5, there would be two finish sorting transforms: ((a + b) * c) -> ((a + b) * c, sin(a / b)) -> ((a + b) * c, sin(a / b), d). +-- It's important that ((a + b) * c) -> ((a + b) * c does not have LIMIT. We can add LIMIT WITH TIES later, when sorting alog support it. +-- In case of tab4, we do full sorting by ((a + b) * c, sin(a / b), d) with LIMIT. We can replace it to sorting by ((a + b) * c, sin(a / b)) and LIMIT WITH TIES, when sorting alog support it. +select * from (select * from tab union all select * from tab5 union all select * from tab4) order by (a + b) * c, sin(a / b), d limit 3; +select * from (explain plan actions = 1 select * from (select * from tab union all select * from tab5 union all select * from tab4) order by (a + b) * c, sin(a / b), d limit 3) where explain ilike '%sort description%' or explain like '%ReadType%' or explain like '%Limit%'; + +drop table if exists tab; +drop table if exists tab2; +drop table if exists tab3; +drop table if exists tab4; +drop table if exists tab5; diff --git a/tests/queries/0_stateless/02911_cte_invalid_query_analysis.reference b/tests/queries/0_stateless/02911_cte_invalid_query_analysis.reference new file mode 100644 index 00000000000..d05b1f927f4 --- /dev/null +++ b/tests/queries/0_stateless/02911_cte_invalid_query_analysis.reference @@ -0,0 +1 @@ +0 0 diff --git a/tests/queries/0_stateless/02911_cte_invalid_query_analysis.sql b/tests/queries/0_stateless/02911_cte_invalid_query_analysis.sql new file mode 100644 index 00000000000..dcf21831e6b --- /dev/null +++ b/tests/queries/0_stateless/02911_cte_invalid_query_analysis.sql @@ -0,0 +1,34 @@ +drop table if exists t0; +drop table if exists t1; +drop table if exists t3; + +create table t0 (pkey UInt32, c1 UInt32, primary key(pkey)) engine = MergeTree; +create table t1 (vkey UInt32, primary key(vkey)) engine = MergeTree; +create table t3 (c17 String, primary key(c17)) engine = MergeTree; +insert into t1 values (3); + +WITH +cte_1 AS (select + subq_1.c_5_c1698_16 as c_2_c1702_3, + subq_1.c_5_c1694_12 as c_2_c1703_4 + from + (select + covarPop(-0, 74) as c_5_c1686_4, + sumWithOverflow(0) as c_5_c1694_12, + covarPop(-53.64, 92.63) as c_5_c1698_16 + from + t3 as ref_8 + group by ref_8.c17) as subq_1) +select + ref_15.c_2_c1703_4 as c_2_c1723_6, + ref_15.c_2_c1702_3 as c_2_c1724_7 + from + t0 as ref_14 + RIGHT outer join cte_1 as ref_15 + on (ref_14.c1 = ref_15.c_2_c1702_3) + RIGHT outer join t1 as ref_16 + on (ref_14.pkey = ref_16.vkey); + +drop table t0; +drop table t1; +drop table t3; diff --git a/tests/queries/0_stateless/02911_system_symbols.reference b/tests/queries/0_stateless/02911_system_symbols.reference new file mode 100644 index 00000000000..df30df3ce57 --- /dev/null +++ b/tests/queries/0_stateless/02911_system_symbols.reference @@ -0,0 +1 @@ +DB::StorageSystemSymbols::StorageSystemSymbols(DB::StorageID const&) diff --git a/tests/queries/0_stateless/02911_system_symbols.sql b/tests/queries/0_stateless/02911_system_symbols.sql new file mode 100644 index 00000000000..d4195c1937e --- /dev/null +++ b/tests/queries/0_stateless/02911_system_symbols.sql @@ -0,0 +1 @@ +SELECT x FROM (SELECT demangle(symbol) AS x FROM system.symbols WHERE symbol LIKE '%StorageSystemSymbols%') WHERE x LIKE '%DB::StorageSystemSymbols::StorageSystemSymbols%' ORDER BY x LIMIT 1 SETTINGS allow_introspection_functions = 1; diff --git a/tests/queries/0_stateless/02912_ingestion_mv_deduplication.reference b/tests/queries/0_stateless/02912_ingestion_mv_deduplication.reference new file mode 100644 index 00000000000..946897a4fe3 --- /dev/null +++ b/tests/queries/0_stateless/02912_ingestion_mv_deduplication.reference @@ -0,0 +1,31 @@ +-- Original issue with max_insert_delayed_streams_for_parallel_write = 1 +-- Landing +2022-09-01 12:23:34 42 +2023-09-01 12:23:34 42 +-- MV +2022-09-01 12:00:00 84 +2023-09-01 12:00:00 42 +-- Original issue with deduplicate_blocks_in_dependent_materialized_views = 0 AND max_insert_delayed_streams_for_parallel_write > 1 +-- Landing +2022-09-01 12:23:34 42 +2023-09-01 12:23:34 42 +-- MV +2022-09-01 12:00:00 42 +-- Original issue with deduplicate_blocks_in_dependent_materialized_views = 1 AND max_insert_delayed_streams_for_parallel_write > 1 +-- Landing +2022-09-01 12:23:34 42 +2023-09-01 12:23:34 42 +-- MV +2022-09-01 12:00:00 42 +2023-09-01 12:00:00 42 +-- Regression introduced in https://github.com/ClickHouse/ClickHouse/pull/54184 +-- Landing (Agg/Replacing)MergeTree +org-1 prod checkout user 1 +org-1 prod login account 1 +org-1 prod login user 1 +org-1 stage login user 1 +--- MV +org-1 prod checkout user 1 +org-1 prod login account 3 +org-1 prod login user 3 +org-1 stage login user 1 diff --git a/tests/queries/0_stateless/02912_ingestion_mv_deduplication.sql b/tests/queries/0_stateless/02912_ingestion_mv_deduplication.sql new file mode 100644 index 00000000000..68901b67c91 --- /dev/null +++ b/tests/queries/0_stateless/02912_ingestion_mv_deduplication.sql @@ -0,0 +1,205 @@ +-- Tags: replica +SET session_timezone = 'UTC'; + +SELECT '-- Original issue with max_insert_delayed_streams_for_parallel_write = 1'; +/* + + This is the expected behavior when mv deduplication is set to false. + + - 1st insert works for landing and mv tables + - 2nd insert gets first block 20220901 deduplicated and second one inserted in landing table + - 2nd insert gets both blocks inserted in mv table + +*/ +SET deduplicate_blocks_in_dependent_materialized_views = 0, max_insert_delayed_streams_for_parallel_write = 1; + +CREATE TABLE landing +( + time DateTime, + number Int64 +) +Engine=ReplicatedReplacingMergeTree('/clickhouse/' || currentDatabase() || '/landing/{shard}/', '{replica}') +PARTITION BY toYYYYMMDD(time) +ORDER BY time; + +CREATE MATERIALIZED VIEW mv +ENGINE = ReplicatedSummingMergeTree('/clickhouse/' || currentDatabase() || '/mv/{shard}/', '{replica}') +PARTITION BY toYYYYMMDD(hour) ORDER BY hour +AS SELECT + toStartOfHour(time) AS hour, + sum(number) AS sum_amount +FROM landing +GROUP BY hour; + +INSERT INTO landing VALUES ('2022-09-01 12:23:34', 42); +INSERT INTO landing VALUES ('2022-09-01 12:23:34', 42),('2023-09-01 12:23:34', 42); + +SELECT '-- Landing'; +SELECT * FROM landing FINAL ORDER BY time; +SELECT '-- MV'; +SELECT * FROM mv FINAL ORDER BY hour; + +DROP TABLE IF EXISTS landing SYNC; +DROP TABLE IF EXISTS mv SYNC; + +SELECT '-- Original issue with deduplicate_blocks_in_dependent_materialized_views = 0 AND max_insert_delayed_streams_for_parallel_write > 1'; +/* + + This is the unexpected behavior due to setting max_insert_delayed_streams_for_parallel_write > 1. + + This unexpected behavior was present since version 21.9 or earlier but due to this PR https://github.com/ClickHouse/ClickHouse/pull/34780 + when max_insert_delayed_streams_for_parallel_write setting it to 1 by default the issue was mitigated. + + This is what happens: + + - 1st insert works for landing and mv tables + - 2nd insert gets first block 20220901 deduplicated and second one inserted in landing table + - 2nd insert is not inserting anything in mv table due to a bug computing blocks to be discarded + +*/ +SET deduplicate_blocks_in_dependent_materialized_views = 0, max_insert_delayed_streams_for_parallel_write = 10; + +CREATE TABLE landing +( + time DateTime, + number Int64 +) +Engine=ReplicatedReplacingMergeTree('/clickhouse/' || currentDatabase() || '/landing/{shard}/', '{replica}') +PARTITION BY toYYYYMMDD(time) +ORDER BY time; + +CREATE MATERIALIZED VIEW mv +ENGINE = ReplicatedSummingMergeTree('/clickhouse/' || currentDatabase() || '/mv/{shard}/', '{replica}') +PARTITION BY toYYYYMMDD(hour) ORDER BY hour +AS SELECT + toStartOfHour(time) AS hour, + sum(number) AS sum_amount +FROM landing +GROUP BY hour; + +INSERT INTO landing VALUES ('2022-09-01 12:23:34', 42); +INSERT INTO landing VALUES ('2022-09-01 12:23:34', 42),('2023-09-01 12:23:34', 42); + +SELECT '-- Landing'; +SELECT * FROM landing FINAL ORDER BY time; +SELECT '-- MV'; +SELECT * FROM mv FINAL ORDER BY hour; + +SET max_insert_delayed_streams_for_parallel_write = 1; +DROP TABLE IF EXISTS landing SYNC; +DROP TABLE IF EXISTS mv SYNC; + +SELECT '-- Original issue with deduplicate_blocks_in_dependent_materialized_views = 1 AND max_insert_delayed_streams_for_parallel_write > 1'; +/* + + By setting deduplicate_blocks_in_dependent_materialized_views = 1 we can make the code go through a different path getting an expected + behavior again, even with max_insert_delayed_streams_for_parallel_write > 1. + + This is what happens now: + + - 1st insert works for landing and mv tables + - 2nd insert gets first block 20220901 deduplicated and second one inserted for landing and mv tables + +*/ +SET deduplicate_blocks_in_dependent_materialized_views = 1, max_insert_delayed_streams_for_parallel_write = 10; + +CREATE TABLE landing +( + time DateTime, + number Int64 +) +Engine=ReplicatedReplacingMergeTree('/clickhouse/' || currentDatabase() || '/landing/{shard}/', '{replica}') +PARTITION BY toYYYYMMDD(time) +ORDER BY time; + +CREATE MATERIALIZED VIEW mv +ENGINE = ReplicatedSummingMergeTree('/clickhouse/' || currentDatabase() || '/mv/{shard}/', '{replica}') +PARTITION BY toYYYYMMDD(hour) ORDER BY hour +AS SELECT + toStartOfHour(time) AS hour, + sum(number) AS sum_amount +FROM landing +GROUP BY hour; + +INSERT INTO landing VALUES ('2022-09-01 12:23:34', 42); +INSERT INTO landing VALUES ('2022-09-01 12:23:34', 42),('2023-09-01 12:23:34', 42); + +SELECT '-- Landing'; +SELECT * FROM landing FINAL ORDER BY time; +SELECT '-- MV'; +SELECT * FROM mv FINAL ORDER BY hour; + +SET max_insert_delayed_streams_for_parallel_write = 1; +DROP TABLE IF EXISTS landing SYNC; +DROP TABLE IF EXISTS mv SYNC; + +SELECT '-- Regression introduced in https://github.com/ClickHouse/ClickHouse/pull/54184'; +/* + + This is a test to prevent regression introduced in https://github.com/ClickHouse/ClickHouse/pull/54184 from happening again. + + The PR was trying to fix the unexpected behavior when deduplicate_blocks_in_dependent_materialized_views = 0 AND + max_insert_delayed_streams_for_parallel_write > 1 but it ended up adding a new regression. + +*/ + +CREATE TABLE landing +( + `time` DateTime, + `pk1` LowCardinality(String), + `pk2` LowCardinality(String), + `pk3` LowCardinality(String), + `pk4` String +) +ENGINE = ReplicatedReplacingMergeTree('/clickhouse/' || currentDatabase() || '/landing/{shard}/', '{replica}') +ORDER BY (pk1, pk2, pk3, pk4); + +CREATE TABLE ds +( + `pk1` LowCardinality(String), + `pk2` LowCardinality(String), + `pk3` LowCardinality(String), + `pk4` LowCardinality(String), + `occurences` AggregateFunction(count) +) +ENGINE = ReplicatedAggregatingMergeTree('/clickhouse/' || currentDatabase() || '/ds/{shard}/', '{replica}') +ORDER BY (pk1, pk2, pk3, pk4); + +CREATE MATERIALIZED VIEW mv TO ds AS +SELECT + pk1, + pk2, + pk4, + pk3, + countState() AS occurences +FROM landing +GROUP BY pk1, pk2, pk4, pk3; + +INSERT INTO landing (time, pk1, pk2, pk4, pk3) +VALUES ('2023-01-01 00:00:00','org-1','prod','login','user'),('2023-01-01 00:00:00','org-1','prod','login','user'),('2023-01-01 00:00:00','org-1','prod','login','user'),('2023-02-01 00:00:00','org-1','stage','login','user'),('2023-02-01 00:00:00','org-1','prod','login','account'),('2023-02-01 00:00:00','org-1','prod','checkout','user'),('2023-03-01 00:00:00','org-1','prod','login','account'),('2023-03-01 00:00:00','org-1','prod','login','account'); + +SELECT '-- Landing (Agg/Replacing)MergeTree'; +SELECT + pk1, + pk2, + pk4, + pk3, + count() as occurences +FROM landing +GROUP BY pk1, pk2, pk4, pk3 +ORDER BY pk1, pk2, pk4, pk3; + +SELECT '--- MV'; +SELECT + pk1, + pk2, + pk4, + pk3, + countMerge(occurences) AS occurences +FROM ds +GROUP BY pk1, pk2, pk4, pk3 +ORDER BY pk1, pk2, pk4, pk3; + +DROP TABLE IF EXISTS landing SYNC; +DROP TABLE IF EXISTS ds SYNC; +DROP TABLE IF EXISTS mv SYNC; diff --git a/tests/queries/0_stateless/02914_t64_buffer_overflow.reference b/tests/queries/0_stateless/02914_t64_buffer_overflow.reference new file mode 100644 index 00000000000..2574a09f166 --- /dev/null +++ b/tests/queries/0_stateless/02914_t64_buffer_overflow.reference @@ -0,0 +1 @@ +Exc diff --git a/tests/queries/0_stateless/02914_t64_buffer_overflow.sh b/tests/queries/0_stateless/02914_t64_buffer_overflow.sh new file mode 100755 index 00000000000..557c715d238 --- /dev/null +++ b/tests/queries/0_stateless/02914_t64_buffer_overflow.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +echo -ne 'checksumchecksum\x93\x1A\x04\x00\x00\x08\x00\x00\x00\04\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' | + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&decompress=1&http_native_compression_disable_checksumming_on_decompress=1" --data-binary @- 2>&1 | grep -oF 'Exc' diff --git a/tests/queries/0_stateless/02915_lazy_loading_of_base_backups.reference b/tests/queries/0_stateless/02915_lazy_loading_of_base_backups.reference new file mode 100644 index 00000000000..0ef77460514 --- /dev/null +++ b/tests/queries/0_stateless/02915_lazy_loading_of_base_backups.reference @@ -0,0 +1,12 @@ +BACKUP_CREATED +BACKUP_CREATED +BACKUP_CREATED +RESTORED +RESTORED +RESTORED +a 0 1 +b 1 1 +c 1 1 +r1 3 0 +r2 2 0 +r3 1 0 diff --git a/tests/queries/0_stateless/02915_lazy_loading_of_base_backups.sh b/tests/queries/0_stateless/02915_lazy_loading_of_base_backups.sh new file mode 100755 index 00000000000..5f0f41a956b --- /dev/null +++ b/tests/queries/0_stateless/02915_lazy_loading_of_base_backups.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +a_backup_id=${CLICKHOUSE_TEST_UNIQUE_NAME}_a +a_backup="Disk('backups', '$a_backup_id')" + +b_backup_id=${CLICKHOUSE_TEST_UNIQUE_NAME}_b +b_backup="Disk('backups', '$b_backup_id')" + +c_backup_id=${CLICKHOUSE_TEST_UNIQUE_NAME}_c +c_backup="Disk('backups', '$c_backup_id')" + +${CLICKHOUSE_CLIENT} -nm --query " +DROP TABLE IF EXISTS tbl1; +DROP TABLE IF EXISTS tbl2; +DROP TABLE IF EXISTS tbl3; +" + +${CLICKHOUSE_CLIENT} -nm --query " +CREATE TABLE tbl1 (a Int32) ENGINE = MergeTree() ORDER BY tuple(); +" + +# The following BACKUP command must write backup 'a'. +${CLICKHOUSE_CLIENT} -nm --query " +BACKUP DATABASE ${CLICKHOUSE_DATABASE} TO $a_backup SETTINGS id='$a_backup_id'; +" | grep -o "BACKUP_CREATED" + +${CLICKHOUSE_CLIENT} -nm --query " +CREATE TABLE tbl2 (a Int32) ENGINE = MergeTree() ORDER BY tuple(); +" + +# The following BACKUP command must read backup 'a' and write backup 'b'. +${CLICKHOUSE_CLIENT} -nm --query " +BACKUP DATABASE ${CLICKHOUSE_DATABASE} TO $b_backup SETTINGS id='$b_backup_id', base_backup=$a_backup; +" | grep -o "BACKUP_CREATED" + +${CLICKHOUSE_CLIENT} -nm --query " +CREATE TABLE tbl3 (a Int32) ENGINE = MergeTree() ORDER BY tuple(); +" + +# The following BACKUP command must read only backup 'b' (and not 'a') and write backup 'c'. +${CLICKHOUSE_CLIENT} -nm --query " +BACKUP DATABASE ${CLICKHOUSE_DATABASE} TO $c_backup SETTINGS id='$c_backup_id', base_backup=$b_backup; +" | grep -o "BACKUP_CREATED" + +${CLICKHOUSE_CLIENT} -nm --query " +DROP TABLE tbl1; +DROP TABLE tbl2; +DROP TABLE tbl3; +" + +r1_restore_id=${CLICKHOUSE_TEST_UNIQUE_NAME}_r1 +r2_restore_id=${CLICKHOUSE_TEST_UNIQUE_NAME}_r2 +r3_restore_id=${CLICKHOUSE_TEST_UNIQUE_NAME}_r3 + +# The following RESTORE command must read all 3 backups 'a', 'b', c' because the table 'tbl1' was in the first backup. +${CLICKHOUSE_CLIENT} -nm --query " +RESTORE TABLE ${CLICKHOUSE_DATABASE}.tbl1 FROM $c_backup SETTINGS id='$r1_restore_id'; +" | grep -o "RESTORED" + +# The following RESTORE command must read only 2 backups 'b', c' (and not 'a') because the table 'tbl2' was in the second backup. +${CLICKHOUSE_CLIENT} -nm --query " +RESTORE TABLE ${CLICKHOUSE_DATABASE}.tbl2 FROM $c_backup SETTINGS id='$r2_restore_id'; +" | grep -o "RESTORED" + +# The following RESTORE command must read only 1 backup 'c' (and not 'a' or 'b') because the table 'tbl3' was in the third backup. +${CLICKHOUSE_CLIENT} -nm --query " +RESTORE TABLE ${CLICKHOUSE_DATABASE}.tbl3 FROM $c_backup SETTINGS id='$r3_restore_id'; +" | grep -o "RESTORED" + +all_ids="['$a_backup_id', '$b_backup_id', '$c_backup_id', '$r1_restore_id', '$r2_restore_id', '$r3_restore_id']" +id_prefix_len=`expr "${CLICKHOUSE_TEST_UNIQUE_NAME}_" : '.*'` + +${CLICKHOUSE_CLIENT} -nm --query " +SELECT substr(id, 1 + $id_prefix_len) as short_id, ProfileEvents['BackupsOpenedForRead'], ProfileEvents['BackupsOpenedForWrite'] FROM system.backups WHERE id IN ${all_ids} ORDER BY short_id +" + +${CLICKHOUSE_CLIENT} -nm --query " +DROP TABLE tbl1; +DROP TABLE tbl2; +DROP TABLE tbl3; +" diff --git a/tests/queries/0_stateless/data_csv/csv_with_cr_end_of_line.csv b/tests/queries/0_stateless/data_csv/csv_with_cr_end_of_line.csv new file mode 100644 index 00000000000..077ca2c84c5 --- /dev/null +++ b/tests/queries/0_stateless/data_csv/csv_with_cr_end_of_line.csv @@ -0,0 +1,2 @@ +A,110,208819249 +B,112,208819248 C,123,783434434 diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index a1012678faf..aedb267b3fb 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1,4 +1,4 @@ -personal_ws-1.1 en 543 +personal_ws-1.1 en 2633 AArch ACLs ALTERs @@ -356,6 +356,7 @@ IOUringPendingEvents IOWriterThreads IOWriterThreadsActive IPTrie +IProcessor IPv Identifiant Incrementing @@ -2285,6 +2286,8 @@ stochasticlinearregression stochasticlogisticregression storages storig +stringJaccardIndex +stringJaccardIndexUTF stringToH stripelog strtod diff --git a/utils/check-style/check-doc-aspell b/utils/check-style/check-doc-aspell index 952dbd5b507..b5a3958e6cf 100755 --- a/utils/check-style/check-doc-aspell +++ b/utils/check-style/check-doc-aspell @@ -6,9 +6,9 @@ shopt -s globstar # Perform spell checking on the docs if [[ ${1:-} == "--help" ]] || [[ ${1:-} == "-h" ]]; then - echo "Usage $0 [--help|-h] [-i]" + echo "Usage $0 [--help|-h] [-i [filename]]" echo " --help|-h: print this help" - echo " -i: interactive mode" + echo " -i: interactive mode. If filename is specified, check only this file, otherwise check all files" exit 0 fi @@ -18,14 +18,21 @@ CHECK_LANG=en ASPELL_IGNORE_PATH="${ROOT_PATH}/utils/check-style/aspell-ignore/${CHECK_LANG}" -STATUS=0 -for fname in ${ROOT_PATH}/docs/${CHECK_LANG}/**/*.md; do - if [[ ${1:-} == "-i" ]]; then +if [[ ${1:-} == "-i" ]]; then + if [[ ! -z ${2:-} ]]; then + FILES_TO_CHECK=${ROOT_PATH}/docs/${CHECK_LANG}/${2} + else + FILES_TO_CHECK=${ROOT_PATH}/docs/${CHECK_LANG}/**/*.md + fi + for fname in ${FILES_TO_CHECK}; do echo "Checking $fname" aspell --personal=aspell-dict.txt --add-sgml-skip=code --encoding=utf-8 --mode=markdown -W 3 --lang=${CHECK_LANG} --home-dir=${ASPELL_IGNORE_PATH} -c "$fname" - continue - fi + done + exit +fi +STATUS=0 +for fname in ${ROOT_PATH}/docs/${CHECK_LANG}/**/*.md; do errors=$(cat "$fname" \ | aspell list \ -W 3 \ diff --git a/utils/check-style/check-style b/utils/check-style/check-style index b728602ef40..f87d2e292b5 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -394,6 +394,11 @@ find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | grep -vP $EXCLUDE_DIRS | xargs grep -P '__builtin_unreachable' && echo "Use UNREACHABLE() from defines.h instead" +# Forbid mt19937() and random_device() which are outdated and slow +find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -P '(std::mt19937|std::mersenne_twister_engine|std::random_device)' && echo "Use pcg64_fast (from pcg_random.h) and randomSeed (from Common/randomSeed.h) instead" + # Require checking return value of close(), # since it can hide fd misuse and break other places. find $ROOT_PATH/{src,programs,utils} -name '*.h' -or -name '*.cpp' | diff --git a/utils/clickhouse-diagnostics/README.md b/utils/clickhouse-diagnostics/README.md index aed5e19ee45..9a86ad535fd 100644 --- a/utils/clickhouse-diagnostics/README.md +++ b/utils/clickhouse-diagnostics/README.md @@ -2574,16 +2574,43 @@ Settings: {} SELECT '\n' || arrayStringConcat( arrayMap( - x, - y -> concat(x, ': ', y), + x, y, z -> concat(x, ': ', y, ' @ ', z), arrayMap(x -> addressToLine(x), trace), - arrayMap(x -> demangle(addressToSymbol(x)), trace)), + arrayMap(x -> demangle(addressToSymbol(x)), trace), + arrayMap(x -> '0x' || hex(x), trace)), '\n') AS trace FROM system.stack_trace ``` **result** ``` -ClickhouseError("Code: 446. DB::Exception: default: Introspection functions are disabled, because setting 'allow_introspection_functions' is set to 0: While processing concat('\\n', arrayStringConcat(arrayMap((x, y) -> concat(x, ': ', y), arrayMap(x -> addressToLine(x), trace), arrayMap(x -> demangle(addressToSymbol(x)), trace)), '\\n')) AS trace. (FUNCTION_NOT_ALLOWED) (version 21.11.8.4 (official build))",) +Row 1: +────── +trace: +: @ 0x7F6694A91117 +: @ 0x7F6694A93A41 +./build/./contrib/llvm-project/libcxx/src/condition_variable.cpp:47: std::__1::condition_variable::wait(std::__1::unique_lock&) @ 0x16F4A56F +./build/./contrib/llvm-project/libcxx/include/atomic:958: BaseDaemon::waitForTerminationRequest() @ 0x0B85564B +./build/./contrib/llvm-project/libcxx/include/vector:434: DB::Server::main(std::__1::vector, std::__1::allocator>, std::__1::allocator, std::__1::allocator>>> const&) @ 0x0B6644CE +./build/./base/poco/Util/src/Application.cpp:0: Poco::Util::Application::run() @ 0x1489B8A6 +./build/./programs/server/Server.cpp:402: DB::Server::run() @ 0x0B651E91 +./build/./base/poco/Util/src/ServerApplication.cpp:132: Poco::Util::ServerApplication::run(int, char**) @ 0x148AF4F1 +./build/./programs/server/Server.cpp:0: mainEntryClickHouseServer(int, char**) @ 0x0B64FA96 +./build/./programs/main.cpp:0: main @ 0x06AB8C92 +: @ 0x7F6694A29D90 +: @ 0x7F6694A29E40 +./build/./programs/clickhouse: _start @ 0x06AB802E + +Row 2: +────── +trace: +: @ 0x7F6694B14A0C +./build/./src/IO/ReadBufferFromFileDescriptor.cpp:0: DB::ReadBufferFromFileDescriptor::readImpl(char*, unsigned long, unsigned long, unsigned long) @ 0x0B622EAB +./build/./src/IO/ReadBufferFromFileDescriptor.cpp:126: DB::ReadBufferFromFileDescriptor::nextImpl() @ 0x0B6231A0 +./build/./src/IO/ReadBuffer.h:70: SignalListener::run() @ 0x0B85631D +./build/./base/poco/Foundation/include/Poco/SharedPtr.h:139: Poco::ThreadImpl::runnableEntry(void*) @ 0x149CA102 +: @ 0x7F6694A94AC3 +: @ 0x7F6694B26A40 + ``` #### uname **command** diff --git a/utils/clickhouse-diagnostics/clickhouse-diagnostics b/utils/clickhouse-diagnostics/clickhouse-diagnostics index 5cacbf1d4d4..8e23cc8d0e1 100755 --- a/utils/clickhouse-diagnostics/clickhouse-diagnostics +++ b/utils/clickhouse-diagnostics/clickhouse-diagnostics @@ -453,10 +453,10 @@ LIMIT 10 SELECT_STACK_TRACES = r"""SELECT '\n' || arrayStringConcat( arrayMap( - x, - y -> concat(x, ': ', y), + x, y, z -> concat(x, ': ', y, ' @ ', z), arrayMap(x -> addressToLine(x), trace), - arrayMap(x -> demangle(addressToSymbol(x)), trace)), + arrayMap(x -> demangle(addressToSymbol(x)), trace), + arrayMap(x -> '0x' || hex(x), trace)), '\n') AS trace FROM system.stack_trace """ diff --git a/utils/keeper-bench/Runner.cpp b/utils/keeper-bench/Runner.cpp index bc994502017..13855c6d94e 100644 --- a/utils/keeper-bench/Runner.cpp +++ b/utils/keeper-bench/Runner.cpp @@ -127,6 +127,9 @@ void Runner::parseHostsFromConfig(const Poco::Util::AbstractConfiguration & conf if (config.has(key + ".connection_timeout_ms")) connection_info.connection_timeout_ms = config.getInt(key + ".connection_timeout_ms"); + + if (config.has(key + ".use_compression")) + connection_info.use_compression = config.getBool(key + ".use_compression"); }; fill_connection_details("connections", default_connection_info); @@ -430,8 +433,9 @@ std::shared_ptr Runner::getConnection(const ConnectionI nodes.push_back(node); zkutil::ZooKeeperArgs args; args.session_timeout_ms = connection_info.session_timeout_ms; - args.connection_timeout_ms = connection_info.operation_timeout_ms; - args.operation_timeout_ms = connection_info.connection_timeout_ms; + args.connection_timeout_ms = connection_info.connection_timeout_ms; + args.operation_timeout_ms = connection_info.operation_timeout_ms; + args.use_compression = connection_info.use_compression; return std::make_shared(nodes, args, nullptr); } diff --git a/utils/keeper-bench/Runner.h b/utils/keeper-bench/Runner.h index 14d9b13938b..4f4a75e6ecf 100644 --- a/utils/keeper-bench/Runner.h +++ b/utils/keeper-bench/Runner.h @@ -79,6 +79,7 @@ private: int32_t session_timeout_ms = Coordination::DEFAULT_SESSION_TIMEOUT_MS; int32_t connection_timeout_ms = Coordination::DEFAULT_CONNECTION_TIMEOUT_MS; int32_t operation_timeout_ms = Coordination::DEFAULT_OPERATION_TIMEOUT_MS; + bool use_compression = false; size_t sessions = 1; }; diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 6c7028abbf4..0f2684cd91d 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,7 +1,11 @@ +v23.10.3.5-stable 2023-11-10 +v23.10.2.13-stable 2023-11-08 v23.10.1.1976-stable 2023-11-02 +v23.9.4.11-stable 2023-11-08 v23.9.3.12-stable 2023-10-31 v23.9.2.56-stable 2023-10-19 v23.9.1.1854-stable 2023-09-29 +v23.8.6.16-lts 2023-11-08 v23.8.5.16-lts 2023-10-31 v23.8.4.69-lts 2023-10-19 v23.8.3.48-lts 2023-09-27 @@ -27,6 +31,7 @@ v23.4.4.16-stable 2023-06-17 v23.4.3.48-stable 2023-06-12 v23.4.2.11-stable 2023-05-02 v23.4.1.1943-stable 2023-04-27 +v23.3.16.7-lts 2023-11-08 v23.3.15.29-lts 2023-10-31 v23.3.14.78-lts 2023-10-18 v23.3.13.6-lts 2023-09-05 diff --git a/utils/prepare-time-trace/prepare-time-trace.sh b/utils/prepare-time-trace/prepare-time-trace.sh index 7cacdec8c94..7e585db2000 100755 --- a/utils/prepare-time-trace/prepare-time-trace.sh +++ b/utils/prepare-time-trace/prepare-time-trace.sh @@ -11,12 +11,19 @@ < "${OUTPUT_DIR}/binary_sizes.txt" diff --git a/utils/self-extracting-executable/decompressor.cpp b/utils/self-extracting-executable/decompressor.cpp index c20c1f0cbb2..1f19a349d65 100644 --- a/utils/self-extracting-executable/decompressor.cpp +++ b/utils/self-extracting-executable/decompressor.cpp @@ -45,6 +45,7 @@ int doDecompress(char * input, char * output, off_t & in_offset, off_t & out_off std::cerr << "Error (ZSTD):" << decompressed_size << " " << ZSTD_getErrorName(decompressed_size) << std::endl; return 1; } + std::cerr << "." << std::flush; return 0; } @@ -173,7 +174,7 @@ bool isSudo() return geteuid() == 0; } -/// Read data about files and decomrpess them. +/// Read data about files and decompress them. int decompressFiles(int input_fd, char * path, char * name, bool & have_compressed_analoge, bool & has_exec, char * decompressed_suffix, uint64_t * decompressed_umask) { /// Read data about output file. @@ -332,6 +333,8 @@ int decompressFiles(int input_fd, char * path, char * name, bool & have_compress if (0 != munmap(input, info_in.st_size)) perror("munmap"); + + std::cerr << std::endl; return 0; } @@ -440,6 +443,8 @@ int main(int/* argc*/, char* argv[]) return 1; } + std::cerr << "Decompressing the binary" << std::flush; + std::stringstream lock_path; // STYLE_CHECK_ALLOW_STD_STRING_STREAM lock_path << "/tmp/" << name << ".decompression." << inode << ".lock"; int lock = open(lock_path.str().c_str(), O_CREAT | O_RDWR, 0666);